From c61fa290250fd32e076b46aa91f22836da6cc4df Mon Sep 17 00:00:00 2001 From: circleci-runai Date: Mon, 3 Feb 2025 15:55:55 +0000 Subject: [PATCH] Deployed 4636a74 to v2.20 with MkDocs 1.6.1 and mike 2.1.3 --- v2.20/404.html | 2 +- .../quickstart-inference/index.html | 6 +- .../quickstart-overview/index.html | 6 +- .../Walkthroughs/quickstart-vscode/index.html | 6 +- .../walkthrough-build-ports/index.html | 6 +- .../Walkthroughs/walkthrough-build/index.html | 6 +- .../walkthrough-fractions/index.html | 6 +- .../walkthrough-overquota/index.html | 6 +- .../walkthrough-queue-fairness/index.html | 6 +- .../bare-metal-to-docker-images/index.html | 6 +- .../convert-to-unattended/index.html | 6 +- .../best-practices/env-variables/index.html | 6 +- .../researcher-notifications/index.html | 6 +- .../save-dl-checkpoints/index.html | 6 +- .../secrets-as-env-var-in-cli/index.html | 6 +- .../cli-reference/Introduction/index.html | 6 +- .../new-cli/cli-examples/index.html | 6 +- .../index.html | 6 +- .../cli-reference/new-cli/overview/index.html | 6 +- .../cli-reference/new-cli/runai/index.html | 6 +- .../new-cli/runai_attach/index.html | 6 +- .../new-cli/runai_cluster/index.html | 6 +- .../new-cli/runai_cluster_list/index.html | 6 +- .../new-cli/runai_cluster_set/index.html | 6 +- .../new-cli/runai_config/index.html | 6 +- .../new-cli/runai_config_generate/index.html | 6 +- .../new-cli/runai_config_project/index.html | 6 +- .../new-cli/runai_config_set/index.html | 6 +- .../new-cli/runai_describe/index.html | 6 +- .../new-cli/runai_describe_job/index.html | 6 +- .../new-cli/runai_describe_node/index.html | 6 +- .../new-cli/runai_exec/index.html | 6 +- .../new-cli/runai_kubeconfig/index.html | 6 +- .../new-cli/runai_kubeconfig_set/index.html | 6 +- .../new-cli/runai_list/index.html | 6 +- .../new-cli/runai_list_clusters/index.html | 6 +- .../new-cli/runai_list_jobs/index.html | 6 +- .../new-cli/runai_list_nodes/index.html | 6 +- .../new-cli/runai_list_projects/index.html | 6 +- .../new-cli/runai_login/index.html | 6 +- .../runai_login_application/index.html | 6 +- .../new-cli/runai_login_sso/index.html | 6 +- .../new-cli/runai_login_user/index.html | 6 +- .../new-cli/runai_logout/index.html | 6 +- .../new-cli/runai_logs/index.html | 6 +- .../new-cli/runai_mpi/index.html | 6 +- .../new-cli/runai_mpi_attach/index.html | 6 +- .../new-cli/runai_mpi_bash/index.html | 6 +- .../new-cli/runai_mpi_delete/index.html | 6 +- .../new-cli/runai_mpi_describe/index.html | 6 +- .../new-cli/runai_mpi_exec/index.html | 6 +- .../new-cli/runai_mpi_list/index.html | 6 +- .../new-cli/runai_mpi_logs/index.html | 6 +- .../new-cli/runai_mpi_port-forward/index.html | 6 +- .../new-cli/runai_mpi_resume/index.html | 6 +- .../new-cli/runai_mpi_submit/index.html | 6 +- .../new-cli/runai_mpi_suspend/index.html | 6 +- .../new-cli/runai_node/index.html | 6 +- .../new-cli/runai_node_list/index.html | 6 +- .../new-cli/runai_nodepool/index.html | 6 +- .../new-cli/runai_nodepool_list/index.html | 6 +- .../new-cli/runai_port-forward/index.html | 6 +- .../new-cli/runai_project/index.html | 6 +- .../new-cli/runai_project_list/index.html | 6 +- .../new-cli/runai_project_set/index.html | 6 +- .../new-cli/runai_pytorch/index.html | 6 +- .../new-cli/runai_pytorch_attach/index.html | 6 +- .../new-cli/runai_pytorch_bash/index.html | 6 +- .../new-cli/runai_pytorch_delete/index.html | 6 +- .../new-cli/runai_pytorch_describe/index.html | 6 +- .../new-cli/runai_pytorch_exec/index.html | 6 +- .../new-cli/runai_pytorch_list/index.html | 6 +- .../new-cli/runai_pytorch_logs/index.html | 6 +- .../runai_pytorch_port-forward/index.html | 6 +- .../new-cli/runai_pytorch_resume/index.html | 6 +- .../new-cli/runai_pytorch_submit/index.html | 6 +- .../new-cli/runai_pytorch_suspend/index.html | 6 +- .../new-cli/runai_report/index.html | 6 +- .../new-cli/runai_report_metrics/index.html | 6 +- .../runai_report_metrics_clear/index.html | 6 +- .../runai_report_metrics_config/index.html | 6 +- .../runai_report_metrics_output/index.html | 6 +- .../new-cli/runai_submit/index.html | 6 +- .../new-cli/runai_tensorflow/index.html | 6 +- .../runai_tensorflow_attach/index.html | 6 +- .../new-cli/runai_tensorflow_bash/index.html | 6 +- .../runai_tensorflow_delete/index.html | 6 +- .../runai_tensorflow_describe/index.html | 6 +- .../new-cli/runai_tensorflow_exec/index.html | 6 +- .../new-cli/runai_tensorflow_list/index.html | 6 +- .../new-cli/runai_tensorflow_logs/index.html | 6 +- .../runai_tensorflow_port-forward/index.html | 6 +- .../runai_tensorflow_resume/index.html | 6 +- .../runai_tensorflow_submit/index.html | 6 +- .../runai_tensorflow_suspend/index.html | 6 +- .../new-cli/runai_training/index.html | 6 +- .../new-cli/runai_training_attach/index.html | 6 +- .../new-cli/runai_training_bash/index.html | 6 +- .../new-cli/runai_training_delete/index.html | 6 +- .../runai_training_describe/index.html | 6 +- .../new-cli/runai_training_exec/index.html | 6 +- .../new-cli/runai_training_list/index.html | 6 +- .../new-cli/runai_training_logs/index.html | 6 +- .../new-cli/runai_training_mpi/index.html | 6 +- .../runai_training_mpi_attach/index.html | 6 +- .../runai_training_mpi_bash/index.html | 6 +- .../runai_training_mpi_delete/index.html | 6 +- .../runai_training_mpi_describe/index.html | 6 +- .../runai_training_mpi_exec/index.html | 6 +- .../runai_training_mpi_list/index.html | 6 +- .../runai_training_mpi_logs/index.html | 6 +- .../index.html | 6 +- .../runai_training_mpi_resume/index.html | 6 +- .../runai_training_mpi_submit/index.html | 6 +- .../runai_training_mpi_suspend/index.html | 6 +- .../runai_training_port-forward/index.html | 6 +- .../new-cli/runai_training_pytorch/index.html | 6 +- .../runai_training_pytorch_attach/index.html | 6 +- .../runai_training_pytorch_bash/index.html | 6 +- .../runai_training_pytorch_delete/index.html | 6 +- .../index.html | 6 +- .../runai_training_pytorch_exec/index.html | 6 +- .../runai_training_pytorch_list/index.html | 6 +- .../runai_training_pytorch_logs/index.html | 6 +- .../index.html | 6 +- .../runai_training_pytorch_resume/index.html | 6 +- .../runai_training_pytorch_submit/index.html | 6 +- .../runai_training_pytorch_suspend/index.html | 6 +- .../new-cli/runai_training_resume/index.html | 6 +- .../runai_training_standard/index.html | 6 +- .../runai_training_standard_attach/index.html | 6 +- .../runai_training_standard_bash/index.html | 6 +- .../runai_training_standard_delete/index.html | 6 +- .../index.html | 6 +- .../runai_training_standard_exec/index.html | 6 +- .../runai_training_standard_list/index.html | 6 +- .../runai_training_standard_logs/index.html | 6 +- .../index.html | 6 +- .../runai_training_standard_resume/index.html | 6 +- .../runai_training_standard_submit/index.html | 6 +- .../index.html | 6 +- .../new-cli/runai_training_submit/index.html | 6 +- .../new-cli/runai_training_suspend/index.html | 6 +- .../runai_training_tensorflow/index.html | 6 +- .../index.html | 6 +- .../runai_training_tensorflow_bash/index.html | 6 +- .../index.html | 6 +- .../index.html | 6 +- .../runai_training_tensorflow_exec/index.html | 6 +- .../runai_training_tensorflow_list/index.html | 6 +- .../runai_training_tensorflow_logs/index.html | 6 +- .../index.html | 6 +- .../index.html | 6 +- .../index.html | 6 +- .../index.html | 6 +- .../new-cli/runai_training_xgboost/index.html | 6 +- .../runai_training_xgboost_attach/index.html | 6 +- .../runai_training_xgboost_bash/index.html | 6 +- .../runai_training_xgboost_delete/index.html | 6 +- .../index.html | 6 +- .../runai_training_xgboost_exec/index.html | 6 +- .../runai_training_xgboost_list/index.html | 6 +- .../runai_training_xgboost_logs/index.html | 6 +- .../index.html | 6 +- .../runai_training_xgboost_resume/index.html | 6 +- .../runai_training_xgboost_submit/index.html | 6 +- .../runai_training_xgboost_suspend/index.html | 6 +- .../new-cli/runai_upgrade/index.html | 6 +- .../new-cli/runai_version/index.html | 6 +- .../new-cli/runai_whoami/index.html | 6 +- .../new-cli/runai_workload/index.html | 6 +- .../new-cli/runai_workload_attach/index.html | 6 +- .../runai_workload_describe/index.html | 6 +- .../new-cli/runai_workload_exec/index.html | 6 +- .../new-cli/runai_workload_list/index.html | 6 +- .../new-cli/runai_workload_logs/index.html | 6 +- .../runai_workload_port-forward/index.html | 6 +- .../new-cli/runai_workspace/index.html | 6 +- .../new-cli/runai_workspace_attach/index.html | 6 +- .../new-cli/runai_workspace_bash/index.html | 6 +- .../new-cli/runai_workspace_delete/index.html | 6 +- .../runai_workspace_describe/index.html | 6 +- .../new-cli/runai_workspace_exec/index.html | 6 +- .../new-cli/runai_workspace_list/index.html | 6 +- .../new-cli/runai_workspace_logs/index.html | 6 +- .../runai_workspace_port-forward/index.html | 6 +- .../new-cli/runai_workspace_resume/index.html | 6 +- .../new-cli/runai_workspace_submit/index.html | 6 +- .../runai_workspace_suspend/index.html | 6 +- .../new-cli/runai_xgboost/index.html | 6 +- .../new-cli/runai_xgboost_attach/index.html | 6 +- .../new-cli/runai_xgboost_bash/index.html | 6 +- .../new-cli/runai_xgboost_delete/index.html | 6 +- .../new-cli/runai_xgboost_describe/index.html | 6 +- .../new-cli/runai_xgboost_exec/index.html | 6 +- .../new-cli/runai_xgboost_list/index.html | 6 +- .../new-cli/runai_xgboost_logs/index.html | 6 +- .../runai_xgboost_port-forward/index.html | 6 +- .../new-cli/runai_xgboost_resume/index.html | 6 +- .../new-cli/runai_xgboost_submit/index.html | 6 +- .../new-cli/runai_xgboost_suspend/index.html | 6 +- .../cli-reference/runai-attach/index.html | 6 +- .../cli-reference/runai-bash/index.html | 6 +- .../cli-reference/runai-config/index.html | 6 +- .../cli-reference/runai-delete/index.html | 6 +- .../cli-reference/runai-describe/index.html | 6 +- .../cli-reference/runai-exec/index.html | 6 +- .../cli-reference/runai-list/index.html | 6 +- .../cli-reference/runai-login/index.html | 6 +- .../cli-reference/runai-logout/index.html | 6 +- .../cli-reference/runai-logs/index.html | 6 +- .../runai-port-forwarding/index.html | 6 +- .../cli-reference/runai-resume/index.html | 6 +- .../runai-submit-dist-TF/index.html | 6 +- .../runai-submit-dist-mpi/index.html | 6 +- .../runai-submit-dist-pytorch/index.html | 6 +- .../runai-submit-dist-xgboost/index.html | 6 +- .../cli-reference/runai-submit/index.html | 6 +- .../cli-reference/runai-suspend/index.html | 6 +- .../cli-reference/runai-top-node/index.html | 6 +- .../cli-reference/runai-update/index.html | 6 +- .../cli-reference/runai-version/index.html | 6 +- .../cli-reference/runai-whoami/index.html | 6 +- .../Researcher/overview-researcher/index.html | 6 +- .../GPU-time-slicing-scheduler/index.html | 6 +- .../allocation-of-cpu-and-memory/index.html | 6 +- .../dynamic-gpu-fractions/index.html | 6 +- .../scheduling/fractions/index.html | 6 +- .../scheduling/gpu-memory-swap/index.html | 6 +- .../node-level-scheduler/index.html | 6 +- .../schedule-to-aws-groups/index.html | 6 +- .../scheduling/the-runai-scheduler/index.html | 6 +- v2.20/Researcher/tools/dev-jupyter/index.html | 6 +- v2.20/Researcher/tools/dev-pycharm/index.html | 6 +- .../tools/dev-tensorboard/index.html | 6 +- v2.20/Researcher/tools/dev-vscode/index.html | 6 +- .../tools/dev-x11forward-pycharm/index.html | 6 +- v2.20/Researcher/use-cases/index.html | 6 +- .../workloads/assets/compute/index.html | 6 +- .../workloads/assets/credentials/index.html | 6 +- .../workloads/assets/data-volumes/index.html | 6 +- .../workloads/assets/datasources/index.html | 6 +- .../workloads/assets/environments/index.html | 6 +- .../workloads/assets/overview/index.html | 6 +- .../workloads/assets/templates/index.html | 6 +- .../inference/custom-inference/index.html | 6 +- .../hugging-face-inference/index.html | 6 +- .../inference/inference-overview/index.html | 6 +- .../inference/nim-inference/index.html | 6 +- .../introduction-to-workloads/index.html | 6 +- .../overviews/managing-workloads/index.html | 6 +- .../overviews/workload-types/index.html | 6 +- .../distributed-training/index.html | 6 +- .../index.html | 6 +- .../quickstart-standard-training/index.html | 6 +- .../standard-training/trainings-v2/index.html | 6 +- .../workspaces/quickstart-jupyter/index.html | 6 +- .../workspaces/workspace-v2/index.html | 6 +- .../authentication/accessrules/index.html | 6 +- .../authentication/applications/index.html | 6 +- .../authentication-overview/index.html | 6 +- .../non-root-containers/index.html | 6 +- .../researcher-authentication/index.html | 6 +- v2.20/admin/authentication/roles/index.html | 6 +- .../sso/openidconnect/index.html | 6 +- .../authentication/sso/openshift/index.html | 6 +- .../admin/authentication/sso/saml/index.html | 6 +- v2.20/admin/authentication/users/index.html | 6 +- v2.20/admin/config/access-roles/index.html | 6 +- v2.20/admin/config/admin-messages/index.html | 6 +- .../config/advanced-cluster-config/index.html | 6 +- .../index.html | 6 +- .../admin/config/cli-admin-install/index.html | 6 +- .../admin/config/cluster-wide-pvc/index.html | 6 +- v2.20/admin/config/clusters/index.html | 6 +- .../create-k8s-assets-in-advance/index.html | 6 +- .../admin/config/default-scheduler/index.html | 6 +- v2.20/admin/config/dr/index.html | 6 +- v2.20/admin/config/ha/index.html | 6 +- v2.20/admin/config/large-clusters/index.html | 6 +- .../config/limit-to-node-group/index.html | 6 +- .../index.html | 6 +- v2.20/admin/config/node-roles/index.html | 6 +- v2.20/admin/config/notifications/index.html | 6 +- v2.20/admin/config/org-cert/index.html | 6 +- v2.20/admin/config/overview/index.html | 6 +- v2.20/admin/config/secure-cluster/index.html | 6 +- v2.20/admin/config/shared-storage/index.html | 6 +- .../workload-ownership-protection/index.html | 6 +- .../maintenance/alert-monitoring/index.html | 6 +- v2.20/admin/maintenance/audit-log/index.html | 6 +- .../maintenance/node-downtime/index.html | 6 +- v2.20/admin/maintenance/overview/index.html | 6 +- v2.20/admin/overview-administrator/index.html | 6 +- .../researcher-setup/cli-install/index.html | 6 +- .../docker-to-runai/index.html | 6 +- .../new-cli-install/index.html | 6 +- .../researcher-setup-intro/index.html | 6 +- .../cluster-setup/cluster-delete/index.html | 6 +- .../cluster-setup/cluster-install/index.html | 6 +- .../cluster-prerequisites/index.html | 6 +- .../cluster-setup-intro/index.html | 6 +- .../cluster-setup/cluster-upgrade/index.html | 6 +- .../customize-cluster-install/index.html | 6 +- .../cluster-setup/dgx-bundle/index.html | 6 +- .../cluster-setup/network-req/index.html | 6 +- .../project-management/index.html | 6 +- .../runai-setup/installation-types/index.html | 6 +- .../k8s/additional-clusters/index.html | 6 +- .../self-hosted/k8s/backend/index.html | 6 +- .../self-hosted/k8s/cluster/index.html | 6 +- .../self-hosted/k8s/next-steps/index.html | 6 +- .../self-hosted/k8s/preparations/index.html | 6 +- .../self-hosted/k8s/prerequisites/index.html | 6 +- .../k8s/project-management/index.html | 6 +- .../self-hosted/k8s/uninstall/index.html | 6 +- .../self-hosted/k8s/upgrade/index.html | 6 +- .../ocp/additional-clusters/index.html | 6 +- .../self-hosted/ocp/backend/index.html | 6 +- .../self-hosted/ocp/cluster/index.html | 6 +- .../self-hosted/ocp/next-steps/index.html | 6 +- .../self-hosted/ocp/preparations/index.html | 6 +- .../self-hosted/ocp/prerequisites/index.html | 6 +- .../ocp/project-management/index.html | 6 +- .../self-hosted/ocp/uninstall/index.html | 6 +- .../self-hosted/ocp/upgrade/index.html | 6 +- .../self-hosted/overview/index.html | 6 +- .../troubleshooting/diagnostics/index.html | 6 +- .../logs-collection/index.html | 6 +- .../troubleshooting/index.html | 6 +- ...5090c770.min.js => bundle.f13b1293.min.js} | 4 +- ....min.js.map => bundle.f13b1293.min.js.map} | 6 +- .../assets/stylesheets/main.a40c8224.min.css | 1 - .../stylesheets/main.a40c8224.min.css.map | 1 - .../assets/stylesheets/main.d7758b05.min.css | 1 + .../stylesheets/main.d7758b05.min.css.map | 1 + .../admin-rest-api/overview/index.html | 6 +- .../cluster-api/other-resources/index.html | 6 +- .../cluster-api/reference/index.html | 6 +- .../cluster-api/submit-rest/index.html | 6 +- .../cluster-api/submit-yaml/index.html | 6 +- .../workload-overview-dev/index.html | 6 +- .../developer/metrics/metrics-api/index.html | 6 +- v2.20/developer/metrics/metrics/index.html | 6 +- v2.20/developer/overview-developer/index.html | 6 +- v2.20/developer/rest-auth/index.html | 6 +- v2.20/developer/user-applications/index.html | 6 +- v2.20/home/changelog/hotfixes-2-13/index.html | 6 +- v2.20/home/changelog/hotfixes-2-15/index.html | 6 +- v2.20/home/changelog/hotfixes-2-16/index.html | 6 +- v2.20/home/changelog/hotfixes-2-17/index.html | 6 +- v2.20/home/changelog/hotfixes-2-18/index.html | 6 +- v2.20/home/changelog/hotfixes-2-19/index.html | 6 +- v2.20/home/changelog/hotfixes-2-20/index.html | 6 +- v2.20/home/components/index.html | 6 +- v2.20/home/data-privacy-details/index.html | 6 +- v2.20/home/overview/index.html | 6 +- v2.20/home/saas-updates/index.html | 8 +- v2.20/home/whats-new-2-13/index.html | 6 +- v2.20/home/whats-new-2-15/index.html | 6 +- v2.20/home/whats-new-2-16/index.html | 6 +- v2.20/home/whats-new-2-17/index.html | 6 +- v2.20/home/whats-new-2-18/index.html | 6 +- v2.20/home/whats-new-2-19/index.html | 6 +- v2.20/home/whats-new-2-20/index.html | 6 +- .../aiinitiatives/org/departments/index.html | 6 +- .../aiinitiatives/org/projects/index.html | 6 +- .../org/scheduling-rules/index.html | 6 +- .../aiinitiatives/overview/index.html | 6 +- .../configuring-mig-profiles/index.html | 6 +- .../resources/node-pools/index.html | 6 +- .../aiinitiatives/resources/nodes/index.html | 6 +- .../authentication/accessrules/index.html | 6 +- .../authentication/applications/index.html | 6 +- .../authentication/roles/index.html | 6 +- .../authentication/users/index.html | 6 +- .../integration-overview/index.html | 6 +- .../integrations/karpenter/index.html | 6 +- v2.20/platform-admin/overview/index.html | 6 +- .../performance/dashboard-analysis/index.html | 6 +- .../performance/reports/index.html | 6 +- .../workloads/assets/compute/index.html | 6 +- .../workloads/assets/credentials/index.html | 6 +- .../workloads/assets/data-volumes/index.html | 6 +- .../workloads/assets/datasources/index.html | 6 +- .../workloads/assets/environments/index.html | 6 +- .../workloads/assets/overview/index.html | 6 +- .../workloads/assets/templates/index.html | 6 +- .../introduction-to-workloads/index.html | 6 +- .../overviews/managing-workloads/index.html | 6 +- .../overviews/workload-types/index.html | 6 +- .../policies/old-policies/index.html | 6 +- .../workloads/policies/overview/index.html | 6 +- .../policies/policy-examples/index.html | 6 +- .../policies/policy-reference/index.html | 6 +- .../policies/workspaces-policy/index.html | 6 +- v2.20/search/search_index.json | 2 +- v2.20/sitemap.xml | 778 +++++++++--------- v2.20/sitemap.xml.gz | Bin 2851 -> 2851 bytes versions.json | 4 +- 400 files changed, 1568 insertions(+), 1568 deletions(-) rename v2.20/assets/javascripts/{bundle.5090c770.min.js => bundle.f13b1293.min.js} (85%) rename v2.20/assets/javascripts/{bundle.5090c770.min.js.map => bundle.f13b1293.min.js.map} (88%) delete mode 100644 v2.20/assets/stylesheets/main.a40c8224.min.css delete mode 100644 v2.20/assets/stylesheets/main.a40c8224.min.css.map create mode 100644 v2.20/assets/stylesheets/main.d7758b05.min.css create mode 100644 v2.20/assets/stylesheets/main.d7758b05.min.css.map diff --git a/v2.20/404.html b/v2.20/404.html index ae67097aab..b69b93fb79 100644 --- a/v2.20/404.html +++ b/v2.20/404.html @@ -1 +1 @@ - 404 - Not found

Document Not Found

The link you have used does not point to an existing document. Please search for the content on the top right, use the navigation bar to find what you are looking for or submit a ticket here.

\ No newline at end of file + 404 - Not found

Document Not Found

The link you have used does not point to an existing document. Please search for the content on the top right, use the navigation bar to find what you are looking for or submit a ticket here.

\ No newline at end of file diff --git a/v2.20/Researcher/Walkthroughs/quickstart-inference/index.html b/v2.20/Researcher/Walkthroughs/quickstart-inference/index.html index 1143006fb9..1a841d3821 100644 --- a/v2.20/Researcher/Walkthroughs/quickstart-inference/index.html +++ b/v2.20/Researcher/Walkthroughs/quickstart-inference/index.html @@ -1,4 +1,4 @@ - Inference - Run:ai Documentation Library
\ No newline at end of file diff --git a/v2.20/Researcher/Walkthroughs/quickstart-overview/index.html b/v2.20/Researcher/Walkthroughs/quickstart-overview/index.html index 3ced88cbb1..47af5470db 100644 --- a/v2.20/Researcher/Walkthroughs/quickstart-overview/index.html +++ b/v2.20/Researcher/Walkthroughs/quickstart-overview/index.html @@ -1,4 +1,4 @@ - Run:ai Quickstart Guides - Run:ai Documentation Library

Run:ai Quickstart Guides

Below is a set of Quickstart documents. The purpose of these documents is to get you acquainted with an aspect of Run:ai in the simplest possible form.

Note

The Quickstart documents are based solely on the command-line interface. The same functionality can be achieved by using the Workloads User interface which allows for Workload submission and log viewing.

Follow the Quickstart documents below to learn more:

Most quickstarts rely on an image called runai.jfrog.io/demo/quickstart. The image is based on TensorFlow Release 20-08. This TensorFlow image has minimal requirements for CUDA and NVIDIA Compute Capability.

If your GPUs do not meet these requirements, use runai.jfrog.io/demo/quickstart:legacy instead.

Run:ai Quickstart Guides

Below is a set of Quickstart documents. The purpose of these documents is to get you acquainted with an aspect of Run:ai in the simplest possible form.

Note

The Quickstart documents are based solely on the command-line interface. The same functionality can be achieved by using the Workloads User interface which allows for Workload submission and log viewing.

Follow the Quickstart documents below to learn more:

Most quickstarts rely on an image called runai.jfrog.io/demo/quickstart. The image is based on TensorFlow Release 20-08. This TensorFlow image has minimal requirements for CUDA and NVIDIA Compute Capability.

If your GPUs do not meet these requirements, use runai.jfrog.io/demo/quickstart:legacy instead.

\ No newline at end of file diff --git a/v2.20/Researcher/Walkthroughs/quickstart-vscode/index.html b/v2.20/Researcher/Walkthroughs/quickstart-vscode/index.html index 470951f308..02b282a976 100644 --- a/v2.20/Researcher/Walkthroughs/quickstart-vscode/index.html +++ b/v2.20/Researcher/Walkthroughs/quickstart-vscode/index.html @@ -1,4 +1,4 @@ - Visual Studio Code Web - Run:ai Documentation Library
\ No newline at end of file diff --git a/v2.20/Researcher/Walkthroughs/walkthrough-build-ports/index.html b/v2.20/Researcher/Walkthroughs/walkthrough-build-ports/index.html index 2e04d775b1..c633b3c615 100644 --- a/v2.20/Researcher/Walkthroughs/walkthrough-build-ports/index.html +++ b/v2.20/Researcher/Walkthroughs/walkthrough-build-ports/index.html @@ -1,4 +1,4 @@ - Build with Connected Ports - Run:ai Documentation Library
\ No newline at end of file diff --git a/v2.20/Researcher/Walkthroughs/walkthrough-build/index.html b/v2.20/Researcher/Walkthroughs/walkthrough-build/index.html index bfbada1704..61c7fe8de2 100644 --- a/v2.20/Researcher/Walkthroughs/walkthrough-build/index.html +++ b/v2.20/Researcher/Walkthroughs/walkthrough-build/index.html @@ -1,4 +1,4 @@ - Basics - Run:ai Documentation Library
\ No newline at end of file diff --git a/v2.20/Researcher/Walkthroughs/walkthrough-fractions/index.html b/v2.20/Researcher/Walkthroughs/walkthrough-fractions/index.html index 39aa904a76..e32c371c8c 100644 --- a/v2.20/Researcher/Walkthroughs/walkthrough-fractions/index.html +++ b/v2.20/Researcher/Walkthroughs/walkthrough-fractions/index.html @@ -1,4 +1,4 @@ - GPU Fractions - Run:ai Documentation Library
\ No newline at end of file diff --git a/v2.20/Researcher/Walkthroughs/walkthrough-overquota/index.html b/v2.20/Researcher/Walkthroughs/walkthrough-overquota/index.html index d5f163f176..d6c5273151 100644 --- a/v2.20/Researcher/Walkthroughs/walkthrough-overquota/index.html +++ b/v2.20/Researcher/Walkthroughs/walkthrough-overquota/index.html @@ -1,4 +1,4 @@ - Over-Quota, Basic Fairness & Bin-Packing - Run:ai Documentation Library
\ No newline at end of file diff --git a/v2.20/Researcher/Walkthroughs/walkthrough-queue-fairness/index.html b/v2.20/Researcher/Walkthroughs/walkthrough-queue-fairness/index.html index 12b93b69a5..184385ffd1 100644 --- a/v2.20/Researcher/Walkthroughs/walkthrough-queue-fairness/index.html +++ b/v2.20/Researcher/Walkthroughs/walkthrough-queue-fairness/index.html @@ -1,4 +1,4 @@ - Queue Fairness - Run:ai Documentation Library
\ No newline at end of file diff --git a/v2.20/Researcher/best-practices/bare-metal-to-docker-images/index.html b/v2.20/Researcher/best-practices/bare-metal-to-docker-images/index.html index 79301e231f..420d41778d 100644 --- a/v2.20/Researcher/best-practices/bare-metal-to-docker-images/index.html +++ b/v2.20/Researcher/best-practices/bare-metal-to-docker-images/index.html @@ -1,4 +1,4 @@ - Bare-Metal to Docker Images - Run:ai Documentation Library
Skip to content

Best Practice: From Bare Metal to Docker Images

Introduction

Some Researchers do data science on bare metal. The term bare-metal relates to connecting to a server and working directly on its operating system and disks.

This is the fastest way to start working, but it introduces problems when the data science organization scales:

  • More Researchers mean that the machine resources need to be efficiently shared
  • Researchers need to collaborate and share data, code, and results

To overcome that, people working on bare-metal typically write scripts to gather data, code as well as code dependencies. This soon becomes an overwhelming task.

Why Use Docker Images?

Docker images and containerization in general provide a level of abstraction which, by large, frees developers and Researchers from the mundane tasks of setting up an environment. The image is an operating system by itself and thus the 'environment' is by large, a part of the image.

When a docker image is instantiated, it creates a container. A container is the running manifestation of a docker image.

Moving a Data Science Environment to Docker

A data science environment typically includes:

  • Training data
  • Machine Learning (ML) code and inputs
  • Libraries: Code dependencies that must be installed before the ML code can be run
  • Training data

    Training data is usually significantly large (from several Gigabytes to Petabytes) and is read-only in nature. Thus, training data is typically left outside of the docker image. Instead, the data is mounted onto the image when it is instantiated. Mounting a volume allows the code within the container to access the data as though it was within a directory on the local file system.

    The best practice is to store the training data on a shared file system. This allows the data to be accessed uniformly on whichever machine the Researcher is currently using, allowing the Researcher to easily migrate between machines.

    Organizations without a shared file system typically write scripts to copy data from machine to machine.

    Machine Learning Code and Inputs

    As a rule, code needs to be saved and versioned in a code repository.

    There are two alternative practices:

    • The code resides in the image and is being periodically pulled from the repository. This practice requires building a new container image each time a change is introduced to the code.
    • When a shared file system exists, the code can reside outside the image on a shared disk and mounted via a volume onto the container.

    Both practices are valid.

    Inputs to machine learning models and artifacts of training sessions, like model checkpoints, are also better stored in and loaded from a shared file system.

    Code Dependencies

    Any code has code dependencies. These libraries must be installed for the code to run. As the code is changing, so do the dependencies.

    ML Code is typically python and python dependencies are typically declared together in a single requirements.txt file which is saved together with the code.

    The best practice is to have your docker startup script (see below) run this file using pip install -r requirements.txt. This allows the flexibility of adding and removing code dependencies dynamically.

    ML Lifecycle: Build and Train

    Deep learning workloads can be divided into two generic types:

  • Interactive "build" sessions. With these types of workloads, the data scientist opens an interactive session, via bash, Jupyter Notebook, remote PyCharm, or similar and accesses GPU resources directly. Build workloads are typically meant for debugging and development sessions.
  • Unattended "training" sessions. Training is characterized by a machine learning run that has a start and a finish. With these types of workloads, the data scientist prepares a self-running workload and sends it for execution. During the execution, the data scientist can examine the results. A Training session can take from a few minutes to a couple of days. It can be interrupted in the middle and later restored (though the data scientist should save checkpoints for that purpose). Training workloads typically utilize large percentages of the GPU and at the end of the run automatically frees the resources.
  • Getting your docker ready is also a matter of which type of workload you are currently running.

    Build Workloads

    With "build" you are actually coding and debugging small experiments. You are interactive. In that mode, you can typically take a well known standard image (e.g. https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) and use it directly.

    Start a docker container by running:

    docker run -it .... "the well known image" -v /where/my/code/resides bash 

    You get a shell prompt to a container with a mounted volume of where your code is. You can then install your prerequisites and run your code via ssh.

    You can also access the container remotely from tools such as PyCharm, Jupyter Notebook, and more. In this case, the docker image needs to be customized to install the "server software" (e.g. a Jupyter Notebook service).

    Training Workloads

    For training workloads, you can use a well-known image (e.g. the TensorFlow image from the link above) but more often than not, you want to create your own docker image. The best practice is to use the well-known image (e.g. TensorFlow from above) as a base image and add your own customizations on top of it. To achieve that, you create a Dockerfile. A Dockerfile is a declarative way to build a docker image and is built in layers. e.g.:

    1. Base image is nvidia-tensorflow
    2. Install popular software
    3. (Optional) Run a script

    The script can be part of the image or can be provided as part of the command line to run the docker. It will typically include additional dependencies to install as well as a reference to the ML code to be run.

    The best practice for running training workloads is to test the container image in a "build" session and then send it for execution as a training Job. For further information on how to set up and parameterize a training workload via docker or Run:ai see Converting your Workload to use Unattended Training Execution.

    Best Practice: From Bare Metal to Docker Images

    Introduction

    Some Researchers do data science on bare metal. The term bare-metal relates to connecting to a server and working directly on its operating system and disks.

    This is the fastest way to start working, but it introduces problems when the data science organization scales:

    • More Researchers mean that the machine resources need to be efficiently shared
    • Researchers need to collaborate and share data, code, and results

    To overcome that, people working on bare-metal typically write scripts to gather data, code as well as code dependencies. This soon becomes an overwhelming task.

    Why Use Docker Images?

    Docker images and containerization in general provide a level of abstraction which, by large, frees developers and Researchers from the mundane tasks of setting up an environment. The image is an operating system by itself and thus the 'environment' is by large, a part of the image.

    When a docker image is instantiated, it creates a container. A container is the running manifestation of a docker image.

    Moving a Data Science Environment to Docker

    A data science environment typically includes:

  • Training data
  • Machine Learning (ML) code and inputs
  • Libraries: Code dependencies that must be installed before the ML code can be run
  • Training data

    Training data is usually significantly large (from several Gigabytes to Petabytes) and is read-only in nature. Thus, training data is typically left outside of the docker image. Instead, the data is mounted onto the image when it is instantiated. Mounting a volume allows the code within the container to access the data as though it was within a directory on the local file system.

    The best practice is to store the training data on a shared file system. This allows the data to be accessed uniformly on whichever machine the Researcher is currently using, allowing the Researcher to easily migrate between machines.

    Organizations without a shared file system typically write scripts to copy data from machine to machine.

    Machine Learning Code and Inputs

    As a rule, code needs to be saved and versioned in a code repository.

    There are two alternative practices:

    • The code resides in the image and is being periodically pulled from the repository. This practice requires building a new container image each time a change is introduced to the code.
    • When a shared file system exists, the code can reside outside the image on a shared disk and mounted via a volume onto the container.

    Both practices are valid.

    Inputs to machine learning models and artifacts of training sessions, like model checkpoints, are also better stored in and loaded from a shared file system.

    Code Dependencies

    Any code has code dependencies. These libraries must be installed for the code to run. As the code is changing, so do the dependencies.

    ML Code is typically python and python dependencies are typically declared together in a single requirements.txt file which is saved together with the code.

    The best practice is to have your docker startup script (see below) run this file using pip install -r requirements.txt. This allows the flexibility of adding and removing code dependencies dynamically.

    ML Lifecycle: Build and Train

    Deep learning workloads can be divided into two generic types:

  • Interactive "build" sessions. With these types of workloads, the data scientist opens an interactive session, via bash, Jupyter Notebook, remote PyCharm, or similar and accesses GPU resources directly. Build workloads are typically meant for debugging and development sessions.
  • Unattended "training" sessions. Training is characterized by a machine learning run that has a start and a finish. With these types of workloads, the data scientist prepares a self-running workload and sends it for execution. During the execution, the data scientist can examine the results. A Training session can take from a few minutes to a couple of days. It can be interrupted in the middle and later restored (though the data scientist should save checkpoints for that purpose). Training workloads typically utilize large percentages of the GPU and at the end of the run automatically frees the resources.
  • Getting your docker ready is also a matter of which type of workload you are currently running.

    Build Workloads

    With "build" you are actually coding and debugging small experiments. You are interactive. In that mode, you can typically take a well known standard image (e.g. https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) and use it directly.

    Start a docker container by running:

    docker run -it .... "the well known image" -v /where/my/code/resides bash 

    You get a shell prompt to a container with a mounted volume of where your code is. You can then install your prerequisites and run your code via ssh.

    You can also access the container remotely from tools such as PyCharm, Jupyter Notebook, and more. In this case, the docker image needs to be customized to install the "server software" (e.g. a Jupyter Notebook service).

    Training Workloads

    For training workloads, you can use a well-known image (e.g. the TensorFlow image from the link above) but more often than not, you want to create your own docker image. The best practice is to use the well-known image (e.g. TensorFlow from above) as a base image and add your own customizations on top of it. To achieve that, you create a Dockerfile. A Dockerfile is a declarative way to build a docker image and is built in layers. e.g.:

    1. Base image is nvidia-tensorflow
    2. Install popular software
    3. (Optional) Run a script

    The script can be part of the image or can be provided as part of the command line to run the docker. It will typically include additional dependencies to install as well as a reference to the ML code to be run.

    The best practice for running training workloads is to test the container image in a "build" session and then send it for execution as a training Job. For further information on how to set up and parameterize a training workload via docker or Run:ai see Converting your Workload to use Unattended Training Execution.

    \ No newline at end of file diff --git a/v2.20/Researcher/best-practices/convert-to-unattended/index.html b/v2.20/Researcher/best-practices/convert-to-unattended/index.html index 6d3cc2224a..dd1af20a24 100644 --- a/v2.20/Researcher/best-practices/convert-to-unattended/index.html +++ b/v2.20/Researcher/best-practices/convert-to-unattended/index.html @@ -1,4 +1,4 @@ - Convert a Workload to Run Unattended - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/best-practices/env-variables/index.html b/v2.20/Researcher/best-practices/env-variables/index.html index b16d3272e6..6fd215f817 100644 --- a/v2.20/Researcher/best-practices/env-variables/index.html +++ b/v2.20/Researcher/best-practices/env-variables/index.html @@ -1,4 +1,4 @@ - Environment Variables - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/best-practices/researcher-notifications/index.html b/v2.20/Researcher/best-practices/researcher-notifications/index.html index c4eb61632b..f217d78492 100644 --- a/v2.20/Researcher/best-practices/researcher-notifications/index.html +++ b/v2.20/Researcher/best-practices/researcher-notifications/index.html @@ -1,4 +1,4 @@ - Researcher Email Notifications - Run:ai Documentation Library
    Skip to content

    Email Notifications

    Importance of Email Notifications for Data Scientists

    Managing numerous data science workloads requires monitoring various stages, including submission, scheduling, initialization, execution, and completion. Additionally, handling suspensions and failures is crucial for ensuring timely workload completion. Email Notifications address this need by sending alerts for critical workload life cycle changes. This empowers data scientists to take necessary actions and prevent delays.

    Once the system administrator configures the email notifications, users will receive notifications about their jobs that transition from one status to another. In addition, the user will get warning notifications before workload termination due to project-defined timeouts. Details included in the email are:

    • Workload type
    • Project and cluster information
    • Event timestamp

    To configure the types of email notifications you can receive:

    1. The user must log in to their account.
    2. Press the user icon, then select settings.
    3. In the Email notifications, and in the Send me an email about my workloads when section, select the relevant workload statuses.
    4. When complete, press Save.

    Email Notifications

    Importance of Email Notifications for Data Scientists

    Managing numerous data science workloads requires monitoring various stages, including submission, scheduling, initialization, execution, and completion. Additionally, handling suspensions and failures is crucial for ensuring timely workload completion. Email Notifications address this need by sending alerts for critical workload life cycle changes. This empowers data scientists to take necessary actions and prevent delays.

    Once the system administrator configures the email notifications, users will receive notifications about their jobs that transition from one status to another. In addition, the user will get warning notifications before workload termination due to project-defined timeouts. Details included in the email are:

    • Workload type
    • Project and cluster information
    • Event timestamp

    To configure the types of email notifications you can receive:

    1. The user must log in to their account.
    2. Press the user icon, then select settings.
    3. In the Email notifications, and in the Send me an email about my workloads when section, select the relevant workload statuses.
    4. When complete, press Save.
    \ No newline at end of file diff --git a/v2.20/Researcher/best-practices/save-dl-checkpoints/index.html b/v2.20/Researcher/best-practices/save-dl-checkpoints/index.html index 13aba14ff1..97ce480bc0 100644 --- a/v2.20/Researcher/best-practices/save-dl-checkpoints/index.html +++ b/v2.20/Researcher/best-practices/save-dl-checkpoints/index.html @@ -1,4 +1,4 @@ - Save Deep Learning Checkpoints - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/best-practices/secrets-as-env-var-in-cli/index.html b/v2.20/Researcher/best-practices/secrets-as-env-var-in-cli/index.html index 0726688459..0f8cefcbb1 100644 --- a/v2.20/Researcher/best-practices/secrets-as-env-var-in-cli/index.html +++ b/v2.20/Researcher/best-practices/secrets-as-env-var-in-cli/index.html @@ -1,4 +1,4 @@ - Secrets as Environment Variables (CLI) - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/Introduction/index.html b/v2.20/Researcher/cli-reference/Introduction/index.html index bf4bbda8f1..6916893909 100644 --- a/v2.20/Researcher/cli-reference/Introduction/index.html +++ b/v2.20/Researcher/cli-reference/Introduction/index.html @@ -1,4 +1,4 @@ - Introduction - Run:ai Documentation Library

    Introduction

    The Run:ai Command-line Interface (CLI) is one of the ways for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, etc.

    To install and configure the Run:ai CLI see Researcher Setup - Start Here

    Introduction

    The Run:ai Command-line Interface (CLI) is one of the ways for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, etc.

    To install and configure the Run:ai CLI see Researcher Setup - Start Here

    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/cli-examples/index.html b/v2.20/Researcher/cli-reference/new-cli/cli-examples/index.html index 267e9c33b8..4f88484ae3 100644 --- a/v2.20/Researcher/cli-reference/new-cli/cli-examples/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/cli-examples/index.html @@ -1,4 +1,4 @@ - CLI Examples - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/index.html b/v2.20/Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/index.html index 86948a4533..d2085c96e6 100644 --- a/v2.20/Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/index.html @@ -1,4 +1,4 @@ - Set cluster authorization - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/overview/index.html b/v2.20/Researcher/cli-reference/new-cli/overview/index.html index cf9de71aeb..b216bedc30 100644 --- a/v2.20/Researcher/cli-reference/new-cli/overview/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/overview/index.html @@ -1,4 +1,4 @@ - Run:ai V2 Command-line Interface - Run:ai Documentation Library
    Skip to content

    Overview

    The Run:ai Command-line Interface (CLI) tool for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, and access other features in the Run:ai platform.

    The new V2 Command-line interface

    This command-line interface is a complete revamp of the command-line interface. Few highlights:

    • The CLI internally uses the Control-plane API. This provides a single point of view on Workloads removing dissimilarities between the user interface, programming interface and the command-line interface.
    • As such, it also removes the need to configure the Kubernetes API server for authentication.
    • The CLI is only available for Run:ai cluster version 2.18 and up.
    • The new V2 CLI is backward compatible with the older V1 CLI.

    Installing the Improved Command Line Interface

    See installation instructions here.

    Reference

    List of all commands can be found here

    Overview

    The Run:ai Command-line Interface (CLI) tool for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, and access other features in the Run:ai platform.

    The new V2 Command-line interface

    This command-line interface is a complete revamp of the command-line interface. Few highlights:

    • The CLI internally uses the Control-plane API. This provides a single point of view on Workloads removing dissimilarities between the user interface, programming interface and the command-line interface.
    • As such, it also removes the need to configure the Kubernetes API server for authentication.
    • The CLI is only available for Run:ai cluster version 2.18 and up.
    • The new V2 CLI is backward compatible with the older V1 CLI.

    Installing the Improved Command Line Interface

    See installation instructions here.

    Reference

    List of all commands can be found here

    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai/index.html b/v2.20/Researcher/cli-reference/new-cli/runai/index.html index 9d65a60710..4f165750cb 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai/index.html @@ -1,4 +1,4 @@ - CLI Reference - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_attach/index.html index 8982fd8c49..398c192c84 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_attach/index.html @@ -1,4 +1,4 @@ - Runai attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_cluster/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_cluster/index.html index 42c02ac4e7..ad892b1657 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_cluster/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_cluster/index.html @@ -1,4 +1,4 @@ - Runai cluster - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_cluster_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_cluster_list/index.html index a001a3af19..0a7c096300 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_cluster_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_cluster_list/index.html @@ -1,4 +1,4 @@ - Runai cluster list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_cluster_set/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_cluster_set/index.html index c51a9aa1ad..4b05651127 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_cluster_set/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_cluster_set/index.html @@ -1,4 +1,4 @@ - Runai cluster set - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_config/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_config/index.html index 8eef7dea92..4bf24ed09f 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_config/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_config/index.html @@ -1,4 +1,4 @@ - Runai config - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_config_generate/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_config_generate/index.html index f5dc0ac0b5..d6003b11a6 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_config_generate/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_config_generate/index.html @@ -1,4 +1,4 @@ - Runai config generate - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_config_project/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_config_project/index.html index 7f7d13148e..1bd08a8007 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_config_project/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_config_project/index.html @@ -1,4 +1,4 @@ - Runai config project - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_config_set/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_config_set/index.html index e1cd9cc11c..5c1f244d0d 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_config_set/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_config_set/index.html @@ -1,4 +1,4 @@ - Runai config set - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_describe/index.html index 402d689d99..00af0ee4bb 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_describe/index.html @@ -1,4 +1,4 @@ - Runai describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_describe_job/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_describe_job/index.html index cb25754763..af4917268e 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_describe_job/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_describe_job/index.html @@ -1,4 +1,4 @@ - Runai describe job - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_describe_node/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_describe_node/index.html index d26cbb03bd..e95e2b7395 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_describe_node/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_describe_node/index.html @@ -1,4 +1,4 @@ - Runai describe node - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_exec/index.html index 2769bea1e3..3605978b7d 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_exec/index.html @@ -1,4 +1,4 @@ - Runai exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_kubeconfig/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_kubeconfig/index.html index 0236dd3977..a6887533b0 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_kubeconfig/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_kubeconfig/index.html @@ -1,4 +1,4 @@ - Runai kubeconfig - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_kubeconfig_set/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_kubeconfig_set/index.html index eb102bd90a..0692b118c1 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_kubeconfig_set/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_kubeconfig_set/index.html @@ -1,4 +1,4 @@ - Runai kubeconfig set - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_list/index.html index 63e6aa3a13..d2c1612e52 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_list/index.html @@ -1,4 +1,4 @@ - Runai list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_list_clusters/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_list_clusters/index.html index 92c2b357d4..b3924ff6b8 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_list_clusters/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_list_clusters/index.html @@ -1,4 +1,4 @@ - Runai list clusters - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_list_jobs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_list_jobs/index.html index 7220387fac..b3ac972e0c 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_list_jobs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_list_jobs/index.html @@ -1,4 +1,4 @@ - Runai list jobs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_list_nodes/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_list_nodes/index.html index d845cabfbc..0002e603da 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_list_nodes/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_list_nodes/index.html @@ -1,4 +1,4 @@ - Runai list nodes - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_list_projects/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_list_projects/index.html index 96a42daabd..8e1043a8ac 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_list_projects/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_list_projects/index.html @@ -1,4 +1,4 @@ - Runai list projects - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_login/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_login/index.html index a857f17a1e..35c46dfac8 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_login/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_login/index.html @@ -1,4 +1,4 @@ - Runai login - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_login_application/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_login_application/index.html index ebc917327e..30a1b25c50 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_login_application/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_login_application/index.html @@ -1,4 +1,4 @@ - Runai login application - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_login_sso/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_login_sso/index.html index 8edfccfcdb..3b7098f182 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_login_sso/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_login_sso/index.html @@ -1,4 +1,4 @@ - Runai login sso - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_login_user/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_login_user/index.html index 55cb0a4c13..8323bacc0a 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_login_user/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_login_user/index.html @@ -1,4 +1,4 @@ - Runai login user - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_logout/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_logout/index.html index 48518632ae..f797a39877 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_logout/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_logout/index.html @@ -1,4 +1,4 @@ - Runai logout - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_logs/index.html index e7bd7f9a42..7377fa4ce7 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_logs/index.html @@ -1,4 +1,4 @@ - Runai logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_mpi/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_mpi/index.html index 1efdd052e4..93746b0794 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_mpi/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_mpi/index.html @@ -1,4 +1,4 @@ - Runai mpi - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_attach/index.html index c622700722..9c85e3098a 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_attach/index.html @@ -1,4 +1,4 @@ - Runai mpi attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_bash/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_bash/index.html index 4afce7c5a8..1095b191ad 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_bash/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_bash/index.html @@ -1,4 +1,4 @@ - Runai mpi bash - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_delete/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_delete/index.html index f5d86ca15e..f4af163ec8 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_delete/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_delete/index.html @@ -1,4 +1,4 @@ - Runai mpi delete - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_describe/index.html index 9e50ad080f..abbbd4b3ed 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_describe/index.html @@ -1,4 +1,4 @@ - Runai mpi describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_exec/index.html index e407dfced0..b6ca2a6f90 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_exec/index.html @@ -1,4 +1,4 @@ - Runai mpi exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_list/index.html index e8c90badde..87cf1c0416 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_list/index.html @@ -1,4 +1,4 @@ - Runai mpi list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_logs/index.html index e6dc1a9ffd..9a392439f0 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_logs/index.html @@ -1,4 +1,4 @@ - Runai mpi logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_port-forward/index.html index a3028e3d89..b30e09823c 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_port-forward/index.html @@ -1,4 +1,4 @@ - Runai mpi port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_resume/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_resume/index.html index 1b52b82b89..0ac6b08631 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_resume/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_resume/index.html @@ -1,4 +1,4 @@ - Runai mpi resume - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_submit/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_submit/index.html index 31b372fe96..35f2304cd6 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_submit/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_submit/index.html @@ -1,4 +1,4 @@ - Runai mpi submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_suspend/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_suspend/index.html index 2bc435e2ef..b211424b00 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_mpi_suspend/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_mpi_suspend/index.html @@ -1,4 +1,4 @@ - Runai mpi suspend - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_node/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_node/index.html index 3d201fbc58..e69086b9b7 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_node/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_node/index.html @@ -1,4 +1,4 @@ - Runai node - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_node_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_node_list/index.html index 74809ba107..608d7dbda2 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_node_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_node_list/index.html @@ -1,4 +1,4 @@ - Runai node list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_nodepool/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_nodepool/index.html index 7c427113c0..b251612736 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_nodepool/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_nodepool/index.html @@ -1,4 +1,4 @@ - Runai nodepool - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_nodepool_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_nodepool_list/index.html index 54a853a17e..bdc6e9f67f 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_nodepool_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_nodepool_list/index.html @@ -1,4 +1,4 @@ - Runai nodepool list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_port-forward/index.html index 573dc3e0e2..102cd4ba1d 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_port-forward/index.html @@ -1,4 +1,4 @@ - Runai port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_project/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_project/index.html index ea14f763b2..cff0236743 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_project/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_project/index.html @@ -1,4 +1,4 @@ - Runai project - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_project_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_project_list/index.html index b2f0278fd8..50a4b653c5 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_project_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_project_list/index.html @@ -1,4 +1,4 @@ - Runai project list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_project_set/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_project_set/index.html index 7c311f60d5..b8a62befce 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_project_set/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_project_set/index.html @@ -1,4 +1,4 @@ - Runai project set - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch/index.html index 51251ed9f9..012c74940c 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch/index.html @@ -1,4 +1,4 @@ - Runai pytorch - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_attach/index.html index 30028ffa3f..ba68ab6f49 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_attach/index.html @@ -1,4 +1,4 @@ - Runai pytorch attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_bash/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_bash/index.html index 39c72676a0..bc44fd1874 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_bash/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_bash/index.html @@ -1,4 +1,4 @@ - Runai pytorch bash - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_delete/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_delete/index.html index b61e7451d0..1f181017a4 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_delete/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_delete/index.html @@ -1,4 +1,4 @@ - Runai pytorch delete - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_describe/index.html index 121f9db74e..ecb75558ad 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_describe/index.html @@ -1,4 +1,4 @@ - Runai pytorch describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_exec/index.html index f90205ac93..a9f4bcfa24 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_exec/index.html @@ -1,4 +1,4 @@ - Runai pytorch exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_list/index.html index 3d4e3dc0d9..26b23a9c7b 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_list/index.html @@ -1,4 +1,4 @@ - Runai pytorch list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_logs/index.html index 3b124da6ef..2f6faabbd7 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_logs/index.html @@ -1,4 +1,4 @@ - Runai pytorch logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_port-forward/index.html index 963d2371be..f1b4c52f1a 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_port-forward/index.html @@ -1,4 +1,4 @@ - Runai pytorch port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_resume/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_resume/index.html index 26ce7a4f2f..387d73c22c 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_resume/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_resume/index.html @@ -1,4 +1,4 @@ - Runai pytorch resume - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_submit/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_submit/index.html index 885353aaed..0c9edb7942 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_submit/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_submit/index.html @@ -1,4 +1,4 @@ - Runai pytorch submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_suspend/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_suspend/index.html index fb0c1ace9b..555dce0413 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_suspend/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_suspend/index.html @@ -1,4 +1,4 @@ - Runai pytorch suspend - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_report/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_report/index.html index 8c33cf3b27..1b50200789 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_report/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_report/index.html @@ -1,4 +1,4 @@ - Runai report - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics/index.html index 65ac05fe4c..7f475e433c 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics/index.html @@ -1,4 +1,4 @@ - Runai report metrics - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_clear/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_clear/index.html index 978164c13a..a2634b84ea 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_clear/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_clear/index.html @@ -1,4 +1,4 @@ - Runai report metrics clear - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_config/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_config/index.html index 60c54d2bf0..e9eeaeb98b 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_config/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_config/index.html @@ -1,4 +1,4 @@ - Runai report metrics config - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_output/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_output/index.html index acc64386ba..416bf15464 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_output/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_output/index.html @@ -1,4 +1,4 @@ - Runai report metrics output - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_submit/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_submit/index.html index fa3f9d1634..e7b5e27400 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_submit/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_submit/index.html @@ -1,4 +1,4 @@ - Runai submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow/index.html index 21e22b223f..62dda8720a 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow/index.html @@ -1,4 +1,4 @@ - Runai tensorflow - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_attach/index.html index 395add05b8..cac87505cb 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_attach/index.html @@ -1,4 +1,4 @@ - Runai tensorflow attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_bash/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_bash/index.html index 58fac5f24e..ca347a96af 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_bash/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_bash/index.html @@ -1,4 +1,4 @@ - Runai tensorflow bash - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_delete/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_delete/index.html index e558aae638..b9a03d5039 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_delete/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_delete/index.html @@ -1,4 +1,4 @@ - Runai tensorflow delete - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_describe/index.html index 5285109e28..37ae01668d 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_describe/index.html @@ -1,4 +1,4 @@ - Runai tensorflow describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_exec/index.html index addcb1b281..9c75574467 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_exec/index.html @@ -1,4 +1,4 @@ - Runai tensorflow exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_list/index.html index aa9cb3f0b6..4ea5905274 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_list/index.html @@ -1,4 +1,4 @@ - Runai tensorflow list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_logs/index.html index 80e07b3aa1..53035ec798 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_logs/index.html @@ -1,4 +1,4 @@ - Runai tensorflow logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/index.html index bccfe94348..1d059f4d47 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/index.html @@ -1,4 +1,4 @@ - Runai tensorflow port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_resume/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_resume/index.html index b64038b6be..6071cdb65a 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_resume/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_resume/index.html @@ -1,4 +1,4 @@ - Runai tensorflow resume - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_submit/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_submit/index.html index 7096e1870b..8ab16a6c55 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_submit/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_submit/index.html @@ -1,4 +1,4 @@ - Runai tensorflow submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_suspend/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_suspend/index.html index a5a7d88104..c18a64df5e 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_suspend/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_suspend/index.html @@ -1,4 +1,4 @@ - Runai tensorflow suspend - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training/index.html index f0219ee6fd..3b73a8126a 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training/index.html @@ -1,4 +1,4 @@ - Runai training - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_attach/index.html index fc9479f719..92a2318bf5 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_attach/index.html @@ -1,4 +1,4 @@ - Runai training attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_bash/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_bash/index.html index b4c46c530a..bc23722af2 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_bash/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_bash/index.html @@ -1,4 +1,4 @@ - Runai training bash - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_delete/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_delete/index.html index b4fbc01fa8..3137c1cc70 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_delete/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_delete/index.html @@ -1,4 +1,4 @@ - Runai training delete - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_describe/index.html index f41483e6cd..8ceb84e7c8 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_describe/index.html @@ -1,4 +1,4 @@ - Runai training describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_exec/index.html index 2499be380c..3c82588c40 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_exec/index.html @@ -1,4 +1,4 @@ - Runai training exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_list/index.html index 132894a90a..f490d9bc93 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_list/index.html @@ -1,4 +1,4 @@ - Runai training list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_logs/index.html index fda7b96dc7..a3da9b574e 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_logs/index.html @@ -1,4 +1,4 @@ - Runai training logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi/index.html index b53caa3a06..b4d8bdaf8a 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi/index.html @@ -1,4 +1,4 @@ - Runai training mpi - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_attach/index.html index 263aea4b8e..64441d3fb4 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_attach/index.html @@ -1,4 +1,4 @@ - Runai training mpi attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_bash/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_bash/index.html index e800c20fac..dd5c570e4b 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_bash/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_bash/index.html @@ -1,4 +1,4 @@ - Runai training mpi bash - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_delete/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_delete/index.html index 56a974688d..325d608456 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_delete/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_delete/index.html @@ -1,4 +1,4 @@ - Runai training mpi delete - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_describe/index.html index 9286cc9f92..9373f78b38 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_describe/index.html @@ -1,4 +1,4 @@ - Runai training mpi describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_exec/index.html index f37568e087..a31dd03a66 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_exec/index.html @@ -1,4 +1,4 @@ - Runai training mpi exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_list/index.html index c751fdc497..94b67898fa 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_list/index.html @@ -1,4 +1,4 @@ - Runai training mpi list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_logs/index.html index e4e6efada8..9a4c77ab9f 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_logs/index.html @@ -1,4 +1,4 @@ - Runai training mpi logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/index.html index bcbc70375c..c5d9c4df6b 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/index.html @@ -1,4 +1,4 @@ - Runai training mpi port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_resume/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_resume/index.html index 27f4ed9128..abfe7a211d 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_resume/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_resume/index.html @@ -1,4 +1,4 @@ - Runai training mpi resume - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_submit/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_submit/index.html index 1693045179..ac3f0660ec 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_submit/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_submit/index.html @@ -1,4 +1,4 @@ - Runai training mpi submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_suspend/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_suspend/index.html index fa308bd29c..d953e6197f 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_suspend/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_suspend/index.html @@ -1,4 +1,4 @@ - Runai training mpi suspend - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_port-forward/index.html index e8c9c7d264..73b9be9243 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_port-forward/index.html @@ -1,4 +1,4 @@ - Runai training port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch/index.html index d1b359175b..d7a6724fb3 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch/index.html @@ -1,4 +1,4 @@ - Runai training pytorch - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_attach/index.html index 9c14d8f3ec..fc2723554e 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_attach/index.html @@ -1,4 +1,4 @@ - Runai training pytorch attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_bash/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_bash/index.html index b9bc60c879..3183b117d6 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_bash/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_bash/index.html @@ -1,4 +1,4 @@ - Runai training pytorch bash - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_delete/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_delete/index.html index d90a50c36d..be4cbb3ca4 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_delete/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_delete/index.html @@ -1,4 +1,4 @@ - Runai training pytorch delete - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_describe/index.html index a65f77ef6e..cf980aab47 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_describe/index.html @@ -1,4 +1,4 @@ - Runai training pytorch describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_exec/index.html index cda7d542a3..dc7ffd409c 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_exec/index.html @@ -1,4 +1,4 @@ - Runai training pytorch exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_list/index.html index 98742f6e8f..6fed5df797 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_list/index.html @@ -1,4 +1,4 @@ - Runai training pytorch list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_logs/index.html index 03b3ad3d1f..efd57f9154 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_logs/index.html @@ -1,4 +1,4 @@ - Runai training pytorch logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/index.html index 4beece67c5..e3caaf0b60 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/index.html @@ -1,4 +1,4 @@ - Runai training pytorch port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_resume/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_resume/index.html index 56682c48e0..e66cac2200 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_resume/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_resume/index.html @@ -1,4 +1,4 @@ - Runai training pytorch resume - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_submit/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_submit/index.html index f2ad81b365..d302b5a000 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_submit/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_submit/index.html @@ -1,4 +1,4 @@ - Runai training pytorch submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/index.html index ddd6116892..491d10aa84 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/index.html @@ -1,4 +1,4 @@ - Runai training pytorch suspend - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_resume/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_resume/index.html index 0ed13d9b0a..0b48b8d885 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_resume/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_resume/index.html @@ -1,4 +1,4 @@ - Runai training resume - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard/index.html index 8f65fc2a42..c792a30fda 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard/index.html @@ -1,4 +1,4 @@ - Runai training standard - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_attach/index.html index 6a5fd3d738..3fe9e69d0e 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_attach/index.html @@ -1,4 +1,4 @@ - Runai training standard attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_bash/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_bash/index.html index b72ed80e4a..1f68f2e6ac 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_bash/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_bash/index.html @@ -1,4 +1,4 @@ - Runai training standard bash - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_delete/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_delete/index.html index 414c3e300e..0abbb77835 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_delete/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_delete/index.html @@ -1,4 +1,4 @@ - Runai training standard delete - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_describe/index.html index ea1f20a3ab..96dc0b9f7d 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_describe/index.html @@ -1,4 +1,4 @@ - Runai training standard describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_exec/index.html index 4cf51b0f52..6fbf58eecd 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_exec/index.html @@ -1,4 +1,4 @@ - Runai training standard exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_list/index.html index b9c76a065e..7bc4a0a89b 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_list/index.html @@ -1,4 +1,4 @@ - Runai training standard list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_logs/index.html index ec6e2c99b7..47aaf739ef 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_logs/index.html @@ -1,4 +1,4 @@ - Runai training standard logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_port-forward/index.html index fef5531e3f..78304ca94b 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_port-forward/index.html @@ -1,4 +1,4 @@ - Runai training standard port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_resume/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_resume/index.html index 54471d2737..6a63b7ba8d 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_resume/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_resume/index.html @@ -1,4 +1,4 @@ - Runai training standard resume - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_submit/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_submit/index.html index dbda410b68..9aae6063c0 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_submit/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_submit/index.html @@ -1,4 +1,4 @@ - Runai training standard submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_suspend/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_suspend/index.html index 8ffda7cd17..cd2ab89995 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_suspend/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_suspend/index.html @@ -1,4 +1,4 @@ - Runai training standard suspend - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_submit/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_submit/index.html index 0c7168a866..aec171ec43 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_submit/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_submit/index.html @@ -1,4 +1,4 @@ - Runai training submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_suspend/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_suspend/index.html index 6deca63864..f59105c7b1 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_suspend/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_suspend/index.html @@ -1,4 +1,4 @@ - Runai training suspend - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow/index.html index f3b6c6a22c..19b04d700b 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow/index.html @@ -1,4 +1,4 @@ - Runai training tensorflow - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/index.html index 5a6a50afab..fc3e55bf13 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/index.html @@ -1,4 +1,4 @@ - Runai training tensorflow attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/index.html index bc436e564f..1ec356b74a 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/index.html @@ -1,4 +1,4 @@ - Runai training tensorflow bash - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/index.html index c2bb1d07ea..a43ec29527 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/index.html @@ -1,4 +1,4 @@ - Runai training tensorflow delete - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/index.html index d35b984acc..f7960344be 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/index.html @@ -1,4 +1,4 @@ - Runai training tensorflow describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/index.html index 1579cbf96a..c43df55a63 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/index.html @@ -1,4 +1,4 @@ - Runai training tensorflow exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_list/index.html index da5b4fcb9f..7deaededdc 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_list/index.html @@ -1,4 +1,4 @@ - Runai training tensorflow list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/index.html index fa769a7b6a..e8f52e06da 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/index.html @@ -1,4 +1,4 @@ - Runai training tensorflow logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/index.html index 9ee3bcbe07..238223a158 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/index.html @@ -1,4 +1,4 @@ - Runai training tensorflow port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/index.html index 3ce6c861c8..3243311312 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/index.html @@ -1,4 +1,4 @@ - Runai training tensorflow resume - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/index.html index 928dd81c81..5352f9d1e4 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/index.html @@ -1,4 +1,4 @@ - Runai training tensorflow submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/index.html index 72ee64bd7a..d15447df3c 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/index.html @@ -1,4 +1,4 @@ - Runai training tensorflow suspend - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost/index.html index 4136304366..1e646324bc 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost/index.html @@ -1,4 +1,4 @@ - Runai training xgboost - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_attach/index.html index e12de6e12d..a663162e6b 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_attach/index.html @@ -1,4 +1,4 @@ - Runai training xgboost attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_bash/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_bash/index.html index 63e77c701a..25ee876db9 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_bash/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_bash/index.html @@ -1,4 +1,4 @@ - Runai training xgboost bash - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_delete/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_delete/index.html index d0b736de55..811faeb1e5 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_delete/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_delete/index.html @@ -1,4 +1,4 @@ - Runai training xgboost delete - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_describe/index.html index 3871eabba1..81e99ddf7c 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_describe/index.html @@ -1,4 +1,4 @@ - Runai training xgboost describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_exec/index.html index b72b2e7052..f09188e304 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_exec/index.html @@ -1,4 +1,4 @@ - Runai training xgboost exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_list/index.html index c498806926..f4a25698a3 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_list/index.html @@ -1,4 +1,4 @@ - Runai training xgboost list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_logs/index.html index 3250088a55..782182d146 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_logs/index.html @@ -1,4 +1,4 @@ - Runai training xgboost logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/index.html index 2f14a7d0d5..7440b67b34 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/index.html @@ -1,4 +1,4 @@ - Runai training xgboost port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_resume/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_resume/index.html index c8e94b62e2..982a0b51fc 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_resume/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_resume/index.html @@ -1,4 +1,4 @@ - Runai training xgboost resume - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_submit/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_submit/index.html index 58264fb524..a83fb0d0c3 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_submit/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_submit/index.html @@ -1,4 +1,4 @@ - Runai training xgboost submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/index.html index d4bc5c6b8b..6df5ae306c 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/index.html @@ -1,4 +1,4 @@ - Runai training xgboost suspend - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_upgrade/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_upgrade/index.html index f9248d6c7f..307f89a5d9 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_upgrade/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_upgrade/index.html @@ -1,4 +1,4 @@ - Runai upgrade - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_version/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_version/index.html index 64151a48f7..191ba1ef2a 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_version/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_version/index.html @@ -1,4 +1,4 @@ - Runai version - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_whoami/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_whoami/index.html index 94005cb226..bdf8562631 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_whoami/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_whoami/index.html @@ -1,4 +1,4 @@ - Runai whoami - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workload/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workload/index.html index 1740396081..2e53f8da18 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workload/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workload/index.html @@ -1,4 +1,4 @@ - Runai workload - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workload_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workload_attach/index.html index 8512dca05a..c332b26679 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workload_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workload_attach/index.html @@ -1,4 +1,4 @@ - Runai workload attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workload_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workload_describe/index.html index dcc4d29418..72d0617b16 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workload_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workload_describe/index.html @@ -1,4 +1,4 @@ - Runai workload describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workload_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workload_exec/index.html index 31ce6150bd..b1bdf56dc8 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workload_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workload_exec/index.html @@ -1,4 +1,4 @@ - Runai workload exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workload_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workload_list/index.html index 04605365de..58bb06aab4 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workload_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workload_list/index.html @@ -1,4 +1,4 @@ - Runai workload list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workload_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workload_logs/index.html index c850b39086..8002e8462d 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workload_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workload_logs/index.html @@ -1,4 +1,4 @@ - Runai workload logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workload_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workload_port-forward/index.html index c37a82a8f4..5eee703bb3 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workload_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workload_port-forward/index.html @@ -1,4 +1,4 @@ - Runai workload port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workspace/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workspace/index.html index 51598d78e3..5526f01fdb 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workspace/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workspace/index.html @@ -1,4 +1,4 @@ - Runai workspace - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_attach/index.html index af9c74a185..352f06ece6 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_attach/index.html @@ -1,4 +1,4 @@ - Runai workspace attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_bash/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_bash/index.html index 9f7538df84..1fae335aa4 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_bash/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_bash/index.html @@ -1,4 +1,4 @@ - Runai workspace bash - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_delete/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_delete/index.html index 09160b1fb2..89721a9387 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_delete/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_delete/index.html @@ -1,4 +1,4 @@ - Runai workspace delete - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_describe/index.html index 6bb5b097c7..21c7d97292 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_describe/index.html @@ -1,4 +1,4 @@ - Runai workspace describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_exec/index.html index 887da8c220..05161651e8 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_exec/index.html @@ -1,4 +1,4 @@ - Runai workspace exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_list/index.html index 78cf8fca3b..40ce2f1eb7 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_list/index.html @@ -1,4 +1,4 @@ - Runai workspace list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_logs/index.html index bd99d19dbd..d28a07acd2 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_logs/index.html @@ -1,4 +1,4 @@ - Runai workspace logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_port-forward/index.html index 4eec9ab8ef..85c9503088 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_port-forward/index.html @@ -1,4 +1,4 @@ - Runai workspace port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_resume/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_resume/index.html index 82988d3be3..47b9a77811 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_resume/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_resume/index.html @@ -1,4 +1,4 @@ - Runai workspace resume - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_submit/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_submit/index.html index c4d93e51c2..29ebbd8936 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_submit/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_submit/index.html @@ -1,4 +1,4 @@ - Runai workspace submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_suspend/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_suspend/index.html index b29b272380..e380310c1f 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_workspace_suspend/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_workspace_suspend/index.html @@ -1,4 +1,4 @@ - Runai workspace suspend - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost/index.html index d6fbed687d..f9d7d4e824 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost/index.html @@ -1,4 +1,4 @@ - Runai xgboost - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_attach/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_attach/index.html index 830955402d..66e1dd1360 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_attach/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_attach/index.html @@ -1,4 +1,4 @@ - Runai xgboost attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_bash/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_bash/index.html index 03a6d51470..7c16a9af29 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_bash/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_bash/index.html @@ -1,4 +1,4 @@ - Runai xgboost bash - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_delete/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_delete/index.html index 64efe6c9ec..f119cc3b5e 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_delete/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_delete/index.html @@ -1,4 +1,4 @@ - Runai xgboost delete - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_describe/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_describe/index.html index fecd8de5d5..d3bf8b46e7 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_describe/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_describe/index.html @@ -1,4 +1,4 @@ - Runai xgboost describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_exec/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_exec/index.html index ee80050f2a..2cf466da92 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_exec/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_exec/index.html @@ -1,4 +1,4 @@ - Runai xgboost exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_list/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_list/index.html index 727364ba48..eaa8803b30 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_list/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_list/index.html @@ -1,4 +1,4 @@ - Runai xgboost list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_logs/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_logs/index.html index 0522a4d220..5ddfc71ea3 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_logs/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_logs/index.html @@ -1,4 +1,4 @@ - Runai xgboost logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_port-forward/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_port-forward/index.html index 59fbc80fb8..ede4341b9e 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_port-forward/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_port-forward/index.html @@ -1,4 +1,4 @@ - Runai xgboost port forward - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_resume/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_resume/index.html index c27835c383..fadf322495 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_resume/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_resume/index.html @@ -1,4 +1,4 @@ - Runai xgboost resume - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_submit/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_submit/index.html index 07f819c259..fc93f1fd45 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_submit/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_submit/index.html @@ -1,4 +1,4 @@ - Runai xgboost submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_suspend/index.html b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_suspend/index.html index 36618a5e6b..21125d9d18 100644 --- a/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_suspend/index.html +++ b/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_suspend/index.html @@ -1,4 +1,4 @@ - Runai xgboost suspend - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-attach/index.html b/v2.20/Researcher/cli-reference/runai-attach/index.html index 0564e2e870..21664f2603 100644 --- a/v2.20/Researcher/cli-reference/runai-attach/index.html +++ b/v2.20/Researcher/cli-reference/runai-attach/index.html @@ -1,4 +1,4 @@ - runai attach - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-bash/index.html b/v2.20/Researcher/cli-reference/runai-bash/index.html index c5bcd7ef1f..264e011255 100644 --- a/v2.20/Researcher/cli-reference/runai-bash/index.html +++ b/v2.20/Researcher/cli-reference/runai-bash/index.html @@ -1,4 +1,4 @@ - runai bash - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-config/index.html b/v2.20/Researcher/cli-reference/runai-config/index.html index b5d9ef45c4..699897b0d9 100644 --- a/v2.20/Researcher/cli-reference/runai-config/index.html +++ b/v2.20/Researcher/cli-reference/runai-config/index.html @@ -1,4 +1,4 @@ - runai config - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-delete/index.html b/v2.20/Researcher/cli-reference/runai-delete/index.html index eecf26f7ab..1b0656183c 100644 --- a/v2.20/Researcher/cli-reference/runai-delete/index.html +++ b/v2.20/Researcher/cli-reference/runai-delete/index.html @@ -1,4 +1,4 @@ - runai delete - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-describe/index.html b/v2.20/Researcher/cli-reference/runai-describe/index.html index 43c7b8afe5..8fd0732af3 100644 --- a/v2.20/Researcher/cli-reference/runai-describe/index.html +++ b/v2.20/Researcher/cli-reference/runai-describe/index.html @@ -1,4 +1,4 @@ - runai describe - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-exec/index.html b/v2.20/Researcher/cli-reference/runai-exec/index.html index ae52248a92..dd46b745bf 100644 --- a/v2.20/Researcher/cli-reference/runai-exec/index.html +++ b/v2.20/Researcher/cli-reference/runai-exec/index.html @@ -1,4 +1,4 @@ - runai exec - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-list/index.html b/v2.20/Researcher/cli-reference/runai-list/index.html index 935c1d6b14..a426ed2a8d 100644 --- a/v2.20/Researcher/cli-reference/runai-list/index.html +++ b/v2.20/Researcher/cli-reference/runai-list/index.html @@ -1,4 +1,4 @@ - runai list - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-login/index.html b/v2.20/Researcher/cli-reference/runai-login/index.html index faed080b77..cd1571162d 100644 --- a/v2.20/Researcher/cli-reference/runai-login/index.html +++ b/v2.20/Researcher/cli-reference/runai-login/index.html @@ -1,4 +1,4 @@ - runai login - Run:ai Documentation Library
    Skip to content

    runai login

    Description

    Login to Run:ai

    When Researcher Authentication is enabled, you will need to login to Run:ai using your username and password before accessing resources

    Synopsis

    runai login 
         [--loglevel value]
         [--help | -h]
    -

    Options

    Global Flags

    --loglevel (string)

    Set the logging level. One of: debug | info | warn | error (default "info").

    --help | -h

    Show help text.

    Output

    You will be prompted for a user name and password

    See Also

    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-logout/index.html b/v2.20/Researcher/cli-reference/runai-logout/index.html index c4c6be5577..8bc1026f92 100644 --- a/v2.20/Researcher/cli-reference/runai-logout/index.html +++ b/v2.20/Researcher/cli-reference/runai-logout/index.html @@ -1,4 +1,4 @@ - runai logout - Run:ai Documentation Library
    Skip to content

    runai logout

    Description

    Log out from Run:ai

    Synopsis

    runai logout 
         [--loglevel value]
         [--help | -h]
    -

    Options

    Global Flags

    --loglevel (string)

    Set the logging level. One of: debug | info | warn | error (default "info").

    --help | -h

    Show help text.

    Output

    You will be logged out from Run:ai

    See Also

    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-logs/index.html b/v2.20/Researcher/cli-reference/runai-logs/index.html index ee3acf34f9..4c6e0909af 100644 --- a/v2.20/Researcher/cli-reference/runai-logs/index.html +++ b/v2.20/Researcher/cli-reference/runai-logs/index.html @@ -1,4 +1,4 @@ - runai logs - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-port-forwarding/index.html b/v2.20/Researcher/cli-reference/runai-port-forwarding/index.html index dcae94853b..56c3315be5 100644 --- a/v2.20/Researcher/cli-reference/runai-port-forwarding/index.html +++ b/v2.20/Researcher/cli-reference/runai-port-forwarding/index.html @@ -1,4 +1,4 @@ - runai port-forward - Run:ai Documentation Library
    Skip to content

    runai port-forward

    Description

    Forward one or more local ports to the selected job or a pod within the job. The forwarding session ends when the selected job terminates or the terminal is interrupted.

    Examples

    1. Port forward connections from localhost:8080 (localhost is the default) to on port 8090.

      runai port-forward <job-name> --port 8080:8090

    2. Port forward connections from 192.168.1.23:8080 to on port 8080.

      runai port-forward <job-name> --port 8080 --address 192.168.1.23

    3. Port forward multiple connections from localhost:8080 to on port 8090 and localhost:6443 to on port 443.

      runai port-forward <job-name> --port 8080:8090 --port 6443:443

    4. Port forward into a specific pod in a multi-pod job.

      runai port-forward <job-name> --port 8080:8090 --pod <pod-name>

    Global flags

    --loglevel <string>—Set the logging level. Choose: (default "info").

    -p | --project <string>—Specify the project name. To change the default project use runai config project <project name>.

    Flags

    --address <string> | [local-interface-ip\host] |localhost | 0.0.0.0 [privileged]—The listening address of your local machine. (default "localhost").

    -h | --help—Help for the command.

    --port—forward ports based on one of the following arguments:

    • <stringArray>—a list of port forwarding combinations.

    • [local-port]:[remote-port]—different local and remote ports.

    • [local-port=remote-port]—the same port is used for both local and remote.

    --pod—Specify a pod of a running job. To get a list of the pods of a specific job, run the command runai describe <job-name>.

    --pod-running-timeout—The length of time (like 5s, 2m, or 3h, higher than zero) to wait until the pod is running. Default is 10 minutes.

    Filter based flags

    --mpi—search only for mpi jobs.

    --interactive—search only for interactive jobs.

    --pytorch—search only for pytorch jobs.

    --tf—search only for tensorflow jobs.

    --train—search only for training jobs.

    runai port-forward

    Description

    Forward one or more local ports to the selected job or a pod within the job. The forwarding session ends when the selected job terminates or the terminal is interrupted.

    Examples

    1. Port forward connections from localhost:8080 (localhost is the default) to on port 8090.

      runai port-forward <job-name> --port 8080:8090

    2. Port forward connections from 192.168.1.23:8080 to on port 8080.

      runai port-forward <job-name> --port 8080 --address 192.168.1.23

    3. Port forward multiple connections from localhost:8080 to on port 8090 and localhost:6443 to on port 443.

      runai port-forward <job-name> --port 8080:8090 --port 6443:443

    4. Port forward into a specific pod in a multi-pod job.

      runai port-forward <job-name> --port 8080:8090 --pod <pod-name>

    Global flags

    --loglevel <string>—Set the logging level. Choose: (default "info").

    -p | --project <string>—Specify the project name. To change the default project use runai config project <project name>.

    Flags

    --address <string> | [local-interface-ip\host] |localhost | 0.0.0.0 [privileged]—The listening address of your local machine. (default "localhost").

    -h | --help—Help for the command.

    --port—forward ports based on one of the following arguments:

    • <stringArray>—a list of port forwarding combinations.

    • [local-port]:[remote-port]—different local and remote ports.

    • [local-port=remote-port]—the same port is used for both local and remote.

    --pod—Specify a pod of a running job. To get a list of the pods of a specific job, run the command runai describe <job-name>.

    --pod-running-timeout—The length of time (like 5s, 2m, or 3h, higher than zero) to wait until the pod is running. Default is 10 minutes.

    Filter based flags

    --mpi—search only for mpi jobs.

    --interactive—search only for interactive jobs.

    --pytorch—search only for pytorch jobs.

    --tf—search only for tensorflow jobs.

    --train—search only for training jobs.

    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-resume/index.html b/v2.20/Researcher/cli-reference/runai-resume/index.html index 1ff0ef1aef..14d8d1de91 100644 --- a/v2.20/Researcher/cli-reference/runai-resume/index.html +++ b/v2.20/Researcher/cli-reference/runai-resume/index.html @@ -1,4 +1,4 @@ - runai resume - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-submit-dist-TF/index.html b/v2.20/Researcher/cli-reference/runai-submit-dist-TF/index.html index 141fa837b3..999a5c0be1 100644 --- a/v2.20/Researcher/cli-reference/runai-submit-dist-TF/index.html +++ b/v2.20/Researcher/cli-reference/runai-submit-dist-TF/index.html @@ -1,4 +1,4 @@ - runai submit-dist tf - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-submit-dist-mpi/index.html b/v2.20/Researcher/cli-reference/runai-submit-dist-mpi/index.html index 3f6ba4c45d..e78143e194 100644 --- a/v2.20/Researcher/cli-reference/runai-submit-dist-mpi/index.html +++ b/v2.20/Researcher/cli-reference/runai-submit-dist-mpi/index.html @@ -1,4 +1,4 @@ - runai submit-dist mpi - Run:ai Documentation Library
    Skip to content

    runai submit-dist mpi

    Description

    Submit a Distributed Training (MPI) Run:ai Job to run.

    Note

    To use distributed training you need to have installed the Kubeflow MPI Operator as specified in Distributed training.

    Syntax notes:

    • Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.

    Examples

    You can start an unattended mpi training Job of name dist1, based on Project team-a using a quickstart-distributed image:

    runai submit-dist mpi --name dist1 --workers=2 -g 1 \
         -i runai.jfrog.io/demo/quickstart-distributed:v0.3.0 -e RUNAI_SLEEP_SECS=60
     

    (see: distributed training Quickstart).

    Options

    Distributed

    --clean-pod-policy < string >

    The CleanPodPolicy controls deletion of pods when a job terminates. The policy can be one of the following values:

    • Running—only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)
    • All—all (including completed) pods will be deleted immediately when the job finishes.
    • None—no pods will be deleted when the job completes.

    --workers < int >

    Number of replicas for Inference jobs.

    --slots-per-worker < int >

    Number of slots to allocate for each worker.

    Naming and Shortcuts

    --job-name-prefix <string>

    The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional --job-name-prefix flag creates Job names with the provided prefix.

    --name <string>

    The name of the Job.

    --template <string>

    Load default values from a workload.

    Container Definition

    --add-capability <stringArray>

    Add linux capabilities to the container.

    -a | --annotation <stringArray>

    Set annotations variables in the container.

    --attach

    Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.

    The --attach flag also sets --tty and --stdin to true.

    --command

    Overrides the image's entry point with the command supplied after '--'. When not using the --command flag, the entry point will not be overrided and the string after -- will be appended as arguments to the entry point command.

    Example:

    --command -- run.sh 1 54 will start the docker and run run.sh 1 54

    -- script.py 10000 will augment script.py 10000 to the entry point command (e.g. python)

    --create-home-dir

    Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.

    -e <stringArray> | --environment`

    Define environment variables to be set in the container. To set multiple values add the flag multiple times (-e BATCH_SIZE=50 -e LEARNING_RATE=0.2).

    --image <string> | -i <string>

    Image to use when creating the container for this Job

    --image-pull-policy <string>

    Pulling policy of the image when starting a container. Options are:

    • Always (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.
    • IfNotPresent: the image is pulled only if it is not already present locally.
    • Never: the image is assumed to exist locally. No attempt is made to pull the image.

    For more information see Kubernetes documentation.

    -l | --label <stringArray>

    Set labels variables in the container.

    --master-args string <string>

    Arguments to pass to the master pod container command. If used together with --command, overrides the image's entrypoint of the master pod container with the given command.

    --master-environment <stringArray>

    Set environment variables in the master pod container. To prevent from a worker environment variable from being set in the master, use the format: name=-.

    --master-extended-resource <stringArray>

    Request access to an extended resource in the master pod. Use the format: resource_name=quantity.

    --master-gpu <float>

    GPU units to allocate for the master pod.

    --master-no-pvcs

    Do not mount any persistent volumes in the master pod.

    --preferred-pod-topology-key <string>

    If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.

    --required-pod-topology-key <string>

    Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.

    --stdin

    Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.

    -t | --tty

    Allocate a pseudo-TTY.

    --working-dir <string>

    Starts the container with the specified directory as the current directory.

    Resource Allocation

    --cpu <double>

    CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.

    --cpu-limit <double>

    Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.

    --extended-resource `

    Request access to extended resource, syntax <resource-name> = < resource_quantity >

    -g | --gpu <float>

    GPU units to allocate for the Job (0.5, 1).

    --gpu-memory

    GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.

    --memory <string>

    CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.

    --memory-limit `

    CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.

    --mig-profile <string> (Deprecated)

    MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)

    Job Lifecycle

    --backoff-limit <int>

    The number of times the Job will be retried before failing. The default is 6.

    --ttl-after-finish < duration >

    The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).

    Storage

    --git-sync <stringArray>

    Clone a git repository into the container running the Job. The parameter should follow the syntax: source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE.

    --large-shm

    Mount a large /dev/shm device.

    --mount-propagation

    Enable HostToContainer mount propagation for all container volumes

    --nfs-server <string>

    Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).

    --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro]

    --pvc Pvc_Name:Container_Mount_Path:[ro]

    Mount a persistent volume claim into a container.

    Note

    This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters --pvc-exists and --pvc-new.

    The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.

    Storage_Class_Name is a storage class name that can be obtained by running kubectl get storageclasses.storage.k8s.io. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class.

    Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes

    Container_Mount_Path. A path internal to the container where the storage will be mounted

    Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container

    Examples:

    --pvc :3Gi:/tmp/john:ro - Allocate 3GB from the default Storage class. Mount it to /tmp/john as read-only

    --pvc my-storage:3Gi:/tmp/john:ro - Allocate 3GB from the my-storage storage class. Mount it to /tmp/john as read-only

    --pvc :3Gi:/tmp/john - Allocate 3GB from the default storage class. Mount it to /tmp/john as read-write

    --pvc my-pvc:/tmp/john - Use a Persistent Volume Claim named my-pvc. Mount it to /tmp/john as read-write

    --pvc my-pvc-2:/tmp/john:ro - Use a Persistent Volume Claim named my-pvc-2. Mount it to /tmp/john as read-only

    --pvc-exists <string>

    Mount a persistent volume. You must include a claimname and path.

    • claim name—The name of the persistent colume claim. Can be obtained by running

    kubectl get storageclasses.storage.k8s.io

    • path—the path internal to the container where the storage will be mounted

    Use the format:

    claimname=<CLAIM_NAME>,path=<PATH>

    --pvc-new <string>

    Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.

    • claim name—The name of the persistent colume claim.
    • storage class—A storage class name that can be obtained by running

    kubectl get storageclasses.storage.k8s.io.

    storageclass may be omitted if there is a single storage class in the system, or you are using the default storage class.

    • size—The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.
    • accessmode—The description ofthedesired volume capabilities for the PVC.
    • ro—Mount the PVC with read-only access.
    • ephemeral—The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.

    Use the format:

    storageclass= <storageclass>,size= <size>, path= <path>, ro, accessmode-rwm

    --s3 <string>

    Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:

    bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH

    All the fields, except url=URL, are mandatory. Default for url is

    url=https://s3.amazon.com

    -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'

    Volumes to mount into the container.

    Examples:

    -v /raid/public/john/data:/root/data:ro

    Mount /root/data to local path /raid/public/john/data for read-only access.

    -v /public/data:/root/data::nfs.example.com

    Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.

    --configmap-volume name=,path= ...'

    Mount a ConfigMap object for use as a data volume.

    Network

    --address <string>

    Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)

    --host-ipc

    Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.

    For further information see docker run reference documentation.

    --host-network

    Use the host's network stack inside the container. For further information see docker run referencedocumentation.

    --port <stringArray>

    Expose ports from the Job container.

    -s | --service-type <string>

    External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.

    Access Control

    --allow-privilege-escalation

    Allow the job to gain additional privileges after start.

    --run-as-user

    Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.

    Scheduling

    --node-pools <string>

    Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.

    --node-type <string>

    Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.

    --toleration <string>

    Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.

    The format of the string:

    operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]
    -

    Global Flags

    --loglevel (string)

    Set the logging level. One of: debug | info | warn | error (default "info")

    --project | -p (string)

    Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

    --help | -h

    Show help text.

    Output

    The command will attempt to submit an mpi Job. You can follow up on the Job by running runai list jobs or runai describe job <job-name>.

    See Also

    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-submit-dist-pytorch/index.html b/v2.20/Researcher/cli-reference/runai-submit-dist-pytorch/index.html index 39fd99e6a0..5a3beb742a 100644 --- a/v2.20/Researcher/cli-reference/runai-submit-dist-pytorch/index.html +++ b/v2.20/Researcher/cli-reference/runai-submit-dist-pytorch/index.html @@ -1,4 +1,4 @@ - runai submit-dist pytorch - Run:ai Documentation Library
    Skip to content

    runai submit-dist pytorch

    Description

    Submit a distributed PyTorch training run:ai job to run.

    Note

    To use distributed training you need to have installed the Pytorch operator as specified in Distributed training.

    Syntax notes:

    • Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.

    Examples

    runai submit-dist pytorch --name distributed-job --workers=2 -g 1 \
         -i <image_name>
     

    Options

    Distributed

    --clean-pod-policy < string >

    The CleanPodPolicy controls deletion of pods when a job terminates. The policy can be one of the following values:

    • Running—only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)
    • All—all (including completed) pods will be deleted immediately when the job finishes.
    • None—no pods will be deleted when the job completes.

    --max-replicas < int >

    Maximum number of replicas for elastic PyTorch job.

    --min-replicas < int >

    Minimum number of replicas for elastic PyTorch job.

    --workers < int>

    Number of replicas for Inference jobs

    Naming and Shortcuts

    --job-name-prefix <string>

    The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional --job-name-prefix flag creates Job names with the provided prefix.

    --name <string>

    The name of the Job.

    --template <string>

    Load default values from a workload.

    Container Definition

    --add-capability <stringArray>

    Add linux capabilities to the container.

    -a | --annotation <stringArray>

    Set annotations variables in the container.

    --attach

    Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.

    The --attach flag also sets --tty and --stdin to true.

    --command

    Overrides the image's entry point with the command supplied after '--'. When not using the --command flag, the entry point will not be overrided and the string after -- will be appended as arguments to the entry point command.

    Example:

    --command -- run.sh 1 54 will start the docker and run run.sh 1 54

    -- script.py 10000 will augment script.py 10000 to the entry point command (e.g. python)

    --create-home-dir

    Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.

    -e <stringArray> | --environment`

    Define environment variables to be set in the container. To set multiple values add the flag multiple times (-e BATCH_SIZE=50 -e LEARNING_RATE=0.2).

    --image <string> | -i <string>

    Image to use when creating the container for this Job.

    --image-pull-policy <string>

    Pulling policy of the image when starting a container. Options are:

    • Always (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.
    • IfNotPresent: the image is pulled only if it is not already present locally.
    • Never: the image is assumed to exist locally. No attempt is made to pull the image.

    For more information see Kubernetes documentation.

    -l | --label <stringArray>

    Set labels variables in the container.

    --master-args string <string>

    Arguments to pass to the master pod container command. If used together with --command, overrides the image's entrypoint of the master pod container with the given command.

    --master-environment <stringArray>

    Set environment variables in the master pod container. To prevent from a worker environment variable from being set in the master, use the format: name=-.

    --master-extended-resource <stringArray>

    Request access to an extended resource in the master pod. Use the format: resource_name=quantity.

    --master-gpu <float>

    GPU units to allocate for the master pod.

    --master-no-pvcs

    Do not mount any persistent volumes in the master pod.

    --no-master

    Do not create a separate pod for the master.

    --preferred-pod-topology-key <string>

    If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.

    --required-pod-topology-key <string>

    Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.

    --stdin

    Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.

    -t | --tty

    Allocate a pseudo-TTY.

    --working-dir <string>

    Starts the container with the specified directory as the current directory.

    Resource Allocation

    --cpu <double>

    CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.

    --cpu-limit <double>

    Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.

    --extended-resource `

    Request access to extended resource, syntax <resource-name> = < resource_quantity >

    -g | --gpu <float>

    GPU units to allocate for the Job (0.5, 1).

    --gpu-memory

    GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.

    --memory <string>

    CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.

    --memory-limit `

    CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.

    --mig-profile <string> (Deprecated)

    MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)

    Job Lifecycle

    --backoff-limit <int>

    The number of times the Job will be retried before failing. The default is 6.

    --ttl-after-finish < duration >

    The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).

    Storage

    --git-sync <stringArray>

    Clone a git repository into the container running the Job. The parameter should follow the syntax: source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE.

    --large-shm

    Mount a large /dev/shm device.

    --mount-propagation

    Enable HostToContainer mount propagation for all container volumes

    --nfs-server <string>

    Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).

    --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro]

    --pvc Pvc_Name:Container_Mount_Path:[ro]

    Mount a persistent volume claim into a container.

    Note

    This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters --pvc-exists and --pvc-new.

    The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.

    Storage_Class_Name is a storage class name that can be obtained by running kubectl get storageclasses.storage.k8s.io. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class.

    Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes

    Container_Mount_Path. A path internal to the container where the storage will be mounted

    Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container

    Examples:

    --pvc :3Gi:/tmp/john:ro - Allocate 3GB from the default Storage class. Mount it to /tmp/john as read-only

    --pvc my-storage:3Gi:/tmp/john:ro - Allocate 3GB from the my-storage storage class. Mount it to /tmp/john as read-only

    --pvc :3Gi:/tmp/john - Allocate 3GB from the default storage class. Mount it to /tmp/john as read-write

    --pvc my-pvc:/tmp/john - Use a Persistent Volume Claim named my-pvc. Mount it to /tmp/john as read-write

    --pvc my-pvc-2:/tmp/john:ro - Use a Persistent Volume Claim named my-pvc-2. Mount it to /tmp/john as read-only

    --pvc-exists <string>

    Mount a persistent volume. You must include a claimname and path.

    • claim name—The name of the persistent colume claim. Can be obtained by running

    kubectl get storageclasses.storage.k8s.io

    • path—the path internal to the container where the storage will be mounted

    Use the format:

    claimname=<CLAIM_NAME>,path=<PATH>

    --pvc-new <string>

    Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.

    • claim name—The name of the persistent colume claim.
    • storage class—A storage class name that can be obtained by running

    kubectl get storageclasses.storage.k8s.io.

    storageclass may be omitted if there is a single storage class in the system, or you are using the default storage class.

    • size—The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.
    • accessmode—The description ofthedesired volume capabilities for the PVC.
    • ro—Mount the PVC with read-only access.
    • ephemeral—The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.

    Use the format:

    storageclass= <storageclass>,size= <size>, path= <path>, ro, accessmode-rwm

    --s3 <string>

    Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:

    bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH

    All the fields, except url=URL, are mandatory. Default for url is

    url=https://s3.amazon.com

    -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'

    Volumes to mount into the container.

    Examples:

    -v /raid/public/john/data:/root/data:ro

    Mount /root/data to local path /raid/public/john/data for read-only access.

    -v /public/data:/root/data::nfs.example.com

    Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.

    --configmap-volume name=,path= ...'

    Mount a ConfigMap object for use as a data volume.

    Network

    --address <string>

    Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)

    --host-ipc

    Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.

    For further information see docker run reference documentation.

    --host-network

    Use the host's network stack inside the container. For further information see docker run referencedocumentation.

    --port <stringArray>

    Expose ports from the Job container.

    -s | --service-type <string>

    External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.

    Access Control

    --allow-privilege-escalation

    Allow the job to gain additional privileges after start.

    --run-as-user

    Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.

    Scheduling

    --node-pools <string>

    Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.

    --node-type <string>

    Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.

    --toleration <string>

    Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.

    The format of the string:

    operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]
    -

    Global Flags

    --loglevel (string)

    Set the logging level. One of: debug | info | warn | error (default "info")

    --project | -p (string)

    Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

    --help | -h

    Show help text.

    Output

    The command will attempt to submit a distributed pytorch workload. You can follow up on the workload by running runai list jobs or runai describe job <job-name>.

    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-submit-dist-xgboost/index.html b/v2.20/Researcher/cli-reference/runai-submit-dist-xgboost/index.html index ca128f3a64..0b51e594c6 100644 --- a/v2.20/Researcher/cli-reference/runai-submit-dist-xgboost/index.html +++ b/v2.20/Researcher/cli-reference/runai-submit-dist-xgboost/index.html @@ -1,4 +1,4 @@ - runai submit-dist xgboost - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-submit/index.html b/v2.20/Researcher/cli-reference/runai-submit/index.html index e4177984de..924a218060 100644 --- a/v2.20/Researcher/cli-reference/runai-submit/index.html +++ b/v2.20/Researcher/cli-reference/runai-submit/index.html @@ -1,4 +1,4 @@ - runai submit - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-suspend/index.html b/v2.20/Researcher/cli-reference/runai-suspend/index.html index ca3e0c6d3b..adc1b84148 100644 --- a/v2.20/Researcher/cli-reference/runai-suspend/index.html +++ b/v2.20/Researcher/cli-reference/runai-suspend/index.html @@ -1,4 +1,4 @@ - runai suspend - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-top-node/index.html b/v2.20/Researcher/cli-reference/runai-top-node/index.html index b90f6a0717..5d1819957a 100644 --- a/v2.20/Researcher/cli-reference/runai-top-node/index.html +++ b/v2.20/Researcher/cli-reference/runai-top-node/index.html @@ -1,4 +1,4 @@ - runai top node - Run:ai Documentation Library
    Skip to content

    runai top node

    Description

    Show list of Nodes (machines), their capacity and utilization.

    Synopsis

    runai top node 
         [--help | -h]
         [--details | -d]
    -

    Options

    Global Flags

    --loglevel (string)

    Set the logging level. One of: debug | info | warn | error (default "info").

    --help | -h

    Show help text.

    --details | -d

    Show additional details.

    Output

    Shows a list of Nodes their capacity and utilization.

    See Also

    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-update/index.html b/v2.20/Researcher/cli-reference/runai-update/index.html index e6bb5c05cd..3f0228b7af 100644 --- a/v2.20/Researcher/cli-reference/runai-update/index.html +++ b/v2.20/Researcher/cli-reference/runai-update/index.html @@ -1,4 +1,4 @@ - runai update - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-version/index.html b/v2.20/Researcher/cli-reference/runai-version/index.html index 11d24dd8ed..02961abc6f 100644 --- a/v2.20/Researcher/cli-reference/runai-version/index.html +++ b/v2.20/Researcher/cli-reference/runai-version/index.html @@ -1,4 +1,4 @@ - runai version - Run:ai Documentation Library
    Skip to content

    runai version

    Description

    Show the version of this utility.

    Synopsis

    runai version 
         [--loglevel value] 
         [--help | -h]
    -

    Options

    --loglevel (string)

    Set the logging level. One of: debug | info | warn | error (default "info").

    --help | -h

    Show help text.

    Output

    The version of the Run:ai command-line interface.

    See Also

    \ No newline at end of file diff --git a/v2.20/Researcher/cli-reference/runai-whoami/index.html b/v2.20/Researcher/cli-reference/runai-whoami/index.html index 435629e0a8..44dc5f97f1 100644 --- a/v2.20/Researcher/cli-reference/runai-whoami/index.html +++ b/v2.20/Researcher/cli-reference/runai-whoami/index.html @@ -1,4 +1,4 @@ - runai whoami - Run:ai Documentation Library
    Skip to content

    runai whoami

    Description

    Show the user name currently logged in

    Synopsis

    runai whoami 
         [--loglevel value] 
         [--help | -h]
    -

    Options

    --loglevel (string)

    Set the logging level. One of: debug | info | warn | error (default "info").

    --help | -h

    Show help text.

    Output

    The name of the User currently logged in with the Run:ai command-line interface.

    See Also

    \ No newline at end of file diff --git a/v2.20/Researcher/overview-researcher/index.html b/v2.20/Researcher/overview-researcher/index.html index 7d6c577cad..d28df7079c 100644 --- a/v2.20/Researcher/overview-researcher/index.html +++ b/v2.20/Researcher/overview-researcher/index.html @@ -1,4 +1,4 @@ - Researcher Documentation Overview - Run:ai Documentation Library
    Skip to content

    Overview: Researcher Documentation

    Researchers, or AI practitioners, use Run:ai to submit Workloads.

    As part of the Researcher documentation you will find:

    • Quickstart Guides which provide step-by-step guides to Run:ai technology.
    • Command line interface reference documentation.
    • Best Practices for Deep Learning with Run:ai.
    • Information about the Run:ai Scheduler.
    • Using Run:ai with various developer tools.

    Overview: Researcher Documentation

    Researchers, or AI practitioners, use Run:ai to submit Workloads.

    As part of the Researcher documentation you will find:

    • Quickstart Guides which provide step-by-step guides to Run:ai technology.
    • Command line interface reference documentation.
    • Best Practices for Deep Learning with Run:ai.
    • Information about the Run:ai Scheduler.
    • Using Run:ai with various developer tools.
    \ No newline at end of file diff --git a/v2.20/Researcher/scheduling/GPU-time-slicing-scheduler/index.html b/v2.20/Researcher/scheduling/GPU-time-slicing-scheduler/index.html index d4d27376b6..5844584304 100644 --- a/v2.20/Researcher/scheduling/GPU-time-slicing-scheduler/index.html +++ b/v2.20/Researcher/scheduling/GPU-time-slicing-scheduler/index.html @@ -1,4 +1,4 @@ - GPU Time Slicing Scheduler - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/scheduling/allocation-of-cpu-and-memory/index.html b/v2.20/Researcher/scheduling/allocation-of-cpu-and-memory/index.html index 6c45667340..2898f15b71 100644 --- a/v2.20/Researcher/scheduling/allocation-of-cpu-and-memory/index.html +++ b/v2.20/Researcher/scheduling/allocation-of-cpu-and-memory/index.html @@ -1,4 +1,4 @@ - Allocation of CPU and Memory - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/scheduling/dynamic-gpu-fractions/index.html b/v2.20/Researcher/scheduling/dynamic-gpu-fractions/index.html index d6b4effb4f..67be4a1d33 100644 --- a/v2.20/Researcher/scheduling/dynamic-gpu-fractions/index.html +++ b/v2.20/Researcher/scheduling/dynamic-gpu-fractions/index.html @@ -1,4 +1,4 @@ - Dynamic GPU Fractions - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/scheduling/fractions/index.html b/v2.20/Researcher/scheduling/fractions/index.html index ae035dbe7c..df34449a2e 100644 --- a/v2.20/Researcher/scheduling/fractions/index.html +++ b/v2.20/Researcher/scheduling/fractions/index.html @@ -1,4 +1,4 @@ - Allocation of GPU Fractions - Run:ai Documentation Library
    Skip to content

    Allocation of GPU Fractions

    Introduction

    A single GPU has a significant amount of memory. Ranging from a couple of gigabytes in older generations and up to 80GB per GPU in the later models of the latest NVIDIA GPU technology. A single GPU also has a vast amount of computing power.

    This amount of memory and computing power is important for processing large amounts of data, such as in training deep learning models. However, there are quite a few applications that do not need this power. Examples can be inference workloads and the model-creation phase. It would thus be convenient if we could divide up a GPU between various workloads, thus achieving better GPU utilization.

    This article describes a Run:ai technology called Fractions that allow the division of GPUs and how to use them with Run:ai.

    Run:ai Fractions

    Run:ai provides the capability to allocate a container with a specific amount of GPU RAM. As a researcher, if you know that your code needs 4GB of RAM. You can submit a job using the flag --gpu-memory 4G to specify the exact portion of the GPU memory that you need. Run:ai will allocate your container that specific amount of GPU RAM. Attempting to reach beyond your allotted RAM will result in an out-of-memory exception.

    You can also use the flag --gpu 0.2 to get 20% of the GPU memory on the GPU assigned for you.

    For more details on Run:ai fractions see the fractions quickstart.

    Limitation

    With the fraction technology all running workloads, which utilize the GPU, share the compute in parallel and on average get an even share of the compute. For example, assuming two containers, one with 0.25 GPU workload and the other with 0.75 GPU workload - both will get (on average) an equal part of the computation power. If one of the workloads does not utilize the GPU, the other workload will get the entire GPU's compute power.

    Info

    For interoperability with other Kubernetes schedulers, Run:ai creates special reservation pods. Once a workload has been allocated a fraction of a GPU, Run:ai will create a pod in a dedicated runai-reservation namespace with the full GPU as a resource. This would cause other schedulers to understand that the GPU is reserved.

    Multi-GPU Fractions

    Run:ai also supports workload submission using multi-GPU fractions. Multi-GPU fractions work similarly to fractional single GPU workloads, however, the Run:ai Scheduler allocates the same fraction size on multiple GPU devices within the same node. For example, if practitioners develop a new model that uses 8 GPUs and requires 40GB of memory per GPU, they can allocate 8×40GB with multi-GPU fractions instead of reserving the full memory of each GPU (e.g. 80GB). This leaves 40GB of GPU memory available on each of the 8 GPUs for other workloads within that node. This is useful during model development, where memory requirements are usually lower due to experimentation with smaller model or configurations.

    This approach significantly improves GPU utilization and availability, enabling more precise and often smaller quota requirements for the end user. Time sharing where single GPUs can serve multiple workloads with fractions remains unchanged, only now, it serves multiple workloads using multi-GPU per workload, single GPU per workload, or a mix of both.

    Configuring Multi-GPU Fractions

    Using the compute resources asset, you can define the compute requirement to run multiple GPU devices, by specifying either a fraction (percentage) of the overall memory or specifying the memory request (GB, MB):

    See Also

    Allocation of GPU Fractions

    Introduction

    A single GPU has a significant amount of memory. Ranging from a couple of gigabytes in older generations and up to 80GB per GPU in the later models of the latest NVIDIA GPU technology. A single GPU also has a vast amount of computing power.

    This amount of memory and computing power is important for processing large amounts of data, such as in training deep learning models. However, there are quite a few applications that do not need this power. Examples can be inference workloads and the model-creation phase. It would thus be convenient if we could divide up a GPU between various workloads, thus achieving better GPU utilization.

    This article describes a Run:ai technology called Fractions that allow the division of GPUs and how to use them with Run:ai.

    Run:ai Fractions

    Run:ai provides the capability to allocate a container with a specific amount of GPU RAM. As a researcher, if you know that your code needs 4GB of RAM. You can submit a job using the flag --gpu-memory 4G to specify the exact portion of the GPU memory that you need. Run:ai will allocate your container that specific amount of GPU RAM. Attempting to reach beyond your allotted RAM will result in an out-of-memory exception.

    You can also use the flag --gpu 0.2 to get 20% of the GPU memory on the GPU assigned for you.

    For more details on Run:ai fractions see the fractions quickstart.

    Limitation

    With the fraction technology all running workloads, which utilize the GPU, share the compute in parallel and on average get an even share of the compute. For example, assuming two containers, one with 0.25 GPU workload and the other with 0.75 GPU workload - both will get (on average) an equal part of the computation power. If one of the workloads does not utilize the GPU, the other workload will get the entire GPU's compute power.

    Info

    For interoperability with other Kubernetes schedulers, Run:ai creates special reservation pods. Once a workload has been allocated a fraction of a GPU, Run:ai will create a pod in a dedicated runai-reservation namespace with the full GPU as a resource. This would cause other schedulers to understand that the GPU is reserved.

    Multi-GPU Fractions

    Run:ai also supports workload submission using multi-GPU fractions. Multi-GPU fractions work similarly to fractional single GPU workloads, however, the Run:ai Scheduler allocates the same fraction size on multiple GPU devices within the same node. For example, if practitioners develop a new model that uses 8 GPUs and requires 40GB of memory per GPU, they can allocate 8×40GB with multi-GPU fractions instead of reserving the full memory of each GPU (e.g. 80GB). This leaves 40GB of GPU memory available on each of the 8 GPUs for other workloads within that node. This is useful during model development, where memory requirements are usually lower due to experimentation with smaller model or configurations.

    This approach significantly improves GPU utilization and availability, enabling more precise and often smaller quota requirements for the end user. Time sharing where single GPUs can serve multiple workloads with fractions remains unchanged, only now, it serves multiple workloads using multi-GPU per workload, single GPU per workload, or a mix of both.

    Configuring Multi-GPU Fractions

    Using the compute resources asset, you can define the compute requirement to run multiple GPU devices, by specifying either a fraction (percentage) of the overall memory or specifying the memory request (GB, MB):

    See Also

    \ No newline at end of file diff --git a/v2.20/Researcher/scheduling/gpu-memory-swap/index.html b/v2.20/Researcher/scheduling/gpu-memory-swap/index.html index 6ca9c8b73f..8eb1c1713a 100644 --- a/v2.20/Researcher/scheduling/gpu-memory-swap/index.html +++ b/v2.20/Researcher/scheduling/gpu-memory-swap/index.html @@ -1,4 +1,4 @@ - GPU Memory SWAP - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/scheduling/node-level-scheduler/index.html b/v2.20/Researcher/scheduling/node-level-scheduler/index.html index 6f697bc1e1..c2031ddba9 100644 --- a/v2.20/Researcher/scheduling/node-level-scheduler/index.html +++ b/v2.20/Researcher/scheduling/node-level-scheduler/index.html @@ -1,4 +1,4 @@ - Optimize performance with Node Level Scheduler - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/scheduling/schedule-to-aws-groups/index.html b/v2.20/Researcher/scheduling/schedule-to-aws-groups/index.html index e8c9784b92..3b46590a73 100644 --- a/v2.20/Researcher/scheduling/schedule-to-aws-groups/index.html +++ b/v2.20/Researcher/scheduling/schedule-to-aws-groups/index.html @@ -1,4 +1,4 @@ - Scheduling workloads to AWS placement groups - Run:ai Documentation Library
    Skip to content

    Scheduling workloads to AWS placement groups

    Run:ai supports AWS placement groups when building and submitting a job. AWS Placement Groups are used to maximize throughput and performance of distributed training workloads.

    To enable and configure this feature:

    1. Press Jobs | New job.
    2. In Scheduling and lifecycle enable the Topology aware scheduling.
    3. In Topology key, enter the label of the topology of the node.
    4. In Scheduling rule choose Required or Preferred from the drop down.

      • Required—when enabled, all PODs must be scheduled to the same placement group.
      • Preferred—when enabled, this is a best-effort, to place as many PODs on the same placement group.

    Scheduling workloads to AWS placement groups

    Run:ai supports AWS placement groups when building and submitting a job. AWS Placement Groups are used to maximize throughput and performance of distributed training workloads.

    To enable and configure this feature:

    1. Press Jobs | New job.
    2. In Scheduling and lifecycle enable the Topology aware scheduling.
    3. In Topology key, enter the label of the topology of the node.
    4. In Scheduling rule choose Required or Preferred from the drop down.

      • Required—when enabled, all PODs must be scheduled to the same placement group.
      • Preferred—when enabled, this is a best-effort, to place as many PODs on the same placement group.
    \ No newline at end of file diff --git a/v2.20/Researcher/scheduling/the-runai-scheduler/index.html b/v2.20/Researcher/scheduling/the-runai-scheduler/index.html index 95086a5833..ecb1532bab 100644 --- a/v2.20/Researcher/scheduling/the-runai-scheduler/index.html +++ b/v2.20/Researcher/scheduling/the-runai-scheduler/index.html @@ -1,4 +1,4 @@ - The Run:ai Scheduler - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/tools/dev-jupyter/index.html b/v2.20/Researcher/tools/dev-jupyter/index.html index f80b2b8f28..c79dbf4b4a 100644 --- a/v2.20/Researcher/tools/dev-jupyter/index.html +++ b/v2.20/Researcher/tools/dev-jupyter/index.html @@ -1,4 +1,4 @@ - Jupyter Notebook - Run:ai Documentation Library
    Skip to content

    Use a Jupyter Notebook with a Run:ai Job

    See the Jupyter Notebook Quickstart here.

    Use a Jupyter Notebook with a Run:ai Job

    See the Jupyter Notebook Quickstart here.

    \ No newline at end of file diff --git a/v2.20/Researcher/tools/dev-pycharm/index.html b/v2.20/Researcher/tools/dev-pycharm/index.html index afb943af2a..0b7b2602c8 100644 --- a/v2.20/Researcher/tools/dev-pycharm/index.html +++ b/v2.20/Researcher/tools/dev-pycharm/index.html @@ -1,4 +1,4 @@ - PyCharm - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/tools/dev-tensorboard/index.html b/v2.20/Researcher/tools/dev-tensorboard/index.html index 648342bc51..aeacd0e79b 100644 --- a/v2.20/Researcher/tools/dev-tensorboard/index.html +++ b/v2.20/Researcher/tools/dev-tensorboard/index.html @@ -1,4 +1,4 @@ - TensorBoard - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/tools/dev-vscode/index.html b/v2.20/Researcher/tools/dev-vscode/index.html index 4b12c57ac6..bc9fde65ca 100644 --- a/v2.20/Researcher/tools/dev-vscode/index.html +++ b/v2.20/Researcher/tools/dev-vscode/index.html @@ -1,4 +1,4 @@ - Visual Studio Code - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/tools/dev-x11forward-pycharm/index.html b/v2.20/Researcher/tools/dev-x11forward-pycharm/index.html index 5193d6c281..53671e4754 100644 --- a/v2.20/Researcher/tools/dev-x11forward-pycharm/index.html +++ b/v2.20/Researcher/tools/dev-x11forward-pycharm/index.html @@ -1,4 +1,4 @@ - X11 & PyCharm - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/use-cases/index.html b/v2.20/Researcher/use-cases/index.html index db40ebe1e8..68826a804c 100644 --- a/v2.20/Researcher/use-cases/index.html +++ b/v2.20/Researcher/use-cases/index.html @@ -1,4 +1,4 @@ - Use Cases - Run:ai Documentation Library
    Skip to content

    Use Cases

    This is a collection of various client-requested use cases. Each use case is accompanied by a short live-demo video, along with all the files used.

    Note

    For the most up-to-date information, check out the official Run:ai use-cases GitHub page.

    • MLflow with Run:ai: experiment management is important for Data Scientists. This is a demo of how to set up and use MLflow with Run:ai.
    • Introduction to Docker: Run:ai runs using Docker images. This is a brief introduction to Docker, image creation, and how to use them in the context of Run:ai. Please also check out the Persistent Environments use case if you wish to keep the creation of Docker images to a minimum.
    • Tensorboard with Jupyter (ResNet demo): Many Data Scientists like to use Tensorboard to keep an eye on the their current training experiments. They also like to have it side-by-side with Jupyter. In this demo, we will show how to integrate Tensorboard and Jupyter Lab within the context of Run:ai.
    • Persistent Environments (with Conda/Mamba & Jupyter): Some Data Scientists find creating Docker images for every single one of their environments a bit of a hindrance. They would often prefer the ability to create and alter environments on the fly and to have those environments remain, even after an image has finished running in a job. This demo shows users how they can create and persist Conda/Mamba environments using an NFS.
    • Weights & Biases with Run:ai: W&B (Weights & Biases) is one of the best tools for experiment tracking and management. W&B is an official Run:ai partner. In this tutorial, we will demo how to use W&B alongside Run:ai

    Use Cases

    This is a collection of various client-requested use cases. Each use case is accompanied by a short live-demo video, along with all the files used.

    Note

    For the most up-to-date information, check out the official Run:ai use-cases GitHub page.

    • MLflow with Run:ai: experiment management is important for Data Scientists. This is a demo of how to set up and use MLflow with Run:ai.
    • Introduction to Docker: Run:ai runs using Docker images. This is a brief introduction to Docker, image creation, and how to use them in the context of Run:ai. Please also check out the Persistent Environments use case if you wish to keep the creation of Docker images to a minimum.
    • Tensorboard with Jupyter (ResNet demo): Many Data Scientists like to use Tensorboard to keep an eye on the their current training experiments. They also like to have it side-by-side with Jupyter. In this demo, we will show how to integrate Tensorboard and Jupyter Lab within the context of Run:ai.
    • Persistent Environments (with Conda/Mamba & Jupyter): Some Data Scientists find creating Docker images for every single one of their environments a bit of a hindrance. They would often prefer the ability to create and alter environments on the fly and to have those environments remain, even after an image has finished running in a job. This demo shows users how they can create and persist Conda/Mamba environments using an NFS.
    • Weights & Biases with Run:ai: W&B (Weights & Biases) is one of the best tools for experiment tracking and management. W&B is an official Run:ai partner. In this tutorial, we will demo how to use W&B alongside Run:ai
    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/assets/compute/index.html b/v2.20/Researcher/workloads/assets/compute/index.html index 6c582aa525..6f0ca2c3e8 100644 --- a/v2.20/Researcher/workloads/assets/compute/index.html +++ b/v2.20/Researcher/workloads/assets/compute/index.html @@ -1,4 +1,4 @@ - Compute Resources - Run:ai Documentation Library
    Skip to content

    Compute Resources

    This article explains what compute resources are and how to create and use them.

    Compute resources are one type of workload asset. A compute resource is a template that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

    A compute resource asset is a preconfigured building block that encapsulates all the specifications of compute requirements for the workload including:

    • GPU devices and GPU memory
    • CPU memory and CPU compute

    Compute resource table

    The Compute resource table can be found under Workload manager in the Run:ai UI.

    The Compute resource table provides a list of all the compute resources defined in the platform and allows you to manage them.

    The Compute resource table consists of the following columns:

    Column Description
    Compute resource The name of the compute resource
    Description A description of the essence of the compute resource
    GPU devices request per pod The number of requested physical devices per pod of the workload that uses this compute resource
    GPU memory request per device The amount of GPU memory per requested device that is granted to each pod of the workload that uses this compute resource
    CPU memory request The minimum amount of CPU memory per pod of the workload that uses this compute resource
    CPU memory limit The maximum amount of CPU memory per pod of the workload that uses this compute resource
    CPU compute request The minimum number of CPU cores per pod of the workload that uses this compute resource
    CPU compute limit The maximum number of CPU cores per pod of the workload that uses this compute resource
    Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram
    Workload(s) The list of workloads associated with the compute resource
    Template(s) The list of workload templates that use this compute resource
    Created by The name of the user who created the compute resource
    Creation time The timestamp of when the compute resource was created
    Last updated The timestamp of when the compute resource was last updated
    Cluster The cluster that the compute resource is associated with

    Workloads associated with the compute resource

    Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

    Column Description
    Workload The workload that uses the compute resource
    Type Workspace/Training/Inference
    Status Represents the workload lifecycle. See the full list of workload status.

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table

    Adding new compute resource

    To add a new compute resource:

    1. Go to the Compute resource table
    2. Click +NEW COMPUTE RESOURCE
    3. Select under which cluster to create the compute resource
    4. Select a scope
    5. Enter a name for the compute resource. The name must be unique.
    6. Optional: Provide a description of the essence of the compute resource
    7. Set the resource types needed within a single node
      (the Run:ai scheduler tries to match a single node that complies with the compute resource for each of the workload’s pods)

      • GPU

        • GPU devices per pod
          The number of devices (physical GPUs) per pod
          (for example, if you requested 3 devices per pod and the running workload using this compute resource consists of 3 pods, there are 9 physical GPU devices used in total)

        Note

        • When setting it to zero, the workload using this computer resource neither requests or uses GPU resources while running
        • You can set any number of GPU devices and specify the memory requirement to any portion size (1..100), or memory size value using GB or MB units per device
        • GPU memory per device
          • Select the memory request format
            • % (of device) - Fraction of a GPU device’s memory
            • MB (memory size) - An explicit GPU memory unit
            • GB (memory size) - An explicit GPU memory unit
          • Set the memory Request - The minimum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives this amount of GPU memory for each device(s) the pod utilizes
          • Optional: Set the memory Limit - The maximum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives at most this amount of GPU memory for each device(s) the pod utilizes.
            To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.

        Note

        • GPU memory limit is disabled by default. If you cannot see the Limit toggle in the compute resource form, then it must be enabled by your Administrator, under General settings → Resources → GPU resource optimization
        • When a Limit is set and is bigger than the Request, the scheduler allows each pod to reach the maximum amount of GPU memory in an opportunistic manner (only upon availability).
        • If the GPU Memory Limit is bigger that the Request the pod is prone to be killed by the Run:ai toolkit (out of memory signal). The greater the difference between the GPU memory used and the request, the higher the risk of being killed
        • If GPU resource optimization is turned off, the minimum and maximum are in fact equal
      • CPU

        • CPU compute per pod
          • Select the units for the CPU compute (Cores / Millicores)
          • Set the CPU compute Request - the minimum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU compute for each pod.
          • Optional: Set the CPU compute Limit - The maximum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU compute.
            To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.
            By default, the limit is set to “Unlimited” - which means that the pod may consume all the node's free CPU compute resources.
        • CPU memory per pod
          • Select the units for the CPU memory (MB / GB)
          • Set the CPU memory Request - The minimum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU memory for each pod.
          • Optional: Set the CPU memory Limit - The maximum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU memory.
            To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.
            By default, the limit is set to “Unlimited” - Meaning that the pod may consume all the node's free CPU memory resources.

        Note

        If the CPU Memory Limit is bigger that the Request the pod is prone to be killed by the operating system (out of memory signal). The greater the difference between the CPU memory used and the request, the higher the risk of being killed.

    8. Optional: More settings

      • Increase shared memory size
        When enabled, the shared memory size available to the pod is increased from the default 64MB to the node's total available memory or the CPU memory limit, if set above.
      • Set extended resource(s)
        Click +EXTENDED RESOURCES to add resource/quantity pairs. For more information on how to set extended resources, see the Extended resources and Quantity guides
    9. Click CREATE COMPUTE RESOURCE

      Note

      It is also possible to add compute resources directly when creating a specific Workspace, training or inference workload.

    Editing a compute resource

    To edit a compute resource:

    1. Select the compute resource you want to edit
    2. Click Edit
    3. Click SAVE COMPUTE RESOURCE

    Note

    The already bound workload that is using this asset will not be affected.

    Copying a compute resource

    To make a copy of an existing compute resource:

    1. Select the compute resource you want to copy
    2. Click MAKE A COPY
    3. Enter a name for the environment. The name must be unique.
    4. Update the environment
    5. Click CREATE COMPUTE RESOURCE

    Deleting a compute resource

    1. Select the compute resource you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Note

    The already bound workload that is using this asset will not be affected.

    Using API

    Go to the Compute resources API reference to view the available actions

    Compute Resources

    This article explains what compute resources are and how to create and use them.

    Compute resources are one type of workload asset. A compute resource is a template that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

    A compute resource asset is a preconfigured building block that encapsulates all the specifications of compute requirements for the workload including:

    • GPU devices and GPU memory
    • CPU memory and CPU compute

    Compute resource table

    The Compute resource table can be found under Workload manager in the Run:ai UI.

    The Compute resource table provides a list of all the compute resources defined in the platform and allows you to manage them.

    The Compute resource table consists of the following columns:

    Column Description
    Compute resource The name of the compute resource
    Description A description of the essence of the compute resource
    GPU devices request per pod The number of requested physical devices per pod of the workload that uses this compute resource
    GPU memory request per device The amount of GPU memory per requested device that is granted to each pod of the workload that uses this compute resource
    CPU memory request The minimum amount of CPU memory per pod of the workload that uses this compute resource
    CPU memory limit The maximum amount of CPU memory per pod of the workload that uses this compute resource
    CPU compute request The minimum number of CPU cores per pod of the workload that uses this compute resource
    CPU compute limit The maximum number of CPU cores per pod of the workload that uses this compute resource
    Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram
    Workload(s) The list of workloads associated with the compute resource
    Template(s) The list of workload templates that use this compute resource
    Created by The name of the user who created the compute resource
    Creation time The timestamp of when the compute resource was created
    Last updated The timestamp of when the compute resource was last updated
    Cluster The cluster that the compute resource is associated with

    Workloads associated with the compute resource

    Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

    Column Description
    Workload The workload that uses the compute resource
    Type Workspace/Training/Inference
    Status Represents the workload lifecycle. See the full list of workload status.

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table

    Adding new compute resource

    To add a new compute resource:

    1. Go to the Compute resource table
    2. Click +NEW COMPUTE RESOURCE
    3. Select under which cluster to create the compute resource
    4. Select a scope
    5. Enter a name for the compute resource. The name must be unique.
    6. Optional: Provide a description of the essence of the compute resource
    7. Set the resource types needed within a single node
      (the Run:ai scheduler tries to match a single node that complies with the compute resource for each of the workload’s pods)

      • GPU

        • GPU devices per pod
          The number of devices (physical GPUs) per pod
          (for example, if you requested 3 devices per pod and the running workload using this compute resource consists of 3 pods, there are 9 physical GPU devices used in total)

        Note

        • When setting it to zero, the workload using this computer resource neither requests or uses GPU resources while running
        • You can set any number of GPU devices and specify the memory requirement to any portion size (1..100), or memory size value using GB or MB units per device
        • GPU memory per device
          • Select the memory request format
            • % (of device) - Fraction of a GPU device’s memory
            • MB (memory size) - An explicit GPU memory unit
            • GB (memory size) - An explicit GPU memory unit
          • Set the memory Request - The minimum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives this amount of GPU memory for each device(s) the pod utilizes
          • Optional: Set the memory Limit - The maximum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives at most this amount of GPU memory for each device(s) the pod utilizes.
            To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.

        Note

        • GPU memory limit is disabled by default. If you cannot see the Limit toggle in the compute resource form, then it must be enabled by your Administrator, under General settings → Resources → GPU resource optimization
        • When a Limit is set and is bigger than the Request, the scheduler allows each pod to reach the maximum amount of GPU memory in an opportunistic manner (only upon availability).
        • If the GPU Memory Limit is bigger that the Request the pod is prone to be killed by the Run:ai toolkit (out of memory signal). The greater the difference between the GPU memory used and the request, the higher the risk of being killed
        • If GPU resource optimization is turned off, the minimum and maximum are in fact equal
      • CPU

        • CPU compute per pod
          • Select the units for the CPU compute (Cores / Millicores)
          • Set the CPU compute Request - the minimum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU compute for each pod.
          • Optional: Set the CPU compute Limit - The maximum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU compute.
            To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.
            By default, the limit is set to “Unlimited” - which means that the pod may consume all the node's free CPU compute resources.
        • CPU memory per pod
          • Select the units for the CPU memory (MB / GB)
          • Set the CPU memory Request - The minimum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU memory for each pod.
          • Optional: Set the CPU memory Limit - The maximum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU memory.
            To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.
            By default, the limit is set to “Unlimited” - Meaning that the pod may consume all the node's free CPU memory resources.

        Note

        If the CPU Memory Limit is bigger that the Request the pod is prone to be killed by the operating system (out of memory signal). The greater the difference between the CPU memory used and the request, the higher the risk of being killed.

    8. Optional: More settings

      • Increase shared memory size
        When enabled, the shared memory size available to the pod is increased from the default 64MB to the node's total available memory or the CPU memory limit, if set above.
      • Set extended resource(s)
        Click +EXTENDED RESOURCES to add resource/quantity pairs. For more information on how to set extended resources, see the Extended resources and Quantity guides
    9. Click CREATE COMPUTE RESOURCE

      Note

      It is also possible to add compute resources directly when creating a specific Workspace, training or inference workload.

    Editing a compute resource

    To edit a compute resource:

    1. Select the compute resource you want to edit
    2. Click Edit
    3. Click SAVE COMPUTE RESOURCE

    Note

    The already bound workload that is using this asset will not be affected.

    Copying a compute resource

    To make a copy of an existing compute resource:

    1. Select the compute resource you want to copy
    2. Click MAKE A COPY
    3. Enter a name for the environment. The name must be unique.
    4. Update the environment
    5. Click CREATE COMPUTE RESOURCE

    Deleting a compute resource

    1. Select the compute resource you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Note

    The already bound workload that is using this asset will not be affected.

    Using API

    Go to the Compute resources API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/assets/credentials/index.html b/v2.20/Researcher/workloads/assets/credentials/index.html index 282a3e34bf..f45ae4385e 100644 --- a/v2.20/Researcher/workloads/assets/credentials/index.html +++ b/v2.20/Researcher/workloads/assets/credentials/index.html @@ -1,4 +1,4 @@ - Credentials - Run:ai Documentation Library
    Skip to content

    Credentials

    This article explains what credentials are and how to create and use them.

    Credentials are a workload asset that simplify the complexities of Kubernetes secrets. They consist of and mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

    Credentials are crucial for the security of AI workloads and the resources they require, as they restrict access to authorized users, verify identities, and ensure secure interactions. By enforcing the protection of sensitive data, credentials help organizations comply with industry regulations, fostering a secure environment overall.

    Essentially, credentials enable AI practitioners to access relevant protected resources, such as private data sources and Docker images, thereby streamlining the workload submission process.

    Credentials table

    The Credentials table can be found under Workload manager in the Run:ai User interface.

    The Credentials table provides a list of all the credentials defined in the platform and allows you to manage them.

    The Credentials table comprises the following columns:

    Column Description
    Credentials The name of the credentials
    Description A description of the credentials
    Type The type of credentials, e.g., Docker registry
    Status The different lifecycle phases and representation of the credentials’ condition
    Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram
    Kubernetes name The unique name of the credentials Kubernetes name as it appears in the cluster
    Environment(s) The environment(s) that are associated with the credentials
    Data source(s) The private data source(s) that are accessed using the credentials
    Created by The user who created the credentials
    Creation time The timestamp of when the credentials were created
    Cluster The cluster with which the credentials are associated

    Credentials status

    The following table describes the credentials’ condition and whether they were created successfully for the selected scope.

    Status Description
    No issues found No issues were found while creating the credentials (this status may change while propagating the credentials to the selected scope)
    Issues found Issues found while propagating the credentials
    Issues found Failed to access the cluster
    Creating… Credentials are being created
    Deleting… Credentials are being deleted
    No status When the credentials’ scope is an account, or the current version of the cluster is not up to date, the status cannot be displayed

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then click ‘Download as CSV’. Export to CSV is limited to 20,000 rows.
    • Refresh - Click REFRESH to update the table with the latest data

    Adding new credentials

    Creating credentials is limited to specific roles.

    To add a new credential:

    1. Go to the Credentials table:
    2. Click +NEW CREDENTIALS
    3. Select the credential type from the list
      Follow the step-by-step guide for each credential type:

    Docker registry

    These credentials allow users to authenticate and pull images from a Docker registry, enabling access to containerized applications and services.

    After creating the credentials, it is used automatically when pulling images.

    1. Select a scope.
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credentials
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Enter the username, password, and Docker registry URL
    5. Click CREATE CREDENTIALS

    After the credentials are created, check their status to monitor their proper creation across the selected scope.

    Access key

    These credentials are unique identifiers used to authenticate and authorize access to cloud services or APIs, ensuring secure communication between applications. They typically consist of two parts:

    • An access key ID
    • A secret access key

    The purpose of this credential type is to allow access to restricted data.

    1. Select a scope.
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credential
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Enter the Access key and Access secret
    5. Click CREATE CREDENTIALS

    After the credentials are created, check their status to monitor their proper creation across the selected scope.

    Username & password

    These credentials require a username and corresponding password to access various resources, ensuring that only authorized users can log in.

    The purpose of this credential type is to allow access to restricted data.

    1. Select a scope
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credentials
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Enter the username and password
    5. Click CREATE CREDENTIALS

    After the credentials are created, check their status to monitor their proper creation across the selected scope.

    Generic secret

    These credentials are a flexible option that consists of multiple keys & values and can store various sensitive information, such as API keys or configuration data, to be used securely within applications.

    The purpose of this credential type is to allow access to restricted data.

    1. Select a scope
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credentials
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Click +KEY & VALUE - to add key/value pairs to store in the new secret
    5. Click CREATE CREDENTIALS

    Editing credentials

    To rename a credential:

    1. Select the credential from the table
    2. Click Rename to edit its name and description

    Deleting credentials

    To delete a credential:

    1. Select the credential you want to delete
    2. Click DELETE
    3. In the dialog, click DELETE to confirm

    Note

    Credentials cannot be deleted if they are being used by a workload and template.

    Using credentials

    You can use credentials (secrets) in various ways within the system

    Access private data sources

    To access private data sources, attach credentials to data sources of the following types: Git, S3 Bucket

    Use directly within the container

    To use the secret directly from within the container, you can choose between the following options

    1. Get the secret mounted to the file system by using the Generic secret data source
    2. Get the secret as an environment variable injected into the container. There are two equivalent ways to inject the environment variable.

      a. By adding it to the Environment asset. b. By adding it ad-hoc as part of the workload.


    Creating secrets in advance

    Add secrets in advance to be used when creating credentials via the Run:ai UI.

    Follow the steps below for each required scope:

    1. Create the secret in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the secret, label it: run.ai/cluster-wide: "true"
    3. Label the secret with the correct credential type:
      1. Docker registry - run.ai/resource: "docker-registry"
      2. Access key - run.ai/resource: "access-key"
      3. Username and password - run.ai/resource: "password"
      4. Generic secret - run.ai/resource: "generic" ֿ
    1. Create the secret in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the secret, label it: run.ai/department: "<department id>"
    3. Label the secret with the correct credential type:
      1. Docker registry - run.ai/resource: "docker-registry"
      2. Access key - run.ai/resource: "access-key"
      3. Username and password - run.ai/resource: "password"
      4. Generic secret - run.ai/resource: "generic"
    1. Create the secret in the project’s namespace
    2. Label the secret with the correct credential type:
      1. Docker registry - run.ai/resource: "docker-registry"
      2. Access key - run.ai/resource: "access-key"
      3. Username and password - run.ai/resource: "password"
      4. Generic secret - run.ai/resource: "generic"

    The secret is now displayed for that scope in the list of existing secrets.

    Using API

    To view the available actions, go to the Credentials API reference

    Credentials

    This article explains what credentials are and how to create and use them.

    Credentials are a workload asset that simplify the complexities of Kubernetes secrets. They consist of and mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

    Credentials are crucial for the security of AI workloads and the resources they require, as they restrict access to authorized users, verify identities, and ensure secure interactions. By enforcing the protection of sensitive data, credentials help organizations comply with industry regulations, fostering a secure environment overall.

    Essentially, credentials enable AI practitioners to access relevant protected resources, such as private data sources and Docker images, thereby streamlining the workload submission process.

    Credentials table

    The Credentials table can be found under Workload manager in the Run:ai User interface.

    The Credentials table provides a list of all the credentials defined in the platform and allows you to manage them.

    The Credentials table comprises the following columns:

    Column Description
    Credentials The name of the credentials
    Description A description of the credentials
    Type The type of credentials, e.g., Docker registry
    Status The different lifecycle phases and representation of the credentials’ condition
    Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram
    Kubernetes name The unique name of the credentials Kubernetes name as it appears in the cluster
    Environment(s) The environment(s) that are associated with the credentials
    Data source(s) The private data source(s) that are accessed using the credentials
    Created by The user who created the credentials
    Creation time The timestamp of when the credentials were created
    Cluster The cluster with which the credentials are associated

    Credentials status

    The following table describes the credentials’ condition and whether they were created successfully for the selected scope.

    Status Description
    No issues found No issues were found while creating the credentials (this status may change while propagating the credentials to the selected scope)
    Issues found Issues found while propagating the credentials
    Issues found Failed to access the cluster
    Creating… Credentials are being created
    Deleting… Credentials are being deleted
    No status When the credentials’ scope is an account, or the current version of the cluster is not up to date, the status cannot be displayed

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then click ‘Download as CSV’. Export to CSV is limited to 20,000 rows.
    • Refresh - Click REFRESH to update the table with the latest data

    Adding new credentials

    Creating credentials is limited to specific roles.

    To add a new credential:

    1. Go to the Credentials table:
    2. Click +NEW CREDENTIALS
    3. Select the credential type from the list
      Follow the step-by-step guide for each credential type:

    Docker registry

    These credentials allow users to authenticate and pull images from a Docker registry, enabling access to containerized applications and services.

    After creating the credentials, it is used automatically when pulling images.

    1. Select a scope.
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credentials
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Enter the username, password, and Docker registry URL
    5. Click CREATE CREDENTIALS

    After the credentials are created, check their status to monitor their proper creation across the selected scope.

    Access key

    These credentials are unique identifiers used to authenticate and authorize access to cloud services or APIs, ensuring secure communication between applications. They typically consist of two parts:

    • An access key ID
    • A secret access key

    The purpose of this credential type is to allow access to restricted data.

    1. Select a scope.
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credential
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Enter the Access key and Access secret
    5. Click CREATE CREDENTIALS

    After the credentials are created, check their status to monitor their proper creation across the selected scope.

    Username & password

    These credentials require a username and corresponding password to access various resources, ensuring that only authorized users can log in.

    The purpose of this credential type is to allow access to restricted data.

    1. Select a scope
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credentials
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Enter the username and password
    5. Click CREATE CREDENTIALS

    After the credentials are created, check their status to monitor their proper creation across the selected scope.

    Generic secret

    These credentials are a flexible option that consists of multiple keys & values and can store various sensitive information, such as API keys or configuration data, to be used securely within applications.

    The purpose of this credential type is to allow access to restricted data.

    1. Select a scope
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credentials
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Click +KEY & VALUE - to add key/value pairs to store in the new secret
    5. Click CREATE CREDENTIALS

    Editing credentials

    To rename a credential:

    1. Select the credential from the table
    2. Click Rename to edit its name and description

    Deleting credentials

    To delete a credential:

    1. Select the credential you want to delete
    2. Click DELETE
    3. In the dialog, click DELETE to confirm

    Note

    Credentials cannot be deleted if they are being used by a workload and template.

    Using credentials

    You can use credentials (secrets) in various ways within the system

    Access private data sources

    To access private data sources, attach credentials to data sources of the following types: Git, S3 Bucket

    Use directly within the container

    To use the secret directly from within the container, you can choose between the following options

    1. Get the secret mounted to the file system by using the Generic secret data source
    2. Get the secret as an environment variable injected into the container. There are two equivalent ways to inject the environment variable.

      a. By adding it to the Environment asset. b. By adding it ad-hoc as part of the workload.


    Creating secrets in advance

    Add secrets in advance to be used when creating credentials via the Run:ai UI.

    Follow the steps below for each required scope:

    1. Create the secret in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the secret, label it: run.ai/cluster-wide: "true"
    3. Label the secret with the correct credential type:
      1. Docker registry - run.ai/resource: "docker-registry"
      2. Access key - run.ai/resource: "access-key"
      3. Username and password - run.ai/resource: "password"
      4. Generic secret - run.ai/resource: "generic" ֿ
    1. Create the secret in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the secret, label it: run.ai/department: "<department id>"
    3. Label the secret with the correct credential type:
      1. Docker registry - run.ai/resource: "docker-registry"
      2. Access key - run.ai/resource: "access-key"
      3. Username and password - run.ai/resource: "password"
      4. Generic secret - run.ai/resource: "generic"
    1. Create the secret in the project’s namespace
    2. Label the secret with the correct credential type:
      1. Docker registry - run.ai/resource: "docker-registry"
      2. Access key - run.ai/resource: "access-key"
      3. Username and password - run.ai/resource: "password"
      4. Generic secret - run.ai/resource: "generic"

    The secret is now displayed for that scope in the list of existing secrets.

    Using API

    To view the available actions, go to the Credentials API reference

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/assets/data-volumes/index.html b/v2.20/Researcher/workloads/assets/data-volumes/index.html index 36789a720d..bd983308ce 100644 --- a/v2.20/Researcher/workloads/assets/data-volumes/index.html +++ b/v2.20/Researcher/workloads/assets/data-volumes/index.html @@ -1,4 +1,4 @@ - Data Volumes - Run:ai Documentation Library
    Skip to content

    Data Volumes

    Data volumes offer a powerful solution for storing, managing, and sharing AI training data within the Run:ai platform. They promote collaboration, simplify data access control, and streamline the AI development lifecycle.

    Data volumes are snapshots of datasets stored in Kubernetes Persistent Volume Claims (PVCs). They act as a central repository for training data.

    Why use a data volume?

    1. Sharing with multiple scopes
      Unlike other Run:ai data sources, data volumes can be shared across projects, departments, or clusters, encouraging data reuse and collaboration within the organization.
    2. Storage saving
      A single copy of the data can be used across multiple scopes

    Typical use cases

    1. Sharing large data sets
      In large organizations, the data is often stored in a remote location, which can be a barrier for large model training. Even if the data is transferred into the cluster, sharing it easily with multiple users is still challenging. Data volumes can help share the data seamlessly, with maximum security and control.
    2. Sharing data with colleagues
      When sharing training results, generated data sets, or other artifacts with team members is needed, data volumes can help make the data available easily.

    data-volumes-architecture

    Prerequisites

    To create a data volume, there must be a project with a PVC in its namespace.

    Working with data volumes is currently available using the API. To view the available actions, go to the Data volumes API reference.

    Adding a new data volume

    Data volume creation is limited to specific roles

    Adding scopes for a data volume

    Data volume sharing (adding scopes) is limited to specific roles

    Once created, the data volume is available to its originating project (see the prerequisites above).

    Data volumes can be shared with additional scopes in the organization.

    Who can use a data volume?

    Data volumes are used when submitting workloads. Any user, application or SSO group with a role that has permissions to create workloads can also use data volumes.

    Researchers can list available data volumes within their permitted scopes for easy selection.

    Data Volumes

    Data volumes offer a powerful solution for storing, managing, and sharing AI training data within the Run:ai platform. They promote collaboration, simplify data access control, and streamline the AI development lifecycle.

    Data volumes are snapshots of datasets stored in Kubernetes Persistent Volume Claims (PVCs). They act as a central repository for training data.

    Why use a data volume?

    1. Sharing with multiple scopes
      Unlike other Run:ai data sources, data volumes can be shared across projects, departments, or clusters, encouraging data reuse and collaboration within the organization.
    2. Storage saving
      A single copy of the data can be used across multiple scopes

    Typical use cases

    1. Sharing large data sets
      In large organizations, the data is often stored in a remote location, which can be a barrier for large model training. Even if the data is transferred into the cluster, sharing it easily with multiple users is still challenging. Data volumes can help share the data seamlessly, with maximum security and control.
    2. Sharing data with colleagues
      When sharing training results, generated data sets, or other artifacts with team members is needed, data volumes can help make the data available easily.

    data-volumes-architecture

    Prerequisites

    To create a data volume, there must be a project with a PVC in its namespace.

    Working with data volumes is currently available using the API. To view the available actions, go to the Data volumes API reference.

    Adding a new data volume

    Data volume creation is limited to specific roles

    Adding scopes for a data volume

    Data volume sharing (adding scopes) is limited to specific roles

    Once created, the data volume is available to its originating project (see the prerequisites above).

    Data volumes can be shared with additional scopes in the organization.

    Who can use a data volume?

    Data volumes are used when submitting workloads. Any user, application or SSO group with a role that has permissions to create workloads can also use data volumes.

    Researchers can list available data volumes within their permitted scopes for easy selection.

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/assets/datasources/index.html b/v2.20/Researcher/workloads/assets/datasources/index.html index de3d240138..6cd54d4258 100644 --- a/v2.20/Researcher/workloads/assets/datasources/index.html +++ b/v2.20/Researcher/workloads/assets/datasources/index.html @@ -1,4 +1,4 @@ - Data Sources - Run:ai Documentation Library
    Skip to content

    Data Sources

    This article explains what data sources are and how to create and use them.

    Data sources are a type of workload asset and represent a location where data is actually stored. They may represent a remote data location, such as NFS, Git, or S3, or a Kubernetes local resource, such as PVC, ConfigMap, HostPath, or Secret.

    This configuration simplifies the mapping of the data into the workload’s file system and handles the mounting process during workload creation for reading and writing. These data sources are reusable and can be easily integrated and used by AI practitioners while submitting workloads across various scopes.

    Data sources table

    The data sources table can be found under Workload manager in the Run:ai platform.

    The data sources table provides a list of all the data sources defined in the platform and allows you to manage them.

    The data sources table comprises the following columns:

    Column Description
    Data source The name of the data source
    Description A description of the data source
    Type The type of data source connected – e.g., S3 bucket, PVC, or others
    Status The different lifecycle phases and representation of the data source condition
    Scope The scope of the data source within the organizational tree. Click the scope name to view the organizational tree diagram
    Kubernetes name The unique name of the data sources Kubernetes name as it appears in the cluster
    Workload(s) The list of existing workloads that use the data source
    Template(s) The list of workload templates that use the data source
    Created by The user who created the data source
    Creation time The timestamp for when the data source was created
    Cluster The cluster that the data source is associated with

    Data sources status

    The following table describes the data sources' condition and whether they were created successfully for the selected scope.

    Status Description
    No issues found No issues were found while creating the data source
    Issues found Issues were found while propagating the data source credentials
    Issues found The data source couldn’t be created at the cluster
    Creating… The data source is being created
    No status / “-” When the data source’s scope is an account, the current version of the cluster is not up to date, or the asset is not a cluster-syncing entity, the status can’t be displayed

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then click ‘Download as CSV’
    • Refresh - Click REFRESH to update the table with the latest data

    Adding a new data source

    To create a new data source:

    1. Click +NEW DATA SOURCE
    2. Select the data source type from the list. Follow the step-by-step guide for each data source type:

    NFS

    A Network File System (NFS) is a Kubernetes concept used for sharing storage in the cluster among different pods. Like a PVC, the NFS volume’s content remains preserved, even outside the lifecycle of a single pod. However, unlike PVCs, which abstract storage management, NFS provides a method for network-based file sharing. The NFS volume can be pre-populated with data and can be mounted by multiple pod writers simultaneously. At Run:ai, an NFS-type data source is an abstraction that is mapped directly to a Kubernetes NFS volume. This integration allows multiple workloads under various scopes to mount and present the NFS data source.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Enter the NFS server (host name or host IP)
      • Enter the NFS path
    6. Set the data target location
      • Container path
    7. Optional: Restrictions
      • Prevent data modification - When enabled, the data will be mounted with read-only permissions
    8. Click CREATE DATA SOURCE

    PVC

    A Persistent Volume Claim (PVC) is a Kubernetes concept used for managing storage in the cluster, which can be provisioned by an administrator or dynamically by Kubernetes using a StorageClass. PVCs allow users to request specific sizes and access modes (read/write once, read-only many). Run:ai ensures that data remains consistent and accessible across various scopes and workloads, beyond the lifecycle of individual pods, which is efficient while working with large datasets typically associated with AI projects.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Select PVC:
      • Existing PVC
        This option is relevant when the purpose is to create a PVC-type data source based on an existing PVC in the cluster
        • Select a PVC from the list - (The list is empty if no existing PVCs were created in advance)
      • New PVC - creates a new PVC in the cluster. New PVCs are not added to the Existing PVCs list.
        When creating a PVC-type data source and selecting the ‘New PVC’ option, the PVC is immediately created in the cluster (even if no workload has requested this PVC).
    6. Select the storage class
      • None - Proceed without defining a storage class
      • Custom storage class - This option applies when selecting a storage class based on existing storage classes.
        To add new storage classes to the storage class list, and for additional information, check Kubernetes storage classes
    7. Select the access mode(s) (multiple modes can be selected)
      • Read-write by one node - The volume can be mounted as read-write by a single node.
      • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
      • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
    8. Set the claim size and its units
    9. Select the volume mode
      • File system (default) - allows the volume to be mounted as a filesystem, enabling the usage of directories and files.
      • Block - exposes the volume as a block storage, which can be formatted or used by applications directly without a filesystem.
    10. Set the data target location
      • container path
    11. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permission.
    12. Click CREATE DATA SOURCE

    After the data source is created, check its status to monitor its proper creation across the selected scope.

    S3 Bucket

    The S3 bucket data source enables the mapping of a remote S3 bucket into the workload’s file system. Similar to a PVC, this mapping remains accessible across different workload executions, extending beyond the lifecycle of individual pods. However, unlike PVCs, data stored in an S3 bucket resides remotely, which may lead to decreased performance during the execution of heavy machine learning workloads. As part of the Run:ai connection to the S3 bucket, you can create credentials in order to access and map private buckets.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Set the S3 service URL
      • Select the credentials
        • None - for public buckets
        • Credential names - This option is relevant for private buckets based on existing credentials that were created for the scope.
          To add new credentials to the credentials list, and for additional information, check the Credentials article.
      • Enter the bucket name
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    After a private data source is created, check its status to monitor its proper creation across the selected scope.

    Git

    A Git-type data source is a Run:ai integration, that enables code to be copied from a Git branch into a dedicated folder in the container. It is mainly used to provide the workload with the latest code repository. As part of the integration with Git, in order to access private repositories, you can add predefined credentials to the data source mapping.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Set the Repository URL
      • Set the Revision (branch, tag, or hash)- If left empty, it will use the 'HEAD' (latest)
      • Select the credentials
        • None - for public repositories
        • Credential names - This option applies to private repositories based on existing credentials that were created for the scope.
          To add new credentials to the credentials list, and for additional information, check the Credentials article.
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    After a private data source is created, check its status to monitor its proper creation across the selected scope.

    Host path

    A Host path volume is a Kubernetes concept that enables mounting a host path file or a directory on the workload’s file system. Like a PVC, the host path volume’s data persists across workloads under various scopes. It also enables data serving from the hosting node.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • host path
    6. Set the data target location
      • container path
    7. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permissions.
    8. Click CREATE DATA SOURCE

    ConfigMap

    A ConfigMap data source is a Run:ai abstraction for the Kubernetes ConfigMap concept. The ConfigMap is used mainly for storage that can be mounted on the workload container for non-confidential data. It is usually represented in key-value pairs (e.g., environment variables, command-line arguments etc.). It allows you to decouple environment-specific system configurations from your container images, so that your applications are easily portable. ConfigMaps must be created on the cluster prior to being used within the Run:ai system.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Select the ConfigMap name (The list is empty if no existing ConfigMaps were created in advance).
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    Secret

    A secret-type data source enables the mapping of a credential into the workload’s file system. Credentials are a workload asset that simplify the complexities of Kubernetes Secrets. The credentials mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Select the credentials
        To add new credentials, and for additional information, check the Credentials article.
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    After the data source is created, check its status to monitor its proper creation across the selected scope.

    Note

    It is also possible to add data sources directly when creating a specific workspace, training or inference workload

    Editing a data source

    To edit a data source:

    1. Select the data source from the table
    2. Click Rename to provide it with a new name
    3. Click Copy & Edit to make any changes to the data source

    Deleting a data source

    To delete a data source:

    1. Select the data source you want to delete
    2. Click DELETE
    3. Confirm you want to delete the data source

    Note

    It is not possible to delete an environment being used by an existing workload or template.

    Using API

    To view the available actions, go to the Data sources API reference.

    Data Sources

    This article explains what data sources are and how to create and use them.

    Data sources are a type of workload asset and represent a location where data is actually stored. They may represent a remote data location, such as NFS, Git, or S3, or a Kubernetes local resource, such as PVC, ConfigMap, HostPath, or Secret.

    This configuration simplifies the mapping of the data into the workload’s file system and handles the mounting process during workload creation for reading and writing. These data sources are reusable and can be easily integrated and used by AI practitioners while submitting workloads across various scopes.

    Data sources table

    The data sources table can be found under Workload manager in the Run:ai platform.

    The data sources table provides a list of all the data sources defined in the platform and allows you to manage them.

    The data sources table comprises the following columns:

    Column Description
    Data source The name of the data source
    Description A description of the data source
    Type The type of data source connected – e.g., S3 bucket, PVC, or others
    Status The different lifecycle phases and representation of the data source condition
    Scope The scope of the data source within the organizational tree. Click the scope name to view the organizational tree diagram
    Kubernetes name The unique name of the data sources Kubernetes name as it appears in the cluster
    Workload(s) The list of existing workloads that use the data source
    Template(s) The list of workload templates that use the data source
    Created by The user who created the data source
    Creation time The timestamp for when the data source was created
    Cluster The cluster that the data source is associated with

    Data sources status

    The following table describes the data sources' condition and whether they were created successfully for the selected scope.

    Status Description
    No issues found No issues were found while creating the data source
    Issues found Issues were found while propagating the data source credentials
    Issues found The data source couldn’t be created at the cluster
    Creating… The data source is being created
    No status / “-” When the data source’s scope is an account, the current version of the cluster is not up to date, or the asset is not a cluster-syncing entity, the status can’t be displayed

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then click ‘Download as CSV’
    • Refresh - Click REFRESH to update the table with the latest data

    Adding a new data source

    To create a new data source:

    1. Click +NEW DATA SOURCE
    2. Select the data source type from the list. Follow the step-by-step guide for each data source type:

    NFS

    A Network File System (NFS) is a Kubernetes concept used for sharing storage in the cluster among different pods. Like a PVC, the NFS volume’s content remains preserved, even outside the lifecycle of a single pod. However, unlike PVCs, which abstract storage management, NFS provides a method for network-based file sharing. The NFS volume can be pre-populated with data and can be mounted by multiple pod writers simultaneously. At Run:ai, an NFS-type data source is an abstraction that is mapped directly to a Kubernetes NFS volume. This integration allows multiple workloads under various scopes to mount and present the NFS data source.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Enter the NFS server (host name or host IP)
      • Enter the NFS path
    6. Set the data target location
      • Container path
    7. Optional: Restrictions
      • Prevent data modification - When enabled, the data will be mounted with read-only permissions
    8. Click CREATE DATA SOURCE

    PVC

    A Persistent Volume Claim (PVC) is a Kubernetes concept used for managing storage in the cluster, which can be provisioned by an administrator or dynamically by Kubernetes using a StorageClass. PVCs allow users to request specific sizes and access modes (read/write once, read-only many). Run:ai ensures that data remains consistent and accessible across various scopes and workloads, beyond the lifecycle of individual pods, which is efficient while working with large datasets typically associated with AI projects.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Select PVC:
      • Existing PVC
        This option is relevant when the purpose is to create a PVC-type data source based on an existing PVC in the cluster
        • Select a PVC from the list - (The list is empty if no existing PVCs were created in advance)
      • New PVC - creates a new PVC in the cluster. New PVCs are not added to the Existing PVCs list.
        When creating a PVC-type data source and selecting the ‘New PVC’ option, the PVC is immediately created in the cluster (even if no workload has requested this PVC).
    6. Select the storage class
      • None - Proceed without defining a storage class
      • Custom storage class - This option applies when selecting a storage class based on existing storage classes.
        To add new storage classes to the storage class list, and for additional information, check Kubernetes storage classes
    7. Select the access mode(s) (multiple modes can be selected)
      • Read-write by one node - The volume can be mounted as read-write by a single node.
      • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
      • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
    8. Set the claim size and its units
    9. Select the volume mode
      • File system (default) - allows the volume to be mounted as a filesystem, enabling the usage of directories and files.
      • Block - exposes the volume as a block storage, which can be formatted or used by applications directly without a filesystem.
    10. Set the data target location
      • container path
    11. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permission.
    12. Click CREATE DATA SOURCE

    After the data source is created, check its status to monitor its proper creation across the selected scope.

    S3 Bucket

    The S3 bucket data source enables the mapping of a remote S3 bucket into the workload’s file system. Similar to a PVC, this mapping remains accessible across different workload executions, extending beyond the lifecycle of individual pods. However, unlike PVCs, data stored in an S3 bucket resides remotely, which may lead to decreased performance during the execution of heavy machine learning workloads. As part of the Run:ai connection to the S3 bucket, you can create credentials in order to access and map private buckets.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Set the S3 service URL
      • Select the credentials
        • None - for public buckets
        • Credential names - This option is relevant for private buckets based on existing credentials that were created for the scope.
          To add new credentials to the credentials list, and for additional information, check the Credentials article.
      • Enter the bucket name
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    After a private data source is created, check its status to monitor its proper creation across the selected scope.

    Git

    A Git-type data source is a Run:ai integration, that enables code to be copied from a Git branch into a dedicated folder in the container. It is mainly used to provide the workload with the latest code repository. As part of the integration with Git, in order to access private repositories, you can add predefined credentials to the data source mapping.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Set the Repository URL
      • Set the Revision (branch, tag, or hash)- If left empty, it will use the 'HEAD' (latest)
      • Select the credentials
        • None - for public repositories
        • Credential names - This option applies to private repositories based on existing credentials that were created for the scope.
          To add new credentials to the credentials list, and for additional information, check the Credentials article.
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    After a private data source is created, check its status to monitor its proper creation across the selected scope.

    Host path

    A Host path volume is a Kubernetes concept that enables mounting a host path file or a directory on the workload’s file system. Like a PVC, the host path volume’s data persists across workloads under various scopes. It also enables data serving from the hosting node.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • host path
    6. Set the data target location
      • container path
    7. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permissions.
    8. Click CREATE DATA SOURCE

    ConfigMap

    A ConfigMap data source is a Run:ai abstraction for the Kubernetes ConfigMap concept. The ConfigMap is used mainly for storage that can be mounted on the workload container for non-confidential data. It is usually represented in key-value pairs (e.g., environment variables, command-line arguments etc.). It allows you to decouple environment-specific system configurations from your container images, so that your applications are easily portable. ConfigMaps must be created on the cluster prior to being used within the Run:ai system.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Select the ConfigMap name (The list is empty if no existing ConfigMaps were created in advance).
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    Secret

    A secret-type data source enables the mapping of a credential into the workload’s file system. Credentials are a workload asset that simplify the complexities of Kubernetes Secrets. The credentials mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Select the credentials
        To add new credentials, and for additional information, check the Credentials article.
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    After the data source is created, check its status to monitor its proper creation across the selected scope.

    Note

    It is also possible to add data sources directly when creating a specific workspace, training or inference workload

    Editing a data source

    To edit a data source:

    1. Select the data source from the table
    2. Click Rename to provide it with a new name
    3. Click Copy & Edit to make any changes to the data source

    Deleting a data source

    To delete a data source:

    1. Select the data source you want to delete
    2. Click DELETE
    3. Confirm you want to delete the data source

    Note

    It is not possible to delete an environment being used by an existing workload or template.

    Using API

    To view the available actions, go to the Data sources API reference.

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/assets/environments/index.html b/v2.20/Researcher/workloads/assets/environments/index.html index 3c89281d09..ab221ab6e8 100644 --- a/v2.20/Researcher/workloads/assets/environments/index.html +++ b/v2.20/Researcher/workloads/assets/environments/index.html @@ -1,4 +1,4 @@ - Environments - Run:ai Documentation Library
    Skip to content

    Environments

    This article explains what environments are and how to create and use them.

    Environments are one type of workload asset. An environment consists of a configuration that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

    An environment asset is a preconfigured building block that encapsulates aspects for the workload such as:

    • Container image and container configuration
    • Tools and connections
    • The type of workload it serves

    Environments table

    The Environments table can be found under Workload manager in the Run:ai platform.

    The Environment table provides a list of all the environment defined in the platform and allows you to manage them.

    The Environments table consists of the following columns:

    Column Description
    Environment The name of the environment
    Description A description of the environment
    Scope The scope of this environment within the organizational tree. Click the name of the scope to view the organizational tree diagram
    Image The application or service to be run by the workload
    Workload Architecture This can be either standard for running workloads on a single node or distributed for running distributed workloads on multiple nodes
    Tool(s) The tools and connection types the environment exposes
    Workload(s) The list of existing workloads that use the environment
    Workload types The workload types that can use the environment (Workspace/ Training / Inference)
    Template(s) The list of workload templates that use this environment
    Created by The user who created the environment. By default Run:ai UI comes with preinstalled environments created by Run:ai created by Run:ai
    Creation time The timestamp of when the environment was created
    Last updated The timestamp of when the environment was last updated
    Cluster The cluster with which the environment is associated

    Tools associated with the environment

    Click one of the values in the tools column to view the list of tools and their connection type.

    Column Description
    Tool name The name of the tool or application AI practitioner can set up within the environment.
    Connection type The method by which you can access and interact with the running workload. It's essentially the "doorway" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc)

    Workloads associated with the environment

    Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

    Column Description
    Workload The workload that uses the environment
    Type The workload type (Workspace/Training/Inference)
    Status Represents the workload lifecycle. See the full list of workload status

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Environments created by Run:ai

    When installing Run:ai, you automatically get the environments created by Run:ai to ease up the onboarding process and support different use cases out of the box.
    These environments are created at the scope of the account.

    Environment Image
    Jupiter-lab jupyter/scipy-notebook
    jupyter-tensorboard gcr.io/run-ai-demo/jupyter-tensorboard
    tensorboard tensorflow/tensorflow:latest
    llm-server runai.jfrog.io/core-llm/runai-vllm:v0.6.4-0.10.0
    chatbot-ui runai.jfrog.io/core-llm/llm-app
    gpt2 runai.jfrog.io/core-llm/quickstart-inference:gpt2-cpu

    Adding a new environment

    Environment creation is limited to specific roles

    To add a new environment:

    1. Go to the Environments table
    2. Click +NEW ENVIRONMENT
    3. Select under which cluster to create the environment
    4. Select a scope
    5. Enter a name for the environment. The name must be unique.
    6. Optional: Provide a description of the essence of the environment
    7. Enter the Image URL
      If a token or secret is required to pull the image, it is possible to create it via credentials of type docker registry. These credentials are automatically used once the image is pulled (which happens when the workload is submitted)
    8. Set the image pull policy - the condition for when to pull the image from the registry
    9. Set the workload architecture:
      • Standard
        Only standard workloads can use the environment. A standard workload consists of a single process.
      • Distributed
        Only distributed workloads can use the environment. A distributed workload consists of multiple processes working together. These processes can run on different nodes.
      • Select a framework from the list.
    10. Set the workload type:
      • Workspace
      • Training
      • Inference
      • When inference is selected, define the endpoint of the model by providing both the protocol and the container’s serving port
    11. Optional: Set the connection for your tool(s). The tools must be configured in the image. When submitting a workload using the environment, it is possible to connect to these tools
      • Select the tool from the list (the available tools varies from IDE, experiment tracking, and more, including a custom tool for your choice)
      • Select the connection type
        • External URL
          • Auto generate
            A unique URL is automatically created for each workload using the environment
          • Custom URL
            The URL is set manually
        • Node port
          • Auto generate
            A unique port is automatically exposed for each workload using the environment
          • Custom URL
            Set the port manually
        • Set the container port
    12. Optional: Set a command and arguments for the container running the pod
      • When no command is added, the default command of the image is used (the image entrypoint)
      • The command can be modified while submitting a workload using the environment
      • The argument(s) can be modified while submitting a workload using the environment
    13. Optional: Set the environment variable(s)
      • Click +ENVIRONMENT VARIABLE
      • Enter a name
      • Select the source for the environment variable
      • Custom
        • Enter a value
        • Leave empty
        • Add instructions for the expected value if any
      • Credentials - Select existing credentials as the environment variable
        • Select a credential name
          To add new credentials to the credentials list, and for additional information, see Credentials.
        • Select a secret key
      • The environment variables can be modified and new variables can be added while submitting a workload using the environment
    14. Optional: Set the container’s working directory to define where the container’s process starts running. When left empty, the default directory is used.
    15. Optional: Set where the UID, GID and supplementary groups are taken from, this can be:
      • From the image
      • From the IdP token (only available in an SSO installations)
      • Custom (manually set) - decide whether the submitter can modify these value upon submission.
      • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
        • Enter UID
        • Enter GID
        • Add Supplementary groups (multiple groups can be added, separated by commas)
        • Disable Allow the values above to be modified within the workload if you want the above values to be used as the default
    16. Optional: Select Linux capabilities - Grant certain privileges to a container without granting all the privileges of the root user.
    17. Click CREATE ENVIRONMENT

    Note

    It is also possible to add environments directly when creating a specific workspace, training or inference workload.

    Editing an environment

    To edit an environment:

    1. Select the environment you want to edit
    2. Click Edit
    3. Click SAVE ENVIRONMENT

    Note

    • The already bound workload that is using this asset will not be affected.
    • llm-server and chatbot-ui environments cannot be edited.

    Copying an environment

    To make a copy of an existing environment:

    1. Select the environment you want to copy
    2. Click MAKE A COPY
    3. Enter a name for the environment. The name must be unique.
    4. Update the environment
    5. Click CREATE ENVIRONMENT

    Deleting an environment

    To delete an environment:

    1. Select the environment you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Note

    The already bound workload that is using this asset will not be affected.

    Using API

    Go to the Environment API reference to view the available actions

    Environments

    This article explains what environments are and how to create and use them.

    Environments are one type of workload asset. An environment consists of a configuration that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

    An environment asset is a preconfigured building block that encapsulates aspects for the workload such as:

    • Container image and container configuration
    • Tools and connections
    • The type of workload it serves

    Environments table

    The Environments table can be found under Workload manager in the Run:ai platform.

    The Environment table provides a list of all the environment defined in the platform and allows you to manage them.

    The Environments table consists of the following columns:

    Column Description
    Environment The name of the environment
    Description A description of the environment
    Scope The scope of this environment within the organizational tree. Click the name of the scope to view the organizational tree diagram
    Image The application or service to be run by the workload
    Workload Architecture This can be either standard for running workloads on a single node or distributed for running distributed workloads on multiple nodes
    Tool(s) The tools and connection types the environment exposes
    Workload(s) The list of existing workloads that use the environment
    Workload types The workload types that can use the environment (Workspace/ Training / Inference)
    Template(s) The list of workload templates that use this environment
    Created by The user who created the environment. By default Run:ai UI comes with preinstalled environments created by Run:ai created by Run:ai
    Creation time The timestamp of when the environment was created
    Last updated The timestamp of when the environment was last updated
    Cluster The cluster with which the environment is associated

    Tools associated with the environment

    Click one of the values in the tools column to view the list of tools and their connection type.

    Column Description
    Tool name The name of the tool or application AI practitioner can set up within the environment.
    Connection type The method by which you can access and interact with the running workload. It's essentially the "doorway" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc)

    Workloads associated with the environment

    Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

    Column Description
    Workload The workload that uses the environment
    Type The workload type (Workspace/Training/Inference)
    Status Represents the workload lifecycle. See the full list of workload status

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Environments created by Run:ai

    When installing Run:ai, you automatically get the environments created by Run:ai to ease up the onboarding process and support different use cases out of the box.
    These environments are created at the scope of the account.

    Environment Image
    Jupiter-lab jupyter/scipy-notebook
    jupyter-tensorboard gcr.io/run-ai-demo/jupyter-tensorboard
    tensorboard tensorflow/tensorflow:latest
    llm-server runai.jfrog.io/core-llm/runai-vllm:v0.6.4-0.10.0
    chatbot-ui runai.jfrog.io/core-llm/llm-app
    gpt2 runai.jfrog.io/core-llm/quickstart-inference:gpt2-cpu

    Adding a new environment

    Environment creation is limited to specific roles

    To add a new environment:

    1. Go to the Environments table
    2. Click +NEW ENVIRONMENT
    3. Select under which cluster to create the environment
    4. Select a scope
    5. Enter a name for the environment. The name must be unique.
    6. Optional: Provide a description of the essence of the environment
    7. Enter the Image URL
      If a token or secret is required to pull the image, it is possible to create it via credentials of type docker registry. These credentials are automatically used once the image is pulled (which happens when the workload is submitted)
    8. Set the image pull policy - the condition for when to pull the image from the registry
    9. Set the workload architecture:
      • Standard
        Only standard workloads can use the environment. A standard workload consists of a single process.
      • Distributed
        Only distributed workloads can use the environment. A distributed workload consists of multiple processes working together. These processes can run on different nodes.
      • Select a framework from the list.
    10. Set the workload type:
      • Workspace
      • Training
      • Inference
      • When inference is selected, define the endpoint of the model by providing both the protocol and the container’s serving port
    11. Optional: Set the connection for your tool(s). The tools must be configured in the image. When submitting a workload using the environment, it is possible to connect to these tools
      • Select the tool from the list (the available tools varies from IDE, experiment tracking, and more, including a custom tool for your choice)
      • Select the connection type
        • External URL
          • Auto generate
            A unique URL is automatically created for each workload using the environment
          • Custom URL
            The URL is set manually
        • Node port
          • Auto generate
            A unique port is automatically exposed for each workload using the environment
          • Custom URL
            Set the port manually
        • Set the container port
    12. Optional: Set a command and arguments for the container running the pod
      • When no command is added, the default command of the image is used (the image entrypoint)
      • The command can be modified while submitting a workload using the environment
      • The argument(s) can be modified while submitting a workload using the environment
    13. Optional: Set the environment variable(s)
      • Click +ENVIRONMENT VARIABLE
      • Enter a name
      • Select the source for the environment variable
      • Custom
        • Enter a value
        • Leave empty
        • Add instructions for the expected value if any
      • Credentials - Select existing credentials as the environment variable
        • Select a credential name
          To add new credentials to the credentials list, and for additional information, see Credentials.
        • Select a secret key
      • The environment variables can be modified and new variables can be added while submitting a workload using the environment
    14. Optional: Set the container’s working directory to define where the container’s process starts running. When left empty, the default directory is used.
    15. Optional: Set where the UID, GID and supplementary groups are taken from, this can be:
      • From the image
      • From the IdP token (only available in an SSO installations)
      • Custom (manually set) - decide whether the submitter can modify these value upon submission.
      • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
        • Enter UID
        • Enter GID
        • Add Supplementary groups (multiple groups can be added, separated by commas)
        • Disable Allow the values above to be modified within the workload if you want the above values to be used as the default
    16. Optional: Select Linux capabilities - Grant certain privileges to a container without granting all the privileges of the root user.
    17. Click CREATE ENVIRONMENT

    Note

    It is also possible to add environments directly when creating a specific workspace, training or inference workload.

    Editing an environment

    To edit an environment:

    1. Select the environment you want to edit
    2. Click Edit
    3. Click SAVE ENVIRONMENT

    Note

    • The already bound workload that is using this asset will not be affected.
    • llm-server and chatbot-ui environments cannot be edited.

    Copying an environment

    To make a copy of an existing environment:

    1. Select the environment you want to copy
    2. Click MAKE A COPY
    3. Enter a name for the environment. The name must be unique.
    4. Update the environment
    5. Click CREATE ENVIRONMENT

    Deleting an environment

    To delete an environment:

    1. Select the environment you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Note

    The already bound workload that is using this asset will not be affected.

    Using API

    Go to the Environment API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/assets/overview/index.html b/v2.20/Researcher/workloads/assets/overview/index.html index a67c1e8b17..722ef0e2f8 100644 --- a/v2.20/Researcher/workloads/assets/overview/index.html +++ b/v2.20/Researcher/workloads/assets/overview/index.html @@ -1,4 +1,4 @@ - Overview - Run:ai Documentation Library
    Skip to content

    Overview

    Workload assets enable organizations to:

    • Create and reuse preconfigured setup for code, data, storage and resources to be used by AI practitioners to simplify the process of submitting workloads
    • Share the preconfigured setup with a wide audience of AI practitioners with similar needs

    Note

    • The creation of assets is possible only via API and the Run:ai UI
    • The submission of workloads using assets, is possible only via the Run:ai UI

    Workload asset types

    There are four workload asset types used by the workload:

    • Environments
      The container image, tools and connections for the workload
    • Data sources
      The type of data, its origin and the target storage location such as PVCs or cloud storage buckets where datasets are stored
    • Compute resources
      The compute specification, including GPU and CPU compute and memory
    • Credentials
      The secrets to be used to access sensitive data, services, and applications such as docker registry or S3 buckets

    Asset scope

    When a workload asset is created, a scope is required. The scope defines who in the organization can view and/or use the asset.

    Note

    When an asset is created via API, the scope can be the entire account, this is currently an experimental feature.

    Who can create an asset?

    Any subject (user, application, or SSO group) with a role that has permissions to Create an asset, can do so within their scope.

    Who can use an asset?

    Assets are used when submitting workloads. Any subject (user, application or SSO group) with a role that has permissions to Create workloads, can also use assets.

    Who can view an asset?

    Any subject (user, application, or SSO group) with a role that has permission to View an asset, can do so within their scope.

    Overview

    Workload assets enable organizations to:

    • Create and reuse preconfigured setup for code, data, storage and resources to be used by AI practitioners to simplify the process of submitting workloads
    • Share the preconfigured setup with a wide audience of AI practitioners with similar needs

    Note

    • The creation of assets is possible only via API and the Run:ai UI
    • The submission of workloads using assets, is possible only via the Run:ai UI

    Workload asset types

    There are four workload asset types used by the workload:

    • Environments
      The container image, tools and connections for the workload
    • Data sources
      The type of data, its origin and the target storage location such as PVCs or cloud storage buckets where datasets are stored
    • Compute resources
      The compute specification, including GPU and CPU compute and memory
    • Credentials
      The secrets to be used to access sensitive data, services, and applications such as docker registry or S3 buckets

    Asset scope

    When a workload asset is created, a scope is required. The scope defines who in the organization can view and/or use the asset.

    Note

    When an asset is created via API, the scope can be the entire account, this is currently an experimental feature.

    Who can create an asset?

    Any subject (user, application, or SSO group) with a role that has permissions to Create an asset, can do so within their scope.

    Who can use an asset?

    Assets are used when submitting workloads. Any subject (user, application or SSO group) with a role that has permissions to Create workloads, can also use assets.

    Who can view an asset?

    Any subject (user, application, or SSO group) with a role that has permission to View an asset, can do so within their scope.

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/assets/templates/index.html b/v2.20/Researcher/workloads/assets/templates/index.html index 8795cf0ff1..bb7db305c5 100644 --- a/v2.20/Researcher/workloads/assets/templates/index.html +++ b/v2.20/Researcher/workloads/assets/templates/index.html @@ -1,4 +1,4 @@ - Workspace Templates - Run:ai Documentation Library
    Skip to content

    Workspace Templates

    This article explains the procedure to manage templates.

    A template is a pre-set configuration that is used to quickly configure and submit workloads using existing assets. A template consists of all the assets a workload needs, allowing researchers to submit a workload in a single click, or make subtle adjustments to differentiate them from each other.

    Workspace templates table

    The Templates table can be found under Workload manager in the Run:ai User interface.

    The Templates table provides a list of all the templates defined in the platform, and allows you to manage them.

    Flexible Management

    It is also possible to manage templates directly for a specific user, application, project, or department.

    The Templates table consists of the following columns:

    Column Description
    Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates
    Environment The name of the environment related to the workspace template
    Compute resource The name of the compute resource connected to the workspace template
    Data source(s) The name of the data source(s) connected to the workspace template
    Created by The subject that created the template
    Creation time The timestamp for when the template was created
    Cluster The cluster name containing the template

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then click Download as CSV. Export to CSV is limited to 20,000 rows.
    • Refresh (optional) - Click REFRESH to update the table with the latest data
    • Show/Hide details (optional) - Click to view additional information on the selected row

    Adding a new workspace template

    To add a new template:

    1. Click +NEW TEMPLATE
    2. Set the scope for the template
    3. Enter a name for the template
    4. Select the environment for your workload
    5. Select the node resources needed to run your workload
      - or -
      Click +NEW COMPUTE RESOURCE

    6. Set the volume needed for your workload

    7. Create a new data source
    8. Set auto-deletion, annotations and labels, as required
    9. Click CREATE TEMPLATE

    Editing a template

    To edit a template:

    1. Select the template from the table
    2. Click Rename to provide it with a new name
    3. Click Copy & Edit to make any changes to the template

    Deleting a template

    To delete a template:

    1. Select the template you want to delete
    2. Click DELETE
    3. Confirm you want to delete the template

    Using API**

    Go to the Workload template API reference to view the available actions

    Workspace Templates

    This article explains the procedure to manage templates.

    A template is a pre-set configuration that is used to quickly configure and submit workloads using existing assets. A template consists of all the assets a workload needs, allowing researchers to submit a workload in a single click, or make subtle adjustments to differentiate them from each other.

    Workspace templates table

    The Templates table can be found under Workload manager in the Run:ai User interface.

    The Templates table provides a list of all the templates defined in the platform, and allows you to manage them.

    Flexible Management

    It is also possible to manage templates directly for a specific user, application, project, or department.

    The Templates table consists of the following columns:

    Column Description
    Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates
    Environment The name of the environment related to the workspace template
    Compute resource The name of the compute resource connected to the workspace template
    Data source(s) The name of the data source(s) connected to the workspace template
    Created by The subject that created the template
    Creation time The timestamp for when the template was created
    Cluster The cluster name containing the template

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then click Download as CSV. Export to CSV is limited to 20,000 rows.
    • Refresh (optional) - Click REFRESH to update the table with the latest data
    • Show/Hide details (optional) - Click to view additional information on the selected row

    Adding a new workspace template

    To add a new template:

    1. Click +NEW TEMPLATE
    2. Set the scope for the template
    3. Enter a name for the template
    4. Select the environment for your workload
    5. Select the node resources needed to run your workload
      - or -
      Click +NEW COMPUTE RESOURCE

    6. Set the volume needed for your workload

    7. Create a new data source
    8. Set auto-deletion, annotations and labels, as required
    9. Click CREATE TEMPLATE

    Editing a template

    To edit a template:

    1. Select the template from the table
    2. Click Rename to provide it with a new name
    3. Click Copy & Edit to make any changes to the template

    Deleting a template

    To delete a template:

    1. Select the template you want to delete
    2. Click DELETE
    3. Confirm you want to delete the template

    Using API**

    Go to the Workload template API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/inference/custom-inference/index.html b/v2.20/Researcher/workloads/inference/custom-inference/index.html index 25fff01e83..e36da07aa4 100644 --- a/v2.20/Researcher/workloads/inference/custom-inference/index.html +++ b/v2.20/Researcher/workloads/inference/custom-inference/index.html @@ -1,4 +1,4 @@ - Deploy a Custom Inference Workload - Run:ai Documentation Library
    Skip to content

    Deploy a custom inference workload

    This article explains how to create a custom inference workload via the Run:ai UI.

    An inference workload provides the setup and configuration needed to deploy your trained model for real-time or batch predictions. It includes specifications for the container image, data sets, network settings, and resource requests required to serve your models.

    The inference workload is assigned to a project and is affected by the project’s quota.

    To learn more about the inference workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

    Creating a custom inference workload

    Before you start, make sure you have a project.

    To add a new custom inference workload:

    1. Go to the Workload manager → Workloads
    2. Click +NEW WORKLOAD and select Inference
      Within the new inference form:
    3. Select under which cluster to create the inference workload
    4. Select the project in which your inference will run
    5. Select custom inference from Inference type

      Note

      Selecting the Inference type is disabled by default. If you cannot see it in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Models.

    6. Enter a unique name for the inference workload (if the name already exists in the project, you will be requested to submit a different name)

    7. Click CONTINUE
      In the next step:
    8. Select the environment for your inference workload

      • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery.
        For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
      • Set an inference serving endpoint. The connection protocol and the container port are defined within the environment

        • Optional: Modify who can access the endpoint

          • Public (default)

            Everyone within the network can access the endpoint with no authentication

          • All authenticated users

            Everyone within the organization’s account that can log in (to Run:ai or SSO)

          • Specific group(s)

            • Click +GROUP
            • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the endpoint.
          • Specific user(s)

            • Click +USER
            • Enter a valid email address or username. If you remove yourself, you will lose access to the endpoint.
      • Set the connection for your tool(s). The tools are configured as part of the environment.

        • External URL
          • Custom URL
            • Set the URL
          • Optional: Modify who can access the tool:
            • All authenticated users (default)
              Everyone within the organization’s account
            • Specific group(s)
              • Click +GROUP
              • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
            • Specific user(s)
              • Click +USER
              • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
        • Node port
          • Custom port
            • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
      • Optional: Set the command and arguments for the container running the workload
        If no command is added, the container will use the image’s default command (entry-point).
        • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
        • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
      • Set the environment variable(s)
        • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
        • (Optional) Add new variables
          • Click +ENVIRONMENT VARIABLE
          • Enter a name
          • Select the source for the environment variable
            • Custom
              • Enter a value according to the provided instructions
            • Credentials - Select existing credentials as the environment variable
              • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
              • Select a secret key
    9. Select the compute resource for your inference workload

      • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery.
        For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
      • Optional: Set the minimum and maximum number of replicas to be scaled up and down to meet the changing demands of inference services.
      • If the number of minimum and maximum replicas are different, autoscaling will be triggered and you'll need to set conditions for creating a new replica. A replica will be created every time a condition is met. When a condition is no longer met after a replica was created, the replica will be automatically deleted to save resources.

        • Select a variable - The variable's values will be monitored via the container's port.
          • Latency (milliseconds)
          • Throughput (Requests/sec)
          • Concurrency (Requests)
        • Set a value - This value is the threshold at which autoscaling is triggered.
      • Optional: Set when the replicas should be automatically scaled down to zero. This allows compute resources to be freed up when the model is inactive (i.e., there are no requests being sent) When automatic scaling to zero is enabled, the minimum number of replicas set in the previous step, automatically changes to 0

      • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload.
        When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
        • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
        • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster.
          To configure a new node pool and for additional information, see node pools.
      • Select a node affinity to schedule the workload on a specific node type.
        If the administrator added a ‘node type (affinity)’ scheduling rule to the project/department, then this field is mandatory.
        Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
      • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

        Note

        Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Tolerations

        • Click +TOLERATION
        • Enter a key
        • Select the operator
          • Exists - If the key exists on the node, the effect will be applied.
          • Equals - If the key and the value set below matches to the value on the node, the effect will be applied
            • Enter a value matching the value on the node
        • Select the effect for the toleration
          • NoExecute - Pods that do not tolerate this taint are evicted immediately.
          • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
          • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
          • Any - All effects above match.
        • Optional: Select data sources for your inference workload Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection.
          For a step-by-step guide on adding data sources to the gallery, see data sources.
          Once created, the new data source will be automatically selected.
          • Optional: Modify the data target location for the selected data source(s).
    10. Optional - General settings:

      • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
      • Set annotations(s)
        Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
        • Click +ANNOTATION
        • Enter a name
        • Enter a value
      • Set labels(s)
        Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
        • Enter a name
        • Enter a value
    11. Click CREATE INFERENCE

    Managing and monitoring

    After the inference workload is created, it is added to the Workloads table, where it can be managed and monitored.

    Using API

    To view the available actions, see the Inferences API reference.

    Deploy a custom inference workload

    This article explains how to create a custom inference workload via the Run:ai UI.

    An inference workload provides the setup and configuration needed to deploy your trained model for real-time or batch predictions. It includes specifications for the container image, data sets, network settings, and resource requests required to serve your models.

    The inference workload is assigned to a project and is affected by the project’s quota.

    To learn more about the inference workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

    Creating a custom inference workload

    Before you start, make sure you have a project.

    To add a new custom inference workload:

    1. Go to the Workload manager → Workloads
    2. Click +NEW WORKLOAD and select Inference
      Within the new inference form:
    3. Select under which cluster to create the inference workload
    4. Select the project in which your inference will run
    5. Select custom inference from Inference type

      Note

      Selecting the Inference type is disabled by default. If you cannot see it in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Models.

    6. Enter a unique name for the inference workload (if the name already exists in the project, you will be requested to submit a different name)

    7. Click CONTINUE
      In the next step:
    8. Select the environment for your inference workload

      • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery.
        For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
      • Set an inference serving endpoint. The connection protocol and the container port are defined within the environment

        • Optional: Modify who can access the endpoint

          • Public (default)

            Everyone within the network can access the endpoint with no authentication

          • All authenticated users

            Everyone within the organization’s account that can log in (to Run:ai or SSO)

          • Specific group(s)

            • Click +GROUP
            • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the endpoint.
          • Specific user(s)

            • Click +USER
            • Enter a valid email address or username. If you remove yourself, you will lose access to the endpoint.
      • Set the connection for your tool(s). The tools are configured as part of the environment.

        • External URL
          • Custom URL
            • Set the URL
          • Optional: Modify who can access the tool:
            • All authenticated users (default)
              Everyone within the organization’s account
            • Specific group(s)
              • Click +GROUP
              • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
            • Specific user(s)
              • Click +USER
              • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
        • Node port
          • Custom port
            • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
      • Optional: Set the command and arguments for the container running the workload
        If no command is added, the container will use the image’s default command (entry-point).
        • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
        • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
      • Set the environment variable(s)
        • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
        • (Optional) Add new variables
          • Click +ENVIRONMENT VARIABLE
          • Enter a name
          • Select the source for the environment variable
            • Custom
              • Enter a value according to the provided instructions
            • Credentials - Select existing credentials as the environment variable
              • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
              • Select a secret key
    9. Select the compute resource for your inference workload

      • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery.
        For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
      • Optional: Set the minimum and maximum number of replicas to be scaled up and down to meet the changing demands of inference services.
      • If the number of minimum and maximum replicas are different, autoscaling will be triggered and you'll need to set conditions for creating a new replica. A replica will be created every time a condition is met. When a condition is no longer met after a replica was created, the replica will be automatically deleted to save resources.

        • Select a variable - The variable's values will be monitored via the container's port.
          • Latency (milliseconds)
          • Throughput (Requests/sec)
          • Concurrency (Requests)
        • Set a value - This value is the threshold at which autoscaling is triggered.
      • Optional: Set when the replicas should be automatically scaled down to zero. This allows compute resources to be freed up when the model is inactive (i.e., there are no requests being sent) When automatic scaling to zero is enabled, the minimum number of replicas set in the previous step, automatically changes to 0

      • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload.
        When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
        • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
        • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster.
          To configure a new node pool and for additional information, see node pools.
      • Select a node affinity to schedule the workload on a specific node type.
        If the administrator added a ‘node type (affinity)’ scheduling rule to the project/department, then this field is mandatory.
        Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
      • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

        Note

        Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Tolerations

        • Click +TOLERATION
        • Enter a key
        • Select the operator
          • Exists - If the key exists on the node, the effect will be applied.
          • Equals - If the key and the value set below matches to the value on the node, the effect will be applied
            • Enter a value matching the value on the node
        • Select the effect for the toleration
          • NoExecute - Pods that do not tolerate this taint are evicted immediately.
          • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
          • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
          • Any - All effects above match.
        • Optional: Select data sources for your inference workload Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection.
          For a step-by-step guide on adding data sources to the gallery, see data sources.
          Once created, the new data source will be automatically selected.
          • Optional: Modify the data target location for the selected data source(s).
    10. Optional - General settings:

      • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
      • Set annotations(s)
        Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
        • Click +ANNOTATION
        • Enter a name
        • Enter a value
      • Set labels(s)
        Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
        • Enter a name
        • Enter a value
    11. Click CREATE INFERENCE

    Managing and monitoring

    After the inference workload is created, it is added to the Workloads table, where it can be managed and monitored.

    Using API

    To view the available actions, see the Inferences API reference.

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/inference/hugging-face-inference/index.html b/v2.20/Researcher/workloads/inference/hugging-face-inference/index.html index abc41bc51e..f6b4dd7d7b 100644 --- a/v2.20/Researcher/workloads/inference/hugging-face-inference/index.html +++ b/v2.20/Researcher/workloads/inference/hugging-face-inference/index.html @@ -1,4 +1,4 @@ - Deploy Inference Workloads from Hugging Face - Run:ai Documentation Library
    Skip to content

    Deploy inference workloads from Hugging Face

    This article explains how to create an inference workload via the Run:ai UI using Hugging Face inference models.

    An inference workload provides the setup and configuration needed to deploy your trained model for real-time or batch predictions. It includes specifications for the container image, data sets, network settings, and resource requests required to serve your models.

    The inference workload is assigned to a project and is affected by the project’s quota.

    To learn more about the inference workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

    Creating a Hugging Face inference workload

    Before you start, make sure you have a project.

    To add a new inference workload:

    1. Go to the Workload manager → Workloads
    2. Click +NEW WORKLOAD and select Inference
      Within the new inference form:
    3. Select under which cluster to create the inference workload
    4. Select the project in which your inference will run
    5. Select Hugging Face from Inference type

      Note

      Selecting the Inference type is disabled by default. If you cannot see it in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Models.

    6. Enter a unique name for the inference workload (if the name already exists in the project, you will be requested to submit a different name)

    7. Click CONTINUE
      In the next step:
    8. Set the model and how to access

      • Set the model name as displayed in Hugging Face. The model must be supported by vLLM version 0.6.4.
        • Enter a name
      • Set how to access Hugging Face

        • Provide a token
          • Access token
            • Enter a token
        • Select credentials
          • Select existing credentials. Make sure the existing credentials contain an HF_TOKEN key
          • Add new credentials with a HF_TOKEN

            Within the new credentials form:

            • Enter a name for the credential. The name must be unique.
            • Optional: Provide a description of the credentials
            • Set how the credential is created

              • Existing secret (in the cluster)

                This option applies when the purpose is to create credentials based on an existing secret

              • New secret

                A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.

                • Enter a key
                • Enter the HF_TOKEN as the value
      • Optional: Modify who can access the inference serving endpoint

        • Public (default)

          Everyone within the network can access the endpoint with no authentication

        • All authenticated users

          Everyone within the organization’s account that can log in (to Run:ai or SSO)

        • Specific group(s)

          • Click +GROUP
          • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the endpoint.
        • Specific user(s)

          • Click +USER
          • Enter a valid email address or username. If you remove yourself, you will lose access to the endpoint.
    9. Select the compute resource for your inference workload

      • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery.
        For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
      • Optional: Set the minimum and maximum number of replicas to be scaled up and down to meet the changing demands of inference services.
      • If the number of minimum and maximum replicas are different, autoscaling will be triggered and you'll need to set conditions for creating a new replica. A replica will be created every time a condition is met. When a condition is no longer met after a replica was created, the replica will be automatically deleted to save resources.

        • Select a variable - The variable's values will be monitored via the container's port.
          • Latency (milliseconds)
          • Throughput (Requests/sec)
          • Concurrency (Requests)
        • Set a value - This value is the threshold at which autoscaling is triggered.
      • Optional: Set when the replicas should be automatically scaled down to zero. This allows the compute resources to be freed up when the model is inactive (i.e., there are no requests being sent) When automatic scaling to zero is enabled, the minimum number of replicas set in the previous step, automatically changes to 0

      • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload.
        When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
        • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
        • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster.
          To configure a new node pool and for additional information, see node pools.
      • Select a node affinity to schedule the workload on a specific node type.
        If the administrator added a ‘node type (affinity)’ scheduling rule to the project/department, then this field is mandatory.
        Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
      • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

        Note

        Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Tolerations

        • Click +TOLERATION
        • Enter a key
        • Select the operator
          • Exists - If the key exists on the node, the effect will be applied.
          • Equals - If the key and the value set below matches to the value on the node, the effect will be applied
            • Enter a value matching the value on the node
        • Select the effect for the toleration
          • NoExecute - Pods that do not tolerate this taint are evicted immediately.
          • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
          • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
          • Any - All effects above match.
    10. Optional - General settings:

      • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
      • Set annotations(s)
        Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
        • Click +ANNOTATION
        • Enter a name
        • Enter a value
      • Set labels(s)
        Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
        • Enter a name
        • Enter a value
    11. Click CREATE INFERENCE

    Managing and monitoring

    After the inference workload is created, it is added to the Workloads table, where it can be managed and monitored.

    Deploy inference workloads from Hugging Face

    This article explains how to create an inference workload via the Run:ai UI using Hugging Face inference models.

    An inference workload provides the setup and configuration needed to deploy your trained model for real-time or batch predictions. It includes specifications for the container image, data sets, network settings, and resource requests required to serve your models.

    The inference workload is assigned to a project and is affected by the project’s quota.

    To learn more about the inference workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

    Creating a Hugging Face inference workload

    Before you start, make sure you have a project.

    To add a new inference workload:

    1. Go to the Workload manager → Workloads
    2. Click +NEW WORKLOAD and select Inference
      Within the new inference form:
    3. Select under which cluster to create the inference workload
    4. Select the project in which your inference will run
    5. Select Hugging Face from Inference type

      Note

      Selecting the Inference type is disabled by default. If you cannot see it in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Models.

    6. Enter a unique name for the inference workload (if the name already exists in the project, you will be requested to submit a different name)

    7. Click CONTINUE
      In the next step:
    8. Set the model and how to access

      • Set the model name as displayed in Hugging Face. The model must be supported by vLLM version 0.6.4.
        • Enter a name
      • Set how to access Hugging Face

        • Provide a token
          • Access token
            • Enter a token
        • Select credentials
          • Select existing credentials. Make sure the existing credentials contain an HF_TOKEN key
          • Add new credentials with a HF_TOKEN

            Within the new credentials form:

            • Enter a name for the credential. The name must be unique.
            • Optional: Provide a description of the credentials
            • Set how the credential is created

              • Existing secret (in the cluster)

                This option applies when the purpose is to create credentials based on an existing secret

              • New secret

                A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.

                • Enter a key
                • Enter the HF_TOKEN as the value
      • Optional: Modify who can access the inference serving endpoint

        • Public (default)

          Everyone within the network can access the endpoint with no authentication

        • All authenticated users

          Everyone within the organization’s account that can log in (to Run:ai or SSO)

        • Specific group(s)

          • Click +GROUP
          • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the endpoint.
        • Specific user(s)

          • Click +USER
          • Enter a valid email address or username. If you remove yourself, you will lose access to the endpoint.
    9. Select the compute resource for your inference workload

      • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery.
        For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
      • Optional: Set the minimum and maximum number of replicas to be scaled up and down to meet the changing demands of inference services.
      • If the number of minimum and maximum replicas are different, autoscaling will be triggered and you'll need to set conditions for creating a new replica. A replica will be created every time a condition is met. When a condition is no longer met after a replica was created, the replica will be automatically deleted to save resources.

        • Select a variable - The variable's values will be monitored via the container's port.
          • Latency (milliseconds)
          • Throughput (Requests/sec)
          • Concurrency (Requests)
        • Set a value - This value is the threshold at which autoscaling is triggered.
      • Optional: Set when the replicas should be automatically scaled down to zero. This allows the compute resources to be freed up when the model is inactive (i.e., there are no requests being sent) When automatic scaling to zero is enabled, the minimum number of replicas set in the previous step, automatically changes to 0

      • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload.
        When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
        • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
        • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster.
          To configure a new node pool and for additional information, see node pools.
      • Select a node affinity to schedule the workload on a specific node type.
        If the administrator added a ‘node type (affinity)’ scheduling rule to the project/department, then this field is mandatory.
        Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
      • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

        Note

        Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Tolerations

        • Click +TOLERATION
        • Enter a key
        • Select the operator
          • Exists - If the key exists on the node, the effect will be applied.
          • Equals - If the key and the value set below matches to the value on the node, the effect will be applied
            • Enter a value matching the value on the node
        • Select the effect for the toleration
          • NoExecute - Pods that do not tolerate this taint are evicted immediately.
          • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
          • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
          • Any - All effects above match.
    10. Optional - General settings:

      • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
      • Set annotations(s)
        Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
        • Click +ANNOTATION
        • Enter a name
        • Enter a value
      • Set labels(s)
        Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
        • Enter a name
        • Enter a value
    11. Click CREATE INFERENCE

    Managing and monitoring

    After the inference workload is created, it is added to the Workloads table, where it can be managed and monitored.

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/inference/inference-overview/index.html b/v2.20/Researcher/workloads/inference/inference-overview/index.html index 50ca1de81f..49e0fb7be5 100644 --- a/v2.20/Researcher/workloads/inference/inference-overview/index.html +++ b/v2.20/Researcher/workloads/inference/inference-overview/index.html @@ -1,4 +1,4 @@ - Inference overview - Run:ai Documentation Library
    Skip to content

    Overview

    What is Inference

    Machine learning (ML) inference is the process of running live data points into a machine-learning algorithm to calculate an output.

    With Inference workloads, you are taking a trained Model and deploying it into a production environment. The deployment must align with the organization's production standards such as average and 95% response time as well as up-time.

    Inference and GPUs

    The Inference process is a subset of the original Training algorithm on a single datum (e.g. one sentence or one image), or a small batch. As such, GPU memory requirements are typically smaller than a full-blown Training process.

    Given that, Inference lends itself nicely to the usage of Run:ai Fractions. You can, for example, run 4 instances of an Inference server on a single GPU, each employing a fourth of the memory.

    Inference @Run:ai

    Run:ai provides Inference services as an equal part together with the other two Workload types: Train and Build.

    • Inference is considered a high-priority workload as it is customer-facing. Running an Inference workload (within the Project's quota) will preempt any Run:ai Workload marked as Training.

    • Inference workloads will receive priority over Train and Build workloads during scheduling.

    • Inference is implemented as a Kubernetes Deployment object with a defined number of replicas. The replicas are load-balanced by Kubernetes so adding more replicas will improve the overall throughput of the system.

    • Multiple replicas will appear in Run:ai as a single Inference workload. The workload will appear in all Run:ai dashboards and views as well as the Command-line interface.

    • Inference workloads can be submitted via Run:ai user interface as well as Run:ai API. Internally, spawning an Inference workload also creates a Kubernetes Service. The service is an end-point to which clients can connect.

    Autoscaling

    To withstand SLA, Inference workloads are typically set with auto scaling. Auto-scaling is the ability to add more computing power (Kubernetes pods) when the load increases and shrink allocated resources when the system is idle. There are several ways to trigger autoscaling. Run:ai supports the following:

    Metric Units
    Latency Millisecond
    Throughput Requests/sec
    Concurrency Requests

    The Minimum and Maximum number of replicas can be configured as part of the autoscaling configuration.

    Autoscaling also supports a scale-to-zero policy with Throughput and Concurrency metrics, meaning that given enough time under the target threshold, the number of replicas will be scaled down to 0.

    This has the benefit of conserving resources at the risk of a delay from "cold starting" the model when traffic resumes.

    Rolling inference updates

    When deploying models and running inference workloads, you may need to update the workload configuration in a live manner, without impacting the important services that are provided by the workload.

    This means you can submit updates to an existing inference workload whether it is currently running, pending, or any other status.

    The following are a few examples of updates that can be implemented:

    • Changing the container image to deploy a new version of the model
    • Changing different parameters (such as environment variables)
    • Changing compute resources to improve performance
    • Changing the number of replicas and scale plan to adapt to requirement changes and scales

    During the update and until its successful completion, the service that the workload provides is not jeopardized as these are production-grade workloads. Hence, consumers can continue using the model (interact with the LLM) during the update process.

    During the update process of an inference workload, a new revision of pod(s) is created. This revision is the new desired specification of the workload. Although several updates can be submitted consecutively even if the process of the previous update is not complete, the target goal is always according to the last submitted update. This means, the previous updates are ignored.

    Once the new revision is created completely and is up and running, the entire traffic of requests is navigated to the new revision, the original revision is terminated and the resources are sent back to the shared pool. Only then is the update process considered complete.

    It is important to note that:

    • To finish the inference workload update successfully, the project must have sufficient free GPU quota in favor of the update. For example:

      • The existing workload uses 3 replicas: A running inference workload with 3 replicas, assuming that each replica is equal to 1 GPU, means the project is already using 3 GPUs of its quota. For the sake of simplicity, we will refer to this revision as revision #1.

      • The workload is updated to use 8 replicas: This means, to complete the update, an additional 8 GPUs of free quota is needed. Only when the update is complete, the 3 GPUs used for the initial revision (revision #1) are reclaimed.

    • In the UI, the Workloads table displays the configuration of the latest submitted update. For example, if you change the container image, the image column in the running / requested pods will display the name of updated image. The status of the workload continues to reflect the operational state of the service the workload exposes. For instance, during an update, the workload status remains "Running" if the service is still being delivered to consumers. Hovering over the workload's status in the grid will display the phase message for the update, offering additional insights into its update state.

    • The submission of inference updates is currently possible only via API. The following are the API fields that can be updated:

      • Command
      • Args
      • Image
      • imagePullPolicy
      • workingDir
      • createHomeDir
      • Probes
      • environmentVariables
      • Autoscaling
    • As long as the update process is not completed, GPUs are not allocated to the replicas of the new revision. This prevents the allocation of idle GPUs so others will not be deprived using them.

    • If the update process is not completed within the default time limit of 10 minutes, it will automatically stop. At that point, all replicas of the new revision will be removed, and the original revision will continue to run normally.
    • The default time limit for updates is configurable. Consider setting a longer duration if your workload requires extended time to pull the image due to its size, if the workload takes additional time to reach a 'READY' state due to a long initialization process, or if your cluster depends on autoscaling to allocate resources for new replicas. For example, to set the time limit to 30 minutes, you can run the following command:
      kubectl patch ConfigMap config-deployment -n knative-serving --type='merge' -p '{"data": {"progress-deadline": "1800s"}}'
      -

    Inference workloads with Knative new behavior in v2.19

    Starting in version 2.19, all pods of a single Knative revision are grouped under a single Pod-Group. This means that when a new Knative revision is created:

    • It either succeeds in allocating the minimum number of pods; or
    • It fails and moves into a pending state, to retry again later to allocate all pods with their resources.

    The resources (GPUs, CPUs) are not occupied by a new Knative revision until it succeeds in allocating all pods. The older revision pods are then terminated and release their resources (GPUs, CPUs) back to the cluster to be used by other workloads.

    See Also

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/inference/nim-inference/index.html b/v2.20/Researcher/workloads/inference/nim-inference/index.html index 5229c0a7ed..4bcf36e8ca 100644 --- a/v2.20/Researcher/workloads/inference/nim-inference/index.html +++ b/v2.20/Researcher/workloads/inference/nim-inference/index.html @@ -1,4 +1,4 @@ - Deploy Inference Workloads with NVIDIA NIM - Run:ai Documentation Library
    Skip to content

    Deploy inference workloads with NVIDIA NIM

    This article explains how to deploy a GenAI model from Nvidia NIM as an inference workload via the Run:ai UI.

    An inference workload provides the setup and configuration needed to deploy your trained model for real-time or batch predictions. It includes specifications for the container image, data sets, network settings, and resource requests required to serve your models.

    The inference workload is assigned to a project and is affected by the project’s quota.

    To learn more about the inference workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

    Creating a NIM inference workload

    Before you start, make sure you have a project.

    To add a new inference workload:

    1. Go to the Workload manager → Workloads
    2. Click +NEW WORKLOAD and select Inference
      Within the new inference form:
    3. Select under which cluster to create the inference workload
    4. Select the project in which your inference will run
    5. Select NIM from Inference type

      Note

      Selecting the Inference type is disabled by default. If you cannot see it in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Models.

    6. Enter a unique name for the inference workload (if the name already exists in the project, you will be requested to submit a different name)

    7. Click CONTINUE
      In the next step:
    8. Select the NIM model and set how to access

      • Set the model name by selecting a model or entering the model name as displayed in NIM
      • Set how the model profile should be selected

        A NIM model profile sets compatible model engines and criteria for engine selection, such as precision, latency, throughput optimization, and GPU requirements. Profiles are optimized to balance either latency or throughput, with quantized profiles (e.g., fp8) preferred to reduce memory usage and enhance performance.

        • Automatically (recommended) NIM is designed to automatically select the most suitable profile from the list of compatible profiles based on the detected hardware. Each profile consists of different parameters that influence the selection process.
        • Manually
          • Enter profile name or hash
      • Optional: Modify who can access the inference serving endpoint

        • Public (default)

          Everyone within the network can access the endpoint with no authentication

        • All authenticated users

          Everyone within the organization’s account that can log in (to Run:ai or SSO)

        • Specific group(s)

          • Click +GROUP
          • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the endpoint.
        • Specific user(s)

          • Click +USER
          • Enter a valid email address or username. If you remove yourself, you will lose access to the endpoint.
    9. Select how to access the model store

      • From NVIDIA NGC - The model is downloaded when the workload starts running
        • Set the NVIDIA NGC API key
          • Enter a key
          • (Optional) Click Storage - When downloading a model from NVIDIA NGC, selecting storage is recommended.
            Select a data source where the model is already cached to reduce loading time or click +NEW DATA SOURCE to add a new data source to the gallery. This will cache the model and reduce loading time for future use. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection.
            For a step-by-step guide on adding data sources to the gallery, see data sources.
            Once created, the new data source will be automatically selected.
      • From storage - The model is accessed directly and without being downloaded
        • Storage - Set where to load the model
          Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection.
          For a step-by-step guide on adding data sources to the gallery, see data sources.
          Once created, the new data source will be automatically selected.
    10. Select the compute resource for your inference workload

      • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery.
        For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
      • Optional: Set the minimum and maximum number of replicas to be scaled up and down to meet the changing demands of inference services.
      • If the number of minimum and maximum replicas are different, autoscaling will be triggered and you'll need to set conditions for creating a new replica. A replica will be created every time a condition is met. When a condition is no longer met after a replica was created, the replica will be automatically deleted to save resources.

        • Select a variable - The variable's values will be monitored via the container's port.
          • Latency (milliseconds)
          • Throughput (Requests/sec)
          • Concurrency (Requests)
        • Set a value - This value is the threshold at which autoscaling is triggered
      • Optional: Set when the replicas should be automatically scaled down to zero. This allows the compute resources to be freed up when the model is inactive (i.e., there are no requests being sent) When automatic scaling to zero is enabled, the minimum number of replicas set in the previous step, automatically changes to 0

      • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload.
        When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
        • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
        • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster.
          To configure a new node pool and for additional information, see node pools.
      • Select a node affinity to schedule the workload on a specific node type.
        If the administrator added a ‘node type (affinity)’ scheduling rule to the project/department, then this field is mandatory.
        Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
      • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

        Note

        Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Tolerations

        • Click +TOLERATION
        • Enter a key
        • Select the operator
          • Exists - If the key exists on the node, the effect will be applied.
          • Equals - If the key and the value set below matches to the value on the node, the effect will be applied
            • Enter a value matching the value on the node
        • Select the effect for the toleration
          • NoExecute - Pods that do not tolerate this taint are evicted immediately.
          • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
          • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
          • Any - All effects above match.
    11. Optional - General settings:

      • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
      • Set annotations(s)
        Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
        • Click +ANNOTATION
        • Enter a name
        • Enter a value
      • Set labels(s)
        Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
        • Enter a name
        • Enter a value
    12. Click CREATE INFERENCE

    Managing and monitoring

    After the inference workload is created, it is added to the Workloads table, where it can be managed and monitored.

    Deploy inference workloads with NVIDIA NIM

    This article explains how to deploy a GenAI model from Nvidia NIM as an inference workload via the Run:ai UI.

    An inference workload provides the setup and configuration needed to deploy your trained model for real-time or batch predictions. It includes specifications for the container image, data sets, network settings, and resource requests required to serve your models.

    The inference workload is assigned to a project and is affected by the project’s quota.

    To learn more about the inference workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

    Creating a NIM inference workload

    Before you start, make sure you have a project.

    To add a new inference workload:

    1. Go to the Workload manager → Workloads
    2. Click +NEW WORKLOAD and select Inference
      Within the new inference form:
    3. Select under which cluster to create the inference workload
    4. Select the project in which your inference will run
    5. Select NIM from Inference type

      Note

      Selecting the Inference type is disabled by default. If you cannot see it in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Models.

    6. Enter a unique name for the inference workload (if the name already exists in the project, you will be requested to submit a different name)

    7. Click CONTINUE
      In the next step:
    8. Select the NIM model and set how to access

      • Set the model name by selecting a model or entering the model name as displayed in NIM
      • Set how the model profile should be selected

        A NIM model profile sets compatible model engines and criteria for engine selection, such as precision, latency, throughput optimization, and GPU requirements. Profiles are optimized to balance either latency or throughput, with quantized profiles (e.g., fp8) preferred to reduce memory usage and enhance performance.

        • Automatically (recommended) NIM is designed to automatically select the most suitable profile from the list of compatible profiles based on the detected hardware. Each profile consists of different parameters that influence the selection process.
        • Manually
          • Enter profile name or hash
      • Optional: Modify who can access the inference serving endpoint

        • Public (default)

          Everyone within the network can access the endpoint with no authentication

        • All authenticated users

          Everyone within the organization’s account that can log in (to Run:ai or SSO)

        • Specific group(s)

          • Click +GROUP
          • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the endpoint.
        • Specific user(s)

          • Click +USER
          • Enter a valid email address or username. If you remove yourself, you will lose access to the endpoint.
    9. Select how to access the model store

      • From NVIDIA NGC - The model is downloaded when the workload starts running
        • Set the NVIDIA NGC API key
          • Enter a key
          • (Optional) Click Storage - When downloading a model from NVIDIA NGC, selecting storage is recommended.
            Select a data source where the model is already cached to reduce loading time or click +NEW DATA SOURCE to add a new data source to the gallery. This will cache the model and reduce loading time for future use. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection.
            For a step-by-step guide on adding data sources to the gallery, see data sources.
            Once created, the new data source will be automatically selected.
      • From storage - The model is accessed directly and without being downloaded
        • Storage - Set where to load the model
          Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection.
          For a step-by-step guide on adding data sources to the gallery, see data sources.
          Once created, the new data source will be automatically selected.
    10. Select the compute resource for your inference workload

      • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery.
        For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
      • Optional: Set the minimum and maximum number of replicas to be scaled up and down to meet the changing demands of inference services.
      • If the number of minimum and maximum replicas are different, autoscaling will be triggered and you'll need to set conditions for creating a new replica. A replica will be created every time a condition is met. When a condition is no longer met after a replica was created, the replica will be automatically deleted to save resources.

        • Select a variable - The variable's values will be monitored via the container's port.
          • Latency (milliseconds)
          • Throughput (Requests/sec)
          • Concurrency (Requests)
        • Set a value - This value is the threshold at which autoscaling is triggered
      • Optional: Set when the replicas should be automatically scaled down to zero. This allows the compute resources to be freed up when the model is inactive (i.e., there are no requests being sent) When automatic scaling to zero is enabled, the minimum number of replicas set in the previous step, automatically changes to 0

      • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload.
        When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
        • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
        • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster.
          To configure a new node pool and for additional information, see node pools.
      • Select a node affinity to schedule the workload on a specific node type.
        If the administrator added a ‘node type (affinity)’ scheduling rule to the project/department, then this field is mandatory.
        Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
      • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

        Note

        Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Tolerations

        • Click +TOLERATION
        • Enter a key
        • Select the operator
          • Exists - If the key exists on the node, the effect will be applied.
          • Equals - If the key and the value set below matches to the value on the node, the effect will be applied
            • Enter a value matching the value on the node
        • Select the effect for the toleration
          • NoExecute - Pods that do not tolerate this taint are evicted immediately.
          • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
          • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
          • Any - All effects above match.
    11. Optional - General settings:

      • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
      • Set annotations(s)
        Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
        • Click +ANNOTATION
        • Enter a name
        • Enter a value
      • Set labels(s)
        Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
        • Enter a name
        • Enter a value
    12. Click CREATE INFERENCE

    Managing and monitoring

    After the inference workload is created, it is added to the Workloads table, where it can be managed and monitored.

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/overviews/introduction-to-workloads/index.html b/v2.20/Researcher/workloads/overviews/introduction-to-workloads/index.html index 6a0b90d904..e74768da30 100644 --- a/v2.20/Researcher/workloads/overviews/introduction-to-workloads/index.html +++ b/v2.20/Researcher/workloads/overviews/introduction-to-workloads/index.html @@ -1,4 +1,4 @@ - Introduction to Workloads - Run:ai Documentation Library
    Skip to content

    Introduction to Workloads

    Run:ai enhances visibility and simplifies management, by monitoring, presenting and orchestrating all AI workloads in the clusters it is installed on. Workloads are the fundamental building blocks for consuming resources, enabling AI practitioners such as researchers, data scientists and engineers to efficiently support the entire life cycle of an AI initiative.

    Workloads across the AI lifecycle

    A typical AI initiative progresses through several key stages, each with distinct workloads and objectives. With Run:ai, research and engineering teams can host and manage all these workloads to achieve the following:

    • Data preparation: Aggregating, cleaning, normalizing, and labeling data to prepare for training.
    • Training: Conducting resource-intensive model development and iterative performance optimization.
    • Fine-tuning: Adapting pre-trained models to domain-specific data sets while balancing efficiency and performance.
    • Inference: Deploying models for real-time or batch predictions with a focus on low latency and high throughput.
    • Monitoring and optimization: Ensuring ongoing performance by addressing data drift, usage patterns, and retraining as needed.

    What is a workload?

    A workload runs in the cluster, is associated with a namespace, and operates to fulfill its targets, whether that is running to completion for a batch job, allocating resources for experimentation in an integrated development environment (IDE)/notebook, or serving inference requests in production.

    The workload, defined by the AI practitioner, consists of:

    • Container images: This includes the application, its dependencies, and the runtime environment.
    • Compute resources: CPU, GPU, and RAM to execute efficiently and address the workload’s needs.
    • Data sets: The data needed for processing, such as training data sets or input from external databases.
    • Credentials: The access to certain data sources or external services, ensuring proper authentication and authorization.

    Workload scheduling and orchestration

    Run:ai’s core mission is to optimize AI resource usage at scale. This is achieved through efficient scheduling and orchestrating of all cluster workloads using the Run:ai Scheduler. The Scheduler allows the prioritization of workloads across different departments and projects within the organization at large scales, based on the resource distribution set by the system administrator.

    Run:ai and third-party workloads

    • Run:ai workloads: These workloads are submitted via the Run:ai platform. They are represented by Kubernetes Custom Resource Definitions (CRDs) and APIs. When using Run:ai workloads, a complete Workload and Scheduling Policy solution is offered for administrators to ensure optimizations, governance and security standards are applied.
    • Third-party workloads: These workloads are submitted via third-party applications that use the Run:ai Scheduler. The Run:ai platform manages and monitors these workloads. They enable seamless integrations with external tools, allowing teams and individuals flexibility.

    Levels of support

    Different types of workloads have different levels of support. Understanding what capabilities are needed before selecting the workload type to work with is important. The table below details the level of support for each workload type in Run:ai. Run:ai workloads are fully supported with all of Run:ai advanced features and capabilities. While third-party workloads are partially supported. The list of capabilities can change between different Run:ai versions.

    Functionality Workload Type
    Run:ai workloads Third-party workloads
    Training - Standard Workspace Inference Training - distributed
    Fairness v v v v v
    Priority and preemption v v v v v
    Over quota v v v v v
    Node pools v v v v v
    Bin packing / Spread v v v v v
    Multi-GPU fractions v v v v v
    Multi-GPU dynamic fractions v v v v v
    Node level scheduler v v v v v
    Multi-GPU memory swap v v v v v
    Elastic scaling NA NA v v v
    Gang scheduling v v v v v
    Monitoring v v v v v
    RBAC v v v v
    Workload awareness v v v v
    Workload submission v v v v
    Workload actions (stop/run) v v v v
    Workload Policies v v v v
    Scheduling rules v v v v

    Note

    Workload awareness

    Specific workload-aware visibility, so that different pods are identified and treated as a single workload (for example GPU utilization, workload view, dashboards).

    Introduction to Workloads

    Run:ai enhances visibility and simplifies management, by monitoring, presenting and orchestrating all AI workloads in the clusters it is installed on. Workloads are the fundamental building blocks for consuming resources, enabling AI practitioners such as researchers, data scientists and engineers to efficiently support the entire life cycle of an AI initiative.

    Workloads across the AI lifecycle

    A typical AI initiative progresses through several key stages, each with distinct workloads and objectives. With Run:ai, research and engineering teams can host and manage all these workloads to achieve the following:

    • Data preparation: Aggregating, cleaning, normalizing, and labeling data to prepare for training.
    • Training: Conducting resource-intensive model development and iterative performance optimization.
    • Fine-tuning: Adapting pre-trained models to domain-specific data sets while balancing efficiency and performance.
    • Inference: Deploying models for real-time or batch predictions with a focus on low latency and high throughput.
    • Monitoring and optimization: Ensuring ongoing performance by addressing data drift, usage patterns, and retraining as needed.

    What is a workload?

    A workload runs in the cluster, is associated with a namespace, and operates to fulfill its targets, whether that is running to completion for a batch job, allocating resources for experimentation in an integrated development environment (IDE)/notebook, or serving inference requests in production.

    The workload, defined by the AI practitioner, consists of:

    • Container images: This includes the application, its dependencies, and the runtime environment.
    • Compute resources: CPU, GPU, and RAM to execute efficiently and address the workload’s needs.
    • Data sets: The data needed for processing, such as training data sets or input from external databases.
    • Credentials: The access to certain data sources or external services, ensuring proper authentication and authorization.

    Workload scheduling and orchestration

    Run:ai’s core mission is to optimize AI resource usage at scale. This is achieved through efficient scheduling and orchestrating of all cluster workloads using the Run:ai Scheduler. The Scheduler allows the prioritization of workloads across different departments and projects within the organization at large scales, based on the resource distribution set by the system administrator.

    Run:ai and third-party workloads

    • Run:ai workloads: These workloads are submitted via the Run:ai platform. They are represented by Kubernetes Custom Resource Definitions (CRDs) and APIs. When using Run:ai workloads, a complete Workload and Scheduling Policy solution is offered for administrators to ensure optimizations, governance and security standards are applied.
    • Third-party workloads: These workloads are submitted via third-party applications that use the Run:ai Scheduler. The Run:ai platform manages and monitors these workloads. They enable seamless integrations with external tools, allowing teams and individuals flexibility.

    Levels of support

    Different types of workloads have different levels of support. Understanding what capabilities are needed before selecting the workload type to work with is important. The table below details the level of support for each workload type in Run:ai. Run:ai workloads are fully supported with all of Run:ai advanced features and capabilities. While third-party workloads are partially supported. The list of capabilities can change between different Run:ai versions.

    Functionality Workload Type
    Run:ai workloads Third-party workloads
    Training - Standard Workspace Inference Training - distributed
    Fairness v v v v v
    Priority and preemption v v v v v
    Over quota v v v v v
    Node pools v v v v v
    Bin packing / Spread v v v v v
    Multi-GPU fractions v v v v v
    Multi-GPU dynamic fractions v v v v v
    Node level scheduler v v v v v
    Multi-GPU memory swap v v v v v
    Elastic scaling NA NA v v v
    Gang scheduling v v v v v
    Monitoring v v v v v
    RBAC v v v v
    Workload awareness v v v v
    Workload submission v v v v
    Workload actions (stop/run) v v v v
    Workload Policies v v v v
    Scheduling rules v v v v

    Note

    Workload awareness

    Specific workload-aware visibility, so that different pods are identified and treated as a single workload (for example GPU utilization, workload view, dashboards).

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/overviews/managing-workloads/index.html b/v2.20/Researcher/workloads/overviews/managing-workloads/index.html index 55159c9624..decef91dd7 100644 --- a/v2.20/Researcher/workloads/overviews/managing-workloads/index.html +++ b/v2.20/Researcher/workloads/overviews/managing-workloads/index.html @@ -1,4 +1,4 @@ - Workloads - Run:ai Documentation Library
    Skip to content

    Workloads

    This article explains the procedure for managing workloads.

    Workloads table

    The Workloads table can be found under Workload manager in the Run:ai platform.

    The workloads table provides a list of all the workloads scheduled on the run:ai Scheduler, and allows you to manage them.

    The Workloads table consists of the following columns:

    Column Description
    Workload The name of the workload
    Type The workload type
    Preemptible Is the workload preemptible
    Status The different phases in a workload life cycle.
    Project The project in which the workload runs.
    Department The department that the workload is associated with. this column is visible only if the department toggle is enabled by your administrator.
    Created by The user who created the workload
    Running/requested pods The number of running pods out of the requested
    Creation time The timestamp for when the workload was created
    Completion time The timestamp the workload reached a terminal state (failed/completed)
    Connection(s) The method by which you can access and interact with the running workload. It's essentially the "doorway" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc). Click one of the values in the column to view the list of connections and their parameters
    Data source(s) Data resources used by the workload
    Environment The environment used by the workload
    Workload architecture Standard or distributed. A standard workload consists of a single process. A distributed workload consists of multiple processes working together. These processes can run on different nodes.
    GPU compute request Amount of GPU devices requested
    GPU compute allocation Amount of GPU devices allocated
    GPU memory request Amount of GPU memory Requested
    GPU memory allocation Amount of GPU memory allocated
    Idle GPU devices The number of allocated GPU devices that have been idle for more than 5 minutes
    CPU compute request Amount of CPU cores requested
    CPU compute allocation Amount of CPU cores allocated
    CPU memory request Amount of CPU memory requested
    CPU memory allocation Amount of CPU memory allocated
    Cluster The cluster that the workload is associated with

    Workload status

    The following table describes the different phases in a workload life cycle. The UI provides additional details for some of the below workload statuses which can be viewed by clicking the icon next to the status.

    Status Description Entry Condition Exit Condition
    Creating Workload setup is initiated in the cluster. Resources and pods are now provisioning. A workload is submitted. A multi-pod group is created.
    Pending Workload is queued and awaiting resource allocation. A pod group exists. All pods are scheduled.
    Initializing Workload is retrieving images, starting containers, and preparing pods. All pods are scheduled. All pods are initialized or a failure to initialize is detected.
    Running Workload is currently in progress with all pods operational. All pods initialized (all containers in pods are ready). Workload completion or failure.
    Degraded Pods may not align with specifications, network services might be incomplete, or persistent volumes may be detached. Check your logs for specific details. Pending - All pods are running but have issues. Running - All pods are running with no issues. Running - All resources are OK. Completed - Workload finished with fewer resources. Failed - Workload failure or user-defined rules.
    Deleting Workload and its associated resources are being decommissioned from the cluster. Deleting the workload. Resources are fully deleted.
    Stopped Workload is on hold and resources are intact but inactive. Stopping the workload without deleting resources. Transitioning back to the initializing phase or proceeding to deleting the workload.
    Failed Image retrieval failed or containers experienced a crash. Check your logs for specific details. An error occurs preventing the successful completion of the workload. Terminal state.
    Completed Workload has successfully finished its execution. The workload has finished processing without errors. Terminal state.

    Pods Associated with Workload

    Click one of the values in the Running/requested pods column, to view the list of pods and their parameters.

    Column Description
    Pod Pod name
    Status Pod lifecycle stages
    Node The node on which the pod resides
    Node pool The node pool in which the pod resides (applicable if node pools are enabled)
    Image The pod’s main image
    GPU compute allocation Amount of GPU devices allocated for the pod
    GPU memory allocation Amount of GPU memory allocated for the pod

    Connections Associated with Workload

    A connection refers to the method by which you can access and interact with the running workloads. It is essentially the "doorway" through which you can reach and use the applications (tools) these workloads provide.

    Click one of the values in the Connection(s) column, to view the list of connections and their parameters. Connections are network interfaces that communicate with the application running in the workload. Connections are either the URL the application exposes or the IP and the port of the node that the workload is running on.

    Column Description
    Name The name of the application running on the workload
    Connection type The network connection type selected for the workload
    Access Who is authorized to use this connection (everyone, specific groups/users)
    Address The connection URL
    Copy button Copy URL to clipboard
    Connect button Enabled only for supported tools

    Data Sources Associated with Workload

    Click one of the values in the Data source(s) column, to view the list of data sources and their parameters.

    Column Description
    Data source The name of the data source mounted to the workload
    Type The data source type

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
    • Refresh - Click REFRESH to update the table with the latest data
    • Show/Hide details - Click to view additional information on the selected row

    Show/Hide details

    Click a row in the Workloads table and then click the SHOW DETAILS button at the upper-right side of the action bar. The details pane appears, presenting the following tabs:

    Event History

    Displays the workload status over time. It displays events describing the workload lifecycle and alerts on notable events. Use the filter to search through the history for specific events.

    Metrics

    • GPU utilization
      Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs compute utilization (percentage of GPU compute) in this node.
    • GPU memory utilization
      Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs memory usage (percentage of the GPU memory) in this node.
    • CPU compute utilization
      The average of all CPUs’ cores compute utilization graph, along an adjustable period allows you to see the trends of CPU compute utilization (percentage of CPU compute) in this node.
    • CPU memory utilization
      The utilization of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory utilization (percentage of CPU memory) in this node.
    • CPU memory usage
      The usage of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory usage (in GB or MB of CPU memory) in this node.

    • For GPUs charts - Click the GPU legend on the right-hand side of the chart, to activate or deactivate any of the GPU lines.

    • You can click the date picker to change the presented period
    • You can use your mouse to mark a sub-period in the graph for zooming in, and use Reset zoom to go back to the preset period
    • Changes in the period affect all graphs on this screen.

    Logs

    Workload events are ordered in chronological order. The logs contain events from the workload’s lifecycle to help monitor and debug issues.

    Adding new workload

    Before starting, make sure you have created a project or have one created for you to work with workloads.

    To create a new workload:

    1. Click +NEW WORKLOAD
    2. Select a workload type - Follow the links below to view the step-by-step guide for each workload type:
      • Workspace. Used for data preparation and model-building tasks.
      • Training. Used for standard training tasks of all sorts
      • Distributed Training. Used for distributed tasks of all sorts
      • Inference. Used for inference and serving tasks
      • Job (legacy). This type is displayed only if enabled by your Administrator, under General settings → Workloads → Workload policies
    3. Click CREATE WORKLOAD

    Stopping a workload

    Stopping a workload kills the workload pods and releases the workload resources.

    1. Select the workload you want to stop
    2. Click STOP

    Running a workload

    Running a workload spins up new pods and resumes the workload work after it was stopped.

    1. Select the workload you want to run again
    2. Click RUN

    Connecting to a workload

    To connect to an application running in the workload (for example, Jupyter Notebook)

    1. Select the workload you want to connect
    2. Click CONNECT
    3. Select the tool from the drop-down list
    4. The selected tool is opened in a new tab on your browser

    Deleting a workload

    1. Select the workload you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Note

    Once a workload is deleted you can view it in the Deleted tab in the workloads view.
    This tab is displayed only if enabled by your Administrator, under General settings → Workloads → Deleted workloads

    Copy & Edit a workload

    1. Select the workload you want to copy and edit
    2. Click COPY & EDIT
    3. Update the workload and click CREATE WORKLOAD

    Using API

    Go to the Workloads API reference to view the available actions

    Troubleshooting

    To understand the condition of the workload, review the workload status in the Workload table. For more information, see check the workload’s event history.

    Listed below are a number of known issues when working with workloads and how to fix them:

    Issue Mediation
    Cluster connectivity issues (there are issues with your connection to the cluster error message) Verify that you are on a network that has been granted access to the cluster.
    Reach out to your cluster admin for instructions on verifying this.
    If you are an admin, see the troubleshooting section in the cluster documentation
    Workload in “Initializing” status for some time Check that you have access to the Container image registry.
    Check the statuses of the pods in the pods’ modal.
    Check the event history for more details
    Workload has been pending for some time Check that you have the required quota.
    Check the project’s available quota in the project dialog.
    Check that all services needed to run are bound to the workload.
    Check the event history for more details.
    PVCs created using the K8s API or kubectl are not visible or mountable in Run:ai. This is by design.
    - Create a new data source of type PVC in the Run:ai UI
    - In the Data mount section, select Existing PVC
    - Select the PVC you created via the K8S API
    You are now able to select and mount this PVC in your Run:ai submitted workloads.
    Workload is not visible in the UI. Check that the workload hasn’t been deleted.
    See the “Deleted” tab in the workloads view

    Workloads

    This article explains the procedure for managing workloads.

    Workloads table

    The Workloads table can be found under Workload manager in the Run:ai platform.

    The workloads table provides a list of all the workloads scheduled on the run:ai Scheduler, and allows you to manage them.

    The Workloads table consists of the following columns:

    Column Description
    Workload The name of the workload
    Type The workload type
    Preemptible Is the workload preemptible
    Status The different phases in a workload life cycle.
    Project The project in which the workload runs.
    Department The department that the workload is associated with. this column is visible only if the department toggle is enabled by your administrator.
    Created by The user who created the workload
    Running/requested pods The number of running pods out of the requested
    Creation time The timestamp for when the workload was created
    Completion time The timestamp the workload reached a terminal state (failed/completed)
    Connection(s) The method by which you can access and interact with the running workload. It's essentially the "doorway" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc). Click one of the values in the column to view the list of connections and their parameters
    Data source(s) Data resources used by the workload
    Environment The environment used by the workload
    Workload architecture Standard or distributed. A standard workload consists of a single process. A distributed workload consists of multiple processes working together. These processes can run on different nodes.
    GPU compute request Amount of GPU devices requested
    GPU compute allocation Amount of GPU devices allocated
    GPU memory request Amount of GPU memory Requested
    GPU memory allocation Amount of GPU memory allocated
    Idle GPU devices The number of allocated GPU devices that have been idle for more than 5 minutes
    CPU compute request Amount of CPU cores requested
    CPU compute allocation Amount of CPU cores allocated
    CPU memory request Amount of CPU memory requested
    CPU memory allocation Amount of CPU memory allocated
    Cluster The cluster that the workload is associated with

    Workload status

    The following table describes the different phases in a workload life cycle. The UI provides additional details for some of the below workload statuses which can be viewed by clicking the icon next to the status.

    Status Description Entry Condition Exit Condition
    Creating Workload setup is initiated in the cluster. Resources and pods are now provisioning. A workload is submitted. A multi-pod group is created.
    Pending Workload is queued and awaiting resource allocation. A pod group exists. All pods are scheduled.
    Initializing Workload is retrieving images, starting containers, and preparing pods. All pods are scheduled. All pods are initialized or a failure to initialize is detected.
    Running Workload is currently in progress with all pods operational. All pods initialized (all containers in pods are ready). Workload completion or failure.
    Degraded Pods may not align with specifications, network services might be incomplete, or persistent volumes may be detached. Check your logs for specific details. Pending - All pods are running but have issues. Running - All pods are running with no issues. Running - All resources are OK. Completed - Workload finished with fewer resources. Failed - Workload failure or user-defined rules.
    Deleting Workload and its associated resources are being decommissioned from the cluster. Deleting the workload. Resources are fully deleted.
    Stopped Workload is on hold and resources are intact but inactive. Stopping the workload without deleting resources. Transitioning back to the initializing phase or proceeding to deleting the workload.
    Failed Image retrieval failed or containers experienced a crash. Check your logs for specific details. An error occurs preventing the successful completion of the workload. Terminal state.
    Completed Workload has successfully finished its execution. The workload has finished processing without errors. Terminal state.

    Pods Associated with Workload

    Click one of the values in the Running/requested pods column, to view the list of pods and their parameters.

    Column Description
    Pod Pod name
    Status Pod lifecycle stages
    Node The node on which the pod resides
    Node pool The node pool in which the pod resides (applicable if node pools are enabled)
    Image The pod’s main image
    GPU compute allocation Amount of GPU devices allocated for the pod
    GPU memory allocation Amount of GPU memory allocated for the pod

    Connections Associated with Workload

    A connection refers to the method by which you can access and interact with the running workloads. It is essentially the "doorway" through which you can reach and use the applications (tools) these workloads provide.

    Click one of the values in the Connection(s) column, to view the list of connections and their parameters. Connections are network interfaces that communicate with the application running in the workload. Connections are either the URL the application exposes or the IP and the port of the node that the workload is running on.

    Column Description
    Name The name of the application running on the workload
    Connection type The network connection type selected for the workload
    Access Who is authorized to use this connection (everyone, specific groups/users)
    Address The connection URL
    Copy button Copy URL to clipboard
    Connect button Enabled only for supported tools

    Data Sources Associated with Workload

    Click one of the values in the Data source(s) column, to view the list of data sources and their parameters.

    Column Description
    Data source The name of the data source mounted to the workload
    Type The data source type

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
    • Refresh - Click REFRESH to update the table with the latest data
    • Show/Hide details - Click to view additional information on the selected row

    Show/Hide details

    Click a row in the Workloads table and then click the SHOW DETAILS button at the upper-right side of the action bar. The details pane appears, presenting the following tabs:

    Event History

    Displays the workload status over time. It displays events describing the workload lifecycle and alerts on notable events. Use the filter to search through the history for specific events.

    Metrics

    • GPU utilization
      Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs compute utilization (percentage of GPU compute) in this node.
    • GPU memory utilization
      Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs memory usage (percentage of the GPU memory) in this node.
    • CPU compute utilization
      The average of all CPUs’ cores compute utilization graph, along an adjustable period allows you to see the trends of CPU compute utilization (percentage of CPU compute) in this node.
    • CPU memory utilization
      The utilization of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory utilization (percentage of CPU memory) in this node.
    • CPU memory usage
      The usage of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory usage (in GB or MB of CPU memory) in this node.

    • For GPUs charts - Click the GPU legend on the right-hand side of the chart, to activate or deactivate any of the GPU lines.

    • You can click the date picker to change the presented period
    • You can use your mouse to mark a sub-period in the graph for zooming in, and use Reset zoom to go back to the preset period
    • Changes in the period affect all graphs on this screen.

    Logs

    Workload events are ordered in chronological order. The logs contain events from the workload’s lifecycle to help monitor and debug issues.

    Adding new workload

    Before starting, make sure you have created a project or have one created for you to work with workloads.

    To create a new workload:

    1. Click +NEW WORKLOAD
    2. Select a workload type - Follow the links below to view the step-by-step guide for each workload type:
      • Workspace. Used for data preparation and model-building tasks.
      • Training. Used for standard training tasks of all sorts
      • Distributed Training. Used for distributed tasks of all sorts
      • Inference. Used for inference and serving tasks
      • Job (legacy). This type is displayed only if enabled by your Administrator, under General settings → Workloads → Workload policies
    3. Click CREATE WORKLOAD

    Stopping a workload

    Stopping a workload kills the workload pods and releases the workload resources.

    1. Select the workload you want to stop
    2. Click STOP

    Running a workload

    Running a workload spins up new pods and resumes the workload work after it was stopped.

    1. Select the workload you want to run again
    2. Click RUN

    Connecting to a workload

    To connect to an application running in the workload (for example, Jupyter Notebook)

    1. Select the workload you want to connect
    2. Click CONNECT
    3. Select the tool from the drop-down list
    4. The selected tool is opened in a new tab on your browser

    Deleting a workload

    1. Select the workload you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Note

    Once a workload is deleted you can view it in the Deleted tab in the workloads view.
    This tab is displayed only if enabled by your Administrator, under General settings → Workloads → Deleted workloads

    Copy & Edit a workload

    1. Select the workload you want to copy and edit
    2. Click COPY & EDIT
    3. Update the workload and click CREATE WORKLOAD

    Using API

    Go to the Workloads API reference to view the available actions

    Troubleshooting

    To understand the condition of the workload, review the workload status in the Workload table. For more information, see check the workload’s event history.

    Listed below are a number of known issues when working with workloads and how to fix them:

    Issue Mediation
    Cluster connectivity issues (there are issues with your connection to the cluster error message) Verify that you are on a network that has been granted access to the cluster.
    Reach out to your cluster admin for instructions on verifying this.
    If you are an admin, see the troubleshooting section in the cluster documentation
    Workload in “Initializing” status for some time Check that you have access to the Container image registry.
    Check the statuses of the pods in the pods’ modal.
    Check the event history for more details
    Workload has been pending for some time Check that you have the required quota.
    Check the project’s available quota in the project dialog.
    Check that all services needed to run are bound to the workload.
    Check the event history for more details.
    PVCs created using the K8s API or kubectl are not visible or mountable in Run:ai. This is by design.
    - Create a new data source of type PVC in the Run:ai UI
    - In the Data mount section, select Existing PVC
    - Select the PVC you created via the K8S API
    You are now able to select and mount this PVC in your Run:ai submitted workloads.
    Workload is not visible in the UI. Check that the workload hasn’t been deleted.
    See the “Deleted” tab in the workloads view
    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/overviews/workload-types/index.html b/v2.20/Researcher/workloads/overviews/workload-types/index.html index 7b6bc8d25b..d5f68f12d3 100644 --- a/v2.20/Researcher/workloads/overviews/workload-types/index.html +++ b/v2.20/Researcher/workloads/overviews/workload-types/index.html @@ -1,4 +1,4 @@ - Workload Types - Run:ai Documentation Library
    Skip to content

    Run:ai Workload Types

    In the world of machine learning (ML), the journey from raw data to actionable insights is a complex process that spans multiple stages. Each stage of the AI lifecycle requires different tools, resources, and frameworks to ensure optimal performance. Run:ai simplifies this process by offering specialized workload types tailored to each phase, facilitating a smooth transition across various stages of the ML workflows.

    The ML lifecycle usually begins with the experimental work on data and exploration of different modeling techniques to identify the best approach for accurate predictions. At this stage, resource consumption is usually moderate as experimentation is done on a smaller scale. As confidence grows in the model's potential and its accuracy, the demand for compute resources increases. This is especially true during the training phase, where vast amounts of data need to be processed, particularly with complex models such as large language models (LLMs), with their huge parameter sizes, that often require distributed training across multiple GPUs to handle the intensive computational load.

    Finally, once the model is ready, it moves to the inference stage, where it is deployed to make predictions on new, unseen data. Run:ai's workload types are designed to correspond with the natural stages of this lifecycle. They are structured to align with the specific resource and framework requirements of each phase, ensuring that AI researchers and data scientists can focus on advancing their models without worrying about infrastructure management.

    Run:ai offers three workload types that correspond to a specific phase of the researcher’s work:

    • Workspaces – For experimentation with data and models.
    • Training – For resource-intensive tasks such as model training and data preparation.
    • Inference – For deploying and serving the trained model.

    Workspaces: the experimentation phase

    The Workspace is where data scientists conduct initial research, experiment with different data sets, and test various algorithms. This is the most flexible stage in the ML lifecycle, where models and data are explored, tuned, and refined. The value of workspaces lies in the flexibility they offer, allowing the researcher to iterate quickly without being constrained by rigid infrastructure.

    • Framework flexibility

      Workspaces support a variety of machine learning frameworks, as researchers need to experiment with different tools and methods.

    • Resource requirements

      Workspaces are often lighter on resources compared to the training phase, but they still require significant computational power for data processing, analysis, and model iteration.

      Hence, the default for the Run:ai workspaces considerations is to allow scheduling those workloads without the ability to preempt them once the resources were allocated. However, this non-preemptable state doesn’t allow to utilize more resources outside of the project’s deserved quota.

    See Running workspaces to learn more about how to submit a workspace via the Run:ai platform. For quick starts, see Running Jupyter Notebook using workspaces.

    Training: scaling resources for model development

    As models mature and the need for more robust data processing and model training increases, Run:ai facilitates this shift through the Training workload. This phase is resource-intensive, often requiring distributed computing and high-performance clusters to process vast data sets and train models.

    • Training architecture

      For training workloads Run:ai allows you to specify the architecture - standard or distributed. The distributed architecture is relevant for larger data sets and more complex models that require utilizing multiple nodes. For the distributed architecture, Run:ai allows you to specify different configurations for the master and workers and select which framework to use - PyTorch, XGBoost, MPI, and TensorFlow. In addition, as part of the distributed configuration, Run:ai enable the researchers to schedule their distributed workloads on nodes within the same region, zone, placement group, or any other topology.

    • Resource requirements

      Training tasks demand high memory, compute power, and storage. Run:ai ensures that the allocated resources match the scale of the task and allows those workloads to utilize more compute resources than the project’s deserved quota. Make sure that if you wish your training workload not to be preempted, specify the number of GPU’s that are in your quota.

    See Standard training and Distributed training to learn more about how to submit a training workload via the Run:ai UI. For quick starts, see Run your first standard training and Run your first distributed training.

    Inference: deploying and serving models

    Once a model is trained and validated, it moves to the Inference stage, where it is deployed to make predictions (usually in a production environment). This phase is all about efficiency and responsiveness, as the model needs to serve real-time or batch predictions to end-users or other systems.

    • Inference-specific use cases

      Naturally, inference workloads are required to change and adapt to the ever-changing demands to meet SLA. For example, additional replicas may be deployed, manually or automatically, to increase compute resources as part of a horizontal scaling approach or a new version of the deployment may need to be rolled out without affecting the running services.

    • Resource requirements

      Inference models differ in size and purpose, leading to varying computational requirements. For example, small OCR models can run efficiently on CPUs, whereas LLMs typically require significant GPU memory for deployment and serving. Inference workloads are considered production-critical and are given the highest priority to ensure compliance with SLAs. Additionally, Run:ai ensures that inference workloads cannot be preempted, maintaining consistent performance and reliability.

    See Deploy a custom inference workload to learn more about how to submit an inference workload via the Run:ai UI.

    Run:ai Workload Types

    In the world of machine learning (ML), the journey from raw data to actionable insights is a complex process that spans multiple stages. Each stage of the AI lifecycle requires different tools, resources, and frameworks to ensure optimal performance. Run:ai simplifies this process by offering specialized workload types tailored to each phase, facilitating a smooth transition across various stages of the ML workflows.

    The ML lifecycle usually begins with the experimental work on data and exploration of different modeling techniques to identify the best approach for accurate predictions. At this stage, resource consumption is usually moderate as experimentation is done on a smaller scale. As confidence grows in the model's potential and its accuracy, the demand for compute resources increases. This is especially true during the training phase, where vast amounts of data need to be processed, particularly with complex models such as large language models (LLMs), with their huge parameter sizes, that often require distributed training across multiple GPUs to handle the intensive computational load.

    Finally, once the model is ready, it moves to the inference stage, where it is deployed to make predictions on new, unseen data. Run:ai's workload types are designed to correspond with the natural stages of this lifecycle. They are structured to align with the specific resource and framework requirements of each phase, ensuring that AI researchers and data scientists can focus on advancing their models without worrying about infrastructure management.

    Run:ai offers three workload types that correspond to a specific phase of the researcher’s work:

    • Workspaces – For experimentation with data and models.
    • Training – For resource-intensive tasks such as model training and data preparation.
    • Inference – For deploying and serving the trained model.

    Workspaces: the experimentation phase

    The Workspace is where data scientists conduct initial research, experiment with different data sets, and test various algorithms. This is the most flexible stage in the ML lifecycle, where models and data are explored, tuned, and refined. The value of workspaces lies in the flexibility they offer, allowing the researcher to iterate quickly without being constrained by rigid infrastructure.

    • Framework flexibility

      Workspaces support a variety of machine learning frameworks, as researchers need to experiment with different tools and methods.

    • Resource requirements

      Workspaces are often lighter on resources compared to the training phase, but they still require significant computational power for data processing, analysis, and model iteration.

      Hence, the default for the Run:ai workspaces considerations is to allow scheduling those workloads without the ability to preempt them once the resources were allocated. However, this non-preemptable state doesn’t allow to utilize more resources outside of the project’s deserved quota.

    See Running workspaces to learn more about how to submit a workspace via the Run:ai platform. For quick starts, see Running Jupyter Notebook using workspaces.

    Training: scaling resources for model development

    As models mature and the need for more robust data processing and model training increases, Run:ai facilitates this shift through the Training workload. This phase is resource-intensive, often requiring distributed computing and high-performance clusters to process vast data sets and train models.

    • Training architecture

      For training workloads Run:ai allows you to specify the architecture - standard or distributed. The distributed architecture is relevant for larger data sets and more complex models that require utilizing multiple nodes. For the distributed architecture, Run:ai allows you to specify different configurations for the master and workers and select which framework to use - PyTorch, XGBoost, MPI, and TensorFlow. In addition, as part of the distributed configuration, Run:ai enable the researchers to schedule their distributed workloads on nodes within the same region, zone, placement group, or any other topology.

    • Resource requirements

      Training tasks demand high memory, compute power, and storage. Run:ai ensures that the allocated resources match the scale of the task and allows those workloads to utilize more compute resources than the project’s deserved quota. Make sure that if you wish your training workload not to be preempted, specify the number of GPU’s that are in your quota.

    See Standard training and Distributed training to learn more about how to submit a training workload via the Run:ai UI. For quick starts, see Run your first standard training and Run your first distributed training.

    Inference: deploying and serving models

    Once a model is trained and validated, it moves to the Inference stage, where it is deployed to make predictions (usually in a production environment). This phase is all about efficiency and responsiveness, as the model needs to serve real-time or batch predictions to end-users or other systems.

    • Inference-specific use cases

      Naturally, inference workloads are required to change and adapt to the ever-changing demands to meet SLA. For example, additional replicas may be deployed, manually or automatically, to increase compute resources as part of a horizontal scaling approach or a new version of the deployment may need to be rolled out without affecting the running services.

    • Resource requirements

      Inference models differ in size and purpose, leading to varying computational requirements. For example, small OCR models can run efficiently on CPUs, whereas LLMs typically require significant GPU memory for deployment and serving. Inference workloads are considered production-critical and are given the highest priority to ensure compliance with SLAs. Additionally, Run:ai ensures that inference workloads cannot be preempted, maintaining consistent performance and reliability.

    See Deploy a custom inference workload to learn more about how to submit an inference workload via the Run:ai UI.

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/training/distributed-training/distributed-training/index.html b/v2.20/Researcher/workloads/training/distributed-training/distributed-training/index.html index 61c7f9505b..037b770d54 100644 --- a/v2.20/Researcher/workloads/training/distributed-training/distributed-training/index.html +++ b/v2.20/Researcher/workloads/training/distributed-training/distributed-training/index.html @@ -1,4 +1,4 @@ - Train Models Using a Distributed Training Workload - Run:ai Documentation Library
    Skip to content

    Train models using a distributed training workload

    This article explains how to create a distributed training workload via the Run:ai UI.

    A training workload contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

    The distributed training workload is assigned to a project and is affected by the project’s quota.

    To learn more about the distributed training workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

    Creating a distributed training workload

    Before you start, make sure you have a project.

    To add a new distributed training workload:

    1. Go to the Workload manager → Workloads
    2. Click +NEW WORKLOAD and select Training
      Within the new training form:
    3. Select under which cluster to create the training workload
    4. Select the project in which your training will run
    5. Set the training workload architecture as distributed workload, which consists of multiple processes working together. These processes can run on different nodes. This workload uses environments that support distributed training workloads only.

      • Set the framework for the distributed workload. Select from -

        • PyTorch
        • TensorFlow
        • XG Boost
        • MPI

        In case one the above frameworks is not enabled, see Distributed training prerequisites for details on enabling.

      • Set the distributed workload configuration that defines how distributed training workloads are divided across multiple machines or processes. Choose a configuration based on your training requirements and infrastructure -

        • Workers & master
        • Workers only
    6. Select a preconfigured template or select Start from scratch to launch a new training workload quickly

    7. Enter a unique name for the training workload (if the name already exists in the project, you will be requested to submit a different name)
    8. Click CONTINUE
      In the next step:
    9. Select the environment for your training workload
      • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery.
        For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
      • Set the connection for your tool(s). The tools are configured as part of the environment.
        • External URL
          • Custom URL
            • Set the URL
          • Optional: Modify who can access the tool:
            • All authenticated users (default)
              Everyone within the organization’s account
            • Specific group(s)
              • Click +GROUP
              • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
            • Specific user(s)
              • Click +USER
              • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
        • Node port
          • Custom port
            • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
      • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
        • Enter UID
        • Enter GID
        • Add Supplementary groups (multiple groups can be added, separated by commas).
      • Optional: Set the command and arguments for the container running the workload
        When If no command is added, the container will use the image’s default command (entry-point).
        • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
        • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
      • Set the environment variable(s)
        • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
        • (Optional) Add new variables
          • Click +ENVIRONMENT VARIABLE
          • Enter a name
          • Select the source for the environment variable
            • Custom
              • Enter a value according to the provided instructions
            • Credentials - Select existing credentials as the environment variable
              • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
              • Select a secret key
    10. Select the compute resource for your training workload

      • Set the number of workers for your workload
      • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery.
        For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
      • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload.
        When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
        • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
        • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster.
          To configure a new node pool and for additional information, see node pools.
      • Select a node affinity to schedule the workload on a specific node type.
        If the administrator added a ‘node type (affinity)’ scheduling rule to the project/department, then this field is mandatory.
        Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
      • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

        Note

        Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Tolerations

        • Click +TOLERATION
        • Enter a key
        • Select the operator
          • Exists - If the key exists on the node, the effect will be applied.
          • Equals - if the key and the value set below matches to the value on the node, the effect will be applied
            • Enter a value matching the value on the node
        • Select the effect for the toleration
          • NoExecute - Pods that do not tolerate this taint are evicted immediately.
          • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
          • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
          • Any - All effects above match.
      • Optional: Set topology to let the workload be scheduled on nodes with a matching topology. Topology lets the workload to be scheduled on nodes within the same region, zone, placement group or any other topology you define.

        Note

        Setting topology is disabled, by default. If you cannot see Topology in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Topology

        • Click +TOPOLOGY
        • Enter a key
        • Select the operator
          • Required - If the scheduler can’t schedule all pods within the same topology, the workload will be pending.
          • Preferred - The scheduler will try to schedule all pods within the same topology but may schedule some pods on nodes that are not part of the same topology.
    11. Optional: Set the volume needed for your workload
      A volume allocates storage space to your workload that is persistent across restarts.

      • Click +VOLUME
      • Select the storage class
        • None - Proceed without defining a storage class.
        • Custom storage class - This option applies when selecting a storage class based on existing storage classes.
          To add new storage classes to the storage class list, and for additional information, see Kubernetes storage classes.
      • Select the access mode(s) (multiple modes can be selected)
        • Read-write by one node - The volume can be mounted as read-write by a single node.
        • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
        • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
      • Set the claim size and its units
      • Select the volume mode
        • File system (default) - This allows the volume to be mounted as a file system, enabling the usage of directories and files.
        • Block - This exposes the volume as a block storage, which can be formatted or used directly by applications without a file system.
      • Set the Container path with the volume target location
      • Set the volume persistency
        • Persistent - The volume and its data will be deleted only when the workload is deleted.
        • Ephemeral - The volume and its data will be deleted every time the workload’s status changes to “Stopped.”
    12. Optional: Select data sources for your training workload

      Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection.
      For a step-by-step guide on adding data sources to the gallery, see data sources.
      Once created, the new data source will be automatically selected.

      • Optional: Modify the data target location for the selected data source(s).
    13. Optional - General settings:

      • Set the grace period for workload preemption. This is a buffer that allows a preempted workload to reach a safe checkpoint before it is forcibly preempted. Enter a timeframe between 0 sec and 5 min.
      • Set the backoff limit before workload failure. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload status will change to "Failed." Enter a value between 1 and 100.
      • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
      • Set annotations(s)
        Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
        • Click +ANNOTATION
        • Enter a name
        • Enter a value
      • Set labels(s)
        Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
        • Enter a name
        • Enter a value
    14. Click CONTINUE
    15. Decide if you wish to define a different setup between the Workers and the Master via the toggle. When disabled the master’s setup will inherit the workers’ setup.

      • In case a different setup is requested or required, repeat steps 9-13 stated above with necessary changes.
    16. Click CREATE TRAINING

    Workload Policies

    When creating a new workload, fields and assets may have limitations or defaults. These rules and defaults are derived from a policy your administrator set.

    Policies allow you to control, standardize, and simplify the workload submission process. For additional information, see Workload Policies and Rules.

    The effects of the policy are reflected in the training creation form:

    • Defaults derived from the policy will be displayed automatically for specific fields.
    • Disabled actions or values must be within a certain range.
    • Rules and defaults for entire sections (such as environments, compute resources, or data sources) may prevent selection and will appear on the entire library card with an option for additional information via an external modal.

    Managing and monitoring

    After the training workload is created, it is added to the Workloads table, where it can be managed and monitored.

    Using CLI

    To view the available actions,see all possible distributed training workloads in the CLI v2 reference or the CLI v1 reference.

    Using API

    To view the available actions, see the Distributed workload API reference.

    Train models using a distributed training workload

    This article explains how to create a distributed training workload via the Run:ai UI.

    A training workload contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

    The distributed training workload is assigned to a project and is affected by the project’s quota.

    To learn more about the distributed training workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

    Creating a distributed training workload

    Before you start, make sure you have a project.

    To add a new distributed training workload:

    1. Go to the Workload manager → Workloads
    2. Click +NEW WORKLOAD and select Training
      Within the new training form:
    3. Select under which cluster to create the training workload
    4. Select the project in which your training will run
    5. Set the training workload architecture as distributed workload, which consists of multiple processes working together. These processes can run on different nodes. This workload uses environments that support distributed training workloads only.

      • Set the framework for the distributed workload. Select from -

        • PyTorch
        • TensorFlow
        • XG Boost
        • MPI

        In case one the above frameworks is not enabled, see Distributed training prerequisites for details on enabling.

      • Set the distributed workload configuration that defines how distributed training workloads are divided across multiple machines or processes. Choose a configuration based on your training requirements and infrastructure -

        • Workers & master
        • Workers only
    6. Select a preconfigured template or select Start from scratch to launch a new training workload quickly

    7. Enter a unique name for the training workload (if the name already exists in the project, you will be requested to submit a different name)
    8. Click CONTINUE
      In the next step:
    9. Select the environment for your training workload
      • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery.
        For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
      • Set the connection for your tool(s). The tools are configured as part of the environment.
        • External URL
          • Custom URL
            • Set the URL
          • Optional: Modify who can access the tool:
            • All authenticated users (default)
              Everyone within the organization’s account
            • Specific group(s)
              • Click +GROUP
              • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
            • Specific user(s)
              • Click +USER
              • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
        • Node port
          • Custom port
            • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
      • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
        • Enter UID
        • Enter GID
        • Add Supplementary groups (multiple groups can be added, separated by commas).
      • Optional: Set the command and arguments for the container running the workload
        When If no command is added, the container will use the image’s default command (entry-point).
        • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
        • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
      • Set the environment variable(s)
        • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
        • (Optional) Add new variables
          • Click +ENVIRONMENT VARIABLE
          • Enter a name
          • Select the source for the environment variable
            • Custom
              • Enter a value according to the provided instructions
            • Credentials - Select existing credentials as the environment variable
              • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
              • Select a secret key
    10. Select the compute resource for your training workload

      • Set the number of workers for your workload
      • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery.
        For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
      • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload.
        When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
        • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
        • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster.
          To configure a new node pool and for additional information, see node pools.
      • Select a node affinity to schedule the workload on a specific node type.
        If the administrator added a ‘node type (affinity)’ scheduling rule to the project/department, then this field is mandatory.
        Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
      • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

        Note

        Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Tolerations

        • Click +TOLERATION
        • Enter a key
        • Select the operator
          • Exists - If the key exists on the node, the effect will be applied.
          • Equals - if the key and the value set below matches to the value on the node, the effect will be applied
            • Enter a value matching the value on the node
        • Select the effect for the toleration
          • NoExecute - Pods that do not tolerate this taint are evicted immediately.
          • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
          • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
          • Any - All effects above match.
      • Optional: Set topology to let the workload be scheduled on nodes with a matching topology. Topology lets the workload to be scheduled on nodes within the same region, zone, placement group or any other topology you define.

        Note

        Setting topology is disabled, by default. If you cannot see Topology in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Topology

        • Click +TOPOLOGY
        • Enter a key
        • Select the operator
          • Required - If the scheduler can’t schedule all pods within the same topology, the workload will be pending.
          • Preferred - The scheduler will try to schedule all pods within the same topology but may schedule some pods on nodes that are not part of the same topology.
    11. Optional: Set the volume needed for your workload
      A volume allocates storage space to your workload that is persistent across restarts.

      • Click +VOLUME
      • Select the storage class
        • None - Proceed without defining a storage class.
        • Custom storage class - This option applies when selecting a storage class based on existing storage classes.
          To add new storage classes to the storage class list, and for additional information, see Kubernetes storage classes.
      • Select the access mode(s) (multiple modes can be selected)
        • Read-write by one node - The volume can be mounted as read-write by a single node.
        • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
        • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
      • Set the claim size and its units
      • Select the volume mode
        • File system (default) - This allows the volume to be mounted as a file system, enabling the usage of directories and files.
        • Block - This exposes the volume as a block storage, which can be formatted or used directly by applications without a file system.
      • Set the Container path with the volume target location
      • Set the volume persistency
        • Persistent - The volume and its data will be deleted only when the workload is deleted.
        • Ephemeral - The volume and its data will be deleted every time the workload’s status changes to “Stopped.”
    12. Optional: Select data sources for your training workload

      Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection.
      For a step-by-step guide on adding data sources to the gallery, see data sources.
      Once created, the new data source will be automatically selected.

      • Optional: Modify the data target location for the selected data source(s).
    13. Optional - General settings:

      • Set the grace period for workload preemption. This is a buffer that allows a preempted workload to reach a safe checkpoint before it is forcibly preempted. Enter a timeframe between 0 sec and 5 min.
      • Set the backoff limit before workload failure. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload status will change to "Failed." Enter a value between 1 and 100.
      • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
      • Set annotations(s)
        Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
        • Click +ANNOTATION
        • Enter a name
        • Enter a value
      • Set labels(s)
        Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
        • Enter a name
        • Enter a value
    14. Click CONTINUE
    15. Decide if you wish to define a different setup between the Workers and the Master via the toggle. When disabled the master’s setup will inherit the workers’ setup.

      • In case a different setup is requested or required, repeat steps 9-13 stated above with necessary changes.
    16. Click CREATE TRAINING

    Workload Policies

    When creating a new workload, fields and assets may have limitations or defaults. These rules and defaults are derived from a policy your administrator set.

    Policies allow you to control, standardize, and simplify the workload submission process. For additional information, see Workload Policies and Rules.

    The effects of the policy are reflected in the training creation form:

    • Defaults derived from the policy will be displayed automatically for specific fields.
    • Disabled actions or values must be within a certain range.
    • Rules and defaults for entire sections (such as environments, compute resources, or data sources) may prevent selection and will appear on the entire library card with an option for additional information via an external modal.

    Managing and monitoring

    After the training workload is created, it is added to the Workloads table, where it can be managed and monitored.

    Using CLI

    To view the available actions,see all possible distributed training workloads in the CLI v2 reference or the CLI v1 reference.

    Using API

    To view the available actions, see the Distributed workload API reference.

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/training/distributed-training/quickstart-distributed-training/index.html b/v2.20/Researcher/workloads/training/distributed-training/quickstart-distributed-training/index.html index 7abec41b31..5445eca102 100644 --- a/v2.20/Researcher/workloads/training/distributed-training/quickstart-distributed-training/index.html +++ b/v2.20/Researcher/workloads/training/distributed-training/quickstart-distributed-training/index.html @@ -1,4 +1,4 @@ - Run your First Distributed Training - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/training/standard-training/quickstart-standard-training/index.html b/v2.20/Researcher/workloads/training/standard-training/quickstart-standard-training/index.html index 0733ac9551..e26bb02982 100644 --- a/v2.20/Researcher/workloads/training/standard-training/quickstart-standard-training/index.html +++ b/v2.20/Researcher/workloads/training/standard-training/quickstart-standard-training/index.html @@ -1,4 +1,4 @@ - Run your First Standard Training - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/training/standard-training/trainings-v2/index.html b/v2.20/Researcher/workloads/training/standard-training/trainings-v2/index.html index de8a754736..8d7f5894d8 100644 --- a/v2.20/Researcher/workloads/training/standard-training/trainings-v2/index.html +++ b/v2.20/Researcher/workloads/training/standard-training/trainings-v2/index.html @@ -1,4 +1,4 @@ - Train Models Using a Standard Training Workload - Run:ai Documentation Library
    Skip to content

    Train models using a standard training workload

    This article explains how to create a standard training workload via the Run:ai UI.

    A training workload contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

    The training workload is assigned to a project and is affected by the project’s quota.

    To learn more about the training workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

    Creating a standard training workload

    Before you start, make sure you have a project.

    To add a new training workload:

    1. Go to the Workload manager → Workloads
    2. Click +NEW WORKLOAD and select Training
      Within the new training form:
    3. Select under which cluster to create the training workload
    4. Select the project in which your training will run
    5. Set the training workload architecture as standard, which consists of a single main running process. This workload uses environments that support standard training workloads only.
    6. Select a preconfigured template or select Start from scratch to launch a new training workload quickly
    7. Enter a unique name for the training workload (if the name already exists in the project, you will be requested to submit a different name)
    8. Click CONTINUE
      In the next step:
    9. Select the environment for your training workload
      • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery.
        For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
      • Set the connection for your tool(s). The tools are configured as part of the environment.
        • External URL
          • Custom URL
            • Set the URL
          • Optional: Modify who can access the tool:
            • All authenticated users (default)
              Everyone within the organization’s account
            • Specific group(s)
              • Click +GROUP
              • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
            • Specific user(s)
              • Click +USER
              • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
        • Node port
          • Custom port
            • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
      • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
        • Enter UID
        • Enter GID
        • Add Supplementary groups (multiple groups can be added, separated by commas).
      • Optional: Set the command and arguments for the container running the workload
        When If no command is added, the container will use the image’s default command (entry-point).
        • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
        • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
      • Set the environment variable(s)
        • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
        • (Optional) Add new variables
          • Click +ENVIRONMENT VARIABLE
          • Enter a name
          • Select the source for the environment variable
            • Custom
              • Enter a value according to the provided instructions
            • Credentials - Select existing credentials as the environment variable
              • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
              • Select a secret key
    10. Select the compute resource for your training workload

      • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery.
        For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
      • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload.
        When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
        • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
        • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster.
          To configure a new node pool and for additional information, see node pools.
      • Select a node affinity to schedule the workload on a specific node type.
        If the administrator added a ‘node type (affinity)’ scheduling rule to the project/department, then this field is mandatory.
        Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
      • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

        Note

        Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Tolerations

        • Click +TOLERATION
        • Enter a key
        • Select the operator
          • Exists - If the key exists on the node, the effect will be applied.
          • Equals - if the key and the value set below matches to the value on the node, the effect will be applied
            • Enter a value matching the value on the node
        • Select the effect for the toleration
          • NoExecute - Pods that do not tolerate this taint are evicted immediately.
          • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
          • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
          • Any - All effects above match.
    11. Optional: Set the volume needed for your workload
      A volume allocates storage space to your workload that is persistent across restarts.

      • Click +VOLUME
      • Select the storage class
        • None - Proceed without defining a storage class.
        • Custom storage class - This option applies when selecting a storage class based on existing storage classes.
          To add new storage classes to the storage class list, and for additional information, see Kubernetes storage classes.
      • Select the access mode(s) (multiple modes can be selected)
        • Read-write by one node - The volume can be mounted as read-write by a single node.
        • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
        • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
      • Set the claim size and its units
      • Select the volume mode
        • File system (default) - This allows the volume to be mounted as a file system, enabling the usage of directories and files.
        • Block - This exposes the volume as a block storage, which can be formatted or used directly by applications without a file system.
      • Set the Container path with the volume target location
      • Set the volume persistency
        • Persistent - The volume and its data will be deleted only when the workload is deleted.
        • Ephemeral - The volume and its data will be deleted every time the workload’s status changes to “Stopped.”
    12. Optional: Select data sources for your training workload

      Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection.
      For a step-by-step guide on adding data sources to the gallery, see data sources.
      Once created, the new data source will be automatically selected.

      • Optional: Modify the data target location for the selected data source(s).
    13. Optional - General settings:

      • Set the grace period for workload preemption. This is a buffer that allows a preempted workload to reach a safe checkpoint before it is forcibly preempted. Enter a timeframe between 0 sec and 5 min.
      • Set the number of runs the workload must finish to be considered complete. Multiple runs enhance the reliability and validity of the training results.
      • If the number of runs is above 1, enter a value under Parallelism to specify how many runs may be scheduled in parallel. The value must be less than or equal to the number of runs.
      • Set the backoff limit before workload failure. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload status will change to "Failed." Enter a value between 1 and 100.
      • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
      • Set annotations(s)
        Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
        • Click +ANNOTATION
        • Enter a name
        • Enter a value
      • Set labels(s)
        Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
        • Enter a name
        • Enter a value
    14. Click CREATE TRAINING

    Workload Policies

    When creating a new workload, fields and assets may have limitations or defaults. These rules and defaults are derived from a policy your administrator set.

    Policies allow you to control, standardize, and simplify the workload submission process. For additional information, see Workload Policies and Rules.

    The effects of the policy are reflected in the training creation form:

    • Defaults derived from the policy will be displayed automatically for specific fields.
    • Disabled actions or values must be within a certain range.
    • Rules and defaults for entire sections (such as environments, compute resources, or data sources) may prevent selection and will appear on the entire library card with an option for additional information via an external modal.

    Managing and monitoring

    After the training workload is created, it is added to the Workloads table, where it can be managed and monitored.

    Using CLI

    To view the available actions, see the training workload CLI v2 reference or the CLI v1 reference.

    Using API

    To view the available actions, see the Trainings workload API reference.

    Train models using a standard training workload

    This article explains how to create a standard training workload via the Run:ai UI.

    A training workload contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

    The training workload is assigned to a project and is affected by the project’s quota.

    To learn more about the training workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

    Creating a standard training workload

    Before you start, make sure you have a project.

    To add a new training workload:

    1. Go to the Workload manager → Workloads
    2. Click +NEW WORKLOAD and select Training
      Within the new training form:
    3. Select under which cluster to create the training workload
    4. Select the project in which your training will run
    5. Set the training workload architecture as standard, which consists of a single main running process. This workload uses environments that support standard training workloads only.
    6. Select a preconfigured template or select Start from scratch to launch a new training workload quickly
    7. Enter a unique name for the training workload (if the name already exists in the project, you will be requested to submit a different name)
    8. Click CONTINUE
      In the next step:
    9. Select the environment for your training workload
      • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery.
        For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
      • Set the connection for your tool(s). The tools are configured as part of the environment.
        • External URL
          • Custom URL
            • Set the URL
          • Optional: Modify who can access the tool:
            • All authenticated users (default)
              Everyone within the organization’s account
            • Specific group(s)
              • Click +GROUP
              • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
            • Specific user(s)
              • Click +USER
              • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
        • Node port
          • Custom port
            • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
      • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
        • Enter UID
        • Enter GID
        • Add Supplementary groups (multiple groups can be added, separated by commas).
      • Optional: Set the command and arguments for the container running the workload
        When If no command is added, the container will use the image’s default command (entry-point).
        • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
        • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
      • Set the environment variable(s)
        • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
        • (Optional) Add new variables
          • Click +ENVIRONMENT VARIABLE
          • Enter a name
          • Select the source for the environment variable
            • Custom
              • Enter a value according to the provided instructions
            • Credentials - Select existing credentials as the environment variable
              • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
              • Select a secret key
    10. Select the compute resource for your training workload

      • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery.
        For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
      • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload.
        When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
        • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
        • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster.
          To configure a new node pool and for additional information, see node pools.
      • Select a node affinity to schedule the workload on a specific node type.
        If the administrator added a ‘node type (affinity)’ scheduling rule to the project/department, then this field is mandatory.
        Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
      • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

        Note

        Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Tolerations

        • Click +TOLERATION
        • Enter a key
        • Select the operator
          • Exists - If the key exists on the node, the effect will be applied.
          • Equals - if the key and the value set below matches to the value on the node, the effect will be applied
            • Enter a value matching the value on the node
        • Select the effect for the toleration
          • NoExecute - Pods that do not tolerate this taint are evicted immediately.
          • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
          • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
          • Any - All effects above match.
    11. Optional: Set the volume needed for your workload
      A volume allocates storage space to your workload that is persistent across restarts.

      • Click +VOLUME
      • Select the storage class
        • None - Proceed without defining a storage class.
        • Custom storage class - This option applies when selecting a storage class based on existing storage classes.
          To add new storage classes to the storage class list, and for additional information, see Kubernetes storage classes.
      • Select the access mode(s) (multiple modes can be selected)
        • Read-write by one node - The volume can be mounted as read-write by a single node.
        • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
        • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
      • Set the claim size and its units
      • Select the volume mode
        • File system (default) - This allows the volume to be mounted as a file system, enabling the usage of directories and files.
        • Block - This exposes the volume as a block storage, which can be formatted or used directly by applications without a file system.
      • Set the Container path with the volume target location
      • Set the volume persistency
        • Persistent - The volume and its data will be deleted only when the workload is deleted.
        • Ephemeral - The volume and its data will be deleted every time the workload’s status changes to “Stopped.”
    12. Optional: Select data sources for your training workload

      Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection.
      For a step-by-step guide on adding data sources to the gallery, see data sources.
      Once created, the new data source will be automatically selected.

      • Optional: Modify the data target location for the selected data source(s).
    13. Optional - General settings:

      • Set the grace period for workload preemption. This is a buffer that allows a preempted workload to reach a safe checkpoint before it is forcibly preempted. Enter a timeframe between 0 sec and 5 min.
      • Set the number of runs the workload must finish to be considered complete. Multiple runs enhance the reliability and validity of the training results.
      • If the number of runs is above 1, enter a value under Parallelism to specify how many runs may be scheduled in parallel. The value must be less than or equal to the number of runs.
      • Set the backoff limit before workload failure. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload status will change to "Failed." Enter a value between 1 and 100.
      • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
      • Set annotations(s)
        Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
        • Click +ANNOTATION
        • Enter a name
        • Enter a value
      • Set labels(s)
        Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
        • Enter a name
        • Enter a value
    14. Click CREATE TRAINING

    Workload Policies

    When creating a new workload, fields and assets may have limitations or defaults. These rules and defaults are derived from a policy your administrator set.

    Policies allow you to control, standardize, and simplify the workload submission process. For additional information, see Workload Policies and Rules.

    The effects of the policy are reflected in the training creation form:

    • Defaults derived from the policy will be displayed automatically for specific fields.
    • Disabled actions or values must be within a certain range.
    • Rules and defaults for entire sections (such as environments, compute resources, or data sources) may prevent selection and will appear on the entire library card with an option for additional information via an external modal.

    Managing and monitoring

    After the training workload is created, it is added to the Workloads table, where it can be managed and monitored.

    Using CLI

    To view the available actions, see the training workload CLI v2 reference or the CLI v1 reference.

    Using API

    To view the available actions, see the Trainings workload API reference.

    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/workspaces/quickstart-jupyter/index.html b/v2.20/Researcher/workloads/workspaces/quickstart-jupyter/index.html index 0b828ab376..81890a9b6d 100644 --- a/v2.20/Researcher/workloads/workspaces/quickstart-jupyter/index.html +++ b/v2.20/Researcher/workloads/workspaces/quickstart-jupyter/index.html @@ -1,4 +1,4 @@ - Running Jupyter Notebook Using Workspaces - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/Researcher/workloads/workspaces/workspace-v2/index.html b/v2.20/Researcher/workloads/workspaces/workspace-v2/index.html index ef0063d7de..3b537bd147 100644 --- a/v2.20/Researcher/workloads/workspaces/workspace-v2/index.html +++ b/v2.20/Researcher/workloads/workspaces/workspace-v2/index.html @@ -1,4 +1,4 @@ - Running Workspaces - Run:ai Documentation Library
    Skip to content

    Running Workspaces

    This article explains how to create a workspace via the Run:ai UI.

    A workspace contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

    To learn more about the workspace workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

    Creating a new Workspace

    Before you start, make sure you have a project.

    To add a new workspace:

    1. Go to the Workload manager → Workloads
    2. Click +NEW WORKLOAD and select Workspace
      Within the new workspace form:
    3. Select under which cluster to create the workload
    4. Select the project in which your workspace will run
    5. Select a preconfigured template or select Start from scratch to launch a new workspace quickly
    6. Enter a unique name for the workspace (if the name already exists in the project, you will be requested to submit a different name)
    7. Click CONTINUE
      In the next step:
    8. Select the environment for your workspace

      • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery.
        For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
      • Set the connection for your tool(s). The tools are configured as part of the environment.
        • External URL
          • Custom URL
            • Set the URL
          • Optional: Modify who can access the tool:
            • All authenticated users (default)
              Everyone within the organization’s account
            • Specific group(s)
              • Click +GROUP
              • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
            • Specific user(s)
              • Click +USER
              • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
        • Node port
          • Custom port
            • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
      • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
        • Enter UID
        • Enter GID
        • Add Supplementary groups (multiple groups can be added, separated by commas).
      • Optional: Set the command and arguments for the container running the workload.
        If no command is added, the container will use the image’s default command (entry-point).
        • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
        • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
      • Set the environment variable(s)
        • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
        • (Optional) Add new variables
        • Click +ENVIRONMENT VARIABLE
          • Enter a name
          • Select the source for the environment variable

            • Custom
              • Enter a value according to the provided instructions
            • Credentials - Select select an existing credentials as the environment variable
              • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
              • Select a secret key
    9. Select the compute resource for your workspace

      • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery.
        For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
      • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload.
        When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
        • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
        • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster.
          To configure a new node pool and for additional information, see node pools.
      • Select a node affinity to schedule the workload on a specific node type.
        If the administrator added a ‘node type (affinity)’ scheduling rule to the project/department, then this field is mandatory.
        Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
      • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

        Note

        Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Tolerations

        • Click +TOLERATION
        • Enter a key
        • Select the operator
          • Exists - If the key exists on the node, the effect will be applied.
          • Equals - if the key and the value set below matches to the value on the node, the effect will be applied
            • Enter a value matching the value on the node
        • Select the effect for the toleration
          • NoExecute - Pods that do not tolerate this taint are evicted immediately.
          • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
          • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
          • Any - All effects above match.
    10. Optional: Set the volume needed for your workload
      A volume allocates storage space to your workload that is persistent across restarts.

      • Click +VOLUME
      • Select the storage class
        • None - Proceed without defining a storage class.
        • Custom storage class - This option applies when selecting a storage class based on existing storage classes.
          To add new storage classes to the storage class list, and for additional information, see Kubernetes storage classes
      • Select the access mode(s) (multiple modes can be selected)
        • Read-write by one node - The volume can be mounted as read-write by a single node.
        • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
        • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
      • Set the claim size and its units
      • Select the volume mode
        • File system (default) - This allows the volume to be mounted as a file system, enabling the usage of directories and files.
        • Block - This exposes the volume as a block storage, which can be formatted or used directly by applications without a file system.
      • Set the Container path with the volume target location
      • Set the volume persistency
        • Persistent - The volume and its data will be deleted only when the workload is deleted.
        • Ephemeral - The volume and its data will be deleted every time the workload’s status changes to “Stopped.”
    11. Optional: Select data sources for your workspace
      Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection.
      For a step-by-step guide on adding data sources to the gallery, see data sources.
      Once created, the new data source will be automatically selected.
      • Optional: Modify the data target location for the selected data source(s).
    12. Optional - General settings:
      • Allow the workload to exceed the project quota. Workloads running over quota may be preempted and stopped at any time.
      • Set the backoff limit before workload failure. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload status will change to "Failed." Enter a value between 1 and 100.
      • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
      • Set annotations(s)
        Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
        • Click +ANNOTATION
        • Enter a name
        • Enter a value
      • Set labels(s)
        Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
        • Enter a name
        • Enter a value
    13. Click CREATE WORKSPACE

    Workload Policies

    When creating a new workload, fields and assets may have limitations or defaults. These rules and defaults are derived from a policy your administrator set.

    Policies allow you to control, standardize, and simplify the workload submission process. For additional information, see Workload Policies and Rules.

    The effects of the policy are reflected in the workspace creation form:

    • Defaults derived from the policy will be displayed automatically for specific fields.
    • Disabled actions or values must be within a certain range.
    • Rules and defaults for entire sections (such as environments, compute resources, or data sources) may prevent selection and will appear on the entire library card with an option for additional information via an external modal.

    Managing and monitoring

    After the workspace is created, it is added to the Workloads table, where it can be managed and monitored.

    Using CLI

    To view the available actions on workspaces, see the Workspaces CLI v2 reference or the CLI v1 reference.

    Using API

    To view the available actions on workspaces, see the Workspaces API reference.

    Running Workspaces

    This article explains how to create a workspace via the Run:ai UI.

    A workspace contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

    To learn more about the workspace workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

    Creating a new Workspace

    Before you start, make sure you have a project.

    To add a new workspace:

    1. Go to the Workload manager → Workloads
    2. Click +NEW WORKLOAD and select Workspace
      Within the new workspace form:
    3. Select under which cluster to create the workload
    4. Select the project in which your workspace will run
    5. Select a preconfigured template or select Start from scratch to launch a new workspace quickly
    6. Enter a unique name for the workspace (if the name already exists in the project, you will be requested to submit a different name)
    7. Click CONTINUE
      In the next step:
    8. Select the environment for your workspace

      • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery.
        For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
      • Set the connection for your tool(s). The tools are configured as part of the environment.
        • External URL
          • Custom URL
            • Set the URL
          • Optional: Modify who can access the tool:
            • All authenticated users (default)
              Everyone within the organization’s account
            • Specific group(s)
              • Click +GROUP
              • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
            • Specific user(s)
              • Click +USER
              • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
        • Node port
          • Custom port
            • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
      • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
        • Enter UID
        • Enter GID
        • Add Supplementary groups (multiple groups can be added, separated by commas).
      • Optional: Set the command and arguments for the container running the workload.
        If no command is added, the container will use the image’s default command (entry-point).
        • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
        • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
      • Set the environment variable(s)
        • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
        • (Optional) Add new variables
        • Click +ENVIRONMENT VARIABLE
          • Enter a name
          • Select the source for the environment variable

            • Custom
              • Enter a value according to the provided instructions
            • Credentials - Select select an existing credentials as the environment variable
              • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
              • Select a secret key
    9. Select the compute resource for your workspace

      • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery.
        For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
      • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload.
        When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
        • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
        • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster.
          To configure a new node pool and for additional information, see node pools.
      • Select a node affinity to schedule the workload on a specific node type.
        If the administrator added a ‘node type (affinity)’ scheduling rule to the project/department, then this field is mandatory.
        Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
      • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

        Note

        Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings → Workloads → Tolerations

        • Click +TOLERATION
        • Enter a key
        • Select the operator
          • Exists - If the key exists on the node, the effect will be applied.
          • Equals - if the key and the value set below matches to the value on the node, the effect will be applied
            • Enter a value matching the value on the node
        • Select the effect for the toleration
          • NoExecute - Pods that do not tolerate this taint are evicted immediately.
          • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
          • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
          • Any - All effects above match.
    10. Optional: Set the volume needed for your workload
      A volume allocates storage space to your workload that is persistent across restarts.

      • Click +VOLUME
      • Select the storage class
        • None - Proceed without defining a storage class.
        • Custom storage class - This option applies when selecting a storage class based on existing storage classes.
          To add new storage classes to the storage class list, and for additional information, see Kubernetes storage classes
      • Select the access mode(s) (multiple modes can be selected)
        • Read-write by one node - The volume can be mounted as read-write by a single node.
        • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
        • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
      • Set the claim size and its units
      • Select the volume mode
        • File system (default) - This allows the volume to be mounted as a file system, enabling the usage of directories and files.
        • Block - This exposes the volume as a block storage, which can be formatted or used directly by applications without a file system.
      • Set the Container path with the volume target location
      • Set the volume persistency
        • Persistent - The volume and its data will be deleted only when the workload is deleted.
        • Ephemeral - The volume and its data will be deleted every time the workload’s status changes to “Stopped.”
    11. Optional: Select data sources for your workspace
      Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection.
      For a step-by-step guide on adding data sources to the gallery, see data sources.
      Once created, the new data source will be automatically selected.
      • Optional: Modify the data target location for the selected data source(s).
    12. Optional - General settings:
      • Allow the workload to exceed the project quota. Workloads running over quota may be preempted and stopped at any time.
      • Set the backoff limit before workload failure. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload status will change to "Failed." Enter a value between 1 and 100.
      • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
      • Set annotations(s)
        Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
        • Click +ANNOTATION
        • Enter a name
        • Enter a value
      • Set labels(s)
        Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
        • Enter a name
        • Enter a value
    13. Click CREATE WORKSPACE

    Workload Policies

    When creating a new workload, fields and assets may have limitations or defaults. These rules and defaults are derived from a policy your administrator set.

    Policies allow you to control, standardize, and simplify the workload submission process. For additional information, see Workload Policies and Rules.

    The effects of the policy are reflected in the workspace creation form:

    • Defaults derived from the policy will be displayed automatically for specific fields.
    • Disabled actions or values must be within a certain range.
    • Rules and defaults for entire sections (such as environments, compute resources, or data sources) may prevent selection and will appear on the entire library card with an option for additional information via an external modal.

    Managing and monitoring

    After the workspace is created, it is added to the Workloads table, where it can be managed and monitored.

    Using CLI

    To view the available actions on workspaces, see the Workspaces CLI v2 reference or the CLI v1 reference.

    Using API

    To view the available actions on workspaces, see the Workspaces API reference.

    \ No newline at end of file diff --git a/v2.20/admin/authentication/accessrules/index.html b/v2.20/admin/authentication/accessrules/index.html index 6e83d48fca..790218b33f 100644 --- a/v2.20/admin/authentication/accessrules/index.html +++ b/v2.20/admin/authentication/accessrules/index.html @@ -1,4 +1,4 @@ - Access Rules - Run:ai Documentation Library
    Skip to content

    Access Rules

    This article explains the procedure to manage Access rules.

    Access rules provide users, groups, or applications privileges to system entities.

    An access rule is the assignment of a role to a subject in a scope: <Subject> is a <Role> in a <Scope>.

    For example, user user@domain.com is a department admin in department A.

    Access rules table

    The Access rules table can be found under Access in the Run:ai platform.

    The Access rules table provides a list of all the access rules defined in the platform and allows you to manage them.

    Note

    Flexible management

    It is also possible to manage access rules directly for a specific user, application, project, or department.

    The Access rules table consists of the following columns:

    Column Description
    Type The type of subject assigned to the access rule (user, SSO group, or application).
    Subject The user, SSO group, or application assigned with the role
    Role The role assigned to the subject
    Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates
    Authorized by The user who granted the access rule
    Creation time The timestamp for when the rule was created
    Last updated The last time the access rule was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Adding new access rules

    To add a new access rule:

    1. Click +NEW ACCESS RULE
    2. Select a subject User, SSO Group, or Application
    3. Select or enter the subject identifier:
      • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
      • Group name as recognized by the IDP
      • Application name as created in Run:ai
    4. Select a role
    5. Select a scope
    6. Click SAVE RULE

    Note

    An access rule consists of a single subject with a single role in a single scope. To assign multiple roles or multiple scopes to the same subject, multiple access rules must be added.

    Editing an access rule

    Access rules cannot be edited. To change an access rule, you must delete the rule, and then create a new rule to replace it.

    Deleting an access rule

    1. Select the access rule you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Using API

    Go to the Access rules API reference to view the available actions

    Access Rules

    This article explains the procedure to manage Access rules.

    Access rules provide users, groups, or applications privileges to system entities.

    An access rule is the assignment of a role to a subject in a scope: <Subject> is a <Role> in a <Scope>.

    For example, user user@domain.com is a department admin in department A.

    Access rules table

    The Access rules table can be found under Access in the Run:ai platform.

    The Access rules table provides a list of all the access rules defined in the platform and allows you to manage them.

    Note

    Flexible management

    It is also possible to manage access rules directly for a specific user, application, project, or department.

    The Access rules table consists of the following columns:

    Column Description
    Type The type of subject assigned to the access rule (user, SSO group, or application).
    Subject The user, SSO group, or application assigned with the role
    Role The role assigned to the subject
    Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates
    Authorized by The user who granted the access rule
    Creation time The timestamp for when the rule was created
    Last updated The last time the access rule was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Adding new access rules

    To add a new access rule:

    1. Click +NEW ACCESS RULE
    2. Select a subject User, SSO Group, or Application
    3. Select or enter the subject identifier:
      • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
      • Group name as recognized by the IDP
      • Application name as created in Run:ai
    4. Select a role
    5. Select a scope
    6. Click SAVE RULE

    Note

    An access rule consists of a single subject with a single role in a single scope. To assign multiple roles or multiple scopes to the same subject, multiple access rules must be added.

    Editing an access rule

    Access rules cannot be edited. To change an access rule, you must delete the rule, and then create a new rule to replace it.

    Deleting an access rule

    1. Select the access rule you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Using API

    Go to the Access rules API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/admin/authentication/applications/index.html b/v2.20/admin/authentication/applications/index.html index 3911e46b0a..55a043bf99 100644 --- a/v2.20/admin/authentication/applications/index.html +++ b/v2.20/admin/authentication/applications/index.html @@ -1,4 +1,4 @@ - Applications - Run:ai Documentation Library
    Skip to content

    Applications

    This article explains the procedure to manage your organization's applications.

    Applications are used for API integrations with Run:ai. An application contains a client ID and a client secret. With the client credentials, you can obtain a token as detailed in API authentication and use it within subsequent API calls.

    Applications are assigned with Access Rules to manage permissions. For example, application ci-pipeline-prod is assigned with a Researcher role in Cluster: A.

    Applications table

    The Applications table can be found under Access in the Run:ai platform.

    The Applications table provides a list of all the applications defined in the platform, and allows you to manage them.

    The Applications table consists of the following columns:

    Column Description
    Application The name of the application
    Client ID The client ID of the application
    Access rule(s) The access rules assigned to the application
    Last login The timestamp for the last time the user signed in
    Created by The user who created the application
    Creation time The timestamp for when the application was created
    Last updated The last time the application was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Creating an application

    To create an application:

    1. Click +NEW APPLICATION
    2. Enter the application’s name
    3. Click CREATE
    4. Copy the Client ID and Client secret and store them securely
    5. Click DONE

    Note

    The client secret is visible only at the time of creation. It cannot be recovered but can be regenerated.

    Adding an access rule to an application

    To create an access rule:

    1. Select the application you want to add an access rule for
    2. Click ACCESS RULES
    3. Click +ACCESS RULE
    4. Select a role
    5. Select a scope
    6. Click SAVE RULE
    7. Click CLOSE

    Deleting an access rule from an application

    To delete an access rule:

    1. Select the application you want to remove an access rule from
    2. Click ACCESS RULES
    3. Find the access rule assigned to the user you would like to delete
    4. Click on the trash icon
    5. Click CLOSE

    Regenerating client secret

    To regenerate a client secret:

    1. Locate the application you want to regenerate its client secret
    2. Click REGENERATE CLIENT SECRET
    3. Click REGENERATE
    4. Copy the New client secret and store it securely
    5. Click DONE

    Warning

    Regenerating a client secret revokes the previous one.

    Deleting an application

    1. Select the application you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Using API

    Go to the Applications, Access rules API reference to view the available actions

    Applications

    This article explains the procedure to manage your organization's applications.

    Applications are used for API integrations with Run:ai. An application contains a client ID and a client secret. With the client credentials, you can obtain a token as detailed in API authentication and use it within subsequent API calls.

    Applications are assigned with Access Rules to manage permissions. For example, application ci-pipeline-prod is assigned with a Researcher role in Cluster: A.

    Applications table

    The Applications table can be found under Access in the Run:ai platform.

    The Applications table provides a list of all the applications defined in the platform, and allows you to manage them.

    The Applications table consists of the following columns:

    Column Description
    Application The name of the application
    Client ID The client ID of the application
    Access rule(s) The access rules assigned to the application
    Last login The timestamp for the last time the user signed in
    Created by The user who created the application
    Creation time The timestamp for when the application was created
    Last updated The last time the application was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Creating an application

    To create an application:

    1. Click +NEW APPLICATION
    2. Enter the application’s name
    3. Click CREATE
    4. Copy the Client ID and Client secret and store them securely
    5. Click DONE

    Note

    The client secret is visible only at the time of creation. It cannot be recovered but can be regenerated.

    Adding an access rule to an application

    To create an access rule:

    1. Select the application you want to add an access rule for
    2. Click ACCESS RULES
    3. Click +ACCESS RULE
    4. Select a role
    5. Select a scope
    6. Click SAVE RULE
    7. Click CLOSE

    Deleting an access rule from an application

    To delete an access rule:

    1. Select the application you want to remove an access rule from
    2. Click ACCESS RULES
    3. Find the access rule assigned to the user you would like to delete
    4. Click on the trash icon
    5. Click CLOSE

    Regenerating client secret

    To regenerate a client secret:

    1. Locate the application you want to regenerate its client secret
    2. Click REGENERATE CLIENT SECRET
    3. Click REGENERATE
    4. Copy the New client secret and store it securely
    5. Click DONE

    Warning

    Regenerating a client secret revokes the previous one.

    Deleting an application

    1. Select the application you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Using API

    Go to the Applications, Access rules API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/admin/authentication/authentication-overview/index.html b/v2.20/admin/authentication/authentication-overview/index.html index 00a4573f1a..119c3364e9 100644 --- a/v2.20/admin/authentication/authentication-overview/index.html +++ b/v2.20/admin/authentication/authentication-overview/index.html @@ -1,4 +1,4 @@ - Overview - Run:ai Documentation Library
    Skip to content

    Authentication & Authorization

    Run:ai Authentication & Authorization enables a streamlined experience for the user with precise controls covering the data each user can see and the actions each user can perform in the Run:ai platform.

    Authentication verifies user identity during login, and Authorization assigns the user with specific permissions according to the assigned access rules.

    Authenticated access is required to use all aspects of the Run:ai interfaces, including the Run:ai platform, the Run:ai Command Line Interface (CLI) and APIs.

    Authentication

    There are multiple methods to authenticate and access Run:ai.

    Single Sign-On (SSO)

    Single Sign-On (SSO) is the preferred authentication method by large organizations, as it avoids the need to manage duplicate sets of user identities.

    Run:ai offers SSO integration, enabling users to utilize existing organizational credentials to access Run:ai without requiring dedicated credentials.

    Run:ai supports three methods to set up SSO:

    When using SSO, it is highly recommended to manage at least one local user, as a breakglass account (an emergency account), in case access to SSO is not possible.

    Username and password

    Username and password access can be used when SSO integration is not possible.

    Secret key (for Application programmatic access)

    A Secret is the authentication method for Applications. Applications use the Run:ai APIs to perform automated tasks including scripts and pipelines based on their assigned access rules.

    Authorization

    The Run:ai platform uses Role Base Access Control (RBAC) to manage authorization.

    Once a user or an application is authenticated, they can perform actions according to their assigned access rules.

    Role Based Access Control (RBAC) in Run:ai

    While Kubernetes RBAC is limited to a single cluster, Run:ai expands the scope of Kubernetes RBAC, making it easy for administrators to manage access rules across multiple clusters.

    RBAC at Run:ai is configured using access rules.

    An access rule is the assignment of a role to a subject in a scope: <Subject> is a <Role> in a <Scope>.

    • Subject
    • A user, a group, or an application assigned with the role
    • Role
    • A set of permissions that can be assigned to subjects
    • A permission is a set of actions (view, edit, create and delete) over a Run:ai entity (e.g. projects, workloads, users)
      • For example, a role might allow a user to create and read Projects, but not update or delete them
      • Roles at Run:ai are system defined and cannot be created, edited or deleted
    • Scope
    • A scope is part of an organization in which a set of permissions (roles) is effective. Scopes include Projects, Departments, Clusters, Account (all clusters).

    Below is an example of an access rule: username@company.com is a Department admin in Department: A

    Authentication & Authorization

    Run:ai Authentication & Authorization enables a streamlined experience for the user with precise controls covering the data each user can see and the actions each user can perform in the Run:ai platform.

    Authentication verifies user identity during login, and Authorization assigns the user with specific permissions according to the assigned access rules.

    Authenticated access is required to use all aspects of the Run:ai interfaces, including the Run:ai platform, the Run:ai Command Line Interface (CLI) and APIs.

    Authentication

    There are multiple methods to authenticate and access Run:ai.

    Single Sign-On (SSO)

    Single Sign-On (SSO) is the preferred authentication method by large organizations, as it avoids the need to manage duplicate sets of user identities.

    Run:ai offers SSO integration, enabling users to utilize existing organizational credentials to access Run:ai without requiring dedicated credentials.

    Run:ai supports three methods to set up SSO:

    When using SSO, it is highly recommended to manage at least one local user, as a breakglass account (an emergency account), in case access to SSO is not possible.

    Username and password

    Username and password access can be used when SSO integration is not possible.

    Secret key (for Application programmatic access)

    A Secret is the authentication method for Applications. Applications use the Run:ai APIs to perform automated tasks including scripts and pipelines based on their assigned access rules.

    Authorization

    The Run:ai platform uses Role Base Access Control (RBAC) to manage authorization.

    Once a user or an application is authenticated, they can perform actions according to their assigned access rules.

    Role Based Access Control (RBAC) in Run:ai

    While Kubernetes RBAC is limited to a single cluster, Run:ai expands the scope of Kubernetes RBAC, making it easy for administrators to manage access rules across multiple clusters.

    RBAC at Run:ai is configured using access rules.

    An access rule is the assignment of a role to a subject in a scope: <Subject> is a <Role> in a <Scope>.

    • Subject
    • A user, a group, or an application assigned with the role
    • Role
    • A set of permissions that can be assigned to subjects
    • A permission is a set of actions (view, edit, create and delete) over a Run:ai entity (e.g. projects, workloads, users)
      • For example, a role might allow a user to create and read Projects, but not update or delete them
      • Roles at Run:ai are system defined and cannot be created, edited or deleted
    • Scope
    • A scope is part of an organization in which a set of permissions (roles) is effective. Scopes include Projects, Departments, Clusters, Account (all clusters).

    Below is an example of an access rule: username@company.com is a Department admin in Department: A

    \ No newline at end of file diff --git a/v2.20/admin/authentication/non-root-containers/index.html b/v2.20/admin/authentication/non-root-containers/index.html index f70dc9988d..e16ca6a1e1 100644 --- a/v2.20/admin/authentication/non-root-containers/index.html +++ b/v2.20/admin/authentication/non-root-containers/index.html @@ -1,4 +1,4 @@ - User Identity in Container - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/authentication/researcher-authentication/index.html b/v2.20/admin/authentication/researcher-authentication/index.html index 4367acddc2..513133db83 100644 --- a/v2.20/admin/authentication/researcher-authentication/index.html +++ b/v2.20/admin/authentication/researcher-authentication/index.html @@ -1,4 +1,4 @@ - Researcher Authentication - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/authentication/roles/index.html b/v2.20/admin/authentication/roles/index.html index 97eabc7372..39652c350a 100644 --- a/v2.20/admin/authentication/roles/index.html +++ b/v2.20/admin/authentication/roles/index.html @@ -1,4 +1,4 @@ - Roles - Run:ai Documentation Library
    Skip to content

    Roles

    This article explains the available roles in the Run:ai platform.

    A role is a set of permissions that can be assigned to a subject in a scope.

    A permission is a set of actions (View, Edit, Create and Delete) over a Run:ai entity (e.g. projects, workloads, users).

    Roles table

    The Roles table can be found under Access in the Run:ai platform.

    The Roles table displays a list of predefined roles available to users in the Run:ai platform. It is not possible to create additional rules or edit or delete existing rules.

    The Roles table consists of the following columns:

    Column Description
    Role The name of the role
    Created by The name of the role creator
    Creation time The timestamp when the role was created

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Reviewing a role

    1. To review a role click the role name on the table
    2. In the role form review the following:
      • Role name
        The name of the role
      • Entity
        A system-managed object that can be viewed, edited, created or deleted by a user based on their assigned role and scope
      • Actions
        The actions that the role assignee is authorized to perform for each entity
        • View If checked, an assigned user with this role can view instances of this type of entity within their defined scope
        • Edit If checked, an assigned user with this role can change the settings of an instance of this type of entity within their defined scope
        • Create If checked, an assigned user with this role can create new instances of this type of entity within their defined scope
        • Delete If checked, an assigned user with this role can delete instances of this type of entity within their defined scope

    Roles in Run:ai

    Run:ai supports the following roles and their permissions:
    Under each role is a detailed list of the actions that the role assignee is authorized to perform for each entity.

    Compute resource administrator

    Data source administrator

    Data volume administrator

    Department administrator

    Department viewer

    Editor

    Environment administrator

    L1 researcher

    L2 researcher

    ML engineer

    Research manager

    System administrator

    Template administrator

    Viewer

    Notes

    Keep the following in mind when upgrading from versions 2.13 or earlier:

    • Admin becomes System Admin with full access to all managed objects and scopes
    • Research Manager is not automatically assigned to all projects, but to projects set by the relevant Admin when assigning this role to a user, group or app
    • To preserve backwards compatibility, users with the role of Research Manager are assigned to all current projects, but not to new projects
    • To allow the Department Admin to assign a Researcher role to a user, group or app, the Department Admin must have VECD permissions for jobs and workspaces. This creates a broader span of managed objects
    • To preserve backwards compatibility, users with the role of Editor, are assigned to the same scope they had before the upgrade. However, with new user assignments, the Admin can limit the scope to only part of the organizational scope.

    Permitted workloads

    When assigning a role with either one, all or any combination of the View, Edit, Create and Delete permissions for workloads, the subject has permissions to manage not only Run:ai native workloads (Workspace, Training, Inference), but also a list of 3rd party workloads:

    Using API

    Go to the Roles API reference to view the available actions.

    Roles

    This article explains the available roles in the Run:ai platform.

    A role is a set of permissions that can be assigned to a subject in a scope.

    A permission is a set of actions (View, Edit, Create and Delete) over a Run:ai entity (e.g. projects, workloads, users).

    Roles table

    The Roles table can be found under Access in the Run:ai platform.

    The Roles table displays a list of predefined roles available to users in the Run:ai platform. It is not possible to create additional rules or edit or delete existing rules.

    The Roles table consists of the following columns:

    Column Description
    Role The name of the role
    Created by The name of the role creator
    Creation time The timestamp when the role was created

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Reviewing a role

    1. To review a role click the role name on the table
    2. In the role form review the following:
      • Role name
        The name of the role
      • Entity
        A system-managed object that can be viewed, edited, created or deleted by a user based on their assigned role and scope
      • Actions
        The actions that the role assignee is authorized to perform for each entity
        • View If checked, an assigned user with this role can view instances of this type of entity within their defined scope
        • Edit If checked, an assigned user with this role can change the settings of an instance of this type of entity within their defined scope
        • Create If checked, an assigned user with this role can create new instances of this type of entity within their defined scope
        • Delete If checked, an assigned user with this role can delete instances of this type of entity within their defined scope

    Roles in Run:ai

    Run:ai supports the following roles and their permissions:
    Under each role is a detailed list of the actions that the role assignee is authorized to perform for each entity.

    Compute resource administrator

    Data source administrator

    Data volume administrator

    Department administrator

    Department viewer

    Editor

    Environment administrator

    L1 researcher

    L2 researcher

    ML engineer

    Research manager

    System administrator

    Template administrator

    Viewer

    Notes

    Keep the following in mind when upgrading from versions 2.13 or earlier:

    • Admin becomes System Admin with full access to all managed objects and scopes
    • Research Manager is not automatically assigned to all projects, but to projects set by the relevant Admin when assigning this role to a user, group or app
    • To preserve backwards compatibility, users with the role of Research Manager are assigned to all current projects, but not to new projects
    • To allow the Department Admin to assign a Researcher role to a user, group or app, the Department Admin must have VECD permissions for jobs and workspaces. This creates a broader span of managed objects
    • To preserve backwards compatibility, users with the role of Editor, are assigned to the same scope they had before the upgrade. However, with new user assignments, the Admin can limit the scope to only part of the organizational scope.

    Permitted workloads

    When assigning a role with either one, all or any combination of the View, Edit, Create and Delete permissions for workloads, the subject has permissions to manage not only Run:ai native workloads (Workspace, Training, Inference), but also a list of 3rd party workloads:

    Using API

    Go to the Roles API reference to view the available actions.

    \ No newline at end of file diff --git a/v2.20/admin/authentication/sso/openidconnect/index.html b/v2.20/admin/authentication/sso/openidconnect/index.html index 1bc71b67b2..dcf5874299 100644 --- a/v2.20/admin/authentication/sso/openidconnect/index.html +++ b/v2.20/admin/authentication/sso/openidconnect/index.html @@ -1,4 +1,4 @@ - Setup SSO with OpenID Connect - Run:ai Documentation Library
    Skip to content

    Setup SSO with OpenID Connect

    Single Sign-On (SSO) is an authentication scheme, allowing users to log-in with a single pair of credentials to multiple, independent software systems.

    This article explains the procedure to configure single sign-on to Run:ai using the OpenID Connect protocol.

    Prerequisites

    Before starting, make sure you have the following available from your identity provider:

    • Discovery URL - the OpenID server where the content discovery information is published.
    • ClientID - the ID used to identify the client with the Authorization Server.
    • Client Secret - a secret password that only the Client and Authorization server know.
    • Optional: Scopes - a set of user attributes to be used during authentication to authorize access to a user's details.

    Setup

    Follow the steps below to setup SSO with OpenID Connect.

    Adding the identity provider

    1. Go to General settings
    2. Open the Security section and click +IDENTITY PROVIDER
    3. Select Custom OpenID Connect
    4. Enter the Discovery URL, Client ID, and Client Secret
    5. Copy the Redirect URL to be used in your identity provider
    6. Optional: Add the OIDC scopes
    7. Optional: Enter the user attributes and their value in the identity provider (see the user attributes table below)
    8. Click SAVE
      User attributes
    Attribute Default value in Run:ai Description
    User role groups GROUPS If it exists in the IDP, it allows you to assign Run:ai role groups via the IDP. The IDP attribute must be a list of strings.
    Linux User ID UID If it exists in the IDP, it allows Researcher containers to start with the Linux User UID. Used to map access to network resources such as file systems to users. The IDP attribute must be of type integer.
    Linux Group ID GID If it exists in the IDP, it allows Researcher containers to start with the Linux Group GID. The IDP attribute must be of type integer.
    Supplementary Groups SUPPLEMENTARYGROUPS If it exists in the IDP, it allows Researcher containers to start with the relevant Linux supplementary groups. The IDP attribute must be a list of integers.
    Email email Defines the user attribute in the IDP holding the user's email address, which is the user identifier in Run:ai
    User first name firstName Used as the user’s first name appearing in the Run:ai user interface
    User last name lastName Used as the user’s last name appearing in the Run:ai user interface

    Testing the setup

    1. Log-in to the Run:ai platform as an admin
    2. Add Access Rules to an SSO user defined in the IDP
    3. Open the Run:ai platform in an incognito browser tab
    4. On the sign-in page click CONTINUE WITH SSO
      You are redirected to the identity provider sign in page
    5. In the identity provider sign-in page, log in with the SSO user who you granted with access rules
    6. If you are unsuccessful signing-in to the identity provider, follow the Troubleshooting section below

    Editing the identity provider

    You can view the identity provider details and edit its configuration:

    1. Go to General settings
    2. Open the Security section
    3. On the identity provider box, click Edit identity provider
    4. You can edit either the Discovery URL, Client ID, Client Secret, OIDC scopes, or the User attributes

    Removing the identity provider

    You can remove the identity provider configuration:

    1. Go to General settings
    2. Open the Security section
    3. On the identity provider card, click Remove identity provider
    4. In the dialog, click REMOVE to confirm the action

    Note

    To avoid losing access, removing the identity provider must be carried out by a local user.

    Troubleshooting

    If testing the setup was unsuccessful, try the different troubleshooting scenarios according to the error you received.

    Troubleshooting scenarios

    403 - Sorry, we can’t let you see this page. Something about permissions…

    Description: The authenticated user is missing permissions

    Mitigation:

    1. Validate either the user or its related group/s are assigned with access rules
    2. Validate groups attribute is available in the configured OIDC Scopes
    3. Validate the user’s groups attribute is mapped correctly

    Advanced:

    1. Open the Chrome DevTools: Right-click on page → Inspect → Console tab
    2. Run the following command to retrieve and paste the user’s token: localStorage.token;
    3. Paste in https://jwt.io
    4. Under the Payload section validate the values of the user’s attributes
    401 - We’re having trouble identifying your account because your email is incorrect or can’t be found.

    Description: Authentication failed because email attribute was not found.

    Mitigation:

    1. Validate email attribute is available in the configured OIDC Scopes
    2. Validate the user’s email attribute is mapped correctly
    Unexpected error when authenticating with identity provider

    Description: User authentication failed

    Mitigation:

    1. Validate that the configured OIDC Scopes exist and match the Identity Provider’s available scopes

    Advanced:

    1. Look for the specific error message in the URL address
    Unexpected error when authenticating with identity provider (SSO sign-in is not available)

    Description: User authentication failed

    Mitigation:

    1. Validate that the configured OIDC scope exists in the Identity Provider
    2. Validate the configured Client Secret match the Client Secret in the Identity Provider

    Advanced:

    1. Look for the specific error message in the URL address
    Client not found

    Description: OIDC Client ID was not found in the Identity Provider

    Mitigation:

    1. Validate that the configured Client ID matches the Identity Provider Client ID

    Setup SSO with OpenID Connect

    Single Sign-On (SSO) is an authentication scheme, allowing users to log-in with a single pair of credentials to multiple, independent software systems.

    This article explains the procedure to configure single sign-on to Run:ai using the OpenID Connect protocol.

    Prerequisites

    Before starting, make sure you have the following available from your identity provider:

    • Discovery URL - the OpenID server where the content discovery information is published.
    • ClientID - the ID used to identify the client with the Authorization Server.
    • Client Secret - a secret password that only the Client and Authorization server know.
    • Optional: Scopes - a set of user attributes to be used during authentication to authorize access to a user's details.

    Setup

    Follow the steps below to setup SSO with OpenID Connect.

    Adding the identity provider

    1. Go to General settings
    2. Open the Security section and click +IDENTITY PROVIDER
    3. Select Custom OpenID Connect
    4. Enter the Discovery URL, Client ID, and Client Secret
    5. Copy the Redirect URL to be used in your identity provider
    6. Optional: Add the OIDC scopes
    7. Optional: Enter the user attributes and their value in the identity provider (see the user attributes table below)
    8. Click SAVE
      User attributes
    Attribute Default value in Run:ai Description
    User role groups GROUPS If it exists in the IDP, it allows you to assign Run:ai role groups via the IDP. The IDP attribute must be a list of strings.
    Linux User ID UID If it exists in the IDP, it allows Researcher containers to start with the Linux User UID. Used to map access to network resources such as file systems to users. The IDP attribute must be of type integer.
    Linux Group ID GID If it exists in the IDP, it allows Researcher containers to start with the Linux Group GID. The IDP attribute must be of type integer.
    Supplementary Groups SUPPLEMENTARYGROUPS If it exists in the IDP, it allows Researcher containers to start with the relevant Linux supplementary groups. The IDP attribute must be a list of integers.
    Email email Defines the user attribute in the IDP holding the user's email address, which is the user identifier in Run:ai
    User first name firstName Used as the user’s first name appearing in the Run:ai user interface
    User last name lastName Used as the user’s last name appearing in the Run:ai user interface

    Testing the setup

    1. Log-in to the Run:ai platform as an admin
    2. Add Access Rules to an SSO user defined in the IDP
    3. Open the Run:ai platform in an incognito browser tab
    4. On the sign-in page click CONTINUE WITH SSO
      You are redirected to the identity provider sign in page
    5. In the identity provider sign-in page, log in with the SSO user who you granted with access rules
    6. If you are unsuccessful signing-in to the identity provider, follow the Troubleshooting section below

    Editing the identity provider

    You can view the identity provider details and edit its configuration:

    1. Go to General settings
    2. Open the Security section
    3. On the identity provider box, click Edit identity provider
    4. You can edit either the Discovery URL, Client ID, Client Secret, OIDC scopes, or the User attributes

    Removing the identity provider

    You can remove the identity provider configuration:

    1. Go to General settings
    2. Open the Security section
    3. On the identity provider card, click Remove identity provider
    4. In the dialog, click REMOVE to confirm the action

    Note

    To avoid losing access, removing the identity provider must be carried out by a local user.

    Troubleshooting

    If testing the setup was unsuccessful, try the different troubleshooting scenarios according to the error you received.

    Troubleshooting scenarios

    403 - Sorry, we can’t let you see this page. Something about permissions…

    Description: The authenticated user is missing permissions

    Mitigation:

    1. Validate either the user or its related group/s are assigned with access rules
    2. Validate groups attribute is available in the configured OIDC Scopes
    3. Validate the user’s groups attribute is mapped correctly

    Advanced:

    1. Open the Chrome DevTools: Right-click on page → Inspect → Console tab
    2. Run the following command to retrieve and paste the user’s token: localStorage.token;
    3. Paste in https://jwt.io
    4. Under the Payload section validate the values of the user’s attributes
    401 - We’re having trouble identifying your account because your email is incorrect or can’t be found.

    Description: Authentication failed because email attribute was not found.

    Mitigation:

    1. Validate email attribute is available in the configured OIDC Scopes
    2. Validate the user’s email attribute is mapped correctly
    Unexpected error when authenticating with identity provider

    Description: User authentication failed

    Mitigation:

    1. Validate that the configured OIDC Scopes exist and match the Identity Provider’s available scopes

    Advanced:

    1. Look for the specific error message in the URL address
    Unexpected error when authenticating with identity provider (SSO sign-in is not available)

    Description: User authentication failed

    Mitigation:

    1. Validate that the configured OIDC scope exists in the Identity Provider
    2. Validate the configured Client Secret match the Client Secret in the Identity Provider

    Advanced:

    1. Look for the specific error message in the URL address
    Client not found

    Description: OIDC Client ID was not found in the Identity Provider

    Mitigation:

    1. Validate that the configured Client ID matches the Identity Provider Client ID
    \ No newline at end of file diff --git a/v2.20/admin/authentication/sso/openshift/index.html b/v2.20/admin/authentication/sso/openshift/index.html index 2ca19f81c3..53c3fa60c0 100644 --- a/v2.20/admin/authentication/sso/openshift/index.html +++ b/v2.20/admin/authentication/sso/openshift/index.html @@ -1,4 +1,4 @@ - Setup SSO with OpenShift - Run:ai Documentation Library
    Skip to content

    Setup SSO with OpenShift

    Single Sign-On (SSO) is an authentication scheme, allowing users to log-in with a single pair of credentials to multiple, independent software systems.

    This article explains the procedure to configure single sign-on to Run:ai using the OpenID Connect protocol in OpenShift V4.

    Prerequisites

    Before starting, make sure you have the following available from your OpenShift cluster:

    Setup

    Follow the steps below to setup SSO with OpenShift.

    Adding the identity provider

    1. Go to General settings
    2. Open the Security section and click +IDENTITY PROVIDER
    3. Select OpenShift V4
    4. Enter the Base URL, Client ID, and Client Secret from your OpenShift OAuth client.
    5. Copy the Redirect URL to be used in your OpenShift OAuth client
    6. Optional: Enter the user attributes and their value in the identity provider (see the user attributes table below)
    7. Click SAVE
      User attributes
    Attribute Default value in Run:ai Description
    User role groups GROUPS If it exists in the IDP, it allows you to assign Run:ai role groups via the IDP. The IDP attribute must be a list of strings.
    Linux User ID UID If it exists in the IDP, it allows researcher containers to start with the Linux User UID. Used to map access to network resources such as file systems to users. The IDP attribute must be of type integer.
    Linux Group ID GID If it exists in the IDP, it allows researcher containers to start with the Linux Group GID. The IDP attribute must be of type integer.
    Supplementary Groups SUPPLEMENTARYGROUPS If it exists in the IDP, it allows researcher containers to start with the relevant Linux supplementary groups. The IDP attribute must be a list of integers.
    Email email Defines the user attribute in the IDP holding the user's email address, which is the user identifier in Run:ai
    User first name firstName Used as the user’s first name appearing in the Run:ai platform
    User last name lastName Used as the user’s last name appearing in the Run:ai platform

    Testing the setup

    1. Open the Run:ai platform as an admin
    2. Add Access Rules to an SSO user defined in the IDP
    3. Open the Run:ai platform in an incognito browser tab
    4. On the sign-in page click CONTINUE WITH SSO
      You are redirected to the OpenShift IDP sign-in page
    5. In the identity provider sign-in page, log-in with the SSO user who you granted with access rules
    6. If you are unsuccessful signing-in to the identity provider, follow the Troubleshooting section below

    Editing the identity provider

    You can view the identity provider details and edit its configuration:

    1. Go to General settings
    2. Open the Security section
    3. On the identity provider box, click Edit identity provider
    4. You can edit either the Base URL, Client ID, Client Secret, or the User attributes

    Removing the identity provider

    You can remove the identity provider configuration:

    1. Go to General settings
    2. Open the Security section
    3. On the identity provider card, click Remove identity provider
    4. In the dialog, click REMOVE to confirm the action

    Note

    To avoid losing access, removing the identity provider must be carried out by a local user.

    Troubleshooting

    If testing the setup was unsuccessful, try the different troubleshooting scenarios according to the error you received.

    Troubleshooting scenarios

    403 - Sorry, we can’t let you see this page. Something about permissions…

    Description: The authenticated user is missing permissions

    Mitigation:

    1. Validate either the user or its related group/s are assigned with access rules
    2. Validate groups attribute is available in the configured OIDC Scopes
    3. Validate the user’s groups attribute is mapped correctly

    Advanced:

    1. Open the Chrome DevTools: Right-click on page → Inspect → Console tab
    2. Run the following command to retrieve and copy the user’s token: localStorage.token;
    3. Paste in https://jwt.io
    4. Under the Payload section validate the value of the user’s attributes
    401 - We’re having trouble identifying your account because your email is incorrect or can’t be found.

    Description: Authentication failed because e-mail attribute was not found.

    Mitigation:

    1. Validate email attribute is available in the configured OIDC Scopes
    2. Validate the user’s email attribute is mapped correctly
    Unexpected error when authenticating with identity provider

    Description: User authentication failed

    Mitigation:

    1. Validate the the configured OIDC Scopes exist and match the Identity Provider’s available scopes

    Advanced:

    1. Look for the specific error message in the URL address
    Unexpected error when authenticating with identity provider (SSO sign-in is not available)

    Description: User authentication failed

    Mitigation:

    1. Validate that the configured OIDC scope exists in the Identity Provider
    2. Validate that the configured Client Secret matches the Client Secret value in the OAuthclient Kubernetes object.

    Advanced:

    1. Look for the specific error message in the URL address
    unauthorized_client

    Description: OIDC Client ID was not found in the OpenShift IDP

    Mitigation:

    1. Validate that the configured Client ID matches the value in the OAuthclient Kubernetes object.

    Setup SSO with OpenShift

    Single Sign-On (SSO) is an authentication scheme, allowing users to log-in with a single pair of credentials to multiple, independent software systems.

    This article explains the procedure to configure single sign-on to Run:ai using the OpenID Connect protocol in OpenShift V4.

    Prerequisites

    Before starting, make sure you have the following available from your OpenShift cluster:

    Setup

    Follow the steps below to setup SSO with OpenShift.

    Adding the identity provider

    1. Go to General settings
    2. Open the Security section and click +IDENTITY PROVIDER
    3. Select OpenShift V4
    4. Enter the Base URL, Client ID, and Client Secret from your OpenShift OAuth client.
    5. Copy the Redirect URL to be used in your OpenShift OAuth client
    6. Optional: Enter the user attributes and their value in the identity provider (see the user attributes table below)
    7. Click SAVE
      User attributes
    Attribute Default value in Run:ai Description
    User role groups GROUPS If it exists in the IDP, it allows you to assign Run:ai role groups via the IDP. The IDP attribute must be a list of strings.
    Linux User ID UID If it exists in the IDP, it allows researcher containers to start with the Linux User UID. Used to map access to network resources such as file systems to users. The IDP attribute must be of type integer.
    Linux Group ID GID If it exists in the IDP, it allows researcher containers to start with the Linux Group GID. The IDP attribute must be of type integer.
    Supplementary Groups SUPPLEMENTARYGROUPS If it exists in the IDP, it allows researcher containers to start with the relevant Linux supplementary groups. The IDP attribute must be a list of integers.
    Email email Defines the user attribute in the IDP holding the user's email address, which is the user identifier in Run:ai
    User first name firstName Used as the user’s first name appearing in the Run:ai platform
    User last name lastName Used as the user’s last name appearing in the Run:ai platform

    Testing the setup

    1. Open the Run:ai platform as an admin
    2. Add Access Rules to an SSO user defined in the IDP
    3. Open the Run:ai platform in an incognito browser tab
    4. On the sign-in page click CONTINUE WITH SSO
      You are redirected to the OpenShift IDP sign-in page
    5. In the identity provider sign-in page, log-in with the SSO user who you granted with access rules
    6. If you are unsuccessful signing-in to the identity provider, follow the Troubleshooting section below

    Editing the identity provider

    You can view the identity provider details and edit its configuration:

    1. Go to General settings
    2. Open the Security section
    3. On the identity provider box, click Edit identity provider
    4. You can edit either the Base URL, Client ID, Client Secret, or the User attributes

    Removing the identity provider

    You can remove the identity provider configuration:

    1. Go to General settings
    2. Open the Security section
    3. On the identity provider card, click Remove identity provider
    4. In the dialog, click REMOVE to confirm the action

    Note

    To avoid losing access, removing the identity provider must be carried out by a local user.

    Troubleshooting

    If testing the setup was unsuccessful, try the different troubleshooting scenarios according to the error you received.

    Troubleshooting scenarios

    403 - Sorry, we can’t let you see this page. Something about permissions…

    Description: The authenticated user is missing permissions

    Mitigation:

    1. Validate either the user or its related group/s are assigned with access rules
    2. Validate groups attribute is available in the configured OIDC Scopes
    3. Validate the user’s groups attribute is mapped correctly

    Advanced:

    1. Open the Chrome DevTools: Right-click on page → Inspect → Console tab
    2. Run the following command to retrieve and copy the user’s token: localStorage.token;
    3. Paste in https://jwt.io
    4. Under the Payload section validate the value of the user’s attributes
    401 - We’re having trouble identifying your account because your email is incorrect or can’t be found.

    Description: Authentication failed because e-mail attribute was not found.

    Mitigation:

    1. Validate email attribute is available in the configured OIDC Scopes
    2. Validate the user’s email attribute is mapped correctly
    Unexpected error when authenticating with identity provider

    Description: User authentication failed

    Mitigation:

    1. Validate the the configured OIDC Scopes exist and match the Identity Provider’s available scopes

    Advanced:

    1. Look for the specific error message in the URL address
    Unexpected error when authenticating with identity provider (SSO sign-in is not available)

    Description: User authentication failed

    Mitigation:

    1. Validate that the configured OIDC scope exists in the Identity Provider
    2. Validate that the configured Client Secret matches the Client Secret value in the OAuthclient Kubernetes object.

    Advanced:

    1. Look for the specific error message in the URL address
    unauthorized_client

    Description: OIDC Client ID was not found in the OpenShift IDP

    Mitigation:

    1. Validate that the configured Client ID matches the value in the OAuthclient Kubernetes object.
    \ No newline at end of file diff --git a/v2.20/admin/authentication/sso/saml/index.html b/v2.20/admin/authentication/sso/saml/index.html index f82ad47b56..a2f8043e9a 100644 --- a/v2.20/admin/authentication/sso/saml/index.html +++ b/v2.20/admin/authentication/sso/saml/index.html @@ -1,4 +1,4 @@ - Setup SSO with SAML - Run:ai Documentation Library
    Skip to content

    Setup SSO with SAML

    Single Sign-On (SSO) is an authentication scheme, allowing users to log-in with a single pair of credentials to multiple, independent software systems.

    This article explains the procedure to configure SSO to Run:ai using the SAML 2.0 protocol.

    Prerequisites

    Before starting, ensure you have the following available from your identity provider:

    • SAML XML Metadata

    Setup

    Follow the steps below to setup SSO with SAML.

    Adding the identity provider

    1. Go to General settings
    2. Open the Security section and click +IDENTITY PROVIDER
    3. Select Custom SAML 2.0
    4. Select either From computer or From URL
      • From computer - click the Metadata XML file field, then select your file for upload
      • From URL - in the Metadata XML URL field, enter the URL to the XML Metadata file
    5. Copy the Redirect URL and Entity ID to be used in your identity provider
    6. Optional: Enter the user attributes and their value in the identity provider (see the user attributes table below)
    Attribute Default value in Run:ai Description
    User role groups GROUPS If it exists in the IDP, it allows you to assign Run:ai role groups via the IDP. The IDP attribute must be a list of strings.
    Linux User ID UID If it exists in the IDP, it allows Researcher containers to start with the Linux User UID. Used to map access to network resources such as file systems to users. The IDP attribute must be of type integer.
    Linux Group ID GID If it exists in the IDP, it allows Researcher containers to start with the Linux Group GID. The IDP attribute must be of type integer.
    Supplementary Groups SUPPLEMENTARYGROUPS If it exists in the IDP, it allows Researcher containers to start with the relevant Linux supplementary groups. The IDP attribute must be a list of integers.
    Email email Defines the user attribute in the IDP holding the user's email address, which is the user identifier in Run:ai.
    User first name firstName Used as the user’s first name appearing in the Run:ai platform.
    User last name lastName Used as the user’s last name appearing in the Run:ai platform.
    1. Click SAVE

    Testing the setup

    1. Open the Run:ai platform as an admin
    2. Add Access Rules to an SSO user defined in the IDP
    3. Open the Run:ai platform in an incognito browser tab
    4. On the sign-in page click CONTINUE WITH SSO.
      You are redirected to the identity provider sign in page
    5. In the identity provider sign-in page, log-in with the SSO user who you granted with access rules
    6. If you are unsuccessful signing-in to the identity provider, follow the Troubleshooting section below

    Editing the identity provider

    You can view the identity provider details and edit its configuration:

    1. Go General settings
    2. Open the Security section
    3. On the identity provider box, click Edit identity provider
    4. You can edit either the metadata file or the user attributes
    5. You can view the identity provider URL, identity provider entity ID, and the certificate expiration date

    Removing the identity provider

    You can remove the identity provider configuration:

    1. Go to General settings
    2. Open the Security section
    3. On the identity provider card, click Remove identity provider
    4. In the dialog, click REMOVE to confirm the action

    Note

    To avoid losing access, removing the identity provider must be carried out by a local user.

    Downloading the XML metadata file

    You can download the XML file to view the identity provider settings:

    1. Go to General settings
    2. Open the Security section
    3. On the identity provider card, click Download metadata XML file

    Troubleshooting

    If testing the setup was unsuccessful, try the different troubleshooting scenarios according to the error you received. If an error still occurs, check the advanced troubleshooting section.

    Troubleshooting scenarios

    Invalid signature in response from identity provider

    Description: After trying to log-in, the following message is received in the RunLai log-in page. Mitigation: 1. Go to General settings
    2. Open the Security section
    3. In the identity provider box, check for a "Certificate expired” error
    4. If it is expired, update the SAML metadata file to include a valid certificate

    401 - We’re having trouble identifying your account because your email is incorrect or can’t be found.

    Description: Authentication failed because email attribute was not found.

    Mitigation:

    1. Validate the user’s email attribute is mapped correctly
    403 - Sorry, we can’t let you see this page. Something about permissions…

    Description: The authenticated user is missing permissions

    Mitigation:

    1. Validate either the user or its related group/s are assigned with access rules
    2. Validate the user’s groups attribute is mapped correctly

    Advanced:

    1. Open the Chrome DevTools: Right-click on page → Inspect → Console tab
    2. Run the following command to retrieve and paste the user’s token: localStorage.token;
    3. Paste in https://jwt.io
    4. Under the Payload section validate the values of the user’s attributes

    Advanced Troubleshooting

    Validating the SAML request

    The SAML login flow can be separated into two parts:

    • Run:ai redirects to the IDP for log-ins using a SAML Request
    • On successful log-in, the IDP redirects back to Run:ai with a SAML Response

    Validate the SAML Request to ensure the SAML flow works as expected:

    1. Go to the Run:ai login screen
    2. Open the Chrome Network inspector: Right-click → Inspect on the page → Network tab
    3. On the sign-in page click CONTINUE WITH SSO.
    4. Once redirected to the Identity Provider, search in the Chrome network inspector for an HTTP request showing the SAML Request. Depending on the IDP url, this would be a request to the IDP domain name. For example, accounts.google.com/idp?1234.
    5. When found, go to the Payload tab and copy the value of the SAML Request
    6. Paste the value into a SAML decoder (e.g. https://www.samltool.com/decode.php)
    7. Validate the request:
      • The content of the <saml:Issuer> tag is the same as Entity ID given when adding the identity provider
      • The content of the AssertionConsumerServiceURL is the same as the Redirect URI given when adding the identity provider
    8. Validate the response:
      • The user email under the <saml2:Subject> tag is the same as the logged-in user
      • Make sure that under the <saml2:AttributeStatement> tag, there is an Attribute named email (lowercase). This attribute is mandatory.
      • If other, optional user attributes (groups, firstName, lastName, uid, gid) are mapped make sure they also exist under <saml2:AttributeStatement> along with their respective values.

    Setup SSO with SAML

    Single Sign-On (SSO) is an authentication scheme, allowing users to log-in with a single pair of credentials to multiple, independent software systems.

    This article explains the procedure to configure SSO to Run:ai using the SAML 2.0 protocol.

    Prerequisites

    Before starting, ensure you have the following available from your identity provider:

    • SAML XML Metadata

    Setup

    Follow the steps below to setup SSO with SAML.

    Adding the identity provider

    1. Go to General settings
    2. Open the Security section and click +IDENTITY PROVIDER
    3. Select Custom SAML 2.0
    4. Select either From computer or From URL
      • From computer - click the Metadata XML file field, then select your file for upload
      • From URL - in the Metadata XML URL field, enter the URL to the XML Metadata file
    5. Copy the Redirect URL and Entity ID to be used in your identity provider
    6. Optional: Enter the user attributes and their value in the identity provider (see the user attributes table below)
    Attribute Default value in Run:ai Description
    User role groups GROUPS If it exists in the IDP, it allows you to assign Run:ai role groups via the IDP. The IDP attribute must be a list of strings.
    Linux User ID UID If it exists in the IDP, it allows Researcher containers to start with the Linux User UID. Used to map access to network resources such as file systems to users. The IDP attribute must be of type integer.
    Linux Group ID GID If it exists in the IDP, it allows Researcher containers to start with the Linux Group GID. The IDP attribute must be of type integer.
    Supplementary Groups SUPPLEMENTARYGROUPS If it exists in the IDP, it allows Researcher containers to start with the relevant Linux supplementary groups. The IDP attribute must be a list of integers.
    Email email Defines the user attribute in the IDP holding the user's email address, which is the user identifier in Run:ai.
    User first name firstName Used as the user’s first name appearing in the Run:ai platform.
    User last name lastName Used as the user’s last name appearing in the Run:ai platform.
    1. Click SAVE

    Testing the setup

    1. Open the Run:ai platform as an admin
    2. Add Access Rules to an SSO user defined in the IDP
    3. Open the Run:ai platform in an incognito browser tab
    4. On the sign-in page click CONTINUE WITH SSO.
      You are redirected to the identity provider sign in page
    5. In the identity provider sign-in page, log-in with the SSO user who you granted with access rules
    6. If you are unsuccessful signing-in to the identity provider, follow the Troubleshooting section below

    Editing the identity provider

    You can view the identity provider details and edit its configuration:

    1. Go General settings
    2. Open the Security section
    3. On the identity provider box, click Edit identity provider
    4. You can edit either the metadata file or the user attributes
    5. You can view the identity provider URL, identity provider entity ID, and the certificate expiration date

    Removing the identity provider

    You can remove the identity provider configuration:

    1. Go to General settings
    2. Open the Security section
    3. On the identity provider card, click Remove identity provider
    4. In the dialog, click REMOVE to confirm the action

    Note

    To avoid losing access, removing the identity provider must be carried out by a local user.

    Downloading the XML metadata file

    You can download the XML file to view the identity provider settings:

    1. Go to General settings
    2. Open the Security section
    3. On the identity provider card, click Download metadata XML file

    Troubleshooting

    If testing the setup was unsuccessful, try the different troubleshooting scenarios according to the error you received. If an error still occurs, check the advanced troubleshooting section.

    Troubleshooting scenarios

    Invalid signature in response from identity provider

    Description: After trying to log-in, the following message is received in the RunLai log-in page. Mitigation: 1. Go to General settings
    2. Open the Security section
    3. In the identity provider box, check for a "Certificate expired” error
    4. If it is expired, update the SAML metadata file to include a valid certificate

    401 - We’re having trouble identifying your account because your email is incorrect or can’t be found.

    Description: Authentication failed because email attribute was not found.

    Mitigation:

    1. Validate the user’s email attribute is mapped correctly
    403 - Sorry, we can’t let you see this page. Something about permissions…

    Description: The authenticated user is missing permissions

    Mitigation:

    1. Validate either the user or its related group/s are assigned with access rules
    2. Validate the user’s groups attribute is mapped correctly

    Advanced:

    1. Open the Chrome DevTools: Right-click on page → Inspect → Console tab
    2. Run the following command to retrieve and paste the user’s token: localStorage.token;
    3. Paste in https://jwt.io
    4. Under the Payload section validate the values of the user’s attributes

    Advanced Troubleshooting

    Validating the SAML request

    The SAML login flow can be separated into two parts:

    • Run:ai redirects to the IDP for log-ins using a SAML Request
    • On successful log-in, the IDP redirects back to Run:ai with a SAML Response

    Validate the SAML Request to ensure the SAML flow works as expected:

    1. Go to the Run:ai login screen
    2. Open the Chrome Network inspector: Right-click → Inspect on the page → Network tab
    3. On the sign-in page click CONTINUE WITH SSO.
    4. Once redirected to the Identity Provider, search in the Chrome network inspector for an HTTP request showing the SAML Request. Depending on the IDP url, this would be a request to the IDP domain name. For example, accounts.google.com/idp?1234.
    5. When found, go to the Payload tab and copy the value of the SAML Request
    6. Paste the value into a SAML decoder (e.g. https://www.samltool.com/decode.php)
    7. Validate the request:
      • The content of the <saml:Issuer> tag is the same as Entity ID given when adding the identity provider
      • The content of the AssertionConsumerServiceURL is the same as the Redirect URI given when adding the identity provider
    8. Validate the response:
      • The user email under the <saml2:Subject> tag is the same as the logged-in user
      • Make sure that under the <saml2:AttributeStatement> tag, there is an Attribute named email (lowercase). This attribute is mandatory.
      • If other, optional user attributes (groups, firstName, lastName, uid, gid) are mapped make sure they also exist under <saml2:AttributeStatement> along with their respective values.
    \ No newline at end of file diff --git a/v2.20/admin/authentication/users/index.html b/v2.20/admin/authentication/users/index.html index 873767ab96..0858e72386 100644 --- a/v2.20/admin/authentication/users/index.html +++ b/v2.20/admin/authentication/users/index.html @@ -1,4 +1,4 @@ - Users - Run:ai Documentation Library
    Skip to content

    Users

    This article explains the procedure to manage users and their permissions.

    Users can be managed locally, or via the Identity provider, while assigned with Access Rules to manage its permissions.

    For example, user user@domain.com is a department admin in department A.

    Users table

    The Users table can be found under Access in the Run:ai platform.

    The users table provides a list of all the users in the platform.
    You can manage local users and manage user permissions (access rules) for both local and SSO users.

    Note

    Single Sign-On users

    SSO users are managed by the identity provider and appear once they have signed in to Run:ai

    The Users table consists of the following columns:

    Column Description
    User The unique identity of the user (email address)
    Type The type of the user - SSO / local
    Last login The timestamp for the last time the user signed in
    Access rule(s) The access rules assigned to the user
    Created By The user who created the user
    Creation time The timestamp for when the user was created
    Last updated The last time the user was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Creating a local user

    To create a local user:

    1. Click +NEW LOCAL USER
    2. Enter the user’s Email address
    3. Click CREATE
    4. Review and copy the user’s credentials:
      • User Email
      • Temporary password to be used on first sign-in
    5. Click DONE

    Note

    The temporary password is visible only at the time of user’s creation, and must be changed after the first sign-in

    Adding an access rule to a user

    To create an access rule:

    1. Select the user you want to add an access rule for
    2. Click ACCESS RULES
    3. Click +ACCESS RULE
    4. Select a role
    5. Select a scope
    6. Click SAVE RULE
    7. Click CLOSE

    Deleting user’s access rule

    To delete an access rule:

    1. Select the user you want to remove an access rule from
    2. Click ACCESS RULES
    3. Find the access rule assigned to the user you would like to delete
    4. Click on the trash icon
    5. Click CLOSE

    Resetting a user password

    To reset a user’s password:

    1. Select the user you want to reset it’s password
    2. Click RESET PASSWORD
    3. Click RESET
    4. Review and copy the user’s credentials:
      • User Email
      • Temporary password to be used on next sign-in
    5. Click DONE

    Deleting a user

    1. Select the user you want to delete
    2. Click DELETE
    3. In the dialog, click DELETE to confirm the deletion

    Note

    To ensure administrative operations are always available, at least one local user with System Administrator role should exist.

    Using API

    Go to the Users, Access rules API reference to view the available actions

    Users

    This article explains the procedure to manage users and their permissions.

    Users can be managed locally, or via the Identity provider, while assigned with Access Rules to manage its permissions.

    For example, user user@domain.com is a department admin in department A.

    Users table

    The Users table can be found under Access in the Run:ai platform.

    The users table provides a list of all the users in the platform.
    You can manage local users and manage user permissions (access rules) for both local and SSO users.

    Note

    Single Sign-On users

    SSO users are managed by the identity provider and appear once they have signed in to Run:ai

    The Users table consists of the following columns:

    Column Description
    User The unique identity of the user (email address)
    Type The type of the user - SSO / local
    Last login The timestamp for the last time the user signed in
    Access rule(s) The access rules assigned to the user
    Created By The user who created the user
    Creation time The timestamp for when the user was created
    Last updated The last time the user was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Creating a local user

    To create a local user:

    1. Click +NEW LOCAL USER
    2. Enter the user’s Email address
    3. Click CREATE
    4. Review and copy the user’s credentials:
      • User Email
      • Temporary password to be used on first sign-in
    5. Click DONE

    Note

    The temporary password is visible only at the time of user’s creation, and must be changed after the first sign-in

    Adding an access rule to a user

    To create an access rule:

    1. Select the user you want to add an access rule for
    2. Click ACCESS RULES
    3. Click +ACCESS RULE
    4. Select a role
    5. Select a scope
    6. Click SAVE RULE
    7. Click CLOSE

    Deleting user’s access rule

    To delete an access rule:

    1. Select the user you want to remove an access rule from
    2. Click ACCESS RULES
    3. Find the access rule assigned to the user you would like to delete
    4. Click on the trash icon
    5. Click CLOSE

    Resetting a user password

    To reset a user’s password:

    1. Select the user you want to reset it’s password
    2. Click RESET PASSWORD
    3. Click RESET
    4. Review and copy the user’s credentials:
      • User Email
      • Temporary password to be used on next sign-in
    5. Click DONE

    Deleting a user

    1. Select the user you want to delete
    2. Click DELETE
    3. In the dialog, click DELETE to confirm the deletion

    Note

    To ensure administrative operations are always available, at least one local user with System Administrator role should exist.

    Using API

    Go to the Users, Access rules API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/admin/config/access-roles/index.html b/v2.20/admin/config/access-roles/index.html index 118609e2cb..dda57c49a6 100644 --- a/v2.20/admin/config/access-roles/index.html +++ b/v2.20/admin/config/access-roles/index.html @@ -1,4 +1,4 @@ - Review Kubernetes Access provided to Run:ai - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/config/admin-messages/index.html b/v2.20/admin/config/admin-messages/index.html index 7d0f787e37..a7a376984d 100644 --- a/v2.20/admin/config/admin-messages/index.html +++ b/v2.20/admin/config/admin-messages/index.html @@ -1,4 +1,4 @@ - Administrator Messages - Run:ai Documentation Library
    Skip to content

    Administrator Messages

    System administrators can use Administrator messages to make announcements to users once they have logged in. These messages typically are used to keep user informed about different aspects of the platform.

    To configure an Administrator message:

    1. Press General settings.
    2. Expand the Message from administrator pane.
    3. Press Message.
    4. Enter your message in the text box. Use the formatting tools in the toolbar to add special formatting or links to the message.
    5. Enable the Display "Don't show this again" checkbox on message to users to allow the users to see the message only once.
    6. Press Publish when complete.

    Administrator Messages

    System administrators can use Administrator messages to make announcements to users once they have logged in. These messages typically are used to keep user informed about different aspects of the platform.

    To configure an Administrator message:

    1. Press General settings.
    2. Expand the Message from administrator pane.
    3. Press Message.
    4. Enter your message in the text box. Use the formatting tools in the toolbar to add special formatting or links to the message.
    5. Enable the Display "Don't show this again" checkbox on message to users to allow the users to see the message only once.
    6. Press Publish when complete.
    \ No newline at end of file diff --git a/v2.20/admin/config/advanced-cluster-config/index.html b/v2.20/admin/config/advanced-cluster-config/index.html index 6c54e39b91..d58a8ecee1 100644 --- a/v2.20/admin/config/advanced-cluster-config/index.html +++ b/v2.20/admin/config/advanced-cluster-config/index.html @@ -1,4 +1,4 @@ - Advanced Cluster Configuration - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/config/allow-external-access-to-containers/index.html b/v2.20/admin/config/allow-external-access-to-containers/index.html index 6a03882ca3..295391a7bc 100644 --- a/v2.20/admin/config/allow-external-access-to-containers/index.html +++ b/v2.20/admin/config/allow-external-access-to-containers/index.html @@ -1,4 +1,4 @@ - External access to Containers - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/config/cli-admin-install/index.html b/v2.20/admin/config/cli-admin-install/index.html index 5d430153f7..2eaceec585 100644 --- a/v2.20/admin/config/cli-admin-install/index.html +++ b/v2.20/admin/config/cli-admin-install/index.html @@ -1,4 +1,4 @@ - Install Administrator CLI - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/config/cluster-wide-pvc/index.html b/v2.20/admin/config/cluster-wide-pvc/index.html index 24c2547bae..657183ee81 100644 --- a/v2.20/admin/config/cluster-wide-pvc/index.html +++ b/v2.20/admin/config/cluster-wide-pvc/index.html @@ -1,4 +1,4 @@ - Setup cluster wide PVC - Run:ai Documentation Library
    Skip to content

    Cluster wide PVCs

    A PersistentVolumeClaim (PVC) is a request for storage by a user. It is similar to a Pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes. For more information about PVCs, see Persistent Volumes.

    PVCs are namespace-specific. If your PVC relates to all run:ai Projects, do the following to propagate the PVC to all Projects:

    Create a PVC within the run:ai namespace, then run the following once to propagate the PVC to all run:ai Projects:

    kubectl label persistentvolumeclaims -n runai <PVC_NAME> runai/cluster-wide=true
     

    To delete a PVC from all run:ai Projects, run:

    kubectl label persistentvolumeclaims -n runai <PVC_NAME> runai/cluster-wide-
    -

    You can add a PVC to a job using the New job form.

    To add a PVC to a new job:

    1. On the New job form, press Storage.
    2. In Persistent Volume Claims press Add.
    3. Enable Existing PVC.
    4. Enter the name (claim name) of the PVC.
    5. Enter the storage class. (Optional)
    6. Enter the size.
    7. Enable / disable access modes.
    \ No newline at end of file diff --git a/v2.20/admin/config/clusters/index.html b/v2.20/admin/config/clusters/index.html index e523e7ea49..c0e5604fc1 100644 --- a/v2.20/admin/config/clusters/index.html +++ b/v2.20/admin/config/clusters/index.html @@ -1,4 +1,4 @@ - Clusters - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/config/create-k8s-assets-in-advance/index.html b/v2.20/admin/config/create-k8s-assets-in-advance/index.html index 411d7164e1..ce3511cf2f 100644 --- a/v2.20/admin/config/create-k8s-assets-in-advance/index.html +++ b/v2.20/admin/config/create-k8s-assets-in-advance/index.html @@ -1,4 +1,4 @@ - Mark Assets for Run:ai - Run:ai Documentation Library
    Skip to content

    Creating Kubernetes Assets in Advance

    The article describe how to mark Kubernetes assets for use by Run:ai

    Creating PVCs in advance

    Add PVCs in advance to be used when creating a PVC-type data source via the Run:ai UI.

    Follow the steps below for each required scope:

    Cluster scope

    1. Locate the PVC in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the PVC, label it: run.ai/cluster-wide: "true”
      The PVC is now displayed for that scope in the list of existing PVCs.

    Department scope

    1. Locate the PVC in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the PVC, label it: run.ai/department: "id"
      The PVC is now displayed for that scope in the list of existing PVCs.

    Project scope

    1. Locate the PVC in the project’s namespace
      The PVC is now displayed for that scope in the list of existing PVCs.

    Creating ConfigMaps in advance

    Add ConfigMaps in advance to be used when creating a ConfigMap-type data source via the Run:ai UI.

    Cluster scope

    1. Locate the ConfigMap in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the ConfigMap, label it: run.ai/cluster-wide: "true”
    3. The ConfigMap must have a label of run.ai/resource: <resource-name>

      The ConfigMap is now displayed for that scope in the list of existing ConfigMaps.

    Department scope

    1. Locate the ConfigMap in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the ConfigMap, label it: run.ai/department: "<department-id>"
    3. The ConfigMap must have a label of run.ai/resource: <resource-name>

      The ConfigMap is now displayed for that scope in the list of existing ConfigMaps.

    Project scope

    1. Locate the ConfigMap in the project’s namespace
    2. The ConfigMap must have a label of run.ai/resource: <resource-name>

      The ConfigMap is now displayed for that scope in the list of existing ConfigMaps.

    Creating Kubernetes Assets in Advance

    The article describe how to mark Kubernetes assets for use by Run:ai

    Creating PVCs in advance

    Add PVCs in advance to be used when creating a PVC-type data source via the Run:ai UI.

    Follow the steps below for each required scope:

    Cluster scope

    1. Locate the PVC in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the PVC, label it: run.ai/cluster-wide: "true”
      The PVC is now displayed for that scope in the list of existing PVCs.

    Department scope

    1. Locate the PVC in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the PVC, label it: run.ai/department: "id"
      The PVC is now displayed for that scope in the list of existing PVCs.

    Project scope

    1. Locate the PVC in the project’s namespace
      The PVC is now displayed for that scope in the list of existing PVCs.

    Creating ConfigMaps in advance

    Add ConfigMaps in advance to be used when creating a ConfigMap-type data source via the Run:ai UI.

    Cluster scope

    1. Locate the ConfigMap in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the ConfigMap, label it: run.ai/cluster-wide: "true”
    3. The ConfigMap must have a label of run.ai/resource: <resource-name>

      The ConfigMap is now displayed for that scope in the list of existing ConfigMaps.

    Department scope

    1. Locate the ConfigMap in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the ConfigMap, label it: run.ai/department: "<department-id>"
    3. The ConfigMap must have a label of run.ai/resource: <resource-name>

      The ConfigMap is now displayed for that scope in the list of existing ConfigMaps.

    Project scope

    1. Locate the ConfigMap in the project’s namespace
    2. The ConfigMap must have a label of run.ai/resource: <resource-name>

      The ConfigMap is now displayed for that scope in the list of existing ConfigMaps.

    \ No newline at end of file diff --git a/v2.20/admin/config/default-scheduler/index.html b/v2.20/admin/config/default-scheduler/index.html index a14173b36c..82bf9ce705 100644 --- a/v2.20/admin/config/default-scheduler/index.html +++ b/v2.20/admin/config/default-scheduler/index.html @@ -1,4 +1,4 @@ - Set Default Scheduler - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/config/dr/index.html b/v2.20/admin/config/dr/index.html index fdefca133d..d2e02b900a 100644 --- a/v2.20/admin/config/dr/index.html +++ b/v2.20/admin/config/dr/index.html @@ -1,4 +1,4 @@ - Backup & Restore - Run:ai Documentation Library
    Skip to content

    Backup & Restore

    Run:ai Cluster Restore

    This article explains how to restore a Run:ai cluster on a different Kubernetes environment.

    In the event of a critical Kubernetes failure or alternatively, if you want to migrate a Run:ai cluster to a new Kubernetes environment, simply reinstall the Run:ai cluster. Once you have reinstalled and reconnected the cluster - projects, workloads and other cluster data is synced automatically.

    The restoration or back-up of Run:ai cluster Advanced features and Customized deployment configurations which are stored locally on the Kubernetes cluster is optional and they can be restored and backed-up separately.

    Backup

    As back-up of data is not required, the backup procedure is optional for advanced deployments, as explained above.

    Backup cluster configurations

    To backup Run:ai cluster configurations:

    1. Run the following command in your terminal:
      kubectl get runaiconfig runai -n runai -o yaml -o=jsonpath='{.spec}' > runaiconfig_backup.yaml
       
    2. Once the runaiconfig_back.yaml back-up file is created, save the file externally, so that it can be retrieved later.

    Restore

    Follow the steps below to restore the Run:ai cluster on a new Kubernetes environment.

    Prerequisites

    Before restoring the Run:ai cluster, it is essential to validate that it is both disconnected and uninstalled.

    1. If the Kubernetes cluster is still available, uninstall the Run:ai cluster - make sure not to remove the cluster from the Control Plane
    2. Navigate to the Cluster page in the Run:ai platform
    3. Search for the cluster, and make sure its status is Disconnected

    Re-installing Run:ai Cluster

    1. Follow the Run:ai cluster installation instructions and ensure all prerequisites are met
    2. If you have a back-up of the cluster configurations, reload it once the installation is complete
      kubectl apply -f runaiconfig_backup.yaml -n runai
      -
    3. Navigate to the Cluster page in the Run:ai platform
    4. Search for the cluster, and make sure its status is Connected

    Run:ai Control Plane

    The self-hosted variant of Run:ai also installs the control-plane at the customer site. As such, it becomes the responsibility of the IT organization to verify that the system is configured for proper backup and learn how to recover the data when needed.

    Database Storage

    Run:ai uses an internal PostgreSQL database. The database is stored on a Kubernetes Persistent Volume (PV). You must provide a backup solution for the database. Some options:

    • Backing up of PostgreSQL itself. Example: kubectl -n runai-backend exec -it runai-backend-postgresql-0 -- env PGPASSWORD=password pg_dump -U postgres backend > cluster_name_db_backup.sql
    • Backing up the persistent volume holding the database storage.
    • Using third-party backup solutions.

    Run:ai also supports an external PostgreSQL database. For details on how to configure an external database please contact Run:ai customer support.

    Metrics Storage

    Run:ai stores metric history using Thanos. Thanos is configured to store data on a persistent volume. The recommendation is to back up the PV.

    Backing up Control-Plane Configuration

    The installation of the Run:ai control plane can be configured. The configuration is provided as --set command in the helm installation. These changes will be preserved on upgrade, but will not be preserved on uninstall or upon damage to Kubernetes. Thus, it is best to back up these customizations. For a list of customizations used during the installation, run:

    helm get values runai-backend -n runai-backend

    Recovery

    To recover Run:ai

    • Re-create the Kubernetes/OpenShift cluster.
    • Recover the persistent volumes for metrics and database.
    • Re-install the Run:ai control plane. Use the additional configuration previously saved and connect to the restored PostgreSQL PV. Connect Prometheus to the stored metrics PV.
    • Re-install the cluster. Add additional configuration post-install.
    • If the cluster is configured such that Projects do not create a namespace automatically, you will need to re-create namespaces and apply role bindings as discussed in Kubernetes or OpenShift.
    \ No newline at end of file diff --git a/v2.20/admin/config/ha/index.html b/v2.20/admin/config/ha/index.html index 38b26484b9..cd7250990f 100644 --- a/v2.20/admin/config/ha/index.html +++ b/v2.20/admin/config/ha/index.html @@ -1,4 +1,4 @@ - High Availability - Run:ai Documentation Library
    Skip to content

    High Availability

    The purpose of this document is to configure Run:ai such that it will continue to provide service even if parts of the system are down.

    A frequent fail scenario is a physical node in the system becoming non-responsive due to physical problems or lack of resources. In such a case, Kubernetes will attempt to relocate the running pods, but the process may take time, during which Run:ai will be down.

    A different scenario is a high transaction load, leading to system overload. To address such a scenario, please review the article: scaling the Run:ai system.

    Run:ai Control Plane

    Run:ai system workers

    The Run:ai control plane allows the optional gathering of Run:ai pods into specific nodes. When this feature is used, it is important to set more than one node as a Run:ai system worker. Otherwise, the horizontal scaling described below will not span multiple nodes, and the system will remain with a single point of failure.

    Horizontal Scalability of Run:ai services

    Horizontal scalability is about instructing the system to create more pods to dynamically scale according to incoming load and downsize when the load subsides.

    The Run:ai control plane is running on a single Kubernetes namespace named runai-backend. The namespace contains various Kubernetes Deployments and StatefulSets. Each of these services can be scaled horizontally.

    Deployments

    Each of the Run:ai deployments can be set to scale up, by adding a helm settings on install/upgrade. E.g. --set frontend.autoscaling.enabled=true. For a full list of settings, please contact Run:ai customer support.

    StatefulSets

    Run:ai uses three third parties which are managed as Kubernetes StatefulSets:

    • Keycloak—Stores the Run:ai authentication configuration as well as user identities. To scale Keycloak, use the Run:ai control-plane helm flag --set keycloakx.autoscaling.enabled=true. By default, Keycloak sets a minimum of 3 pods and will scale to more on transaction load.
    • PostgreSQL—It is not possible to configure an internal PostgreSQL to scale horizontally. If this is of importance, please contact Customer Support to understand how to connect Run:ai to an external PostgreSQL service which can be configured for high availability.
    • Thanos—To enable Thanos autoscaling, use the following Run:ai control-plane helm flags:
    --set thanos.query.autoscaling.enabled=true  
     --set thanos.query.autoscaling.maxReplicas=2
     --set thanos.query.autoscaling.minReplicas=2 
    -

    Run:ai Cluster

    Run:ai system workers

    The Run:ai cluster allows the mandatory gathering of Run:ai pods into specific nodes. When this feature is used, it is important to set more than one node as a Run:ai system worker. Otherwise, the horizontal scaling described below may not span multiple nodes, and the system will remain with a single point of failure.

    Prometheus

    The default Prometheus installation uses a single pod replica. If the node running the pod is unresponsive, metrics will not be scraped from the cluster and will not be sent to the Run:ai control-plane.

    Prometheus supports high availability by allowing to run multiple instances. The tradeoff of this approach is that all instances will scrape and send the same data. The Run:ai control plane will identify duplicate metric series and ignore them. This approach will thus increase network, CPU and memory consumption.

    To change the number of Prometheus instances, edit the runaiconfig as described under customizing the Run:ai cluster. Under prometheus.spec, set replicas to 2.

    \ No newline at end of file diff --git a/v2.20/admin/config/large-clusters/index.html b/v2.20/admin/config/large-clusters/index.html index a7bd52b15b..5f4c7967c2 100644 --- a/v2.20/admin/config/large-clusters/index.html +++ b/v2.20/admin/config/large-clusters/index.html @@ -1,4 +1,4 @@ - Scaling - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/config/limit-to-node-group/index.html b/v2.20/admin/config/limit-to-node-group/index.html index 71d07c8fa1..6cdf314936 100644 --- a/v2.20/admin/config/limit-to-node-group/index.html +++ b/v2.20/admin/config/limit-to-node-group/index.html @@ -1,4 +1,4 @@ - Group Nodes - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/config/node-affinity-with-cloud-node-pools/index.html b/v2.20/admin/config/node-affinity-with-cloud-node-pools/index.html index 09f52b040a..fb65827524 100644 --- a/v2.20/admin/config/node-affinity-with-cloud-node-pools/index.html +++ b/v2.20/admin/config/node-affinity-with-cloud-node-pools/index.html @@ -1,4 +1,4 @@ - Node Affinity with Cloud Node Pools - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/config/node-roles/index.html b/v2.20/admin/config/node-roles/index.html index 7930c987ec..62b600445e 100644 --- a/v2.20/admin/config/node-roles/index.html +++ b/v2.20/admin/config/node-roles/index.html @@ -1,4 +1,4 @@ - Set Node Roles - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/config/notifications/index.html b/v2.20/admin/config/notifications/index.html index 88248d42c9..95cc284c32 100644 --- a/v2.20/admin/config/notifications/index.html +++ b/v2.20/admin/config/notifications/index.html @@ -1,4 +1,4 @@ - Notifications System - Run:ai Documentation Library
    Skip to content

    Email and System Notifications

    Email Notifications for Data Scientists

    Managing numerous data science workloads requires monitoring various stages, including submission, scheduling, initialization, execution, and completion. Additionally, handling suspensions and failures is crucial for ensuring timely workload completion. Email Notifications address this need by sending alerts for critical workload life cycle changes. This empowers data scientists to take necessary actions and prevent delays.

    Setting Up Email Notifications

    Important

    The system administrator needs to enable and setup email notifications so that users are kept informed about different system statuses.

    To enable email notifications for the system:

    1. Press General settings, then select Notifications.

      Note

      For SaaS deployments, use the Enable email notifications toggle.

    2. In the SMTP Host field, enter the SMTP server address and in the SMTP port field the port number.

    3. Select an Authentication type Plain or Login. Enter a username and password to be used for authentication.
    4. Enter the From email address and the Display name.
    5. Press Verify to ensure that the email configuration is working.
    6. Press Save when complete.

    System Notifications

    Administrators can set system wide notifications for all the users in order to alert them of important information. System notifications allows administrators the ability to update users with events that may be occurring within the Run:ai platform. The system notification will appear at each login or after the message has changed for users who are already logged in.

    To configure system notifications:

    1. Press General settings, then select Notifications.
    2. In the System notification pane, press +MESSAGE.
    3. Enter your message in the text box. Use the formatting tool bar to add special formats to your message text.
    4. Enable the "Don't show this again" option to allow users to opt out of seeing the message multiple times.
    5. When complete, press Save & Publish.

    Email and System Notifications

    Email Notifications for Data Scientists

    Managing numerous data science workloads requires monitoring various stages, including submission, scheduling, initialization, execution, and completion. Additionally, handling suspensions and failures is crucial for ensuring timely workload completion. Email Notifications address this need by sending alerts for critical workload life cycle changes. This empowers data scientists to take necessary actions and prevent delays.

    Setting Up Email Notifications

    Important

    The system administrator needs to enable and setup email notifications so that users are kept informed about different system statuses.

    To enable email notifications for the system:

    1. Press General settings, then select Notifications.

      Note

      For SaaS deployments, use the Enable email notifications toggle.

    2. In the SMTP Host field, enter the SMTP server address and in the SMTP port field the port number.

    3. Select an Authentication type Plain or Login. Enter a username and password to be used for authentication.
    4. Enter the From email address and the Display name.
    5. Press Verify to ensure that the email configuration is working.
    6. Press Save when complete.

    System Notifications

    Administrators can set system wide notifications for all the users in order to alert them of important information. System notifications allows administrators the ability to update users with events that may be occurring within the Run:ai platform. The system notification will appear at each login or after the message has changed for users who are already logged in.

    To configure system notifications:

    1. Press General settings, then select Notifications.
    2. In the System notification pane, press +MESSAGE.
    3. Enter your message in the text box. Use the formatting tool bar to add special formats to your message text.
    4. Enable the "Don't show this again" option to allow users to opt out of seeing the message multiple times.
    5. When complete, press Save & Publish.
    \ No newline at end of file diff --git a/v2.20/admin/config/org-cert/index.html b/v2.20/admin/config/org-cert/index.html index 4b58e0113e..53df72ac7b 100644 --- a/v2.20/admin/config/org-cert/index.html +++ b/v2.20/admin/config/org-cert/index.html @@ -1,4 +1,4 @@ - Local Certificate Authority - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/config/overview/index.html b/v2.20/admin/config/overview/index.html index 3ce7deb40a..ebf96acec5 100644 --- a/v2.20/admin/config/overview/index.html +++ b/v2.20/admin/config/overview/index.html @@ -1,4 +1,4 @@ - Run:ai Configuration Articles - Run:ai Documentation Library
    Skip to content

    Run:ai Configuration Articles

    This section provides a list of installation-related articles dealing with a wide range of subjects:

    Article Purpose
    Designating Specific Role Nodes Set one or more designated Run:ai system nodes or limit Run:ai monitoring and scheduling to specific nodes in the cluster.
    Create and Troubleshoot Clusters Create new clusters, view properties and status, and troubleshoot cluster connectivity related issues.
    Set Default Scheduler Set the default scheduler for a specific namespace
    Review Kubernetes Access provided to Run:ai In Restrictive Kubernetes environments such as when using OpenShift, understand and control what Kubernetes roles are provided to Run:ai
    External access to Containers Understand the available options for Researchers to access containers from the outside
    Install the Run:ai Administrator Command-line Interface The Administrator command-line is useful in a variety of flows such as cluster upgrade, node setup etc.
    Set Node affinity with cloud node pools Set node affinity when using a cloud provider for your cluster
    Local Certificate Authority For self-hosted Run:ai environments, specifically air-gapped installation, setup a local certificate authority to allow customers to safely connect to Run:ai
    Backup & Restore For self-hosted Run:ai environments, set up a scheduled backup of Run:ai data
    High Availability Configure Run:ai such that it will continue to provide service even if parts of the system are down.
    Scaling Scale the Run:ai cluster and the Run:ai control-plane to withstand large transaction loads
    Emails and system notification Configure e-mail notification

    Run:ai Configuration Articles

    This section provides a list of installation-related articles dealing with a wide range of subjects:

    Article Purpose
    Designating Specific Role Nodes Set one or more designated Run:ai system nodes or limit Run:ai monitoring and scheduling to specific nodes in the cluster.
    Create and Troubleshoot Clusters Create new clusters, view properties and status, and troubleshoot cluster connectivity related issues.
    Set Default Scheduler Set the default scheduler for a specific namespace
    Review Kubernetes Access provided to Run:ai In Restrictive Kubernetes environments such as when using OpenShift, understand and control what Kubernetes roles are provided to Run:ai
    External access to Containers Understand the available options for Researchers to access containers from the outside
    Install the Run:ai Administrator Command-line Interface The Administrator command-line is useful in a variety of flows such as cluster upgrade, node setup etc.
    Set Node affinity with cloud node pools Set node affinity when using a cloud provider for your cluster
    Local Certificate Authority For self-hosted Run:ai environments, specifically air-gapped installation, setup a local certificate authority to allow customers to safely connect to Run:ai
    Backup & Restore For self-hosted Run:ai environments, set up a scheduled backup of Run:ai data
    High Availability Configure Run:ai such that it will continue to provide service even if parts of the system are down.
    Scaling Scale the Run:ai cluster and the Run:ai control-plane to withstand large transaction loads
    Emails and system notification Configure e-mail notification
    \ No newline at end of file diff --git a/v2.20/admin/config/secure-cluster/index.html b/v2.20/admin/config/secure-cluster/index.html index c40d3ff46b..d685b85435 100644 --- a/v2.20/admin/config/secure-cluster/index.html +++ b/v2.20/admin/config/secure-cluster/index.html @@ -1,4 +1,4 @@ - Secure your Cluster - Run:ai Documentation Library
    Skip to content

    Secure your cluster

    This article details the security considerations for deploying Run:ai. It is intended to help administrators and security officers understand the specific permissions required by Run:ai.

    Access to the Kubernetes cluster

    Run:ai integrates with Kubernetes clusters and requires specific permissions to successfully operate. These are permissions are controlled with configuration flags that dictate how Run:ai interacts with cluster resources. Prior to installation, security teams can review the permissions and ensure it aligns with their organization’s policies.

    Run:ai provides various security-related permissions that can be customized to fit specific organizational needs. Below are brief descriptions of the key use cases for these customizations:

    Permission Use case
    Automatic Namespace creation Controls whether Run:ai automatically creates Kubernetes namespaces when new projects are created. Useful in environments where namespace creation must be strictly managed.
    Automatic user assignment Decides if users are automatically assigned to projects within Run:ai. Helps manage user access more tightly in certain compliance-driven environments.
    Secret propagation Determines whether Run:ai should propagate secrets across the cluster. Relevant for organizations with specific security protocols for managing sensitive data.
    Disabling Kubernetes limit range Chooses whether to disable the Kubernetes Limit Range feature. May be adjusted in environments with specific resource management needs.

    Note

    These security customizations allow organizations to tailor Run:ai to their specific needs. All changes should be modified cautiously and only when necessary to meet particular security, compliance or operational requirements.

    Secure installation

    Many organizations enforce IT compliance rules for Kubernetes, with strict access control for installing and running workloads. OpenShift uses Security Context Constraints (SCC) for this purpose. Run:ai fully supports SCC, ensuring integration with OpenShift's security requirements.

    Security vulnerabilities

    The platform is actively monitored for security vulnerabilities, with regular scans conducted to identify and address potential issues. Necessary fixes are applied to ensure that the software remains secure and resilient against emerging threats, providing a safe and reliable experience.

    Secure your cluster

    This article details the security considerations for deploying Run:ai. It is intended to help administrators and security officers understand the specific permissions required by Run:ai.

    Access to the Kubernetes cluster

    Run:ai integrates with Kubernetes clusters and requires specific permissions to successfully operate. These are permissions are controlled with configuration flags that dictate how Run:ai interacts with cluster resources. Prior to installation, security teams can review the permissions and ensure it aligns with their organization’s policies.

    Run:ai provides various security-related permissions that can be customized to fit specific organizational needs. Below are brief descriptions of the key use cases for these customizations:

    Permission Use case
    Automatic Namespace creation Controls whether Run:ai automatically creates Kubernetes namespaces when new projects are created. Useful in environments where namespace creation must be strictly managed.
    Automatic user assignment Decides if users are automatically assigned to projects within Run:ai. Helps manage user access more tightly in certain compliance-driven environments.
    Secret propagation Determines whether Run:ai should propagate secrets across the cluster. Relevant for organizations with specific security protocols for managing sensitive data.
    Disabling Kubernetes limit range Chooses whether to disable the Kubernetes Limit Range feature. May be adjusted in environments with specific resource management needs.

    Note

    These security customizations allow organizations to tailor Run:ai to their specific needs. All changes should be modified cautiously and only when necessary to meet particular security, compliance or operational requirements.

    Secure installation

    Many organizations enforce IT compliance rules for Kubernetes, with strict access control for installing and running workloads. OpenShift uses Security Context Constraints (SCC) for this purpose. Run:ai fully supports SCC, ensuring integration with OpenShift's security requirements.

    Security vulnerabilities

    The platform is actively monitored for security vulnerabilities, with regular scans conducted to identify and address potential issues. Necessary fixes are applied to ensure that the software remains secure and resilient against emerging threats, providing a safe and reliable experience.

    \ No newline at end of file diff --git a/v2.20/admin/config/shared-storage/index.html b/v2.20/admin/config/shared-storage/index.html index a35f3182fd..d641cee656 100644 --- a/v2.20/admin/config/shared-storage/index.html +++ b/v2.20/admin/config/shared-storage/index.html @@ -1,4 +1,4 @@ - Shared Storage - Run:ai Documentation Library
    Skip to content

    Shared Storage

    Shared storage is a critical component in AI and machine learning workflows, particularly in scenarios involving distributed training and shared datasets. In AI and ML environments, data must be readily accessible across multiple nodes, especially when training large models or working with vast datasets. Shared storage enable seamless access to data, ensuring that all nodes in a distributed training setup can read and write to the same datasets simultaneously. This setup not only enhances efficiency but is also crucial for maintaining consistency and speed in high-performance computing environments.

    While Run:ai Platform supports a variety of remote data sources, such as Git and S3, it is often more efficient to keep data close to the compute resources. This proximity is typically achieved through the use of shared storage, accessible to multiple nodes in your Kubernetes cluster.

    Shared storage

    When implementing shared storage in Kubernetes, there are two primary approaches:

    Storage Classes being the recommended option.

    Run:ai Data Sources support both direct NFS mount and Kubernetes Storage Classes.

    Kubernetes storage classes

    Storage classes in Kubernetes defines how storage is provisioned and managed. This allows you to select storage types optimized for AI workloads. For example, you can choose storage with high IOPS (Input/Output Operations Per Second) for rapid data access during intensive training sessions, or tiered storage options to balance cost and performance-based on your organization’s requirements. This approach supports dynamic provisioning, enabling storage to be allocated on-demand as required by your applications.

    Run:ai data sources such as Persistent Volume Claims (PVC) and Data Volumes leverage storage class to manage and allocate storage efficiently. This ensures that the most suitable storage option is always accessible, contributing to the efficiency and performance of AI workloads.

    Note

    Run:ai lists all available storage classes in the Kubernetes cluster, making it easy for users to select the appropriate storage. Additionally, policies can be set to restrict or enforce the use of specific storage classes, to helpl maintain compliance with organizational standards and optimize resource utilization.

    Kubernetes 1.23 (old)

    When using Kubernetes 1.23, Data Source of PVC type does not work using a Storage Class with the property volumeBindingMode equals to WaitForFirstConsumer

    Direct NFS mount

    Direct NFS allows you to mount a shared file system directly across multiple nodes in your Kubernetes cluster. This method provides a straightforward way to share data among nodes and is often used for simple setups or when a dedicated NFS server is available.

    However, using NFS can present challenges related to security and control. Direct NFS setups might lack the fine-grained control and security features available with storage class.

    Shared Storage

    Shared storage is a critical component in AI and machine learning workflows, particularly in scenarios involving distributed training and shared datasets. In AI and ML environments, data must be readily accessible across multiple nodes, especially when training large models or working with vast datasets. Shared storage enable seamless access to data, ensuring that all nodes in a distributed training setup can read and write to the same datasets simultaneously. This setup not only enhances efficiency but is also crucial for maintaining consistency and speed in high-performance computing environments.

    While Run:ai Platform supports a variety of remote data sources, such as Git and S3, it is often more efficient to keep data close to the compute resources. This proximity is typically achieved through the use of shared storage, accessible to multiple nodes in your Kubernetes cluster.

    Shared storage

    When implementing shared storage in Kubernetes, there are two primary approaches:

    Storage Classes being the recommended option.

    Run:ai Data Sources support both direct NFS mount and Kubernetes Storage Classes.

    Kubernetes storage classes

    Storage classes in Kubernetes defines how storage is provisioned and managed. This allows you to select storage types optimized for AI workloads. For example, you can choose storage with high IOPS (Input/Output Operations Per Second) for rapid data access during intensive training sessions, or tiered storage options to balance cost and performance-based on your organization’s requirements. This approach supports dynamic provisioning, enabling storage to be allocated on-demand as required by your applications.

    Run:ai data sources such as Persistent Volume Claims (PVC) and Data Volumes leverage storage class to manage and allocate storage efficiently. This ensures that the most suitable storage option is always accessible, contributing to the efficiency and performance of AI workloads.

    Note

    Run:ai lists all available storage classes in the Kubernetes cluster, making it easy for users to select the appropriate storage. Additionally, policies can be set to restrict or enforce the use of specific storage classes, to helpl maintain compliance with organizational standards and optimize resource utilization.

    Kubernetes 1.23 (old)

    When using Kubernetes 1.23, Data Source of PVC type does not work using a Storage Class with the property volumeBindingMode equals to WaitForFirstConsumer

    Direct NFS mount

    Direct NFS allows you to mount a shared file system directly across multiple nodes in your Kubernetes cluster. This method provides a straightforward way to share data among nodes and is often used for simple setups or when a dedicated NFS server is available.

    However, using NFS can present challenges related to security and control. Direct NFS setups might lack the fine-grained control and security features available with storage class.

    \ No newline at end of file diff --git a/v2.20/admin/config/workload-ownership-protection/index.html b/v2.20/admin/config/workload-ownership-protection/index.html index baec057ef9..23f845350a 100644 --- a/v2.20/admin/config/workload-ownership-protection/index.html +++ b/v2.20/admin/config/workload-ownership-protection/index.html @@ -1,4 +1,4 @@ - Workload Deletion Protection - Run:ai Documentation Library
    Skip to content

    Workload Deletion Protection

    Workload Deletion Protection

    Workload deletion protection in Run:ai ensures that only users who created a workload can delete or modify them. This feature is designed to safeguard important jobs and configurations from accidental or unauthorized modifications by users who did not originally create the workload.

    By enforcing ownership rules, Run:ai helps maintain the integrity and security of your machine learning operations. This additional layer of security ensures that only users with the appropriate permissions can delete and suspend workloads.

    The protection feature is implemented at the cluster level.

    To enable deletion protection run the following command:

    kubectl patch -n runai runaiconfigs.run.ai/runai --type='merge' --patch '{"spec":{"global":{"enableWorkloadOwnershipProtection":true}}}'
    -
    \ No newline at end of file diff --git a/v2.20/admin/maintenance/alert-monitoring/index.html b/v2.20/admin/maintenance/alert-monitoring/index.html index 8c1dfaa30a..c6e24770f3 100644 --- a/v2.20/admin/maintenance/alert-monitoring/index.html +++ b/v2.20/admin/maintenance/alert-monitoring/index.html @@ -1,4 +1,4 @@ - System Monitoring - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/maintenance/audit-log/index.html b/v2.20/admin/maintenance/audit-log/index.html index 56234dd681..442def6a6a 100644 --- a/v2.20/admin/maintenance/audit-log/index.html +++ b/v2.20/admin/maintenance/audit-log/index.html @@ -1,4 +1,4 @@ - Audit Log - Run:ai Documentation Library
    Skip to content

    Audit Log

    This article provides details about Run:ai’s Audit log.
    The Run:ai control plane provides the audit log API and event history table in the Run:ai UI . Both reflect the same information regarding changes to business objects: clusters, projects and assets etc.

    Events history table

    The Events history table can be found under Event history in the Run:ai UI.

    The Event history table consists of the following columns:

    Column Description
    Subject The name of the subject
    Subject type The user or application assigned with the role
    Source IP The IP address of the subject
    Date & time The exact timestamp at which the event occurred. Format dd/mm/yyyy for date and hh:mm am/pm for time.
    Event The type of the event. Possible values: Create, Update, Delete, Login
    Event ID Internal event ID, can be used for support purposes
    Status The outcome of the logged operation. Possible values: Succeeded, Failed
    Entity type The type of the logged business object.
    Entity name The name of logged business object.
    Entity ID The system's internal id of the logged business object.
    URL The endpoint or address that was accessed during the logged event.
    HTTP Method The HTTP operation method used for the request. Possible values include standard HTTP methods such as GET, POST, PUT, DELETE, indicating what kind of action was performed on the specified URL.

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV or Download as JSON

    Using the event history date selector

    The Event history table saves events for the last 90 days. However, the table itself presents up to the last 30 days of information due to the potentially very high number of operations that might be logged during this period.

    To view older events, or to refine your search for more specific results or fewer results, use the time selector and change the period you search for.
    You can also refine your search by clicking and using ADD FILTER accordingly.

    Using API

    Go to the Audit log API reference to view the available actions.
    Since the amount of data is not trivial, the API is based on paging. It retrieves a specified number of items for each API call. You can get more data by using subsequent calls.

    Limitations

    Submissions of workloads are not audited. As a result, the system does not track or log details of workload submissions, such as timestamps or user activity.

    Audit Log

    This article provides details about Run:ai’s Audit log.
    The Run:ai control plane provides the audit log API and event history table in the Run:ai UI . Both reflect the same information regarding changes to business objects: clusters, projects and assets etc.

    Events history table

    The Events history table can be found under Event history in the Run:ai UI.

    The Event history table consists of the following columns:

    Column Description
    Subject The name of the subject
    Subject type The user or application assigned with the role
    Source IP The IP address of the subject
    Date & time The exact timestamp at which the event occurred. Format dd/mm/yyyy for date and hh:mm am/pm for time.
    Event The type of the event. Possible values: Create, Update, Delete, Login
    Event ID Internal event ID, can be used for support purposes
    Status The outcome of the logged operation. Possible values: Succeeded, Failed
    Entity type The type of the logged business object.
    Entity name The name of logged business object.
    Entity ID The system's internal id of the logged business object.
    URL The endpoint or address that was accessed during the logged event.
    HTTP Method The HTTP operation method used for the request. Possible values include standard HTTP methods such as GET, POST, PUT, DELETE, indicating what kind of action was performed on the specified URL.

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV or Download as JSON

    Using the event history date selector

    The Event history table saves events for the last 90 days. However, the table itself presents up to the last 30 days of information due to the potentially very high number of operations that might be logged during this period.

    To view older events, or to refine your search for more specific results or fewer results, use the time selector and change the period you search for.
    You can also refine your search by clicking and using ADD FILTER accordingly.

    Using API

    Go to the Audit log API reference to view the available actions.
    Since the amount of data is not trivial, the API is based on paging. It retrieves a specified number of items for each API call. You can get more data by using subsequent calls.

    Limitations

    Submissions of workloads are not audited. As a result, the system does not track or log details of workload submissions, such as timestamps or user activity.

    \ No newline at end of file diff --git a/v2.20/admin/maintenance/node-downtime/index.html b/v2.20/admin/maintenance/node-downtime/index.html index b960df19ec..dfa767b6ab 100644 --- a/v2.20/admin/maintenance/node-downtime/index.html +++ b/v2.20/admin/maintenance/node-downtime/index.html @@ -1,4 +1,4 @@ - Node Maintenance - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/maintenance/overview/index.html b/v2.20/admin/maintenance/overview/index.html index 335c913cb5..b205c86da9 100644 --- a/v2.20/admin/maintenance/overview/index.html +++ b/v2.20/admin/maintenance/overview/index.html @@ -1,4 +1,4 @@ - Monitoring and maintenance Overview - Run:ai Documentation Library
    Skip to content

    Monitoring and maintenance Overview

    Deploying Run:ai in mission-critical environments requires proper monitoring and maintenance of resources to ensure workloads run and are deployed as expected.

    Details on how to monitor different parts of the physical resources in your Kubernetes system, including clusters and nodes, can be found in the monitoring and maintenance section. Adjacent configuration and troubleshooting sections also cover high availability, restoring and securing clusters, collecting logs, and reviewing audit logs to meet compliance requirements.

    In addition to monitoring Run:ai resources, it is also highly recommended to monitor Run:ai runs on Kubernetes, which manages containerized applications. In particular, focus on three main layers:

    Run:ai Control Plane and cluster services

    This is the highest layer and includes the parts of Run:ai pods, which run in containers managed by Kubernetes.

    Kubernetes cluster

    This layer includes the main Kubernetes system that runs and manages Run:ai components. Important elements to monitor include:

    • The health of the cluster and nodes (machines in the cluster).
    • The status of key Kubernetes services, such as the API server. For detailed information on managing clusters, see the official Kubernetes documentation.

    Host infrastructure

    This is the base layer, representing the actual machines (virtual or physical) that make up the cluster IT teams need to handle:

    • Managing CPU, memory, and storage
    • Keeping the operating system updated
    • Setting up the network and balancing the load

    Run:ai does not require any special configurations at this level.

    The articles below explain how to monitor these layers, maintain system security and compliance, and ensure the reliable operation of Run:ai in critical environments.

    Monitoring and maintenance Overview

    Deploying Run:ai in mission-critical environments requires proper monitoring and maintenance of resources to ensure workloads run and are deployed as expected.

    Details on how to monitor different parts of the physical resources in your Kubernetes system, including clusters and nodes, can be found in the monitoring and maintenance section. Adjacent configuration and troubleshooting sections also cover high availability, restoring and securing clusters, collecting logs, and reviewing audit logs to meet compliance requirements.

    In addition to monitoring Run:ai resources, it is also highly recommended to monitor Run:ai runs on Kubernetes, which manages containerized applications. In particular, focus on three main layers:

    Run:ai Control Plane and cluster services

    This is the highest layer and includes the parts of Run:ai pods, which run in containers managed by Kubernetes.

    Kubernetes cluster

    This layer includes the main Kubernetes system that runs and manages Run:ai components. Important elements to monitor include:

    • The health of the cluster and nodes (machines in the cluster).
    • The status of key Kubernetes services, such as the API server. For detailed information on managing clusters, see the official Kubernetes documentation.

    Host infrastructure

    This is the base layer, representing the actual machines (virtual or physical) that make up the cluster IT teams need to handle:

    • Managing CPU, memory, and storage
    • Keeping the operating system updated
    • Setting up the network and balancing the load

    Run:ai does not require any special configurations at this level.

    The articles below explain how to monitor these layers, maintain system security and compliance, and ensure the reliable operation of Run:ai in critical environments.

    \ No newline at end of file diff --git a/v2.20/admin/overview-administrator/index.html b/v2.20/admin/overview-administrator/index.html index 83eeb8c470..207d3635cd 100644 --- a/v2.20/admin/overview-administrator/index.html +++ b/v2.20/admin/overview-administrator/index.html @@ -1,4 +1,4 @@ - Overview - Run:ai Documentation Library
    Skip to content

    Overview: Infrastructure Administrator

    The Infrastructure Administrator is an IT person, responsible for the installation, setup and IT maintenance of the Run:ai product.

    As part of the Infrastructure Administrator documentation you will find:

    Overview: Infrastructure Administrator

    The Infrastructure Administrator is an IT person, responsible for the installation, setup and IT maintenance of the Run:ai product.

    As part of the Infrastructure Administrator documentation you will find:

    \ No newline at end of file diff --git a/v2.20/admin/researcher-setup/cli-install/index.html b/v2.20/admin/researcher-setup/cli-install/index.html index afa710aa0a..945314d2e4 100644 --- a/v2.20/admin/researcher-setup/cli-install/index.html +++ b/v2.20/admin/researcher-setup/cli-install/index.html @@ -1,4 +1,4 @@ - Install the V1 CLI - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/researcher-setup/docker-to-runai/index.html b/v2.20/admin/researcher-setup/docker-to-runai/index.html index 63f67494c6..e61fb25906 100644 --- a/v2.20/admin/researcher-setup/docker-to-runai/index.html +++ b/v2.20/admin/researcher-setup/docker-to-runai/index.html @@ -1,4 +1,4 @@ - From Docker to Run:ai - Run:ai Documentation Library
    Skip to content

    From Docker to Run:ai

    Dockers, Images, and Kubernetes

    Researchers are typically proficient in working with Docker. Docker is an isolation level above the operating system which allows creating your own bundle of the operating system + deep learning environment and packaging it within a single file. The file is called a docker image.

    You create a container by starting a docker image on a machine.

    Run:ai is based on Kubernetes. At its core, Kubernetes is an orchestration software above Docker: Among other things, it allows location abstraction as to where the actual container is running. This calls for some adaptation to the Researcher's workflow as follows.

    Image Repository

    If your Kubernetes cluster contains a single GPU node (machine), then your image can reside on the node itself (in which case, when runai submit workloads, the Researcher must use the flag --local-image).

    If your Kubernetes cluster contains more than a single node, then, to enable location abstraction, the image can no longer reside on the node itself. It must be relocated to an image repository. There are quite a few repository-as-a-service, most notably Docker hub. Alternatively, the organization can install a private repository on-prem.

    Day-to-day work with the image located remotely is almost identical to local work. The image name now contains its location. For example, nvcr.io/nvidia/pytorch:19.12-py_3 is a PyTorch image that is located in nvcr.io. This is the Nvidia image repository as found on the web.

    Data

    Deep learning is about data. It can be your code, the training data, saved checkpoints, etc.

    If your Kubernetes cluster contains a single GPU node (machine), then your data can reside on the node itself.

    If your Kubernetes cluster contains more than a single node, then, to enable location abstraction, the data must sit outside the machine, typically on network storage. The storage must be uniformly mapped to your container when it starts (using the -v command).

    Working with Containers

    Starting a container using docker usually involves a single command-line with multiple flags. A typical example:

    docker run --runtime=nvidia --shm-size 16G -it --rm -e HOSTNAME='hostname' \
         -v /raid/public/my_datasets:/root/dataset:ro   -i  nvcr.io/nvidia/pytorch:19.12-py3
    -

    The docker command docker run should be replaced with a Run:ai command runai submit. The flags are usually the same but some adaptation is required. A complete list of flags can be found here: runai submit.

    There are similar commands to get a shell into the container (runai bash), get the container logs (runai logs), and more. For a complete list see the Run:ai CLI reference.

    Schedule an Onboarding Session

    It is highly recommended to schedule an onboarding session for Researchers with a Run:ai customer success professional. Run:ai can help with the above transition, but adding to that, we at Run:ai have also acquired a large body of knowledge on data science best practices which can help streamline Researchers' work as well as save money for the organization.

    \ No newline at end of file diff --git a/v2.20/admin/researcher-setup/new-cli-install/index.html b/v2.20/admin/researcher-setup/new-cli-install/index.html index 03f1d5df62..102cb27c0c 100644 --- a/v2.20/admin/researcher-setup/new-cli-install/index.html +++ b/v2.20/admin/researcher-setup/new-cli-install/index.html @@ -1,4 +1,4 @@ - Install the V2 CLI - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/researcher-setup/researcher-setup-intro/index.html b/v2.20/admin/researcher-setup/researcher-setup-intro/index.html index 3c9a7d0e38..79e9dddcae 100644 --- a/v2.20/admin/researcher-setup/researcher-setup-intro/index.html +++ b/v2.20/admin/researcher-setup/researcher-setup-intro/index.html @@ -1,4 +1,4 @@ - Researcher Setup Overview - Run:ai Documentation Library
    Skip to content

    Introduction

    Following is a step-by-step guide for getting a new Researcher up to speed with Run:ai and Kubernetes.

    Change of Paradigms: from Docker to Kubernetes

    As part of Run:ai, the organization is typically moving from Docker-based workflows to Kubernetes. This document is an attempt to help the Researcher with this paradigm shift. It explains the basic concepts and provides links for further information about the Run:ai CLI.

    Setup the Run:ai Command-Line Interface

    Run:ai CLI needs to be installed on the Researcher's machine. This document provides step by step instructions.

    Provide the Researcher with a GPU Quota

    To submit workloads with Run:ai, the Researcher must be provided with a Project that contains a GPU quota. Please see Working with Projects document on how to create Projects and set a quota.

    Provide access to the Run:ai User Interface

    See Setting up users for further information on how to provide access to users.

    Schedule an Onboarding Session

    It is highly recommended to schedule an onboarding session for Researchers with a Run:ai customer success professional. Run:ai can help with the above transition, but adding to that, we at Run:ai have also acquired a large body of knowledge on data science best practices which can help streamline the Researchers' work as well as save money for the organization.

    Introduction

    Following is a step-by-step guide for getting a new Researcher up to speed with Run:ai and Kubernetes.

    Change of Paradigms: from Docker to Kubernetes

    As part of Run:ai, the organization is typically moving from Docker-based workflows to Kubernetes. This document is an attempt to help the Researcher with this paradigm shift. It explains the basic concepts and provides links for further information about the Run:ai CLI.

    Setup the Run:ai Command-Line Interface

    Run:ai CLI needs to be installed on the Researcher's machine. This document provides step by step instructions.

    Provide the Researcher with a GPU Quota

    To submit workloads with Run:ai, the Researcher must be provided with a Project that contains a GPU quota. Please see Working with Projects document on how to create Projects and set a quota.

    Provide access to the Run:ai User Interface

    See Setting up users for further information on how to provide access to users.

    Schedule an Onboarding Session

    It is highly recommended to schedule an onboarding session for Researchers with a Run:ai customer success professional. Run:ai can help with the above transition, but adding to that, we at Run:ai have also acquired a large body of knowledge on data science best practices which can help streamline the Researchers' work as well as save money for the organization.

    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/cluster-setup/cluster-delete/index.html b/v2.20/admin/runai-setup/cluster-setup/cluster-delete/index.html index 6a130bf395..145aa8ac41 100644 --- a/v2.20/admin/runai-setup/cluster-setup/cluster-delete/index.html +++ b/v2.20/admin/runai-setup/cluster-setup/cluster-delete/index.html @@ -1,4 +1,4 @@ - Cluster Uninstall - Run:ai Documentation Library
    Skip to content

    Cluster Uninstall

    This article explains how to uninstall Run:ai Cluster installation from the Kubernetes cluster.

    Unistall Run:ai cluster

    Uninstall of Run:ai cluster from the Kubernetes cluster does not delete existing projects, departments or workloads submitted by users.

    To uninstall the Run:ai cluster, run the following helm command in your terminal:

    helm uninstall runai-cluster -n runai
    -

    To delete the Run:ai cluster from the Run:ai Platform, see Removing a cluster.

    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/cluster-setup/cluster-install/index.html b/v2.20/admin/runai-setup/cluster-setup/cluster-install/index.html index ab24f04c41..4a82d11a8a 100644 --- a/v2.20/admin/runai-setup/cluster-setup/cluster-install/index.html +++ b/v2.20/admin/runai-setup/cluster-setup/cluster-install/index.html @@ -1,4 +1,4 @@ - Cluster Install - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/cluster-setup/cluster-prerequisites/index.html b/v2.20/admin/runai-setup/cluster-setup/cluster-prerequisites/index.html index 189b1898cc..3986edf8b5 100644 --- a/v2.20/admin/runai-setup/cluster-setup/cluster-prerequisites/index.html +++ b/v2.20/admin/runai-setup/cluster-setup/cluster-prerequisites/index.html @@ -1,4 +1,4 @@ - System Requirements - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/cluster-setup/cluster-setup-intro/index.html b/v2.20/admin/runai-setup/cluster-setup/cluster-setup-intro/index.html index 71aec46401..8ec5d07c4c 100644 --- a/v2.20/admin/runai-setup/cluster-setup/cluster-setup-intro/index.html +++ b/v2.20/admin/runai-setup/cluster-setup/cluster-setup-intro/index.html @@ -1,4 +1,4 @@ - SaaS Cluster Setup Introduction - Run:ai Documentation Library
    Skip to content

    Introduction

    This section is a step-by-step guide for setting up a Run:ai cluster.

    • A Run:ai cluster is a Kubernetes application installed on top of a Kubernetes cluster.
    • A Run:ai cluster connects to the Run:ai control plane on the cloud. The control plane provides a control point as well as a monitoring and control user interface for Administrators and Researchers.
    • A customer may have multiple Run:ai Clusters, all connecting to a single control plane.

    For additional details see the Run:ai system components

    Documents

    Customization

    For a list of optional customizations see Customize Installation

    Additional Configuration

    For a list of advanced configuration scenarios such as configuring researcher authentication, Single sign-on limiting the installation to specific nodes, and more, see the Configuration Articles section.

    Next Steps

    After setting up the cluster, you may want to start setting up Researchers. See: Researcher Setup.

    Introduction

    This section is a step-by-step guide for setting up a Run:ai cluster.

    • A Run:ai cluster is a Kubernetes application installed on top of a Kubernetes cluster.
    • A Run:ai cluster connects to the Run:ai control plane on the cloud. The control plane provides a control point as well as a monitoring and control user interface for Administrators and Researchers.
    • A customer may have multiple Run:ai Clusters, all connecting to a single control plane.

    For additional details see the Run:ai system components

    Documents

    Customization

    For a list of optional customizations see Customize Installation

    Additional Configuration

    For a list of advanced configuration scenarios such as configuring researcher authentication, Single sign-on limiting the installation to specific nodes, and more, see the Configuration Articles section.

    Next Steps

    After setting up the cluster, you may want to start setting up Researchers. See: Researcher Setup.

    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/cluster-setup/cluster-upgrade/index.html b/v2.20/admin/runai-setup/cluster-setup/cluster-upgrade/index.html index 5c856c0d97..bb5fdc792a 100644 --- a/v2.20/admin/runai-setup/cluster-setup/cluster-upgrade/index.html +++ b/v2.20/admin/runai-setup/cluster-setup/cluster-upgrade/index.html @@ -1,4 +1,4 @@ - Cluster Upgrade - Run:ai Documentation Library
    Skip to content

    Cluster Upgrade

    This article explains how to upgrade Run:ai cluster version.

    Before upgrade

    There are a number of matters to consider prior to upgrading the Run:ai cluster version.

    System and network requirements

    Before upgrading the Run:ai cluster, validate that the latest system requirements and network requirements are met, as they can change from time to time.

    Important

    It is highly recommended to upgrade the Kubernetes version together with the Run:ai cluster version, to ensure compatibility with latest supported version of your Kubernetes distribution

    Helm

    The latest releases of the Run:ai cluster require Helm 3.14 or above.

    Upgrade

    Follow the instructions to upgrade using Helm. The Helm commands to upgrade the Run:ai cluster version may differ between versions. The steps below describe how to get the instructions from the Run:ai UI.

    Getting the installation instructions

    Follow the setup and installation instructions below to get the installation instructions to upgrade the Run:ai cluster.

    Setup

    1. In the Run:ai UI, go to Clusters
    2. Select the cluster you want to upgrade
    3. Click INSTALLATION INSTRUCTIONS
    4. Optional: Select the Run:ai cluster version (latest, by default)
    5. Click CONTINUE

    Installation instructions

    1. Follow the installation instructions (See the additional instructions below when upgrading to v2.13)
      run the Helm commands provided on your Kubernetes cluster (see the troubleshooting below if installation fails)
    2. Click DONE
    3. Once installation is complete, validate the cluster is Connected and listed with the new cluster version (see the cluster troubleshooting scenarios). Once you have done this, the cluster is upgraded to the latest version.

    Note

    To upgrade to a specific version, modify the --version flag by specifying the desired <version-number>. You can find all available versions by using the helm search repo command.

    Upgrade to Run:ai cluster version 2.13 (old release)

    Run:ai cluster version 2.13 (old release) does not support migration of the configured Helm values. If you have customized configurations you want to migrate, follow the additional steps below:

    1. Download the Run:ai Helm values file by running the command provided in your terminal
    2. Run the following command to save existing cluster Helm values into old-values.yaml
    helm get values runai-cluster -n runai > old-values.yaml
     
    1. Identify configured custom values that you want to migrate
    2. Manually merge the values from old-values.yaml into the new values file

    Troubleshooting

    If you encounter an issue with the cluster upgrade, use the troubleshooting scenario below.

    Installation fails

    If the Run:ai cluster upgrade fails, check the installation logs to identify the issue.

    Run the following script to print the installation logs:

    curl -fsSL https://raw.githubusercontent.com/run-ai/public/main/installation/get-installation-logs.sh
    -
    Cluster status

    If the Run:ai cluster upgrade completes, but the cluster status does not show as Connected, refer to the cluster troubleshooting scenarios

    .

    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/cluster-setup/customize-cluster-install/index.html b/v2.20/admin/runai-setup/cluster-setup/customize-cluster-install/index.html index cb9ed00995..f53a589ad0 100644 --- a/v2.20/admin/runai-setup/cluster-setup/customize-cluster-install/index.html +++ b/v2.20/admin/runai-setup/cluster-setup/customize-cluster-install/index.html @@ -1,4 +1,4 @@ - Customize Installation - Run:ai Documentation Library
    Skip to content

    Customize Installation

    This article explains the available configurations for customizing the Run:ai cluster installation.

    Helm chart values

    The Run:ai cluster installation can be customized to support your environment via Helm values files or Helm install flags.

    These configurations are saved in the runaiconfig Kubernetes object and can be edited post-installation as needed. For more information, see Advanced Cluster Configurations.

    Values

    The following table lists the available Helm chart values that can be configured to customize the Run:ai cluster installation.

    Key Description Default
    global.image.registry (string) Global Docker image registry Default: ""
    global.additionalImagePullSecrets (list) List of image pull secrets references Default: []
    spec.researcherService.ingress.tlsSecret (string) Existing secret key where cluster TLS Certificates are stored (non-OpenShift) Default: runai-cluster-domain-tls-secret
    spec.researcherService.route.tlsSecret (string) Existing secret key where cluster TLS Certificates are stored (OpenShift only) Default: ""
    spec.prometheus.spec.image (string) Due to a known issue In the Prometheus Helm chart, the imageRegistry setting is ignored. To pull the image from a different registry, you can manually specify the Prometheus image reference. Default: quay.io/prometheus/prometheus
    spec.prometheus.spec.imagePullSecrets (string) List of image pull secrets references in the runai namespace to use for pulling Prometheus images (relevant for air-gapped installations). Default: []

    Customize Installation

    This article explains the available configurations for customizing the Run:ai cluster installation.

    Helm chart values

    The Run:ai cluster installation can be customized to support your environment via Helm values files or Helm install flags.

    These configurations are saved in the runaiconfig Kubernetes object and can be edited post-installation as needed. For more information, see Advanced Cluster Configurations.

    Values

    The following table lists the available Helm chart values that can be configured to customize the Run:ai cluster installation.

    Key Description Default
    global.image.registry (string) Global Docker image registry Default: ""
    global.additionalImagePullSecrets (list) List of image pull secrets references Default: []
    spec.researcherService.ingress.tlsSecret (string) Existing secret key where cluster TLS Certificates are stored (non-OpenShift) Default: runai-cluster-domain-tls-secret
    spec.researcherService.route.tlsSecret (string) Existing secret key where cluster TLS Certificates are stored (OpenShift only) Default: ""
    spec.prometheus.spec.image (string) Due to a known issue In the Prometheus Helm chart, the imageRegistry setting is ignored. To pull the image from a different registry, you can manually specify the Prometheus image reference. Default: quay.io/prometheus/prometheus
    spec.prometheus.spec.imagePullSecrets (string) List of image pull secrets references in the runai namespace to use for pulling Prometheus images (relevant for air-gapped installations). Default: []
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/cluster-setup/dgx-bundle/index.html b/v2.20/admin/runai-setup/cluster-setup/dgx-bundle/index.html index a014ba8abd..370110e225 100644 --- a/v2.20/admin/runai-setup/cluster-setup/dgx-bundle/index.html +++ b/v2.20/admin/runai-setup/cluster-setup/dgx-bundle/index.html @@ -1,4 +1,4 @@ - Install using Base Command Manager - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/cluster-setup/network-req/index.html b/v2.20/admin/runai-setup/cluster-setup/network-req/index.html index 00b2a110a2..9fa80b35f7 100644 --- a/v2.20/admin/runai-setup/cluster-setup/network-req/index.html +++ b/v2.20/admin/runai-setup/cluster-setup/network-req/index.html @@ -1,4 +1,4 @@ - Network Requirements - Run:ai Documentation Library
    Skip to content

    Network Requirements

    The following network requirements are for the Run:ai cluster installation and usage.

    External access

    Set out below are the domains to whitelist and ports to open for installation, upgrade, and usage of the application and its management.

    Ensure the inbound and outbound rules are correctly applied to your firewall.

    Inbound rules

    To allow your organization’s Run:ai users to interact with the cluster using the Run:ai Command-line interface, or access specific UI features, certain inbound ports need to be open.

    Name Description Source Destination Port
    Run:ai cluster Run:ai cluster HTTPS entrypoint 0.0.0.0 all k8s nodes 443

    Outbound rules

    For the Run:ai cluster installation and usage, certain outbound ports must be open.

    Name Description Source Destination Port
    Run:ai Platform Run:ai cloud instance Run:ai system nodes app.run.ai 443
    Grafana Run:ai cloud metrics store Run:ai system nodes prometheus-us-central1.grafana.net and runailabs.com 443
    Google Container Registry Run:ai image repository All K8S nodes gcr.io/run-ai-prod 443
    JFrog Artifactory Run:ai Helm repository Helm client machine runai.jfrog.io 443

    The Run:ai installation has software requirements that require additional components to be installed on the cluster. This article includes simple installation examples which can be used optionally and require the following cluster outbound ports to be open:

    Name Description Source Destination Port
    Kubernetes Registry Ingress Nginx image repository All K8S nodes registry.k8s.io 443
    Google Container Registry GPU Operator, and Knative image repository All K8S nodes gcr.io 443
    Red Hat Container Registry Prometheus Operator image repository All K8S nodes quay.io 443
    Docker Hub Registry Training Operator image repository All K8S nodes docker.io 443

    Note

    If you are using an HTTP proxy, contact Run:ai support for further instructions.

    Internal network

    Ensure that all Kubernetes nodes can communicate with each other across all necessary ports. Kubernetes assumes full interconnectivity between nodes, so you must configure your network to allow this seamless communication. Specific port requirements may vary depending on your network setup.

    Network Requirements

    The following network requirements are for the Run:ai cluster installation and usage.

    External access

    Set out below are the domains to whitelist and ports to open for installation, upgrade, and usage of the application and its management.

    Ensure the inbound and outbound rules are correctly applied to your firewall.

    Inbound rules

    To allow your organization’s Run:ai users to interact with the cluster using the Run:ai Command-line interface, or access specific UI features, certain inbound ports need to be open.

    Name Description Source Destination Port
    Run:ai cluster Run:ai cluster HTTPS entrypoint 0.0.0.0 all k8s nodes 443

    Outbound rules

    For the Run:ai cluster installation and usage, certain outbound ports must be open.

    Name Description Source Destination Port
    Run:ai Platform Run:ai cloud instance Run:ai system nodes app.run.ai 443
    Grafana Run:ai cloud metrics store Run:ai system nodes prometheus-us-central1.grafana.net and runailabs.com 443
    Google Container Registry Run:ai image repository All K8S nodes gcr.io/run-ai-prod 443
    JFrog Artifactory Run:ai Helm repository Helm client machine runai.jfrog.io 443

    The Run:ai installation has software requirements that require additional components to be installed on the cluster. This article includes simple installation examples which can be used optionally and require the following cluster outbound ports to be open:

    Name Description Source Destination Port
    Kubernetes Registry Ingress Nginx image repository All K8S nodes registry.k8s.io 443
    Google Container Registry GPU Operator, and Knative image repository All K8S nodes gcr.io 443
    Red Hat Container Registry Prometheus Operator image repository All K8S nodes quay.io 443
    Docker Hub Registry Training Operator image repository All K8S nodes docker.io 443

    Note

    If you are using an HTTP proxy, contact Run:ai support for further instructions.

    Internal network

    Ensure that all Kubernetes nodes can communicate with each other across all necessary ports. Kubernetes assumes full interconnectivity between nodes, so you must configure your network to allow this seamless communication. Specific port requirements may vary depending on your network setup.

    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/cluster-setup/project-management/index.html b/v2.20/admin/runai-setup/cluster-setup/project-management/index.html index 374421260a..7420069de4 100644 --- a/v2.20/admin/runai-setup/cluster-setup/project-management/index.html +++ b/v2.20/admin/runai-setup/cluster-setup/project-management/index.html @@ -1,4 +1,4 @@ - Manually Create Projects - Run:ai Documentation Library
    Skip to content

    Manually Create Projects

    Manual Creation of Namespaces for Projects

    Introduction

    The Administrator creates Run:ai Projects via the Run:ai user interface. When enabling Researcher Authentication you also assign users to Projects.

    Run:ai Projects are implemented as Kubernetes namespaces. When creating a new Run:ai Project, Run:ai does the following automatically:

    1. Creates a namespace by the name of runai-<PROJECT-NAME>.
    2. Labels the namespace as managed by Run:ai.
    3. Provides access to the namespace for Run:ai services.
    4. Associates users with the namespace.

    This process may need to be altered if,

    • Researchers already have existing Kubernetes namespaces
    • The organization's Kubernetes namespace naming convention does not allow the runai- prefix.
    • The organization's policy does not allow the automatic creation of namespaces.

    Process

    Run:ai allows the association of a Run:ai Project with any existing Kubernetes namespace:

    • Disable namespace creation by setting the cluster flag createNamespaces to false. For more information see Advanced Cluster Configuration
    • Using the Run:ai User Interface, create a new Project <PROJECT-NAME>. A namespace will not be created.
    • Associate and existing namepace <NAMESPACE> with the Run:ai project by running:
    kubectl label ns <NAMESPACE>  runai/queue=<PROJECT_NAME>
    -

    Caution

    Setting the createNamespaces flag to false moves the responsibility of creating namespaces to match Run:ai Projects to the administrator.

    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/installation-types/index.html b/v2.20/admin/runai-setup/installation-types/index.html index f39ccf748c..4d92c387cc 100644 --- a/v2.20/admin/runai-setup/installation-types/index.html +++ b/v2.20/admin/runai-setup/installation-types/index.html @@ -1,4 +1,4 @@ - Installation Types - Run:ai Documentation Library
    Skip to content

    Installation Types

    Run:ai consists of two components:

    • The Run:ai Cluster. One or more data-science GPU clusters hosted by the customer (on-prem or cloud).
    • The Run:ai Control plane. A single entity that monitors clusters, sets priorities, and business policies.

    There are two main installation options:

    Installation Type Description
    Classic (SaaS) Run:ai is installed on the customer's data science GPU clusters. The cluster connects to the Run:ai control plane on the cloud (https://<tenant-name>.run.ai).
    With this installation, the cluster requires an outbound connection to the Run:ai cloud.
    Self-hosted The Run:ai control plane is also installed in the customer's data center

    The self-hosted option is for organizations that cannot use a SaaS solution due to data leakage concerns. The self-hosted installation is priced differently. For further information please talk to Run:ai sales.

    installation-types

    Self-hosted Installation

    Run:ai self-hosting comes with two variants:

    Self-hosting Type Description
    Connected The organization can freely download from the internet (though upload is not allowed)
    Air-gapped The organization has no connection to the internet

    Self-hosting with Kubernetes vs OpenShift

    Kubernetes has many Certified Kubernetes Providers. Run:ai has been certified with several of them (see the Kubernetes distribution section). The OpenShift installation is different from the rest. As such, the Run:ai self-hosted installation instructions are divided into two separate sections:

    Secure Installation

    In many organizations, Kubernetes is governed by IT compliance rules. In this scenario, there are strict access control rules during the installation and running of workloads:

    • OpenShift is secured using Security Context Constraints (SCC). The Run:ai installation supports SCC.
    • Run:ai provides limited support for Kubernetes Pod Security Admission (PSA). For more information see Kubernetes prerequisites.

    Installation Types

    Run:ai consists of two components:

    • The Run:ai Cluster. One or more data-science GPU clusters hosted by the customer (on-prem or cloud).
    • The Run:ai Control plane. A single entity that monitors clusters, sets priorities, and business policies.

    There are two main installation options:

    Installation Type Description
    Classic (SaaS) Run:ai is installed on the customer's data science GPU clusters. The cluster connects to the Run:ai control plane on the cloud (https://<tenant-name>.run.ai).
    With this installation, the cluster requires an outbound connection to the Run:ai cloud.
    Self-hosted The Run:ai control plane is also installed in the customer's data center

    The self-hosted option is for organizations that cannot use a SaaS solution due to data leakage concerns. The self-hosted installation is priced differently. For further information please talk to Run:ai sales.

    installation-types

    Self-hosted Installation

    Run:ai self-hosting comes with two variants:

    Self-hosting Type Description
    Connected The organization can freely download from the internet (though upload is not allowed)
    Air-gapped The organization has no connection to the internet

    Self-hosting with Kubernetes vs OpenShift

    Kubernetes has many Certified Kubernetes Providers. Run:ai has been certified with several of them (see the Kubernetes distribution section). The OpenShift installation is different from the rest. As such, the Run:ai self-hosted installation instructions are divided into two separate sections:

    Secure Installation

    In many organizations, Kubernetes is governed by IT compliance rules. In this scenario, there are strict access control rules during the installation and running of workloads:

    • OpenShift is secured using Security Context Constraints (SCC). The Run:ai installation supports SCC.
    • Run:ai provides limited support for Kubernetes Pod Security Admission (PSA). For more information see Kubernetes prerequisites.
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/k8s/additional-clusters/index.html b/v2.20/admin/runai-setup/self-hosted/k8s/additional-clusters/index.html index 72de816de2..97a4dcf013 100644 --- a/v2.20/admin/runai-setup/self-hosted/k8s/additional-clusters/index.html +++ b/v2.20/admin/runai-setup/self-hosted/k8s/additional-clusters/index.html @@ -1,4 +1,4 @@ - Install additional Clusters - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/k8s/backend/index.html b/v2.20/admin/runai-setup/self-hosted/k8s/backend/index.html index 2548f33d70..fd3b8a8e6a 100644 --- a/v2.20/admin/runai-setup/self-hosted/k8s/backend/index.html +++ b/v2.20/admin/runai-setup/self-hosted/k8s/backend/index.html @@ -1,4 +1,4 @@ - Install Control Plane - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/k8s/cluster/index.html b/v2.20/admin/runai-setup/self-hosted/k8s/cluster/index.html index a2ea0f2cca..5dd3d0cb24 100644 --- a/v2.20/admin/runai-setup/self-hosted/k8s/cluster/index.html +++ b/v2.20/admin/runai-setup/self-hosted/k8s/cluster/index.html @@ -1,4 +1,4 @@ - Self Hosted installation over Kubernetes - Cluster Setup - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/k8s/next-steps/index.html b/v2.20/admin/runai-setup/self-hosted/k8s/next-steps/index.html index dd1fe30efc..b57f8a4163 100644 --- a/v2.20/admin/runai-setup/self-hosted/k8s/next-steps/index.html +++ b/v2.20/admin/runai-setup/self-hosted/k8s/next-steps/index.html @@ -1,4 +1,4 @@ - Self Hosted installation over Kubernetes - Next Steps - Run:ai Documentation Library
    Skip to content

    Next Steps

    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/k8s/preparations/index.html b/v2.20/admin/runai-setup/self-hosted/k8s/preparations/index.html index 8302c74ea5..3d5257156a 100644 --- a/v2.20/admin/runai-setup/self-hosted/k8s/preparations/index.html +++ b/v2.20/admin/runai-setup/self-hosted/k8s/preparations/index.html @@ -1,4 +1,4 @@ - Self Hosted installation over Kubernetes - preparations - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/k8s/prerequisites/index.html b/v2.20/admin/runai-setup/self-hosted/k8s/prerequisites/index.html index 98dec3a19d..d4ee26024e 100644 --- a/v2.20/admin/runai-setup/self-hosted/k8s/prerequisites/index.html +++ b/v2.20/admin/runai-setup/self-hosted/k8s/prerequisites/index.html @@ -1,4 +1,4 @@ - Prerequisites - Run:ai Documentation Library
    Skip to content

    Self-Hosted installation over Kubernetes - Prerequisites

    Before proceeding with this document, please review the installation types documentation to understand the difference between air-gapped and connected installations.

    Run:ai Components

    As part of the installation process you will install:

    • A control-plane managing cluster
    • One or more clusters

    Both the control plane and clusters require Kubernetes. Typically the control plane and first cluster are installed on the same Kubernetes cluster but this is not a must.

    Installer machine

    The machine running the installation script (typically the Kubernetes master) must have:

    • At least 50GB of free space.
    • Docker installed.

    Helm

    Run:ai requires Helm 3.14 or later. To install Helm, see Installing Helm. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary.

    Cluster hardware requirements

    The Run:ai control plane services require the following resources:

    Component Required Capacity
    CPU 10 cores
    Memory 12GB
    Disk space 110GB

    If Run:ai cluster is planned to be installed on the same cluster as the Run:ai control plane: Ensure the control plane requirements are in addition to the Run:ai cluster hardware requirements.

    ARM Limitation

    The control plane does not support CPU nodes with ARM64k architecture. To schedule the Run:ai control plane services on supported nodes, use the global.affinity configuration paramter as detailed in Additional Run:ai configurations.

    Run:ai software requirements

    Cluster Nodes

    See Run:ai Cluster prerequisites operating system requirements.

    Nodes are required to be synchronized by time using NTP (Network Time Protocol) for proper system functionality.

    Kubernetes

    See Run:ai Cluster prerequisites Kubernetes distribution requirements.

    The Run:ai control plane operating system prerequisites are identical.

    The Run:ai control-plane requires a default storage class to create persistent volume claims for Run:ai storage. The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the Run:ai persistent data is saved or deleted when the Run:ai control plane is deleted.

    Note

    For a simple (nonproduction) storage class example see Kubernetes Local Storage Class. The storage class will set the directory /opt/local-path-provisioner to be used across all nodes as the path for provisioning persistent volumes.

    Then set the new storage class as default:

    kubectl patch storageclass local-path -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
    -

    Install prerequisites

    Ingress Controller

    The Run:ai control plane installation assumes an existing installation of NGINX as the ingress controller. You can follow the Run:ai Cluster prerequisites Kubernetes ingress controller installation.

    NVIDIA GPU Operator

    See Run:ai Cluster prerequisites NVIDIA GPU operator requirements.

    The Run:ai control plane, when installed without a Run:ai cluster, does not require the NVIDIA prerequisites.

    Prometheus

    See Run:ai Cluster prerequisites Prometheus requirements.

    The Run:ai control plane, when installed without a Run:ai cluster, does not require the Prometheus prerequisites.

    Inference (optional)

    See Run:ai Cluster prerequisites Inference requirements.

    The Run:ai control plane, when installed without a Run:ai cluster, does not require the Inference prerequisites.

    External Postgres database (optional)

    The Run:ai control plane installation includes a default PostgreSQL database. However, you may opt to use an existing PostgreSQL database if you have specific requirements or preferences. Please ensure that your PostgreSQL database is version 16 or higher.

    Next steps

    Continue to Preparing for a Run:ai Kubernetes Installation .

    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/k8s/project-management/index.html b/v2.20/admin/runai-setup/self-hosted/k8s/project-management/index.html index 3d21558d57..a58219a69f 100644 --- a/v2.20/admin/runai-setup/self-hosted/k8s/project-management/index.html +++ b/v2.20/admin/runai-setup/self-hosted/k8s/project-management/index.html @@ -1,4 +1,4 @@ - Self Hosted installation over Kubernetes - Create Projects - Run:ai Documentation Library
    Skip to content

    Manually Create Projects

    Introduction

    The Administrator creates Run:ai Projects via the Run:ai user interface. When enabling Researcher Authentication you also assign users to Projects.

    Run:ai Projects are implemented as Kubernetes namespaces. When creating a new Run:ai Project, Run:ai does the following automatically:

    1. Creates a namespace by the name of runai-<PROJECT-NAME>.
    2. Labels the namespace as managed by Run:ai.
    3. Provides access to the namespace for Run:ai services.
    4. Associates users with the namespace.

    This process may need to be altered if,

    • Researchers already have existing Kubernetes namespaces
    • The organization's Kubernetes namespace naming convention does not allow the runai- prefix.
    • The organization's policy does not allow the automatic creation of namespaces.

    Process

    Run:ai allows the association of a Run:ai Project with any existing Kubernetes namespace:

    • When setting up a Run:ai cluster, Disable namespace creation by setting the cluster flag createNamespaces to false.
    • Using the Run:ai User Interface, create a new Project <PROJECT-NAME>. A namespace will not be created.
    • Associate and existing namepace <NAMESPACE> with the Run:ai project by running:
    kubectl label ns <NAMESPACE>  runai/queue=<PROJECT_NAME>
    -

    Caution

    Setting the createNamespaces flag to false moves the responsibility of creating namespaces to match Run:ai Projects to the administrator.

    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/k8s/uninstall/index.html b/v2.20/admin/runai-setup/self-hosted/k8s/uninstall/index.html index 66bc064ac0..33392879ec 100644 --- a/v2.20/admin/runai-setup/self-hosted/k8s/uninstall/index.html +++ b/v2.20/admin/runai-setup/self-hosted/k8s/uninstall/index.html @@ -1,4 +1,4 @@ - Uninstall self-hosted Kubernetes installation - Run:ai Documentation Library
    Skip to content

    Uninstall Run:ai

    Uninstall a Run:ai Cluster

    To uninstall the cluster see: cluster delete

    Uninstall the Run:ai Control Plane

    To delete the control plane, run:

    helm uninstall runai-backend -n runai-backend
    -
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/k8s/upgrade/index.html b/v2.20/admin/runai-setup/self-hosted/k8s/upgrade/index.html index 1b27d689a3..c35f51d4cd 100644 --- a/v2.20/admin/runai-setup/self-hosted/k8s/upgrade/index.html +++ b/v2.20/admin/runai-setup/self-hosted/k8s/upgrade/index.html @@ -1,4 +1,4 @@ - Upgrade self-hosted Kubernetes installation - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/ocp/additional-clusters/index.html b/v2.20/admin/runai-setup/self-hosted/ocp/additional-clusters/index.html index dfb4d371c8..d38ea7b18c 100644 --- a/v2.20/admin/runai-setup/self-hosted/ocp/additional-clusters/index.html +++ b/v2.20/admin/runai-setup/self-hosted/ocp/additional-clusters/index.html @@ -1,4 +1,4 @@ - Install additional Clusters - Run:ai Documentation Library
    Skip to content

    Installing additional clusters

    The first Run:ai cluster is typically installed on the same OpenShift cluster as the Run:ai control plane. Run:ai supports multiple clusters per single control plane. This document is about installing additional clusters on different OpenShift clusters.

    Additional cluster installation

    Create a new cluster, then:

    • Select a target platform OpenShift
    • Select a Cluster location Remote to Control Plane.
    • You must enter a specific cluster URL with the format https://runai.apps.<BASE_DOMAIN>. To get the base Domain run oc get dns cluster -oyaml | grep baseDomain
    • Ignore the instructions for creating a secret.

    Installing additional clusters

    The first Run:ai cluster is typically installed on the same OpenShift cluster as the Run:ai control plane. Run:ai supports multiple clusters per single control plane. This document is about installing additional clusters on different OpenShift clusters.

    Additional cluster installation

    Create a new cluster, then:

    • Select a target platform OpenShift
    • Select a Cluster location Remote to Control Plane.
    • You must enter a specific cluster URL with the format https://runai.apps.<BASE_DOMAIN>. To get the base Domain run oc get dns cluster -oyaml | grep baseDomain
    • Ignore the instructions for creating a secret.
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/ocp/backend/index.html b/v2.20/admin/runai-setup/self-hosted/ocp/backend/index.html index cf0187698d..db29d5936c 100644 --- a/v2.20/admin/runai-setup/self-hosted/ocp/backend/index.html +++ b/v2.20/admin/runai-setup/self-hosted/ocp/backend/index.html @@ -1,4 +1,4 @@ - Install Control Plane - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/ocp/cluster/index.html b/v2.20/admin/runai-setup/self-hosted/ocp/cluster/index.html index 8df65842b6..7747d89d31 100644 --- a/v2.20/admin/runai-setup/self-hosted/ocp/cluster/index.html +++ b/v2.20/admin/runai-setup/self-hosted/ocp/cluster/index.html @@ -1,4 +1,4 @@ - Self-Hosted installation over OpenShift - Cluster Setup - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/ocp/next-steps/index.html b/v2.20/admin/runai-setup/self-hosted/ocp/next-steps/index.html index 5b46322f4e..23247d2258 100644 --- a/v2.20/admin/runai-setup/self-hosted/ocp/next-steps/index.html +++ b/v2.20/admin/runai-setup/self-hosted/ocp/next-steps/index.html @@ -1,4 +1,4 @@ - Self Hosted installation over OpenShift - Next Steps - Run:ai Documentation Library
    Skip to content

    Next Steps

    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/ocp/preparations/index.html b/v2.20/admin/runai-setup/self-hosted/ocp/preparations/index.html index cfdea0ff2d..9c2748777f 100644 --- a/v2.20/admin/runai-setup/self-hosted/ocp/preparations/index.html +++ b/v2.20/admin/runai-setup/self-hosted/ocp/preparations/index.html @@ -1,4 +1,4 @@ - Self Hosted installation over OpenShift - Preparations - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/ocp/prerequisites/index.html b/v2.20/admin/runai-setup/self-hosted/ocp/prerequisites/index.html index b950766d81..f78b1343aa 100644 --- a/v2.20/admin/runai-setup/self-hosted/ocp/prerequisites/index.html +++ b/v2.20/admin/runai-setup/self-hosted/ocp/prerequisites/index.html @@ -1,4 +1,4 @@ - Prerequisites - Run:ai Documentation Library
    Skip to content

    Self Hosted installation over OpenShift - prerequisites

    Before proceeding with this document, please review the installation types documentation to understand the difference between air-gapped and connected installations.

    Run:ai components

    As part of the installation process you will install:

    • A control-plane managing cluster
    • One or more clusters

    Both the control plane and clusters require Kubernetes. Typically the control plane and first cluster are installed on the same Kubernetes cluster but this is not a must.

    Important

    In OpenShift environments, adding a cluster connecting to a remote control plane currently requires the assistance of customer support.

    Installer machine

    The machine running the installation script (typically the Kubernetes master) must have:

    • At least 50GB of free space.
    • Docker installed.

    Helm

    Run:ai requires Helm 3.14 or later. To install Helm, see Installing Helm. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary.

    Cluster hardware requirements

    The Run:ai control plane services require the following resources:

    Component Required Capacity
    CPU 10 cores
    Memory 12GB
    Disk space 110GB

    If Run:ai cluster is planned to be installed on the same cluster as the Run:ai control plane: Ensure the control plane requirements are in addition to the Run:ai cluster hardware requirements.

    Run:ai software requirements

    Cluster Nodes

    Nodes are required to be synchronized by time using NTP (Network Time Protocol) for proper system functionality.

    OpenShift

    Run:ai supports OpenShift. OpenShift Versions supported are detailed in Kubernetes distribution.

    • OpenShift must be configured with a trusted certificate. Run:ai installation relies on OpenShift to create certificates for subdomains.
    • OpenShift must have a configured identity provider (Idp).
    • If your network is air-gapped, you will need to provide the Run:ai control-plane and cluster with information about the local certificate authority.

    Install prerequisites

    NVIDIA GPU Operator

    See Run:ai Cluster prerequisites installing NVIDIA dependencies in OpenShift.

    The Run:ai control plane, when installed without a Run:ai cluster, does not require the NVIDIA prerequisites.

    Information on how to download the GPU Operator for air-gapped installation can be found in the NVIDIA GPU Operator pre-requisites.

    Inference (optional)

    See Run:ai Cluster prerequisites Inference requirements.

    The Run:ai control plane, when installed without a Run:ai cluster, does not require the Inference prerequisites.

    External PostgreSQL database (optional)

    The Run:ai control plane installation includes a default PostgreSQL database. However, you may opt to use an existing PostgreSQL database if you have specific requirements or preferences. Please ensure that your PostgreSQL database is version 16 or higher.

    Next steps

    Continue to Preparing for a Run:ai OpenShift Installation .

    Self Hosted installation over OpenShift - prerequisites

    Before proceeding with this document, please review the installation types documentation to understand the difference between air-gapped and connected installations.

    Run:ai components

    As part of the installation process you will install:

    • A control-plane managing cluster
    • One or more clusters

    Both the control plane and clusters require Kubernetes. Typically the control plane and first cluster are installed on the same Kubernetes cluster but this is not a must.

    Important

    In OpenShift environments, adding a cluster connecting to a remote control plane currently requires the assistance of customer support.

    Installer machine

    The machine running the installation script (typically the Kubernetes master) must have:

    • At least 50GB of free space.
    • Docker installed.

    Helm

    Run:ai requires Helm 3.14 or later. To install Helm, see Installing Helm. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary.

    Cluster hardware requirements

    The Run:ai control plane services require the following resources:

    Component Required Capacity
    CPU 10 cores
    Memory 12GB
    Disk space 110GB

    If Run:ai cluster is planned to be installed on the same cluster as the Run:ai control plane: Ensure the control plane requirements are in addition to the Run:ai cluster hardware requirements.

    Run:ai software requirements

    Cluster Nodes

    Nodes are required to be synchronized by time using NTP (Network Time Protocol) for proper system functionality.

    OpenShift

    Run:ai supports OpenShift. OpenShift Versions supported are detailed in Kubernetes distribution.

    • OpenShift must be configured with a trusted certificate. Run:ai installation relies on OpenShift to create certificates for subdomains.
    • OpenShift must have a configured identity provider (Idp).
    • If your network is air-gapped, you will need to provide the Run:ai control-plane and cluster with information about the local certificate authority.

    Install prerequisites

    NVIDIA GPU Operator

    See Run:ai Cluster prerequisites installing NVIDIA dependencies in OpenShift.

    The Run:ai control plane, when installed without a Run:ai cluster, does not require the NVIDIA prerequisites.

    Information on how to download the GPU Operator for air-gapped installation can be found in the NVIDIA GPU Operator pre-requisites.

    Inference (optional)

    See Run:ai Cluster prerequisites Inference requirements.

    The Run:ai control plane, when installed without a Run:ai cluster, does not require the Inference prerequisites.

    External PostgreSQL database (optional)

    The Run:ai control plane installation includes a default PostgreSQL database. However, you may opt to use an existing PostgreSQL database if you have specific requirements or preferences. Please ensure that your PostgreSQL database is version 16 or higher.

    Next steps

    Continue to Preparing for a Run:ai OpenShift Installation .

    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/ocp/project-management/index.html b/v2.20/admin/runai-setup/self-hosted/ocp/project-management/index.html index a3c853a5ae..de7935372a 100644 --- a/v2.20/admin/runai-setup/self-hosted/ocp/project-management/index.html +++ b/v2.20/admin/runai-setup/self-hosted/ocp/project-management/index.html @@ -1,4 +1,4 @@ - Self Hosted installation over OpenShift - Create Projects - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/ocp/uninstall/index.html b/v2.20/admin/runai-setup/self-hosted/ocp/uninstall/index.html index 4d37319c00..3fe61e5c2e 100644 --- a/v2.20/admin/runai-setup/self-hosted/ocp/uninstall/index.html +++ b/v2.20/admin/runai-setup/self-hosted/ocp/uninstall/index.html @@ -1,4 +1,4 @@ - Uninstall self-hosted OpenShift installation - Run:ai Documentation Library
    Skip to content

    Uninstall Run:ai

    See uninstall section here

    Uninstall Run:ai

    See uninstall section here

    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/ocp/upgrade/index.html b/v2.20/admin/runai-setup/self-hosted/ocp/upgrade/index.html index dbeb3d8553..04bfc7b405 100644 --- a/v2.20/admin/runai-setup/self-hosted/ocp/upgrade/index.html +++ b/v2.20/admin/runai-setup/self-hosted/ocp/upgrade/index.html @@ -1,4 +1,4 @@ - Upgrade self-hosted OpenShift installation - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/runai-setup/self-hosted/overview/index.html b/v2.20/admin/runai-setup/self-hosted/overview/index.html index 3da5210523..9cd93c350b 100644 --- a/v2.20/admin/runai-setup/self-hosted/overview/index.html +++ b/v2.20/admin/runai-setup/self-hosted/overview/index.html @@ -1,4 +1,4 @@ - Self Hosted Run:ai Installation Overview - Run:ai Documentation Library
    Skip to content

    Self Hosted Run:ai Installation

    The self-hosted option is for organizations that cannot use a SaaS solution due to data leakage concerns.

    Run:ai self-hosting comes with two variants:

    Self-hosting Type Description
    Connected The organization can freely download from the internet (though upload is not allowed)
    Air-gapped The organization has no connection to the internet

    The self-hosted installation is priced differently. For further information please talk to Run:ai sales.

    Self-hosting with Kubernetes vs OpenShift

    Run:ai has been certified with a specified set of Kubernetes distributions. The OpenShift installation is different from the rest. As such, the Run:ai self-hosted installation instructions are divided into two separate sections:

    Self Hosted Run:ai Installation

    The self-hosted option is for organizations that cannot use a SaaS solution due to data leakage concerns.

    Run:ai self-hosting comes with two variants:

    Self-hosting Type Description
    Connected The organization can freely download from the internet (though upload is not allowed)
    Air-gapped The organization has no connection to the internet

    The self-hosted installation is priced differently. For further information please talk to Run:ai sales.

    Self-hosting with Kubernetes vs OpenShift

    Run:ai has been certified with a specified set of Kubernetes distributions. The OpenShift installation is different from the rest. As such, the Run:ai self-hosted installation instructions are divided into two separate sections:

    \ No newline at end of file diff --git a/v2.20/admin/troubleshooting/diagnostics/index.html b/v2.20/admin/troubleshooting/diagnostics/index.html index a7af388bfd..f9f21a01bd 100644 --- a/v2.20/admin/troubleshooting/diagnostics/index.html +++ b/v2.20/admin/troubleshooting/diagnostics/index.html @@ -1,4 +1,4 @@ - Diagnostics - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/troubleshooting/logs-collection/index.html b/v2.20/admin/troubleshooting/logs-collection/index.html index 4d29a21404..29683fb5c0 100644 --- a/v2.20/admin/troubleshooting/logs-collection/index.html +++ b/v2.20/admin/troubleshooting/logs-collection/index.html @@ -1,4 +1,4 @@ - Logs Collection - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/admin/troubleshooting/troubleshooting/index.html b/v2.20/admin/troubleshooting/troubleshooting/index.html index c80dbe4a5d..24a6b8a9d5 100644 --- a/v2.20/admin/troubleshooting/troubleshooting/index.html +++ b/v2.20/admin/troubleshooting/troubleshooting/index.html @@ -1,4 +1,4 @@ - Troubleshooting - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/assets/javascripts/bundle.5090c770.min.js b/v2.20/assets/javascripts/bundle.f13b1293.min.js similarity index 85% rename from v2.20/assets/javascripts/bundle.5090c770.min.js rename to v2.20/assets/javascripts/bundle.f13b1293.min.js index 9c639ed97c..7fbc8fc8e2 100644 --- a/v2.20/assets/javascripts/bundle.5090c770.min.js +++ b/v2.20/assets/javascripts/bundle.f13b1293.min.js @@ -11,6 +11,6 @@ * Licensed MIT © Zeno Rocha */(function(t,r){typeof It=="object"&&typeof Yr=="object"?Yr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof It=="object"?It.ClipboardJS=r():t.ClipboardJS=r()})(It,function(){return function(){var e={686:function(o,n,i){"use strict";i.d(n,{default:function(){return Ui}});var a=i(279),s=i.n(a),p=i(370),c=i.n(p),l=i(817),f=i.n(l);function u(V){try{return document.execCommand(V)}catch(A){return!1}}var d=function(A){var M=f()(A);return u("cut"),M},y=d;function L(V){var A=document.documentElement.getAttribute("dir")==="rtl",M=document.createElement("textarea");M.style.fontSize="12pt",M.style.border="0",M.style.padding="0",M.style.margin="0",M.style.position="absolute",M.style[A?"right":"left"]="-9999px";var F=window.pageYOffset||document.documentElement.scrollTop;return M.style.top="".concat(F,"px"),M.setAttribute("readonly",""),M.value=V,M}var X=function(A,M){var F=L(A);M.container.appendChild(F);var D=f()(F);return u("copy"),F.remove(),D},ee=function(A){var M=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},F="";return typeof A=="string"?F=X(A,M):A instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(A==null?void 0:A.type)?F=X(A.value,M):(F=f()(A),u("copy")),F},J=ee;function k(V){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?k=function(M){return typeof M}:k=function(M){return M&&typeof Symbol=="function"&&M.constructor===Symbol&&M!==Symbol.prototype?"symbol":typeof M},k(V)}var ft=function(){var A=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},M=A.action,F=M===void 0?"copy":M,D=A.container,Y=A.target,$e=A.text;if(F!=="copy"&&F!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(Y!==void 0)if(Y&&k(Y)==="object"&&Y.nodeType===1){if(F==="copy"&&Y.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(F==="cut"&&(Y.hasAttribute("readonly")||Y.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if($e)return J($e,{container:D});if(Y)return F==="cut"?y(Y):J(Y,{container:D})},qe=ft;function Fe(V){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?Fe=function(M){return typeof M}:Fe=function(M){return M&&typeof Symbol=="function"&&M.constructor===Symbol&&M!==Symbol.prototype?"symbol":typeof M},Fe(V)}function ki(V,A){if(!(V instanceof A))throw new TypeError("Cannot call a class as a function")}function no(V,A){for(var M=0;M0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof D.action=="function"?D.action:this.defaultAction,this.target=typeof D.target=="function"?D.target:this.defaultTarget,this.text=typeof D.text=="function"?D.text:this.defaultText,this.container=Fe(D.container)==="object"?D.container:document.body}},{key:"listenClick",value:function(D){var Y=this;this.listener=c()(D,"click",function($e){return Y.onClick($e)})}},{key:"onClick",value:function(D){var Y=D.delegateTarget||D.currentTarget,$e=this.action(Y)||"copy",Dt=qe({action:$e,container:this.container,target:this.target(Y),text:this.text(Y)});this.emit(Dt?"success":"error",{action:$e,text:Dt,trigger:Y,clearSelection:function(){Y&&Y.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(D){return vr("action",D)}},{key:"defaultTarget",value:function(D){var Y=vr("target",D);if(Y)return document.querySelector(Y)}},{key:"defaultText",value:function(D){return vr("text",D)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(D){var Y=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return J(D,Y)}},{key:"cut",value:function(D){return y(D)}},{key:"isSupported",value:function(){var D=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],Y=typeof D=="string"?[D]:D,$e=!!document.queryCommandSupported;return Y.forEach(function(Dt){$e=$e&&!!document.queryCommandSupported(Dt)}),$e}}]),M}(s()),Ui=Fi},828:function(o){var n=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function a(s,p){for(;s&&s.nodeType!==n;){if(typeof s.matches=="function"&&s.matches(p))return s;s=s.parentNode}}o.exports=a},438:function(o,n,i){var a=i(828);function s(l,f,u,d,y){var L=c.apply(this,arguments);return l.addEventListener(u,L,y),{destroy:function(){l.removeEventListener(u,L,y)}}}function p(l,f,u,d,y){return typeof l.addEventListener=="function"?s.apply(null,arguments):typeof u=="function"?s.bind(null,document).apply(null,arguments):(typeof l=="string"&&(l=document.querySelectorAll(l)),Array.prototype.map.call(l,function(L){return s(L,f,u,d,y)}))}function c(l,f,u,d){return function(y){y.delegateTarget=a(y.target,f),y.delegateTarget&&d.call(l,y)}}o.exports=p},879:function(o,n){n.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},n.nodeList=function(i){var a=Object.prototype.toString.call(i);return i!==void 0&&(a==="[object NodeList]"||a==="[object HTMLCollection]")&&"length"in i&&(i.length===0||n.node(i[0]))},n.string=function(i){return typeof i=="string"||i instanceof String},n.fn=function(i){var a=Object.prototype.toString.call(i);return a==="[object Function]"}},370:function(o,n,i){var a=i(879),s=i(438);function p(u,d,y){if(!u&&!d&&!y)throw new Error("Missing required arguments");if(!a.string(d))throw new TypeError("Second argument must be a String");if(!a.fn(y))throw new TypeError("Third argument must be a Function");if(a.node(u))return c(u,d,y);if(a.nodeList(u))return l(u,d,y);if(a.string(u))return f(u,d,y);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function c(u,d,y){return u.addEventListener(d,y),{destroy:function(){u.removeEventListener(d,y)}}}function l(u,d,y){return Array.prototype.forEach.call(u,function(L){L.addEventListener(d,y)}),{destroy:function(){Array.prototype.forEach.call(u,function(L){L.removeEventListener(d,y)})}}}function f(u,d,y){return s(document.body,u,d,y)}o.exports=p},817:function(o){function n(i){var a;if(i.nodeName==="SELECT")i.focus(),a=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var s=i.hasAttribute("readonly");s||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),s||i.removeAttribute("readonly"),a=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var p=window.getSelection(),c=document.createRange();c.selectNodeContents(i),p.removeAllRanges(),p.addRange(c),a=p.toString()}return a}o.exports=n},279:function(o){function n(){}n.prototype={on:function(i,a,s){var p=this.e||(this.e={});return(p[i]||(p[i]=[])).push({fn:a,ctx:s}),this},once:function(i,a,s){var p=this;function c(){p.off(i,c),a.apply(s,arguments)}return c._=a,this.on(i,c,s)},emit:function(i){var a=[].slice.call(arguments,1),s=((this.e||(this.e={}))[i]||[]).slice(),p=0,c=s.length;for(p;p0&&i[i.length-1])&&(c[0]===6||c[0]===2)){r=0;continue}if(c[0]===3&&(!i||c[1]>i[0]&&c[1]=e.length&&(e=void 0),{value:e&&e[o++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function N(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var o=r.call(e),n,i=[],a;try{for(;(t===void 0||t-- >0)&&!(n=o.next()).done;)i.push(n.value)}catch(s){a={error:s}}finally{try{n&&!n.done&&(r=o.return)&&r.call(o)}finally{if(a)throw a.error}}return i}function q(e,t,r){if(r||arguments.length===2)for(var o=0,n=t.length,i;o1||p(d,L)})},y&&(n[d]=y(n[d])))}function p(d,y){try{c(o[d](y))}catch(L){u(i[0][3],L)}}function c(d){d.value instanceof nt?Promise.resolve(d.value.v).then(l,f):u(i[0][2],d)}function l(d){p("next",d)}function f(d){p("throw",d)}function u(d,y){d(y),i.shift(),i.length&&p(i[0][0],i[0][1])}}function uo(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof he=="function"?he(e):e[Symbol.iterator](),r={},o("next"),o("throw"),o("return"),r[Symbol.asyncIterator]=function(){return this},r);function o(i){r[i]=e[i]&&function(a){return new Promise(function(s,p){a=e[i](a),n(s,p,a.done,a.value)})}}function n(i,a,s,p){Promise.resolve(p).then(function(c){i({value:c,done:s})},a)}}function H(e){return typeof e=="function"}function ut(e){var t=function(o){Error.call(o),o.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var zt=ut(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: `+r.map(function(o,n){return n+1+") "+o.toString()}).join(` - `):"",this.name="UnsubscriptionError",this.errors=r}});function Qe(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ue=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,o,n,i;if(!this.closed){this.closed=!0;var a=this._parentage;if(a)if(this._parentage=null,Array.isArray(a))try{for(var s=he(a),p=s.next();!p.done;p=s.next()){var c=p.value;c.remove(this)}}catch(L){t={error:L}}finally{try{p&&!p.done&&(r=s.return)&&r.call(s)}finally{if(t)throw t.error}}else a.remove(this);var l=this.initialTeardown;if(H(l))try{l()}catch(L){i=L instanceof zt?L.errors:[L]}var f=this._finalizers;if(f){this._finalizers=null;try{for(var u=he(f),d=u.next();!d.done;d=u.next()){var y=d.value;try{ho(y)}catch(L){i=i!=null?i:[],L instanceof zt?i=q(q([],N(i)),N(L.errors)):i.push(L)}}}catch(L){o={error:L}}finally{try{d&&!d.done&&(n=u.return)&&n.call(u)}finally{if(o)throw o.error}}}if(i)throw new zt(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)ho(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Qe(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Qe(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Tr=Ue.EMPTY;function qt(e){return e instanceof Ue||e&&"closed"in e&&H(e.remove)&&H(e.add)&&H(e.unsubscribe)}function ho(e){H(e)?e():e.unsubscribe()}var Pe={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var dt={setTimeout:function(e,t){for(var r=[],o=2;o0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var o=this,n=this,i=n.hasError,a=n.isStopped,s=n.observers;return i||a?Tr:(this.currentObservers=null,s.push(r),new Ue(function(){o.currentObservers=null,Qe(s,r)}))},t.prototype._checkFinalizedStatuses=function(r){var o=this,n=o.hasError,i=o.thrownError,a=o.isStopped;n?r.error(i):a&&r.complete()},t.prototype.asObservable=function(){var r=new j;return r.source=this,r},t.create=function(r,o){return new To(r,o)},t}(j);var To=function(e){oe(t,e);function t(r,o){var n=e.call(this)||this;return n.destination=r,n.source=o,n}return t.prototype.next=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.next)===null||n===void 0||n.call(o,r)},t.prototype.error=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.error)===null||n===void 0||n.call(o,r)},t.prototype.complete=function(){var r,o;(o=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||o===void 0||o.call(r)},t.prototype._subscribe=function(r){var o,n;return(n=(o=this.source)===null||o===void 0?void 0:o.subscribe(r))!==null&&n!==void 0?n:Tr},t}(g);var _r=function(e){oe(t,e);function t(r){var o=e.call(this)||this;return o._value=r,o}return Object.defineProperty(t.prototype,"value",{get:function(){return this.getValue()},enumerable:!1,configurable:!0}),t.prototype._subscribe=function(r){var o=e.prototype._subscribe.call(this,r);return!o.closed&&r.next(this._value),o},t.prototype.getValue=function(){var r=this,o=r.hasError,n=r.thrownError,i=r._value;if(o)throw n;return this._throwIfClosed(),i},t.prototype.next=function(r){e.prototype.next.call(this,this._value=r)},t}(g);var At={now:function(){return(At.delegate||Date).now()},delegate:void 0};var Ct=function(e){oe(t,e);function t(r,o,n){r===void 0&&(r=1/0),o===void 0&&(o=1/0),n===void 0&&(n=At);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=o,i._timestampProvider=n,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=o===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,o),i}return t.prototype.next=function(r){var o=this,n=o.isStopped,i=o._buffer,a=o._infiniteTimeWindow,s=o._timestampProvider,p=o._windowTime;n||(i.push(r),!a&&i.push(s.now()+p)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var o=this._innerSubscribe(r),n=this,i=n._infiniteTimeWindow,a=n._buffer,s=a.slice(),p=0;p0?e.prototype.schedule.call(this,r,o):(this.delay=o,this.state=r,this.scheduler.flush(this),this)},t.prototype.execute=function(r,o){return o>0||this.closed?e.prototype.execute.call(this,r,o):this._execute(r,o)},t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),n!=null&&n>0||n==null&&this.delay>0?e.prototype.requestAsyncId.call(this,r,o,n):(r.flush(this),0)},t}(gt);var Lo=function(e){oe(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t}(yt);var kr=new Lo(Oo);var Mo=function(e){oe(t,e);function t(r,o){var n=e.call(this,r,o)||this;return n.scheduler=r,n.work=o,n}return t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),n!==null&&n>0?e.prototype.requestAsyncId.call(this,r,o,n):(r.actions.push(this),r._scheduled||(r._scheduled=vt.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,o,n){var i;if(n===void 0&&(n=0),n!=null?n>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,o,n);var a=r.actions;o!=null&&((i=a[a.length-1])===null||i===void 0?void 0:i.id)!==o&&(vt.cancelAnimationFrame(o),r._scheduled=void 0)},t}(gt);var _o=function(e){oe(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var o=this._scheduled;this._scheduled=void 0;var n=this.actions,i;r=r||n.shift();do if(i=r.execute(r.state,r.delay))break;while((r=n[0])&&r.id===o&&n.shift());if(this._active=!1,i){for(;(r=n[0])&&r.id===o&&n.shift();)r.unsubscribe();throw i}},t}(yt);var me=new _o(Mo);var S=new j(function(e){return e.complete()});function Yt(e){return e&&H(e.schedule)}function Hr(e){return e[e.length-1]}function Xe(e){return H(Hr(e))?e.pop():void 0}function ke(e){return Yt(Hr(e))?e.pop():void 0}function Bt(e,t){return typeof Hr(e)=="number"?e.pop():t}var xt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Gt(e){return H(e==null?void 0:e.then)}function Jt(e){return H(e[bt])}function Xt(e){return Symbol.asyncIterator&&H(e==null?void 0:e[Symbol.asyncIterator])}function Zt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function Zi(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var er=Zi();function tr(e){return H(e==null?void 0:e[er])}function rr(e){return fo(this,arguments,function(){var r,o,n,i;return Nt(this,function(a){switch(a.label){case 0:r=e.getReader(),a.label=1;case 1:a.trys.push([1,,9,10]),a.label=2;case 2:return[4,nt(r.read())];case 3:return o=a.sent(),n=o.value,i=o.done,i?[4,nt(void 0)]:[3,5];case 4:return[2,a.sent()];case 5:return[4,nt(n)];case 6:return[4,a.sent()];case 7:return a.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function or(e){return H(e==null?void 0:e.getReader)}function U(e){if(e instanceof j)return e;if(e!=null){if(Jt(e))return ea(e);if(xt(e))return ta(e);if(Gt(e))return ra(e);if(Xt(e))return Ao(e);if(tr(e))return oa(e);if(or(e))return na(e)}throw Zt(e)}function ea(e){return new j(function(t){var r=e[bt]();if(H(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function ta(e){return new j(function(t){for(var r=0;r=2;return function(o){return o.pipe(e?b(function(n,i){return e(n,i,o)}):le,Te(1),r?De(t):Qo(function(){return new ir}))}}function jr(e){return e<=0?function(){return S}:E(function(t,r){var o=[];t.subscribe(T(r,function(n){o.push(n),e=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new g}:t,o=e.resetOnError,n=o===void 0?!0:o,i=e.resetOnComplete,a=i===void 0?!0:i,s=e.resetOnRefCountZero,p=s===void 0?!0:s;return function(c){var l,f,u,d=0,y=!1,L=!1,X=function(){f==null||f.unsubscribe(),f=void 0},ee=function(){X(),l=u=void 0,y=L=!1},J=function(){var k=l;ee(),k==null||k.unsubscribe()};return E(function(k,ft){d++,!L&&!y&&X();var qe=u=u!=null?u:r();ft.add(function(){d--,d===0&&!L&&!y&&(f=Ur(J,p))}),qe.subscribe(ft),!l&&d>0&&(l=new at({next:function(Fe){return qe.next(Fe)},error:function(Fe){L=!0,X(),f=Ur(ee,n,Fe),qe.error(Fe)},complete:function(){y=!0,X(),f=Ur(ee,a),qe.complete()}}),U(k).subscribe(l))})(c)}}function Ur(e,t){for(var r=[],o=2;oe.next(document)),e}function P(e,t=document){return Array.from(t.querySelectorAll(e))}function R(e,t=document){let r=fe(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function fe(e,t=document){return t.querySelector(e)||void 0}function Ie(){var e,t,r,o;return(o=(r=(t=(e=document.activeElement)==null?void 0:e.shadowRoot)==null?void 0:t.activeElement)!=null?r:document.activeElement)!=null?o:void 0}var wa=O(h(document.body,"focusin"),h(document.body,"focusout")).pipe(_e(1),Q(void 0),m(()=>Ie()||document.body),G(1));function et(e){return wa.pipe(m(t=>e.contains(t)),K())}function $t(e,t){return C(()=>O(h(e,"mouseenter").pipe(m(()=>!0)),h(e,"mouseleave").pipe(m(()=>!1))).pipe(t?Ht(r=>Le(+!r*t)):le,Q(e.matches(":hover"))))}function Jo(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)Jo(e,r)}function x(e,t,...r){let o=document.createElement(e);if(t)for(let n of Object.keys(t))typeof t[n]!="undefined"&&(typeof t[n]!="boolean"?o.setAttribute(n,t[n]):o.setAttribute(n,""));for(let n of r)Jo(o,n);return o}function sr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function Tt(e){let t=x("script",{src:e});return C(()=>(document.head.appendChild(t),O(h(t,"load"),h(t,"error").pipe(v(()=>$r(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(m(()=>{}),_(()=>document.head.removeChild(t)),Te(1))))}var Xo=new g,Ta=C(()=>typeof ResizeObserver=="undefined"?Tt("https://unpkg.com/resize-observer-polyfill"):I(void 0)).pipe(m(()=>new ResizeObserver(e=>e.forEach(t=>Xo.next(t)))),v(e=>O(Ye,I(e)).pipe(_(()=>e.disconnect()))),G(1));function ce(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ge(e){let t=e;for(;t.clientWidth===0&&t.parentElement;)t=t.parentElement;return Ta.pipe(w(r=>r.observe(t)),v(r=>Xo.pipe(b(o=>o.target===t),_(()=>r.unobserve(t)))),m(()=>ce(e)),Q(ce(e)))}function St(e){return{width:e.scrollWidth,height:e.scrollHeight}}function cr(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}function Zo(e){let t=[],r=e.parentElement;for(;r;)(e.clientWidth>r.clientWidth||e.clientHeight>r.clientHeight)&&t.push(r),r=(e=r).parentElement;return t.length===0&&t.push(document.documentElement),t}function Ve(e){return{x:e.offsetLeft,y:e.offsetTop}}function en(e){let t=e.getBoundingClientRect();return{x:t.x+window.scrollX,y:t.y+window.scrollY}}function tn(e){return O(h(window,"load"),h(window,"resize")).pipe(Me(0,me),m(()=>Ve(e)),Q(Ve(e)))}function pr(e){return{x:e.scrollLeft,y:e.scrollTop}}function Ne(e){return O(h(e,"scroll"),h(window,"scroll"),h(window,"resize")).pipe(Me(0,me),m(()=>pr(e)),Q(pr(e)))}var rn=new g,Sa=C(()=>I(new IntersectionObserver(e=>{for(let t of e)rn.next(t)},{threshold:0}))).pipe(v(e=>O(Ye,I(e)).pipe(_(()=>e.disconnect()))),G(1));function tt(e){return Sa.pipe(w(t=>t.observe(e)),v(t=>rn.pipe(b(({target:r})=>r===e),_(()=>t.unobserve(e)),m(({isIntersecting:r})=>r))))}function on(e,t=16){return Ne(e).pipe(m(({y:r})=>{let o=ce(e),n=St(e);return r>=n.height-o.height-t}),K())}var lr={drawer:R("[data-md-toggle=drawer]"),search:R("[data-md-toggle=search]")};function nn(e){return lr[e].checked}function Je(e,t){lr[e].checked!==t&&lr[e].click()}function ze(e){let t=lr[e];return h(t,"change").pipe(m(()=>t.checked),Q(t.checked))}function Oa(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function La(){return O(h(window,"compositionstart").pipe(m(()=>!0)),h(window,"compositionend").pipe(m(()=>!1))).pipe(Q(!1))}function an(){let e=h(window,"keydown").pipe(b(t=>!(t.metaKey||t.ctrlKey)),m(t=>({mode:nn("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),b(({mode:t,type:r})=>{if(t==="global"){let o=Ie();if(typeof o!="undefined")return!Oa(o,r)}return!0}),pe());return La().pipe(v(t=>t?S:e))}function ye(){return new URL(location.href)}function lt(e,t=!1){if(B("navigation.instant")&&!t){let r=x("a",{href:e.href});document.body.appendChild(r),r.click(),r.remove()}else location.href=e.href}function sn(){return new g}function cn(){return location.hash.slice(1)}function pn(e){let t=x("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Ma(e){return O(h(window,"hashchange"),e).pipe(m(cn),Q(cn()),b(t=>t.length>0),G(1))}function ln(e){return Ma(e).pipe(m(t=>fe(`[id="${t}"]`)),b(t=>typeof t!="undefined"))}function Pt(e){let t=matchMedia(e);return ar(r=>t.addListener(()=>r(t.matches))).pipe(Q(t.matches))}function mn(){let e=matchMedia("print");return O(h(window,"beforeprint").pipe(m(()=>!0)),h(window,"afterprint").pipe(m(()=>!1))).pipe(Q(e.matches))}function Nr(e,t){return e.pipe(v(r=>r?t():S))}function zr(e,t){return new j(r=>{let o=new XMLHttpRequest;return o.open("GET",`${e}`),o.responseType="blob",o.addEventListener("load",()=>{o.status>=200&&o.status<300?(r.next(o.response),r.complete()):r.error(new Error(o.statusText))}),o.addEventListener("error",()=>{r.error(new Error("Network error"))}),o.addEventListener("abort",()=>{r.complete()}),typeof(t==null?void 0:t.progress$)!="undefined"&&(o.addEventListener("progress",n=>{var i;if(n.lengthComputable)t.progress$.next(n.loaded/n.total*100);else{let a=(i=o.getResponseHeader("Content-Length"))!=null?i:0;t.progress$.next(n.loaded/+a*100)}}),t.progress$.next(5)),o.send(),()=>o.abort()})}function je(e,t){return zr(e,t).pipe(v(r=>r.text()),m(r=>JSON.parse(r)),G(1))}function fn(e,t){let r=new DOMParser;return zr(e,t).pipe(v(o=>o.text()),m(o=>r.parseFromString(o,"text/html")),G(1))}function un(e,t){let r=new DOMParser;return zr(e,t).pipe(v(o=>o.text()),m(o=>r.parseFromString(o,"text/xml")),G(1))}function dn(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function hn(){return O(h(window,"scroll",{passive:!0}),h(window,"resize",{passive:!0})).pipe(m(dn),Q(dn()))}function bn(){return{width:innerWidth,height:innerHeight}}function vn(){return h(window,"resize",{passive:!0}).pipe(m(bn),Q(bn()))}function gn(){return z([hn(),vn()]).pipe(m(([e,t])=>({offset:e,size:t})),G(1))}function mr(e,{viewport$:t,header$:r}){let o=t.pipe(te("size")),n=z([o,r]).pipe(m(()=>Ve(e)));return z([r,t,n]).pipe(m(([{height:i},{offset:a,size:s},{x:p,y:c}])=>({offset:{x:a.x-p,y:a.y-c+i},size:s})))}function _a(e){return h(e,"message",t=>t.data)}function Aa(e){let t=new g;return t.subscribe(r=>e.postMessage(r)),t}function yn(e,t=new Worker(e)){let r=_a(t),o=Aa(t),n=new g;n.subscribe(o);let i=o.pipe(Z(),ie(!0));return n.pipe(Z(),Re(r.pipe(W(i))),pe())}var Ca=R("#__config"),Ot=JSON.parse(Ca.textContent);Ot.base=`${new URL(Ot.base,ye())}`;function xe(){return Ot}function B(e){return Ot.features.includes(e)}function Ee(e,t){return typeof t!="undefined"?Ot.translations[e].replace("#",t.toString()):Ot.translations[e]}function Se(e,t=document){return R(`[data-md-component=${e}]`,t)}function ae(e,t=document){return P(`[data-md-component=${e}]`,t)}function ka(e){let t=R(".md-typeset > :first-child",e);return h(t,"click",{once:!0}).pipe(m(()=>R(".md-typeset",e)),m(r=>({hash:__md_hash(r.innerHTML)})))}function xn(e){if(!B("announce.dismiss")||!e.childElementCount)return S;if(!e.hidden){let t=R(".md-typeset",e);__md_hash(t.innerHTML)===__md_get("__announce")&&(e.hidden=!0)}return C(()=>{let t=new g;return t.subscribe(({hash:r})=>{e.hidden=!0,__md_set("__announce",r)}),ka(e).pipe(w(r=>t.next(r)),_(()=>t.complete()),m(r=>$({ref:e},r)))})}function Ha(e,{target$:t}){return t.pipe(m(r=>({hidden:r!==e})))}function En(e,t){let r=new g;return r.subscribe(({hidden:o})=>{e.hidden=o}),Ha(e,t).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))}function Rt(e,t){return t==="inline"?x("div",{class:"md-tooltip md-tooltip--inline",id:e,role:"tooltip"},x("div",{class:"md-tooltip__inner md-typeset"})):x("div",{class:"md-tooltip",id:e,role:"tooltip"},x("div",{class:"md-tooltip__inner md-typeset"}))}function wn(...e){return x("div",{class:"md-tooltip2",role:"tooltip"},x("div",{class:"md-tooltip2__inner md-typeset"},e))}function Tn(e,t){if(t=t?`${t}_annotation_${e}`:void 0,t){let r=t?`#${t}`:void 0;return x("aside",{class:"md-annotation",tabIndex:0},Rt(t),x("a",{href:r,class:"md-annotation__index",tabIndex:-1},x("span",{"data-md-annotation-id":e})))}else return x("aside",{class:"md-annotation",tabIndex:0},Rt(t),x("span",{class:"md-annotation__index",tabIndex:-1},x("span",{"data-md-annotation-id":e})))}function Sn(e){return x("button",{class:"md-clipboard md-icon",title:Ee("clipboard.copy"),"data-clipboard-target":`#${e} > code`})}var Ln=Mt(qr());function Qr(e,t){let r=t&2,o=t&1,n=Object.keys(e.terms).filter(p=>!e.terms[p]).reduce((p,c)=>[...p,x("del",null,(0,Ln.default)(c))," "],[]).slice(0,-1),i=xe(),a=new URL(e.location,i.base);B("search.highlight")&&a.searchParams.set("h",Object.entries(e.terms).filter(([,p])=>p).reduce((p,[c])=>`${p} ${c}`.trim(),""));let{tags:s}=xe();return x("a",{href:`${a}`,class:"md-search-result__link",tabIndex:-1},x("article",{class:"md-search-result__article md-typeset","data-md-score":e.score.toFixed(2)},r>0&&x("div",{class:"md-search-result__icon md-icon"}),r>0&&x("h1",null,e.title),r<=0&&x("h2",null,e.title),o>0&&e.text.length>0&&e.text,e.tags&&x("nav",{class:"md-tags"},e.tags.map(p=>{let c=s?p in s?`md-tag-icon md-tag--${s[p]}`:"md-tag-icon":"";return x("span",{class:`md-tag ${c}`},p)})),o>0&&n.length>0&&x("p",{class:"md-search-result__terms"},Ee("search.result.term.missing"),": ",...n)))}function Mn(e){let t=e[0].score,r=[...e],o=xe(),n=r.findIndex(l=>!`${new URL(l.location,o.base)}`.includes("#")),[i]=r.splice(n,1),a=r.findIndex(l=>l.scoreQr(l,1)),...p.length?[x("details",{class:"md-search-result__more"},x("summary",{tabIndex:-1},x("div",null,p.length>0&&p.length===1?Ee("search.result.more.one"):Ee("search.result.more.other",p.length))),...p.map(l=>Qr(l,1)))]:[]];return x("li",{class:"md-search-result__item"},c)}function _n(e){return x("ul",{class:"md-source__facts"},Object.entries(e).map(([t,r])=>x("li",{class:`md-source__fact md-source__fact--${t}`},typeof r=="number"?sr(r):r)))}function Kr(e){let t=`tabbed-control tabbed-control--${e}`;return x("div",{class:t,hidden:!0},x("button",{class:"tabbed-button",tabIndex:-1,"aria-hidden":"true"}))}function An(e){return x("div",{class:"md-typeset__scrollwrap"},x("div",{class:"md-typeset__table"},e))}function Ra(e){var o;let t=xe(),r=new URL(`../${e.version}/`,t.base);return x("li",{class:"md-version__item"},x("a",{href:`${r}`,class:"md-version__link"},e.title,((o=t.version)==null?void 0:o.alias)&&e.aliases.length>0&&x("span",{class:"md-version__alias"},e.aliases[0])))}function Cn(e,t){var o;let r=xe();return e=e.filter(n=>{var i;return!((i=n.properties)!=null&&i.hidden)}),x("div",{class:"md-version"},x("button",{class:"md-version__current","aria-label":Ee("select.version")},t.title,((o=r.version)==null?void 0:o.alias)&&t.aliases.length>0&&x("span",{class:"md-version__alias"},t.aliases[0])),x("ul",{class:"md-version__list"},e.map(Ra)))}var Ia=0;function ja(e){let t=z([et(e),$t(e)]).pipe(m(([o,n])=>o||n),K()),r=C(()=>Zo(e)).pipe(ne(Ne),pt(1),He(t),m(()=>en(e)));return t.pipe(Ae(o=>o),v(()=>z([t,r])),m(([o,n])=>({active:o,offset:n})),pe())}function Fa(e,t){let{content$:r,viewport$:o}=t,n=`__tooltip2_${Ia++}`;return C(()=>{let i=new g,a=new _r(!1);i.pipe(Z(),ie(!1)).subscribe(a);let s=a.pipe(Ht(c=>Le(+!c*250,kr)),K(),v(c=>c?r:S),w(c=>c.id=n),pe());z([i.pipe(m(({active:c})=>c)),s.pipe(v(c=>$t(c,250)),Q(!1))]).pipe(m(c=>c.some(l=>l))).subscribe(a);let p=a.pipe(b(c=>c),re(s,o),m(([c,l,{size:f}])=>{let u=e.getBoundingClientRect(),d=u.width/2;if(l.role==="tooltip")return{x:d,y:8+u.height};if(u.y>=f.height/2){let{height:y}=ce(l);return{x:d,y:-16-y}}else return{x:d,y:16+u.height}}));return z([s,i,p]).subscribe(([c,{offset:l},f])=>{c.style.setProperty("--md-tooltip-host-x",`${l.x}px`),c.style.setProperty("--md-tooltip-host-y",`${l.y}px`),c.style.setProperty("--md-tooltip-x",`${f.x}px`),c.style.setProperty("--md-tooltip-y",`${f.y}px`),c.classList.toggle("md-tooltip2--top",f.y<0),c.classList.toggle("md-tooltip2--bottom",f.y>=0)}),a.pipe(b(c=>c),re(s,(c,l)=>l),b(c=>c.role==="tooltip")).subscribe(c=>{let l=ce(R(":scope > *",c));c.style.setProperty("--md-tooltip-width",`${l.width}px`),c.style.setProperty("--md-tooltip-tail","0px")}),a.pipe(K(),ve(me),re(s)).subscribe(([c,l])=>{l.classList.toggle("md-tooltip2--active",c)}),z([a.pipe(b(c=>c)),s]).subscribe(([c,l])=>{l.role==="dialog"?(e.setAttribute("aria-controls",n),e.setAttribute("aria-haspopup","dialog")):e.setAttribute("aria-describedby",n)}),a.pipe(b(c=>!c)).subscribe(()=>{e.removeAttribute("aria-controls"),e.removeAttribute("aria-describedby"),e.removeAttribute("aria-haspopup")}),ja(e).pipe(w(c=>i.next(c)),_(()=>i.complete()),m(c=>$({ref:e},c)))})}function mt(e,{viewport$:t},r=document.body){return Fa(e,{content$:new j(o=>{let n=e.title,i=wn(n);return o.next(i),e.removeAttribute("title"),r.append(i),()=>{i.remove(),e.setAttribute("title",n)}}),viewport$:t})}function Ua(e,t){let r=C(()=>z([tn(e),Ne(t)])).pipe(m(([{x:o,y:n},i])=>{let{width:a,height:s}=ce(e);return{x:o-i.x+a/2,y:n-i.y+s/2}}));return et(e).pipe(v(o=>r.pipe(m(n=>({active:o,offset:n})),Te(+!o||1/0))))}function kn(e,t,{target$:r}){let[o,n]=Array.from(e.children);return C(()=>{let i=new g,a=i.pipe(Z(),ie(!0));return i.subscribe({next({offset:s}){e.style.setProperty("--md-tooltip-x",`${s.x}px`),e.style.setProperty("--md-tooltip-y",`${s.y}px`)},complete(){e.style.removeProperty("--md-tooltip-x"),e.style.removeProperty("--md-tooltip-y")}}),tt(e).pipe(W(a)).subscribe(s=>{e.toggleAttribute("data-md-visible",s)}),O(i.pipe(b(({active:s})=>s)),i.pipe(_e(250),b(({active:s})=>!s))).subscribe({next({active:s}){s?e.prepend(o):o.remove()},complete(){e.prepend(o)}}),i.pipe(Me(16,me)).subscribe(({active:s})=>{o.classList.toggle("md-tooltip--active",s)}),i.pipe(pt(125,me),b(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:s})=>s)).subscribe({next(s){s?e.style.setProperty("--md-tooltip-0",`${-s}px`):e.style.removeProperty("--md-tooltip-0")},complete(){e.style.removeProperty("--md-tooltip-0")}}),h(n,"click").pipe(W(a),b(s=>!(s.metaKey||s.ctrlKey))).subscribe(s=>{s.stopPropagation(),s.preventDefault()}),h(n,"mousedown").pipe(W(a),re(i)).subscribe(([s,{active:p}])=>{var c;if(s.button!==0||s.metaKey||s.ctrlKey)s.preventDefault();else if(p){s.preventDefault();let l=e.parentElement.closest(".md-annotation");l instanceof HTMLElement?l.focus():(c=Ie())==null||c.blur()}}),r.pipe(W(a),b(s=>s===o),Ge(125)).subscribe(()=>e.focus()),Ua(e,t).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))})}function Wa(e){return e.tagName==="CODE"?P(".c, .c1, .cm",e):[e]}function Da(e){let t=[];for(let r of Wa(e)){let o=[],n=document.createNodeIterator(r,NodeFilter.SHOW_TEXT);for(let i=n.nextNode();i;i=n.nextNode())o.push(i);for(let i of o){let a;for(;a=/(\(\d+\))(!)?/.exec(i.textContent);){let[,s,p]=a;if(typeof p=="undefined"){let c=i.splitText(a.index);i=c.splitText(s.length),t.push(c)}else{i.textContent=s,t.push(i);break}}}}return t}function Hn(e,t){t.append(...Array.from(e.childNodes))}function fr(e,t,{target$:r,print$:o}){let n=t.closest("[id]"),i=n==null?void 0:n.id,a=new Map;for(let s of Da(t)){let[,p]=s.textContent.match(/\((\d+)\)/);fe(`:scope > li:nth-child(${p})`,e)&&(a.set(p,Tn(p,i)),s.replaceWith(a.get(p)))}return a.size===0?S:C(()=>{let s=new g,p=s.pipe(Z(),ie(!0)),c=[];for(let[l,f]of a)c.push([R(".md-typeset",f),R(`:scope > li:nth-child(${l})`,e)]);return o.pipe(W(p)).subscribe(l=>{e.hidden=!l,e.classList.toggle("md-annotation-list",l);for(let[f,u]of c)l?Hn(f,u):Hn(u,f)}),O(...[...a].map(([,l])=>kn(l,t,{target$:r}))).pipe(_(()=>s.complete()),pe())})}function $n(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return $n(t)}}function Pn(e,t){return C(()=>{let r=$n(e);return typeof r!="undefined"?fr(r,e,t):S})}var Rn=Mt(Br());var Va=0;function In(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return In(t)}}function Na(e){return ge(e).pipe(m(({width:t})=>({scrollable:St(e).width>t})),te("scrollable"))}function jn(e,t){let{matches:r}=matchMedia("(hover)"),o=C(()=>{let n=new g,i=n.pipe(jr(1));n.subscribe(({scrollable:c})=>{c&&r?e.setAttribute("tabindex","0"):e.removeAttribute("tabindex")});let a=[];if(Rn.default.isSupported()&&(e.closest(".copy")||B("content.code.copy")&&!e.closest(".no-copy"))){let c=e.closest("pre");c.id=`__code_${Va++}`;let l=Sn(c.id);c.insertBefore(l,e),B("content.tooltips")&&a.push(mt(l,{viewport$}))}let s=e.closest(".highlight");if(s instanceof HTMLElement){let c=In(s);if(typeof c!="undefined"&&(s.classList.contains("annotate")||B("content.code.annotate"))){let l=fr(c,e,t);a.push(ge(s).pipe(W(i),m(({width:f,height:u})=>f&&u),K(),v(f=>f?l:S)))}}return P(":scope > span[id]",e).length&&e.classList.add("md-code__content"),Na(e).pipe(w(c=>n.next(c)),_(()=>n.complete()),m(c=>$({ref:e},c)),Re(...a))});return B("content.lazy")?tt(e).pipe(b(n=>n),Te(1),v(()=>o)):o}function za(e,{target$:t,print$:r}){let o=!0;return O(t.pipe(m(n=>n.closest("details:not([open])")),b(n=>e===n),m(()=>({action:"open",reveal:!0}))),r.pipe(b(n=>n||!o),w(()=>o=e.open),m(n=>({action:n?"open":"close"}))))}function Fn(e,t){return C(()=>{let r=new g;return r.subscribe(({action:o,reveal:n})=>{e.toggleAttribute("open",o==="open"),n&&e.scrollIntoView()}),za(e,t).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}var Un=".node circle,.node ellipse,.node path,.node polygon,.node rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}marker{fill:var(--md-mermaid-edge-color)!important}.edgeLabel .label rect{fill:#0000}.flowchartTitleText{fill:var(--md-mermaid-label-fg-color)}.label{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.label foreignObject{line-height:normal;overflow:visible}.label div .edgeLabel{color:var(--md-mermaid-label-fg-color)}.edgeLabel,.edgeLabel p,.label div .edgeLabel{background-color:var(--md-mermaid-label-bg-color)}.edgeLabel,.edgeLabel p{fill:var(--md-mermaid-label-bg-color);color:var(--md-mermaid-edge-color)}.edgePath .path,.flowchart-link{stroke:var(--md-mermaid-edge-color);stroke-width:.05rem}.edgePath .arrowheadPath{fill:var(--md-mermaid-edge-color);stroke:none}.cluster rect{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}.cluster span{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}g #flowchart-circleEnd,g #flowchart-circleStart,g #flowchart-crossEnd,g #flowchart-crossStart,g #flowchart-pointEnd,g #flowchart-pointStart{stroke:none}.classDiagramTitleText{fill:var(--md-mermaid-label-fg-color)}g.classGroup line,g.classGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.classGroup text{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.classLabel .box{fill:var(--md-mermaid-label-bg-color);background-color:var(--md-mermaid-label-bg-color);opacity:1}.classLabel .label{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.node .divider{stroke:var(--md-mermaid-node-fg-color)}.relation{stroke:var(--md-mermaid-edge-color)}.cardinality{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.cardinality text{fill:inherit!important}defs #classDiagram-compositionEnd,defs #classDiagram-compositionStart,defs #classDiagram-dependencyEnd,defs #classDiagram-dependencyStart,defs #classDiagram-extensionEnd,defs #classDiagram-extensionStart{fill:var(--md-mermaid-edge-color)!important;stroke:var(--md-mermaid-edge-color)!important}defs #classDiagram-aggregationEnd,defs #classDiagram-aggregationStart{fill:var(--md-mermaid-label-bg-color)!important;stroke:var(--md-mermaid-edge-color)!important}.statediagramTitleText{fill:var(--md-mermaid-label-fg-color)}g.stateGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.stateGroup .state-title{fill:var(--md-mermaid-label-fg-color)!important;font-family:var(--md-mermaid-font-family)}g.stateGroup .composit{fill:var(--md-mermaid-label-bg-color)}.nodeLabel,.nodeLabel p{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}a .nodeLabel{text-decoration:underline}.node circle.state-end,.node circle.state-start,.start-state{fill:var(--md-mermaid-edge-color);stroke:none}.end-state-inner,.end-state-outer{fill:var(--md-mermaid-edge-color)}.end-state-inner,.node circle.state-end{stroke:var(--md-mermaid-label-bg-color)}.transition{stroke:var(--md-mermaid-edge-color)}[id^=state-fork] rect,[id^=state-join] rect{fill:var(--md-mermaid-edge-color)!important;stroke:none!important}.statediagram-cluster.statediagram-cluster .inner{fill:var(--md-default-bg-color)}.statediagram-cluster rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.statediagram-state rect.divider{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}defs #statediagram-barbEnd{stroke:var(--md-mermaid-edge-color)}.entityTitleText{fill:var(--md-mermaid-label-fg-color)}.attributeBoxEven,.attributeBoxOdd{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityBox{fill:var(--md-mermaid-label-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityLabel{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.relationshipLabelBox{fill:var(--md-mermaid-label-bg-color);fill-opacity:1;background-color:var(--md-mermaid-label-bg-color);opacity:1}.relationshipLabel{fill:var(--md-mermaid-label-fg-color)}.relationshipLine{stroke:var(--md-mermaid-edge-color)}defs #ONE_OR_MORE_END *,defs #ONE_OR_MORE_START *,defs #ONLY_ONE_END *,defs #ONLY_ONE_START *,defs #ZERO_OR_MORE_END *,defs #ZERO_OR_MORE_START *,defs #ZERO_OR_ONE_END *,defs #ZERO_OR_ONE_START *{stroke:var(--md-mermaid-edge-color)!important}defs #ZERO_OR_MORE_END circle,defs #ZERO_OR_MORE_START circle{fill:var(--md-mermaid-label-bg-color)}text:not([class]):last-child{fill:var(--md-mermaid-label-fg-color)}.actor{fill:var(--md-mermaid-sequence-actor-bg-color);stroke:var(--md-mermaid-sequence-actor-border-color)}text.actor>tspan{fill:var(--md-mermaid-sequence-actor-fg-color);font-family:var(--md-mermaid-font-family)}line{stroke:var(--md-mermaid-sequence-actor-line-color)}.actor-man circle,.actor-man line{fill:var(--md-mermaid-sequence-actorman-bg-color);stroke:var(--md-mermaid-sequence-actorman-line-color)}.messageLine0,.messageLine1{stroke:var(--md-mermaid-sequence-message-line-color)}.note{fill:var(--md-mermaid-sequence-note-bg-color);stroke:var(--md-mermaid-sequence-note-border-color)}.loopText,.loopText>tspan,.messageText,.noteText>tspan{stroke:none;font-family:var(--md-mermaid-font-family)!important}.messageText{fill:var(--md-mermaid-sequence-message-fg-color)}.loopText,.loopText>tspan{fill:var(--md-mermaid-sequence-loop-fg-color)}.noteText>tspan{fill:var(--md-mermaid-sequence-note-fg-color)}#arrowhead path{fill:var(--md-mermaid-sequence-message-line-color);stroke:none}.loopLine{fill:var(--md-mermaid-sequence-loop-bg-color);stroke:var(--md-mermaid-sequence-loop-border-color)}.labelBox{fill:var(--md-mermaid-sequence-label-bg-color);stroke:none}.labelText,.labelText>span{fill:var(--md-mermaid-sequence-label-fg-color);font-family:var(--md-mermaid-font-family)}.sequenceNumber{fill:var(--md-mermaid-sequence-number-fg-color)}rect.rect{fill:var(--md-mermaid-sequence-box-bg-color);stroke:none}rect.rect+text.text{fill:var(--md-mermaid-sequence-box-fg-color)}defs #sequencenumber{fill:var(--md-mermaid-sequence-number-bg-color)!important}";var Gr,Qa=0;function Ka(){return typeof mermaid=="undefined"||mermaid instanceof Element?Tt("https://unpkg.com/mermaid@11/dist/mermaid.min.js"):I(void 0)}function Wn(e){return e.classList.remove("mermaid"),Gr||(Gr=Ka().pipe(w(()=>mermaid.initialize({startOnLoad:!1,themeCSS:Un,sequence:{actorFontSize:"16px",messageFontSize:"16px",noteFontSize:"16px"}})),m(()=>{}),G(1))),Gr.subscribe(()=>co(this,null,function*(){e.classList.add("mermaid");let t=`__mermaid_${Qa++}`,r=x("div",{class:"mermaid"}),o=e.textContent,{svg:n,fn:i}=yield mermaid.render(t,o),a=r.attachShadow({mode:"closed"});a.innerHTML=n,e.replaceWith(r),i==null||i(a)})),Gr.pipe(m(()=>({ref:e})))}var Dn=x("table");function Vn(e){return e.replaceWith(Dn),Dn.replaceWith(An(e)),I({ref:e})}function Ya(e){let t=e.find(r=>r.checked)||e[0];return O(...e.map(r=>h(r,"change").pipe(m(()=>R(`label[for="${r.id}"]`))))).pipe(Q(R(`label[for="${t.id}"]`)),m(r=>({active:r})))}function Nn(e,{viewport$:t,target$:r}){let o=R(".tabbed-labels",e),n=P(":scope > input",e),i=Kr("prev");e.append(i);let a=Kr("next");return e.append(a),C(()=>{let s=new g,p=s.pipe(Z(),ie(!0));z([s,ge(e),tt(e)]).pipe(W(p),Me(1,me)).subscribe({next([{active:c},l]){let f=Ve(c),{width:u}=ce(c);e.style.setProperty("--md-indicator-x",`${f.x}px`),e.style.setProperty("--md-indicator-width",`${u}px`);let d=pr(o);(f.xd.x+l.width)&&o.scrollTo({left:Math.max(0,f.x-16),behavior:"smooth"})},complete(){e.style.removeProperty("--md-indicator-x"),e.style.removeProperty("--md-indicator-width")}}),z([Ne(o),ge(o)]).pipe(W(p)).subscribe(([c,l])=>{let f=St(o);i.hidden=c.x<16,a.hidden=c.x>f.width-l.width-16}),O(h(i,"click").pipe(m(()=>-1)),h(a,"click").pipe(m(()=>1))).pipe(W(p)).subscribe(c=>{let{width:l}=ce(o);o.scrollBy({left:l*c,behavior:"smooth"})}),r.pipe(W(p),b(c=>n.includes(c))).subscribe(c=>c.click()),o.classList.add("tabbed-labels--linked");for(let c of n){let l=R(`label[for="${c.id}"]`);l.replaceChildren(x("a",{href:`#${l.htmlFor}`,tabIndex:-1},...Array.from(l.childNodes))),h(l.firstElementChild,"click").pipe(W(p),b(f=>!(f.metaKey||f.ctrlKey)),w(f=>{f.preventDefault(),f.stopPropagation()})).subscribe(()=>{history.replaceState({},"",`#${l.htmlFor}`),l.click()})}return B("content.tabs.link")&&s.pipe(Ce(1),re(t)).subscribe(([{active:c},{offset:l}])=>{let f=c.innerText.trim();if(c.hasAttribute("data-md-switching"))c.removeAttribute("data-md-switching");else{let u=e.offsetTop-l.y;for(let y of P("[data-tabs]"))for(let L of P(":scope > input",y)){let X=R(`label[for="${L.id}"]`);if(X!==c&&X.innerText.trim()===f){X.setAttribute("data-md-switching",""),L.click();break}}window.scrollTo({top:e.offsetTop-u});let d=__md_get("__tabs")||[];__md_set("__tabs",[...new Set([f,...d])])}}),s.pipe(W(p)).subscribe(()=>{for(let c of P("audio, video",e))c.pause()}),Ya(n).pipe(w(c=>s.next(c)),_(()=>s.complete()),m(c=>$({ref:e},c)))}).pipe(Ke(se))}function zn(e,{viewport$:t,target$:r,print$:o}){return O(...P(".annotate:not(.highlight)",e).map(n=>Pn(n,{target$:r,print$:o})),...P("pre:not(.mermaid) > code",e).map(n=>jn(n,{target$:r,print$:o})),...P("pre.mermaid",e).map(n=>Wn(n)),...P("table:not([class])",e).map(n=>Vn(n)),...P("details",e).map(n=>Fn(n,{target$:r,print$:o})),...P("[data-tabs]",e).map(n=>Nn(n,{viewport$:t,target$:r})),...P("[title]",e).filter(()=>B("content.tooltips")).map(n=>mt(n,{viewport$:t})))}function Ba(e,{alert$:t}){return t.pipe(v(r=>O(I(!0),I(!1).pipe(Ge(2e3))).pipe(m(o=>({message:r,active:o})))))}function qn(e,t){let r=R(".md-typeset",e);return C(()=>{let o=new g;return o.subscribe(({message:n,active:i})=>{e.classList.toggle("md-dialog--active",i),r.textContent=n}),Ba(e,t).pipe(w(n=>o.next(n)),_(()=>o.complete()),m(n=>$({ref:e},n)))})}var Ga=0;function Ja(e,t){document.body.append(e);let{width:r}=ce(e);e.style.setProperty("--md-tooltip-width",`${r}px`),e.remove();let o=cr(t),n=typeof o!="undefined"?Ne(o):I({x:0,y:0}),i=O(et(t),$t(t)).pipe(K());return z([i,n]).pipe(m(([a,s])=>{let{x:p,y:c}=Ve(t),l=ce(t),f=t.closest("table");return f&&t.parentElement&&(p+=f.offsetLeft+t.parentElement.offsetLeft,c+=f.offsetTop+t.parentElement.offsetTop),{active:a,offset:{x:p-s.x+l.width/2-r/2,y:c-s.y+l.height+8}}}))}function Qn(e){let t=e.title;if(!t.length)return S;let r=`__tooltip_${Ga++}`,o=Rt(r,"inline"),n=R(".md-typeset",o);return n.innerHTML=t,C(()=>{let i=new g;return i.subscribe({next({offset:a}){o.style.setProperty("--md-tooltip-x",`${a.x}px`),o.style.setProperty("--md-tooltip-y",`${a.y}px`)},complete(){o.style.removeProperty("--md-tooltip-x"),o.style.removeProperty("--md-tooltip-y")}}),O(i.pipe(b(({active:a})=>a)),i.pipe(_e(250),b(({active:a})=>!a))).subscribe({next({active:a}){a?(e.insertAdjacentElement("afterend",o),e.setAttribute("aria-describedby",r),e.removeAttribute("title")):(o.remove(),e.removeAttribute("aria-describedby"),e.setAttribute("title",t))},complete(){o.remove(),e.removeAttribute("aria-describedby"),e.setAttribute("title",t)}}),i.pipe(Me(16,me)).subscribe(({active:a})=>{o.classList.toggle("md-tooltip--active",a)}),i.pipe(pt(125,me),b(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:a})=>a)).subscribe({next(a){a?o.style.setProperty("--md-tooltip-0",`${-a}px`):o.style.removeProperty("--md-tooltip-0")},complete(){o.style.removeProperty("--md-tooltip-0")}}),Ja(o,e).pipe(w(a=>i.next(a)),_(()=>i.complete()),m(a=>$({ref:e},a)))}).pipe(Ke(se))}function Xa({viewport$:e}){if(!B("header.autohide"))return I(!1);let t=e.pipe(m(({offset:{y:n}})=>n),Be(2,1),m(([n,i])=>[nMath.abs(i-n.y)>100),m(([,[n]])=>n),K()),o=ze("search");return z([e,o]).pipe(m(([{offset:n},i])=>n.y>400&&!i),K(),v(n=>n?r:I(!1)),Q(!1))}function Kn(e,t){return C(()=>z([ge(e),Xa(t)])).pipe(m(([{height:r},o])=>({height:r,hidden:o})),K((r,o)=>r.height===o.height&&r.hidden===o.hidden),G(1))}function Yn(e,{header$:t,main$:r}){return C(()=>{let o=new g,n=o.pipe(Z(),ie(!0));o.pipe(te("active"),He(t)).subscribe(([{active:a},{hidden:s}])=>{e.classList.toggle("md-header--shadow",a&&!s),e.hidden=s});let i=ue(P("[title]",e)).pipe(b(()=>B("content.tooltips")),ne(a=>Qn(a)));return r.subscribe(o),t.pipe(W(n),m(a=>$({ref:e},a)),Re(i.pipe(W(n))))})}function Za(e,{viewport$:t,header$:r}){return mr(e,{viewport$:t,header$:r}).pipe(m(({offset:{y:o}})=>{let{height:n}=ce(e);return{active:o>=n}}),te("active"))}function Bn(e,t){return C(()=>{let r=new g;r.subscribe({next({active:n}){e.classList.toggle("md-header__title--active",n)},complete(){e.classList.remove("md-header__title--active")}});let o=fe(".md-content h1");return typeof o=="undefined"?S:Za(o,t).pipe(w(n=>r.next(n)),_(()=>r.complete()),m(n=>$({ref:e},n)))})}function Gn(e,{viewport$:t,header$:r}){let o=r.pipe(m(({height:i})=>i),K()),n=o.pipe(v(()=>ge(e).pipe(m(({height:i})=>({top:e.offsetTop,bottom:e.offsetTop+i})),te("bottom"))));return z([o,n,t]).pipe(m(([i,{top:a,bottom:s},{offset:{y:p},size:{height:c}}])=>(c=Math.max(0,c-Math.max(0,a-p,i)-Math.max(0,c+p-s)),{offset:a-i,height:c,active:a-i<=p})),K((i,a)=>i.offset===a.offset&&i.height===a.height&&i.active===a.active))}function es(e){let t=__md_get("__palette")||{index:e.findIndex(o=>matchMedia(o.getAttribute("data-md-color-media")).matches)},r=Math.max(0,Math.min(t.index,e.length-1));return I(...e).pipe(ne(o=>h(o,"change").pipe(m(()=>o))),Q(e[r]),m(o=>({index:e.indexOf(o),color:{media:o.getAttribute("data-md-color-media"),scheme:o.getAttribute("data-md-color-scheme"),primary:o.getAttribute("data-md-color-primary"),accent:o.getAttribute("data-md-color-accent")}})),G(1))}function Jn(e){let t=P("input",e),r=x("meta",{name:"theme-color"});document.head.appendChild(r);let o=x("meta",{name:"color-scheme"});document.head.appendChild(o);let n=Pt("(prefers-color-scheme: light)");return C(()=>{let i=new g;return i.subscribe(a=>{if(document.body.setAttribute("data-md-color-switching",""),a.color.media==="(prefers-color-scheme)"){let s=matchMedia("(prefers-color-scheme: light)"),p=document.querySelector(s.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");a.color.scheme=p.getAttribute("data-md-color-scheme"),a.color.primary=p.getAttribute("data-md-color-primary"),a.color.accent=p.getAttribute("data-md-color-accent")}for(let[s,p]of Object.entries(a.color))document.body.setAttribute(`data-md-color-${s}`,p);for(let s=0;sa.key==="Enter"),re(i,(a,s)=>s)).subscribe(({index:a})=>{a=(a+1)%t.length,t[a].click(),t[a].focus()}),i.pipe(m(()=>{let a=Se("header"),s=window.getComputedStyle(a);return o.content=s.colorScheme,s.backgroundColor.match(/\d+/g).map(p=>(+p).toString(16).padStart(2,"0")).join("")})).subscribe(a=>r.content=`#${a}`),i.pipe(ve(se)).subscribe(()=>{document.body.removeAttribute("data-md-color-switching")}),es(t).pipe(W(n.pipe(Ce(1))),ct(),w(a=>i.next(a)),_(()=>i.complete()),m(a=>$({ref:e},a)))})}function Xn(e,{progress$:t}){return C(()=>{let r=new g;return r.subscribe(({value:o})=>{e.style.setProperty("--md-progress-value",`${o}`)}),t.pipe(w(o=>r.next({value:o})),_(()=>r.complete()),m(o=>({ref:e,value:o})))})}var Jr=Mt(Br());function ts(e){e.setAttribute("data-md-copying","");let t=e.closest("[data-copy]"),r=t?t.getAttribute("data-copy"):e.innerText;return e.removeAttribute("data-md-copying"),r.trimEnd()}function Zn({alert$:e}){Jr.default.isSupported()&&new j(t=>{new Jr.default("[data-clipboard-target], [data-clipboard-text]",{text:r=>r.getAttribute("data-clipboard-text")||ts(R(r.getAttribute("data-clipboard-target")))}).on("success",r=>t.next(r))}).pipe(w(t=>{t.trigger.focus()}),m(()=>Ee("clipboard.copied"))).subscribe(e)}function ei(e,t){return e.protocol=t.protocol,e.hostname=t.hostname,e}function rs(e,t){let r=new Map;for(let o of P("url",e)){let n=R("loc",o),i=[ei(new URL(n.textContent),t)];r.set(`${i[0]}`,i);for(let a of P("[rel=alternate]",o)){let s=a.getAttribute("href");s!=null&&i.push(ei(new URL(s),t))}}return r}function ur(e){return un(new URL("sitemap.xml",e)).pipe(m(t=>rs(t,new URL(e))),de(()=>I(new Map)))}function os(e,t){if(!(e.target instanceof Element))return S;let r=e.target.closest("a");if(r===null)return S;if(r.target||e.metaKey||e.ctrlKey)return S;let o=new URL(r.href);return o.search=o.hash="",t.has(`${o}`)?(e.preventDefault(),I(new URL(r.href))):S}function ti(e){let t=new Map;for(let r of P(":scope > *",e.head))t.set(r.outerHTML,r);return t}function ri(e){for(let t of P("[href], [src]",e))for(let r of["href","src"]){let o=t.getAttribute(r);if(o&&!/^(?:[a-z]+:)?\/\//i.test(o)){t[r]=t[r];break}}return I(e)}function ns(e){for(let o of["[data-md-component=announce]","[data-md-component=container]","[data-md-component=header-topic]","[data-md-component=outdated]","[data-md-component=logo]","[data-md-component=skip]",...B("navigation.tabs.sticky")?["[data-md-component=tabs]"]:[]]){let n=fe(o),i=fe(o,e);typeof n!="undefined"&&typeof i!="undefined"&&n.replaceWith(i)}let t=ti(document);for(let[o,n]of ti(e))t.has(o)?t.delete(o):document.head.appendChild(n);for(let o of t.values()){let n=o.getAttribute("name");n!=="theme-color"&&n!=="color-scheme"&&o.remove()}let r=Se("container");return We(P("script",r)).pipe(v(o=>{let n=e.createElement("script");if(o.src){for(let i of o.getAttributeNames())n.setAttribute(i,o.getAttribute(i));return o.replaceWith(n),new j(i=>{n.onload=()=>i.complete()})}else return n.textContent=o.textContent,o.replaceWith(n),S}),Z(),ie(document))}function oi({location$:e,viewport$:t,progress$:r}){let o=xe();if(location.protocol==="file:")return S;let n=ur(o.base);I(document).subscribe(ri);let i=h(document.body,"click").pipe(He(n),v(([p,c])=>os(p,c)),pe()),a=h(window,"popstate").pipe(m(ye),pe());i.pipe(re(t)).subscribe(([p,{offset:c}])=>{history.replaceState(c,""),history.pushState(null,"",p)}),O(i,a).subscribe(e);let s=e.pipe(te("pathname"),v(p=>fn(p,{progress$:r}).pipe(de(()=>(lt(p,!0),S)))),v(ri),v(ns),pe());return O(s.pipe(re(e,(p,c)=>c)),s.pipe(v(()=>e),te("hash")),e.pipe(K((p,c)=>p.pathname===c.pathname&&p.hash===c.hash),v(()=>i),w(()=>history.back()))).subscribe(p=>{var c,l;history.state!==null||!p.hash?window.scrollTo(0,(l=(c=history.state)==null?void 0:c.y)!=null?l:0):(history.scrollRestoration="auto",pn(p.hash),history.scrollRestoration="manual")}),e.subscribe(()=>{history.scrollRestoration="manual"}),h(window,"beforeunload").subscribe(()=>{history.scrollRestoration="auto"}),t.pipe(te("offset"),_e(100)).subscribe(({offset:p})=>{history.replaceState(p,"")}),s}var ni=Mt(qr());function ii(e){let t=e.separator.split("|").map(n=>n.replace(/(\(\?[!=<][^)]+\))/g,"").length===0?"\uFFFD":n).join("|"),r=new RegExp(t,"img"),o=(n,i,a)=>`${i}${a}`;return n=>{n=n.replace(/[\s*+\-:~^]+/g," ").trim();let i=new RegExp(`(^|${e.separator}|)(${n.replace(/[|\\{}()[\]^$+*?.-]/g,"\\$&").replace(r,"|")})`,"img");return a=>(0,ni.default)(a).replace(i,o).replace(/<\/mark>(\s+)]*>/img,"$1")}}function jt(e){return e.type===1}function dr(e){return e.type===3}function ai(e,t){let r=yn(e);return O(I(location.protocol!=="file:"),ze("search")).pipe(Ae(o=>o),v(()=>t)).subscribe(({config:o,docs:n})=>r.next({type:0,data:{config:o,docs:n,options:{suggest:B("search.suggest")}}})),r}function si(e){var l;let{selectedVersionSitemap:t,selectedVersionBaseURL:r,currentLocation:o,currentBaseURL:n}=e,i=(l=Xr(n))==null?void 0:l.pathname;if(i===void 0)return;let a=ss(o.pathname,i);if(a===void 0)return;let s=ps(t.keys());if(!t.has(s))return;let p=Xr(a,s);if(!p||!t.has(p.href))return;let c=Xr(a,r);if(c)return c.hash=o.hash,c.search=o.search,c}function Xr(e,t){try{return new URL(e,t)}catch(r){return}}function ss(e,t){if(e.startsWith(t))return e.slice(t.length)}function cs(e,t){let r=Math.min(e.length,t.length),o;for(o=0;oS)),o=r.pipe(m(n=>{let[,i]=t.base.match(/([^/]+)\/?$/);return n.find(({version:a,aliases:s})=>a===i||s.includes(i))||n[0]}));r.pipe(m(n=>new Map(n.map(i=>[`${new URL(`../${i.version}/`,t.base)}`,i]))),v(n=>h(document.body,"click").pipe(b(i=>!i.metaKey&&!i.ctrlKey),re(o),v(([i,a])=>{if(i.target instanceof Element){let s=i.target.closest("a");if(s&&!s.target&&n.has(s.href)){let p=s.href;return!i.target.closest(".md-version")&&n.get(p)===a?S:(i.preventDefault(),I(new URL(p)))}}return S}),v(i=>ur(i).pipe(m(a=>{var s;return(s=si({selectedVersionSitemap:a,selectedVersionBaseURL:i,currentLocation:ye(),currentBaseURL:t.base}))!=null?s:i})))))).subscribe(n=>lt(n,!0)),z([r,o]).subscribe(([n,i])=>{R(".md-header__topic").appendChild(Cn(n,i))}),e.pipe(v(()=>o)).subscribe(n=>{var a;let i=__md_get("__outdated",sessionStorage);if(i===null){i=!0;let s=((a=t.version)==null?void 0:a.default)||"latest";Array.isArray(s)||(s=[s]);e:for(let p of s)for(let c of n.aliases.concat(n.version))if(new RegExp(p,"i").test(c)){i=!1;break e}__md_set("__outdated",i,sessionStorage)}if(i)for(let s of ae("outdated"))s.hidden=!1})}function ls(e,{worker$:t}){let{searchParams:r}=ye();r.has("q")&&(Je("search",!0),e.value=r.get("q"),e.focus(),ze("search").pipe(Ae(i=>!i)).subscribe(()=>{let i=ye();i.searchParams.delete("q"),history.replaceState({},"",`${i}`)}));let o=et(e),n=O(t.pipe(Ae(jt)),h(e,"keyup"),o).pipe(m(()=>e.value),K());return z([n,o]).pipe(m(([i,a])=>({value:i,focus:a})),G(1))}function pi(e,{worker$:t}){let r=new g,o=r.pipe(Z(),ie(!0));z([t.pipe(Ae(jt)),r],(i,a)=>a).pipe(te("value")).subscribe(({value:i})=>t.next({type:2,data:i})),r.pipe(te("focus")).subscribe(({focus:i})=>{i&&Je("search",i)}),h(e.form,"reset").pipe(W(o)).subscribe(()=>e.focus());let n=R("header [for=__search]");return h(n,"click").subscribe(()=>e.focus()),ls(e,{worker$:t}).pipe(w(i=>r.next(i)),_(()=>r.complete()),m(i=>$({ref:e},i)),G(1))}function li(e,{worker$:t,query$:r}){let o=new g,n=on(e.parentElement).pipe(b(Boolean)),i=e.parentElement,a=R(":scope > :first-child",e),s=R(":scope > :last-child",e);ze("search").subscribe(l=>s.setAttribute("role",l?"list":"presentation")),o.pipe(re(r),Wr(t.pipe(Ae(jt)))).subscribe(([{items:l},{value:f}])=>{switch(l.length){case 0:a.textContent=f.length?Ee("search.result.none"):Ee("search.result.placeholder");break;case 1:a.textContent=Ee("search.result.one");break;default:let u=sr(l.length);a.textContent=Ee("search.result.other",u)}});let p=o.pipe(w(()=>s.innerHTML=""),v(({items:l})=>O(I(...l.slice(0,10)),I(...l.slice(10)).pipe(Be(4),Vr(n),v(([f])=>f)))),m(Mn),pe());return p.subscribe(l=>s.appendChild(l)),p.pipe(ne(l=>{let f=fe("details",l);return typeof f=="undefined"?S:h(f,"toggle").pipe(W(o),m(()=>f))})).subscribe(l=>{l.open===!1&&l.offsetTop<=i.scrollTop&&i.scrollTo({top:l.offsetTop})}),t.pipe(b(dr),m(({data:l})=>l)).pipe(w(l=>o.next(l)),_(()=>o.complete()),m(l=>$({ref:e},l)))}function ms(e,{query$:t}){return t.pipe(m(({value:r})=>{let o=ye();return o.hash="",r=r.replace(/\s+/g,"+").replace(/&/g,"%26").replace(/=/g,"%3D"),o.search=`q=${r}`,{url:o}}))}function mi(e,t){let r=new g,o=r.pipe(Z(),ie(!0));return r.subscribe(({url:n})=>{e.setAttribute("data-clipboard-text",e.href),e.href=`${n}`}),h(e,"click").pipe(W(o)).subscribe(n=>n.preventDefault()),ms(e,t).pipe(w(n=>r.next(n)),_(()=>r.complete()),m(n=>$({ref:e},n)))}function fi(e,{worker$:t,keyboard$:r}){let o=new g,n=Se("search-query"),i=O(h(n,"keydown"),h(n,"focus")).pipe(ve(se),m(()=>n.value),K());return o.pipe(He(i),m(([{suggest:s},p])=>{let c=p.split(/([\s-]+)/);if(s!=null&&s.length&&c[c.length-1]){let l=s[s.length-1];l.startsWith(c[c.length-1])&&(c[c.length-1]=l)}else c.length=0;return c})).subscribe(s=>e.innerHTML=s.join("").replace(/\s/g," ")),r.pipe(b(({mode:s})=>s==="search")).subscribe(s=>{switch(s.type){case"ArrowRight":e.innerText.length&&n.selectionStart===n.value.length&&(n.value=e.innerText);break}}),t.pipe(b(dr),m(({data:s})=>s)).pipe(w(s=>o.next(s)),_(()=>o.complete()),m(()=>({ref:e})))}function ui(e,{index$:t,keyboard$:r}){let o=xe();try{let n=ai(o.search,t),i=Se("search-query",e),a=Se("search-result",e);h(e,"click").pipe(b(({target:p})=>p instanceof Element&&!!p.closest("a"))).subscribe(()=>Je("search",!1)),r.pipe(b(({mode:p})=>p==="search")).subscribe(p=>{let c=Ie();switch(p.type){case"Enter":if(c===i){let l=new Map;for(let f of P(":first-child [href]",a)){let u=f.firstElementChild;l.set(f,parseFloat(u.getAttribute("data-md-score")))}if(l.size){let[[f]]=[...l].sort(([,u],[,d])=>d-u);f.click()}p.claim()}break;case"Escape":case"Tab":Je("search",!1),i.blur();break;case"ArrowUp":case"ArrowDown":if(typeof c=="undefined")i.focus();else{let l=[i,...P(":not(details) > [href], summary, details[open] [href]",a)],f=Math.max(0,(Math.max(0,l.indexOf(c))+l.length+(p.type==="ArrowUp"?-1:1))%l.length);l[f].focus()}p.claim();break;default:i!==Ie()&&i.focus()}}),r.pipe(b(({mode:p})=>p==="global")).subscribe(p=>{switch(p.type){case"f":case"s":case"/":i.focus(),i.select(),p.claim();break}});let s=pi(i,{worker$:n});return O(s,li(a,{worker$:n,query$:s})).pipe(Re(...ae("search-share",e).map(p=>mi(p,{query$:s})),...ae("search-suggest",e).map(p=>fi(p,{worker$:n,keyboard$:r}))))}catch(n){return e.hidden=!0,Ye}}function di(e,{index$:t,location$:r}){return z([t,r.pipe(Q(ye()),b(o=>!!o.searchParams.get("h")))]).pipe(m(([o,n])=>ii(o.config)(n.searchParams.get("h"))),m(o=>{var a;let n=new Map,i=document.createNodeIterator(e,NodeFilter.SHOW_TEXT);for(let s=i.nextNode();s;s=i.nextNode())if((a=s.parentElement)!=null&&a.offsetHeight){let p=s.textContent,c=o(p);c.length>p.length&&n.set(s,c)}for(let[s,p]of n){let{childNodes:c}=x("span",null,p);s.replaceWith(...Array.from(c))}return{ref:e,nodes:n}}))}function fs(e,{viewport$:t,main$:r}){let o=e.closest(".md-grid"),n=o.offsetTop-o.parentElement.offsetTop;return z([r,t]).pipe(m(([{offset:i,height:a},{offset:{y:s}}])=>(a=a+Math.min(n,Math.max(0,s-i))-n,{height:a,locked:s>=i+n})),K((i,a)=>i.height===a.height&&i.locked===a.locked))}function Zr(e,o){var n=o,{header$:t}=n,r=so(n,["header$"]);let i=R(".md-sidebar__scrollwrap",e),{y:a}=Ve(i);return C(()=>{let s=new g,p=s.pipe(Z(),ie(!0)),c=s.pipe(Me(0,me));return c.pipe(re(t)).subscribe({next([{height:l},{height:f}]){i.style.height=`${l-2*a}px`,e.style.top=`${f}px`},complete(){i.style.height="",e.style.top=""}}),c.pipe(Ae()).subscribe(()=>{for(let l of P(".md-nav__link--active[href]",e)){if(!l.clientHeight)continue;let f=l.closest(".md-sidebar__scrollwrap");if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:d}=ce(f);f.scrollTo({top:u-d/2})}}}),ue(P("label[tabindex]",e)).pipe(ne(l=>h(l,"click").pipe(ve(se),m(()=>l),W(p)))).subscribe(l=>{let f=R(`[id="${l.htmlFor}"]`);R(`[aria-labelledby="${l.id}"]`).setAttribute("aria-expanded",`${f.checked}`)}),fs(e,r).pipe(w(l=>s.next(l)),_(()=>s.complete()),m(l=>$({ref:e},l)))})}function hi(e,t){if(typeof t!="undefined"){let r=`https://api.github.com/repos/${e}/${t}`;return st(je(`${r}/releases/latest`).pipe(de(()=>S),m(o=>({version:o.tag_name})),De({})),je(r).pipe(de(()=>S),m(o=>({stars:o.stargazers_count,forks:o.forks_count})),De({}))).pipe(m(([o,n])=>$($({},o),n)))}else{let r=`https://api.github.com/users/${e}`;return je(r).pipe(m(o=>({repositories:o.public_repos})),De({}))}}function bi(e,t){let r=`https://${e}/api/v4/projects/${encodeURIComponent(t)}`;return st(je(`${r}/releases/permalink/latest`).pipe(de(()=>S),m(({tag_name:o})=>({version:o})),De({})),je(r).pipe(de(()=>S),m(({star_count:o,forks_count:n})=>({stars:o,forks:n})),De({}))).pipe(m(([o,n])=>$($({},o),n)))}function vi(e){let t=e.match(/^.+github\.com\/([^/]+)\/?([^/]+)?/i);if(t){let[,r,o]=t;return hi(r,o)}if(t=e.match(/^.+?([^/]*gitlab[^/]+)\/(.+?)\/?$/i),t){let[,r,o]=t;return bi(r,o)}return S}var us;function ds(e){return us||(us=C(()=>{let t=__md_get("__source",sessionStorage);if(t)return I(t);if(ae("consent").length){let o=__md_get("__consent");if(!(o&&o.github))return S}return vi(e.href).pipe(w(o=>__md_set("__source",o,sessionStorage)))}).pipe(de(()=>S),b(t=>Object.keys(t).length>0),m(t=>({facts:t})),G(1)))}function gi(e){let t=R(":scope > :last-child",e);return C(()=>{let r=new g;return r.subscribe(({facts:o})=>{t.appendChild(_n(o)),t.classList.add("md-source__repository--active")}),ds(e).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}function hs(e,{viewport$:t,header$:r}){return ge(document.body).pipe(v(()=>mr(e,{header$:r,viewport$:t})),m(({offset:{y:o}})=>({hidden:o>=10})),te("hidden"))}function yi(e,t){return C(()=>{let r=new g;return r.subscribe({next({hidden:o}){e.hidden=o},complete(){e.hidden=!1}}),(B("navigation.tabs.sticky")?I({hidden:!1}):hs(e,t)).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}function bs(e,{viewport$:t,header$:r}){let o=new Map,n=P(".md-nav__link",e);for(let s of n){let p=decodeURIComponent(s.hash.substring(1)),c=fe(`[id="${p}"]`);typeof c!="undefined"&&o.set(s,c)}let i=r.pipe(te("height"),m(({height:s})=>{let p=Se("main"),c=R(":scope > :first-child",p);return s+.8*(c.offsetTop-p.offsetTop)}),pe());return ge(document.body).pipe(te("height"),v(s=>C(()=>{let p=[];return I([...o].reduce((c,[l,f])=>{for(;p.length&&o.get(p[p.length-1]).tagName>=f.tagName;)p.pop();let u=f.offsetTop;for(;!u&&f.parentElement;)f=f.parentElement,u=f.offsetTop;let d=f.offsetParent;for(;d;d=d.offsetParent)u+=d.offsetTop;return c.set([...p=[...p,l]].reverse(),u)},new Map))}).pipe(m(p=>new Map([...p].sort(([,c],[,l])=>c-l))),He(i),v(([p,c])=>t.pipe(Fr(([l,f],{offset:{y:u},size:d})=>{let y=u+d.height>=Math.floor(s.height);for(;f.length;){let[,L]=f[0];if(L-c=u&&!y)f=[l.pop(),...f];else break}return[l,f]},[[],[...p]]),K((l,f)=>l[0]===f[0]&&l[1]===f[1])))))).pipe(m(([s,p])=>({prev:s.map(([c])=>c),next:p.map(([c])=>c)})),Q({prev:[],next:[]}),Be(2,1),m(([s,p])=>s.prev.length{let i=new g,a=i.pipe(Z(),ie(!0));if(i.subscribe(({prev:s,next:p})=>{for(let[c]of p)c.classList.remove("md-nav__link--passed"),c.classList.remove("md-nav__link--active");for(let[c,[l]]of s.entries())l.classList.add("md-nav__link--passed"),l.classList.toggle("md-nav__link--active",c===s.length-1)}),B("toc.follow")){let s=O(t.pipe(_e(1),m(()=>{})),t.pipe(_e(250),m(()=>"smooth")));i.pipe(b(({prev:p})=>p.length>0),He(o.pipe(ve(se))),re(s)).subscribe(([[{prev:p}],c])=>{let[l]=p[p.length-1];if(l.offsetHeight){let f=cr(l);if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:d}=ce(f);f.scrollTo({top:u-d/2,behavior:c})}}})}return B("navigation.tracking")&&t.pipe(W(a),te("offset"),_e(250),Ce(1),W(n.pipe(Ce(1))),ct({delay:250}),re(i)).subscribe(([,{prev:s}])=>{let p=ye(),c=s[s.length-1];if(c&&c.length){let[l]=c,{hash:f}=new URL(l.href);p.hash!==f&&(p.hash=f,history.replaceState({},"",`${p}`))}else p.hash="",history.replaceState({},"",`${p}`)}),bs(e,{viewport$:t,header$:r}).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))})}function vs(e,{viewport$:t,main$:r,target$:o}){let n=t.pipe(m(({offset:{y:a}})=>a),Be(2,1),m(([a,s])=>a>s&&s>0),K()),i=r.pipe(m(({active:a})=>a));return z([i,n]).pipe(m(([a,s])=>!(a&&s)),K(),W(o.pipe(Ce(1))),ie(!0),ct({delay:250}),m(a=>({hidden:a})))}function Ei(e,{viewport$:t,header$:r,main$:o,target$:n}){let i=new g,a=i.pipe(Z(),ie(!0));return i.subscribe({next({hidden:s}){e.hidden=s,s?(e.setAttribute("tabindex","-1"),e.blur()):e.removeAttribute("tabindex")},complete(){e.style.top="",e.hidden=!0,e.removeAttribute("tabindex")}}),r.pipe(W(a),te("height")).subscribe(({height:s})=>{e.style.top=`${s+16}px`}),h(e,"click").subscribe(s=>{s.preventDefault(),window.scrollTo({top:0})}),vs(e,{viewport$:t,main$:o,target$:n}).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))}function wi({document$:e,viewport$:t}){e.pipe(v(()=>P(".md-ellipsis")),ne(r=>tt(r).pipe(W(e.pipe(Ce(1))),b(o=>o),m(()=>r),Te(1))),b(r=>r.offsetWidth{let o=r.innerText,n=r.closest("a")||r;return n.title=o,B("content.tooltips")?mt(n,{viewport$:t}).pipe(W(e.pipe(Ce(1))),_(()=>n.removeAttribute("title"))):S})).subscribe(),B("content.tooltips")&&e.pipe(v(()=>P(".md-status")),ne(r=>mt(r,{viewport$:t}))).subscribe()}function Ti({document$:e,tablet$:t}){e.pipe(v(()=>P(".md-toggle--indeterminate")),w(r=>{r.indeterminate=!0,r.checked=!1}),ne(r=>h(r,"change").pipe(Dr(()=>r.classList.contains("md-toggle--indeterminate")),m(()=>r))),re(t)).subscribe(([r,o])=>{r.classList.remove("md-toggle--indeterminate"),o&&(r.checked=!1)})}function gs(){return/(iPad|iPhone|iPod)/.test(navigator.userAgent)}function Si({document$:e}){e.pipe(v(()=>P("[data-md-scrollfix]")),w(t=>t.removeAttribute("data-md-scrollfix")),b(gs),ne(t=>h(t,"touchstart").pipe(m(()=>t)))).subscribe(t=>{let r=t.scrollTop;r===0?t.scrollTop=1:r+t.offsetHeight===t.scrollHeight&&(t.scrollTop=r-1)})}function Oi({viewport$:e,tablet$:t}){z([ze("search"),t]).pipe(m(([r,o])=>r&&!o),v(r=>I(r).pipe(Ge(r?400:100))),re(e)).subscribe(([r,{offset:{y:o}}])=>{if(r)document.body.setAttribute("data-md-scrolllock",""),document.body.style.top=`-${o}px`;else{let n=-1*parseInt(document.body.style.top,10);document.body.removeAttribute("data-md-scrolllock"),document.body.style.top="",n&&window.scrollTo(0,n)}})}Object.entries||(Object.entries=function(e){let t=[];for(let r of Object.keys(e))t.push([r,e[r]]);return t});Object.values||(Object.values=function(e){let t=[];for(let r of Object.keys(e))t.push(e[r]);return t});typeof Element!="undefined"&&(Element.prototype.scrollTo||(Element.prototype.scrollTo=function(e,t){typeof e=="object"?(this.scrollLeft=e.left,this.scrollTop=e.top):(this.scrollLeft=e,this.scrollTop=t)}),Element.prototype.replaceWith||(Element.prototype.replaceWith=function(...e){let t=this.parentNode;if(t){e.length===0&&t.removeChild(this);for(let r=e.length-1;r>=0;r--){let o=e[r];typeof o=="string"?o=document.createTextNode(o):o.parentNode&&o.parentNode.removeChild(o),r?t.insertBefore(this.previousSibling,o):t.replaceChild(o,this)}}}));function ys(){return location.protocol==="file:"?Tt(`${new URL("search/search_index.js",eo.base)}`).pipe(m(()=>__index),G(1)):je(new URL("search/search_index.json",eo.base))}document.documentElement.classList.remove("no-js");document.documentElement.classList.add("js");var ot=Go(),Ut=sn(),Lt=ln(Ut),to=an(),Oe=gn(),hr=Pt("(min-width: 960px)"),Mi=Pt("(min-width: 1220px)"),_i=mn(),eo=xe(),Ai=document.forms.namedItem("search")?ys():Ye,ro=new g;Zn({alert$:ro});var oo=new g;B("navigation.instant")&&oi({location$:Ut,viewport$:Oe,progress$:oo}).subscribe(ot);var Li;((Li=eo.version)==null?void 0:Li.provider)==="mike"&&ci({document$:ot});O(Ut,Lt).pipe(Ge(125)).subscribe(()=>{Je("drawer",!1),Je("search",!1)});to.pipe(b(({mode:e})=>e==="global")).subscribe(e=>{switch(e.type){case"p":case",":let t=fe("link[rel=prev]");typeof t!="undefined"&<(t);break;case"n":case".":let r=fe("link[rel=next]");typeof r!="undefined"&<(r);break;case"Enter":let o=Ie();o instanceof HTMLLabelElement&&o.click()}});wi({viewport$:Oe,document$:ot});Ti({document$:ot,tablet$:hr});Si({document$:ot});Oi({viewport$:Oe,tablet$:hr});var rt=Kn(Se("header"),{viewport$:Oe}),Ft=ot.pipe(m(()=>Se("main")),v(e=>Gn(e,{viewport$:Oe,header$:rt})),G(1)),xs=O(...ae("consent").map(e=>En(e,{target$:Lt})),...ae("dialog").map(e=>qn(e,{alert$:ro})),...ae("palette").map(e=>Jn(e)),...ae("progress").map(e=>Xn(e,{progress$:oo})),...ae("search").map(e=>ui(e,{index$:Ai,keyboard$:to})),...ae("source").map(e=>gi(e))),Es=C(()=>O(...ae("announce").map(e=>xn(e)),...ae("content").map(e=>zn(e,{viewport$:Oe,target$:Lt,print$:_i})),...ae("content").map(e=>B("search.highlight")?di(e,{index$:Ai,location$:Ut}):S),...ae("header").map(e=>Yn(e,{viewport$:Oe,header$:rt,main$:Ft})),...ae("header-title").map(e=>Bn(e,{viewport$:Oe,header$:rt})),...ae("sidebar").map(e=>e.getAttribute("data-md-type")==="navigation"?Nr(Mi,()=>Zr(e,{viewport$:Oe,header$:rt,main$:Ft})):Nr(hr,()=>Zr(e,{viewport$:Oe,header$:rt,main$:Ft}))),...ae("tabs").map(e=>yi(e,{viewport$:Oe,header$:rt})),...ae("toc").map(e=>xi(e,{viewport$:Oe,header$:rt,main$:Ft,target$:Lt})),...ae("top").map(e=>Ei(e,{viewport$:Oe,header$:rt,main$:Ft,target$:Lt})))),Ci=ot.pipe(v(()=>Es),Re(xs),G(1));Ci.subscribe();window.document$=ot;window.location$=Ut;window.target$=Lt;window.keyboard$=to;window.viewport$=Oe;window.tablet$=hr;window.screen$=Mi;window.print$=_i;window.alert$=ro;window.progress$=oo;window.component$=Ci;})(); -//# sourceMappingURL=bundle.5090c770.min.js.map + `):"",this.name="UnsubscriptionError",this.errors=r}});function Qe(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ue=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,o,n,i;if(!this.closed){this.closed=!0;var a=this._parentage;if(a)if(this._parentage=null,Array.isArray(a))try{for(var s=he(a),p=s.next();!p.done;p=s.next()){var c=p.value;c.remove(this)}}catch(L){t={error:L}}finally{try{p&&!p.done&&(r=s.return)&&r.call(s)}finally{if(t)throw t.error}}else a.remove(this);var l=this.initialTeardown;if(H(l))try{l()}catch(L){i=L instanceof zt?L.errors:[L]}var f=this._finalizers;if(f){this._finalizers=null;try{for(var u=he(f),d=u.next();!d.done;d=u.next()){var y=d.value;try{ho(y)}catch(L){i=i!=null?i:[],L instanceof zt?i=q(q([],N(i)),N(L.errors)):i.push(L)}}}catch(L){o={error:L}}finally{try{d&&!d.done&&(n=u.return)&&n.call(u)}finally{if(o)throw o.error}}}if(i)throw new zt(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)ho(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Qe(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Qe(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Tr=Ue.EMPTY;function qt(e){return e instanceof Ue||e&&"closed"in e&&H(e.remove)&&H(e.add)&&H(e.unsubscribe)}function ho(e){H(e)?e():e.unsubscribe()}var Pe={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var dt={setTimeout:function(e,t){for(var r=[],o=2;o0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var o=this,n=this,i=n.hasError,a=n.isStopped,s=n.observers;return i||a?Tr:(this.currentObservers=null,s.push(r),new Ue(function(){o.currentObservers=null,Qe(s,r)}))},t.prototype._checkFinalizedStatuses=function(r){var o=this,n=o.hasError,i=o.thrownError,a=o.isStopped;n?r.error(i):a&&r.complete()},t.prototype.asObservable=function(){var r=new j;return r.source=this,r},t.create=function(r,o){return new To(r,o)},t}(j);var To=function(e){oe(t,e);function t(r,o){var n=e.call(this)||this;return n.destination=r,n.source=o,n}return t.prototype.next=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.next)===null||n===void 0||n.call(o,r)},t.prototype.error=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.error)===null||n===void 0||n.call(o,r)},t.prototype.complete=function(){var r,o;(o=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||o===void 0||o.call(r)},t.prototype._subscribe=function(r){var o,n;return(n=(o=this.source)===null||o===void 0?void 0:o.subscribe(r))!==null&&n!==void 0?n:Tr},t}(g);var _r=function(e){oe(t,e);function t(r){var o=e.call(this)||this;return o._value=r,o}return Object.defineProperty(t.prototype,"value",{get:function(){return this.getValue()},enumerable:!1,configurable:!0}),t.prototype._subscribe=function(r){var o=e.prototype._subscribe.call(this,r);return!o.closed&&r.next(this._value),o},t.prototype.getValue=function(){var r=this,o=r.hasError,n=r.thrownError,i=r._value;if(o)throw n;return this._throwIfClosed(),i},t.prototype.next=function(r){e.prototype.next.call(this,this._value=r)},t}(g);var At={now:function(){return(At.delegate||Date).now()},delegate:void 0};var Ct=function(e){oe(t,e);function t(r,o,n){r===void 0&&(r=1/0),o===void 0&&(o=1/0),n===void 0&&(n=At);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=o,i._timestampProvider=n,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=o===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,o),i}return t.prototype.next=function(r){var o=this,n=o.isStopped,i=o._buffer,a=o._infiniteTimeWindow,s=o._timestampProvider,p=o._windowTime;n||(i.push(r),!a&&i.push(s.now()+p)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var o=this._innerSubscribe(r),n=this,i=n._infiniteTimeWindow,a=n._buffer,s=a.slice(),p=0;p0?e.prototype.schedule.call(this,r,o):(this.delay=o,this.state=r,this.scheduler.flush(this),this)},t.prototype.execute=function(r,o){return o>0||this.closed?e.prototype.execute.call(this,r,o):this._execute(r,o)},t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),n!=null&&n>0||n==null&&this.delay>0?e.prototype.requestAsyncId.call(this,r,o,n):(r.flush(this),0)},t}(gt);var Lo=function(e){oe(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t}(yt);var kr=new Lo(Oo);var Mo=function(e){oe(t,e);function t(r,o){var n=e.call(this,r,o)||this;return n.scheduler=r,n.work=o,n}return t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),n!==null&&n>0?e.prototype.requestAsyncId.call(this,r,o,n):(r.actions.push(this),r._scheduled||(r._scheduled=vt.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,o,n){var i;if(n===void 0&&(n=0),n!=null?n>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,o,n);var a=r.actions;o!=null&&((i=a[a.length-1])===null||i===void 0?void 0:i.id)!==o&&(vt.cancelAnimationFrame(o),r._scheduled=void 0)},t}(gt);var _o=function(e){oe(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var o=this._scheduled;this._scheduled=void 0;var n=this.actions,i;r=r||n.shift();do if(i=r.execute(r.state,r.delay))break;while((r=n[0])&&r.id===o&&n.shift());if(this._active=!1,i){for(;(r=n[0])&&r.id===o&&n.shift();)r.unsubscribe();throw i}},t}(yt);var me=new _o(Mo);var S=new j(function(e){return e.complete()});function Yt(e){return e&&H(e.schedule)}function Hr(e){return e[e.length-1]}function Xe(e){return H(Hr(e))?e.pop():void 0}function ke(e){return Yt(Hr(e))?e.pop():void 0}function Bt(e,t){return typeof Hr(e)=="number"?e.pop():t}var xt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Gt(e){return H(e==null?void 0:e.then)}function Jt(e){return H(e[bt])}function Xt(e){return Symbol.asyncIterator&&H(e==null?void 0:e[Symbol.asyncIterator])}function Zt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function Zi(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var er=Zi();function tr(e){return H(e==null?void 0:e[er])}function rr(e){return fo(this,arguments,function(){var r,o,n,i;return Nt(this,function(a){switch(a.label){case 0:r=e.getReader(),a.label=1;case 1:a.trys.push([1,,9,10]),a.label=2;case 2:return[4,nt(r.read())];case 3:return o=a.sent(),n=o.value,i=o.done,i?[4,nt(void 0)]:[3,5];case 4:return[2,a.sent()];case 5:return[4,nt(n)];case 6:return[4,a.sent()];case 7:return a.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function or(e){return H(e==null?void 0:e.getReader)}function U(e){if(e instanceof j)return e;if(e!=null){if(Jt(e))return ea(e);if(xt(e))return ta(e);if(Gt(e))return ra(e);if(Xt(e))return Ao(e);if(tr(e))return oa(e);if(or(e))return na(e)}throw Zt(e)}function ea(e){return new j(function(t){var r=e[bt]();if(H(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function ta(e){return new j(function(t){for(var r=0;r=2;return function(o){return o.pipe(e?b(function(n,i){return e(n,i,o)}):le,Te(1),r?De(t):Qo(function(){return new ir}))}}function jr(e){return e<=0?function(){return S}:E(function(t,r){var o=[];t.subscribe(T(r,function(n){o.push(n),e=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new g}:t,o=e.resetOnError,n=o===void 0?!0:o,i=e.resetOnComplete,a=i===void 0?!0:i,s=e.resetOnRefCountZero,p=s===void 0?!0:s;return function(c){var l,f,u,d=0,y=!1,L=!1,X=function(){f==null||f.unsubscribe(),f=void 0},ee=function(){X(),l=u=void 0,y=L=!1},J=function(){var k=l;ee(),k==null||k.unsubscribe()};return E(function(k,ft){d++,!L&&!y&&X();var qe=u=u!=null?u:r();ft.add(function(){d--,d===0&&!L&&!y&&(f=Ur(J,p))}),qe.subscribe(ft),!l&&d>0&&(l=new at({next:function(Fe){return qe.next(Fe)},error:function(Fe){L=!0,X(),f=Ur(ee,n,Fe),qe.error(Fe)},complete:function(){y=!0,X(),f=Ur(ee,a),qe.complete()}}),U(k).subscribe(l))})(c)}}function Ur(e,t){for(var r=[],o=2;oe.next(document)),e}function P(e,t=document){return Array.from(t.querySelectorAll(e))}function R(e,t=document){let r=fe(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function fe(e,t=document){return t.querySelector(e)||void 0}function Ie(){var e,t,r,o;return(o=(r=(t=(e=document.activeElement)==null?void 0:e.shadowRoot)==null?void 0:t.activeElement)!=null?r:document.activeElement)!=null?o:void 0}var wa=O(h(document.body,"focusin"),h(document.body,"focusout")).pipe(_e(1),Q(void 0),m(()=>Ie()||document.body),G(1));function et(e){return wa.pipe(m(t=>e.contains(t)),K())}function $t(e,t){return C(()=>O(h(e,"mouseenter").pipe(m(()=>!0)),h(e,"mouseleave").pipe(m(()=>!1))).pipe(t?Ht(r=>Le(+!r*t)):le,Q(e.matches(":hover"))))}function Jo(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)Jo(e,r)}function x(e,t,...r){let o=document.createElement(e);if(t)for(let n of Object.keys(t))typeof t[n]!="undefined"&&(typeof t[n]!="boolean"?o.setAttribute(n,t[n]):o.setAttribute(n,""));for(let n of r)Jo(o,n);return o}function sr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function Tt(e){let t=x("script",{src:e});return C(()=>(document.head.appendChild(t),O(h(t,"load"),h(t,"error").pipe(v(()=>$r(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(m(()=>{}),_(()=>document.head.removeChild(t)),Te(1))))}var Xo=new g,Ta=C(()=>typeof ResizeObserver=="undefined"?Tt("https://unpkg.com/resize-observer-polyfill"):I(void 0)).pipe(m(()=>new ResizeObserver(e=>e.forEach(t=>Xo.next(t)))),v(e=>O(Ye,I(e)).pipe(_(()=>e.disconnect()))),G(1));function ce(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ge(e){let t=e;for(;t.clientWidth===0&&t.parentElement;)t=t.parentElement;return Ta.pipe(w(r=>r.observe(t)),v(r=>Xo.pipe(b(o=>o.target===t),_(()=>r.unobserve(t)))),m(()=>ce(e)),Q(ce(e)))}function St(e){return{width:e.scrollWidth,height:e.scrollHeight}}function cr(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}function Zo(e){let t=[],r=e.parentElement;for(;r;)(e.clientWidth>r.clientWidth||e.clientHeight>r.clientHeight)&&t.push(r),r=(e=r).parentElement;return t.length===0&&t.push(document.documentElement),t}function Ve(e){return{x:e.offsetLeft,y:e.offsetTop}}function en(e){let t=e.getBoundingClientRect();return{x:t.x+window.scrollX,y:t.y+window.scrollY}}function tn(e){return O(h(window,"load"),h(window,"resize")).pipe(Me(0,me),m(()=>Ve(e)),Q(Ve(e)))}function pr(e){return{x:e.scrollLeft,y:e.scrollTop}}function Ne(e){return O(h(e,"scroll"),h(window,"scroll"),h(window,"resize")).pipe(Me(0,me),m(()=>pr(e)),Q(pr(e)))}var rn=new g,Sa=C(()=>I(new IntersectionObserver(e=>{for(let t of e)rn.next(t)},{threshold:0}))).pipe(v(e=>O(Ye,I(e)).pipe(_(()=>e.disconnect()))),G(1));function tt(e){return Sa.pipe(w(t=>t.observe(e)),v(t=>rn.pipe(b(({target:r})=>r===e),_(()=>t.unobserve(e)),m(({isIntersecting:r})=>r))))}function on(e,t=16){return Ne(e).pipe(m(({y:r})=>{let o=ce(e),n=St(e);return r>=n.height-o.height-t}),K())}var lr={drawer:R("[data-md-toggle=drawer]"),search:R("[data-md-toggle=search]")};function nn(e){return lr[e].checked}function Je(e,t){lr[e].checked!==t&&lr[e].click()}function ze(e){let t=lr[e];return h(t,"change").pipe(m(()=>t.checked),Q(t.checked))}function Oa(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function La(){return O(h(window,"compositionstart").pipe(m(()=>!0)),h(window,"compositionend").pipe(m(()=>!1))).pipe(Q(!1))}function an(){let e=h(window,"keydown").pipe(b(t=>!(t.metaKey||t.ctrlKey)),m(t=>({mode:nn("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),b(({mode:t,type:r})=>{if(t==="global"){let o=Ie();if(typeof o!="undefined")return!Oa(o,r)}return!0}),pe());return La().pipe(v(t=>t?S:e))}function ye(){return new URL(location.href)}function lt(e,t=!1){if(B("navigation.instant")&&!t){let r=x("a",{href:e.href});document.body.appendChild(r),r.click(),r.remove()}else location.href=e.href}function sn(){return new g}function cn(){return location.hash.slice(1)}function pn(e){let t=x("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Ma(e){return O(h(window,"hashchange"),e).pipe(m(cn),Q(cn()),b(t=>t.length>0),G(1))}function ln(e){return Ma(e).pipe(m(t=>fe(`[id="${t}"]`)),b(t=>typeof t!="undefined"))}function Pt(e){let t=matchMedia(e);return ar(r=>t.addListener(()=>r(t.matches))).pipe(Q(t.matches))}function mn(){let e=matchMedia("print");return O(h(window,"beforeprint").pipe(m(()=>!0)),h(window,"afterprint").pipe(m(()=>!1))).pipe(Q(e.matches))}function Nr(e,t){return e.pipe(v(r=>r?t():S))}function zr(e,t){return new j(r=>{let o=new XMLHttpRequest;return o.open("GET",`${e}`),o.responseType="blob",o.addEventListener("load",()=>{o.status>=200&&o.status<300?(r.next(o.response),r.complete()):r.error(new Error(o.statusText))}),o.addEventListener("error",()=>{r.error(new Error("Network error"))}),o.addEventListener("abort",()=>{r.complete()}),typeof(t==null?void 0:t.progress$)!="undefined"&&(o.addEventListener("progress",n=>{var i;if(n.lengthComputable)t.progress$.next(n.loaded/n.total*100);else{let a=(i=o.getResponseHeader("Content-Length"))!=null?i:0;t.progress$.next(n.loaded/+a*100)}}),t.progress$.next(5)),o.send(),()=>o.abort()})}function je(e,t){return zr(e,t).pipe(v(r=>r.text()),m(r=>JSON.parse(r)),G(1))}function fn(e,t){let r=new DOMParser;return zr(e,t).pipe(v(o=>o.text()),m(o=>r.parseFromString(o,"text/html")),G(1))}function un(e,t){let r=new DOMParser;return zr(e,t).pipe(v(o=>o.text()),m(o=>r.parseFromString(o,"text/xml")),G(1))}function dn(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function hn(){return O(h(window,"scroll",{passive:!0}),h(window,"resize",{passive:!0})).pipe(m(dn),Q(dn()))}function bn(){return{width:innerWidth,height:innerHeight}}function vn(){return h(window,"resize",{passive:!0}).pipe(m(bn),Q(bn()))}function gn(){return z([hn(),vn()]).pipe(m(([e,t])=>({offset:e,size:t})),G(1))}function mr(e,{viewport$:t,header$:r}){let o=t.pipe(te("size")),n=z([o,r]).pipe(m(()=>Ve(e)));return z([r,t,n]).pipe(m(([{height:i},{offset:a,size:s},{x:p,y:c}])=>({offset:{x:a.x-p,y:a.y-c+i},size:s})))}function _a(e){return h(e,"message",t=>t.data)}function Aa(e){let t=new g;return t.subscribe(r=>e.postMessage(r)),t}function yn(e,t=new Worker(e)){let r=_a(t),o=Aa(t),n=new g;n.subscribe(o);let i=o.pipe(Z(),ie(!0));return n.pipe(Z(),Re(r.pipe(W(i))),pe())}var Ca=R("#__config"),Ot=JSON.parse(Ca.textContent);Ot.base=`${new URL(Ot.base,ye())}`;function xe(){return Ot}function B(e){return Ot.features.includes(e)}function Ee(e,t){return typeof t!="undefined"?Ot.translations[e].replace("#",t.toString()):Ot.translations[e]}function Se(e,t=document){return R(`[data-md-component=${e}]`,t)}function ae(e,t=document){return P(`[data-md-component=${e}]`,t)}function ka(e){let t=R(".md-typeset > :first-child",e);return h(t,"click",{once:!0}).pipe(m(()=>R(".md-typeset",e)),m(r=>({hash:__md_hash(r.innerHTML)})))}function xn(e){if(!B("announce.dismiss")||!e.childElementCount)return S;if(!e.hidden){let t=R(".md-typeset",e);__md_hash(t.innerHTML)===__md_get("__announce")&&(e.hidden=!0)}return C(()=>{let t=new g;return t.subscribe(({hash:r})=>{e.hidden=!0,__md_set("__announce",r)}),ka(e).pipe(w(r=>t.next(r)),_(()=>t.complete()),m(r=>$({ref:e},r)))})}function Ha(e,{target$:t}){return t.pipe(m(r=>({hidden:r!==e})))}function En(e,t){let r=new g;return r.subscribe(({hidden:o})=>{e.hidden=o}),Ha(e,t).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))}function Rt(e,t){return t==="inline"?x("div",{class:"md-tooltip md-tooltip--inline",id:e,role:"tooltip"},x("div",{class:"md-tooltip__inner md-typeset"})):x("div",{class:"md-tooltip",id:e,role:"tooltip"},x("div",{class:"md-tooltip__inner md-typeset"}))}function wn(...e){return x("div",{class:"md-tooltip2",role:"tooltip"},x("div",{class:"md-tooltip2__inner md-typeset"},e))}function Tn(e,t){if(t=t?`${t}_annotation_${e}`:void 0,t){let r=t?`#${t}`:void 0;return x("aside",{class:"md-annotation",tabIndex:0},Rt(t),x("a",{href:r,class:"md-annotation__index",tabIndex:-1},x("span",{"data-md-annotation-id":e})))}else return x("aside",{class:"md-annotation",tabIndex:0},Rt(t),x("span",{class:"md-annotation__index",tabIndex:-1},x("span",{"data-md-annotation-id":e})))}function Sn(e){return x("button",{class:"md-clipboard md-icon",title:Ee("clipboard.copy"),"data-clipboard-target":`#${e} > code`})}var Ln=Mt(qr());function Qr(e,t){let r=t&2,o=t&1,n=Object.keys(e.terms).filter(p=>!e.terms[p]).reduce((p,c)=>[...p,x("del",null,(0,Ln.default)(c))," "],[]).slice(0,-1),i=xe(),a=new URL(e.location,i.base);B("search.highlight")&&a.searchParams.set("h",Object.entries(e.terms).filter(([,p])=>p).reduce((p,[c])=>`${p} ${c}`.trim(),""));let{tags:s}=xe();return x("a",{href:`${a}`,class:"md-search-result__link",tabIndex:-1},x("article",{class:"md-search-result__article md-typeset","data-md-score":e.score.toFixed(2)},r>0&&x("div",{class:"md-search-result__icon md-icon"}),r>0&&x("h1",null,e.title),r<=0&&x("h2",null,e.title),o>0&&e.text.length>0&&e.text,e.tags&&x("nav",{class:"md-tags"},e.tags.map(p=>{let c=s?p in s?`md-tag-icon md-tag--${s[p]}`:"md-tag-icon":"";return x("span",{class:`md-tag ${c}`},p)})),o>0&&n.length>0&&x("p",{class:"md-search-result__terms"},Ee("search.result.term.missing"),": ",...n)))}function Mn(e){let t=e[0].score,r=[...e],o=xe(),n=r.findIndex(l=>!`${new URL(l.location,o.base)}`.includes("#")),[i]=r.splice(n,1),a=r.findIndex(l=>l.scoreQr(l,1)),...p.length?[x("details",{class:"md-search-result__more"},x("summary",{tabIndex:-1},x("div",null,p.length>0&&p.length===1?Ee("search.result.more.one"):Ee("search.result.more.other",p.length))),...p.map(l=>Qr(l,1)))]:[]];return x("li",{class:"md-search-result__item"},c)}function _n(e){return x("ul",{class:"md-source__facts"},Object.entries(e).map(([t,r])=>x("li",{class:`md-source__fact md-source__fact--${t}`},typeof r=="number"?sr(r):r)))}function Kr(e){let t=`tabbed-control tabbed-control--${e}`;return x("div",{class:t,hidden:!0},x("button",{class:"tabbed-button",tabIndex:-1,"aria-hidden":"true"}))}function An(e){return x("div",{class:"md-typeset__scrollwrap"},x("div",{class:"md-typeset__table"},e))}function Ra(e){var o;let t=xe(),r=new URL(`../${e.version}/`,t.base);return x("li",{class:"md-version__item"},x("a",{href:`${r}`,class:"md-version__link"},e.title,((o=t.version)==null?void 0:o.alias)&&e.aliases.length>0&&x("span",{class:"md-version__alias"},e.aliases[0])))}function Cn(e,t){var o;let r=xe();return e=e.filter(n=>{var i;return!((i=n.properties)!=null&&i.hidden)}),x("div",{class:"md-version"},x("button",{class:"md-version__current","aria-label":Ee("select.version")},t.title,((o=r.version)==null?void 0:o.alias)&&t.aliases.length>0&&x("span",{class:"md-version__alias"},t.aliases[0])),x("ul",{class:"md-version__list"},e.map(Ra)))}var Ia=0;function ja(e){let t=z([et(e),$t(e)]).pipe(m(([o,n])=>o||n),K()),r=C(()=>Zo(e)).pipe(ne(Ne),pt(1),He(t),m(()=>en(e)));return t.pipe(Ae(o=>o),v(()=>z([t,r])),m(([o,n])=>({active:o,offset:n})),pe())}function Fa(e,t){let{content$:r,viewport$:o}=t,n=`__tooltip2_${Ia++}`;return C(()=>{let i=new g,a=new _r(!1);i.pipe(Z(),ie(!1)).subscribe(a);let s=a.pipe(Ht(c=>Le(+!c*250,kr)),K(),v(c=>c?r:S),w(c=>c.id=n),pe());z([i.pipe(m(({active:c})=>c)),s.pipe(v(c=>$t(c,250)),Q(!1))]).pipe(m(c=>c.some(l=>l))).subscribe(a);let p=a.pipe(b(c=>c),re(s,o),m(([c,l,{size:f}])=>{let u=e.getBoundingClientRect(),d=u.width/2;if(l.role==="tooltip")return{x:d,y:8+u.height};if(u.y>=f.height/2){let{height:y}=ce(l);return{x:d,y:-16-y}}else return{x:d,y:16+u.height}}));return z([s,i,p]).subscribe(([c,{offset:l},f])=>{c.style.setProperty("--md-tooltip-host-x",`${l.x}px`),c.style.setProperty("--md-tooltip-host-y",`${l.y}px`),c.style.setProperty("--md-tooltip-x",`${f.x}px`),c.style.setProperty("--md-tooltip-y",`${f.y}px`),c.classList.toggle("md-tooltip2--top",f.y<0),c.classList.toggle("md-tooltip2--bottom",f.y>=0)}),a.pipe(b(c=>c),re(s,(c,l)=>l),b(c=>c.role==="tooltip")).subscribe(c=>{let l=ce(R(":scope > *",c));c.style.setProperty("--md-tooltip-width",`${l.width}px`),c.style.setProperty("--md-tooltip-tail","0px")}),a.pipe(K(),ve(me),re(s)).subscribe(([c,l])=>{l.classList.toggle("md-tooltip2--active",c)}),z([a.pipe(b(c=>c)),s]).subscribe(([c,l])=>{l.role==="dialog"?(e.setAttribute("aria-controls",n),e.setAttribute("aria-haspopup","dialog")):e.setAttribute("aria-describedby",n)}),a.pipe(b(c=>!c)).subscribe(()=>{e.removeAttribute("aria-controls"),e.removeAttribute("aria-describedby"),e.removeAttribute("aria-haspopup")}),ja(e).pipe(w(c=>i.next(c)),_(()=>i.complete()),m(c=>$({ref:e},c)))})}function mt(e,{viewport$:t},r=document.body){return Fa(e,{content$:new j(o=>{let n=e.title,i=wn(n);return o.next(i),e.removeAttribute("title"),r.append(i),()=>{i.remove(),e.setAttribute("title",n)}}),viewport$:t})}function Ua(e,t){let r=C(()=>z([tn(e),Ne(t)])).pipe(m(([{x:o,y:n},i])=>{let{width:a,height:s}=ce(e);return{x:o-i.x+a/2,y:n-i.y+s/2}}));return et(e).pipe(v(o=>r.pipe(m(n=>({active:o,offset:n})),Te(+!o||1/0))))}function kn(e,t,{target$:r}){let[o,n]=Array.from(e.children);return C(()=>{let i=new g,a=i.pipe(Z(),ie(!0));return i.subscribe({next({offset:s}){e.style.setProperty("--md-tooltip-x",`${s.x}px`),e.style.setProperty("--md-tooltip-y",`${s.y}px`)},complete(){e.style.removeProperty("--md-tooltip-x"),e.style.removeProperty("--md-tooltip-y")}}),tt(e).pipe(W(a)).subscribe(s=>{e.toggleAttribute("data-md-visible",s)}),O(i.pipe(b(({active:s})=>s)),i.pipe(_e(250),b(({active:s})=>!s))).subscribe({next({active:s}){s?e.prepend(o):o.remove()},complete(){e.prepend(o)}}),i.pipe(Me(16,me)).subscribe(({active:s})=>{o.classList.toggle("md-tooltip--active",s)}),i.pipe(pt(125,me),b(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:s})=>s)).subscribe({next(s){s?e.style.setProperty("--md-tooltip-0",`${-s}px`):e.style.removeProperty("--md-tooltip-0")},complete(){e.style.removeProperty("--md-tooltip-0")}}),h(n,"click").pipe(W(a),b(s=>!(s.metaKey||s.ctrlKey))).subscribe(s=>{s.stopPropagation(),s.preventDefault()}),h(n,"mousedown").pipe(W(a),re(i)).subscribe(([s,{active:p}])=>{var c;if(s.button!==0||s.metaKey||s.ctrlKey)s.preventDefault();else if(p){s.preventDefault();let l=e.parentElement.closest(".md-annotation");l instanceof HTMLElement?l.focus():(c=Ie())==null||c.blur()}}),r.pipe(W(a),b(s=>s===o),Ge(125)).subscribe(()=>e.focus()),Ua(e,t).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))})}function Wa(e){return e.tagName==="CODE"?P(".c, .c1, .cm",e):[e]}function Da(e){let t=[];for(let r of Wa(e)){let o=[],n=document.createNodeIterator(r,NodeFilter.SHOW_TEXT);for(let i=n.nextNode();i;i=n.nextNode())o.push(i);for(let i of o){let a;for(;a=/(\(\d+\))(!)?/.exec(i.textContent);){let[,s,p]=a;if(typeof p=="undefined"){let c=i.splitText(a.index);i=c.splitText(s.length),t.push(c)}else{i.textContent=s,t.push(i);break}}}}return t}function Hn(e,t){t.append(...Array.from(e.childNodes))}function fr(e,t,{target$:r,print$:o}){let n=t.closest("[id]"),i=n==null?void 0:n.id,a=new Map;for(let s of Da(t)){let[,p]=s.textContent.match(/\((\d+)\)/);fe(`:scope > li:nth-child(${p})`,e)&&(a.set(p,Tn(p,i)),s.replaceWith(a.get(p)))}return a.size===0?S:C(()=>{let s=new g,p=s.pipe(Z(),ie(!0)),c=[];for(let[l,f]of a)c.push([R(".md-typeset",f),R(`:scope > li:nth-child(${l})`,e)]);return o.pipe(W(p)).subscribe(l=>{e.hidden=!l,e.classList.toggle("md-annotation-list",l);for(let[f,u]of c)l?Hn(f,u):Hn(u,f)}),O(...[...a].map(([,l])=>kn(l,t,{target$:r}))).pipe(_(()=>s.complete()),pe())})}function $n(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return $n(t)}}function Pn(e,t){return C(()=>{let r=$n(e);return typeof r!="undefined"?fr(r,e,t):S})}var Rn=Mt(Br());var Va=0;function In(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return In(t)}}function Na(e){return ge(e).pipe(m(({width:t})=>({scrollable:St(e).width>t})),te("scrollable"))}function jn(e,t){let{matches:r}=matchMedia("(hover)"),o=C(()=>{let n=new g,i=n.pipe(jr(1));n.subscribe(({scrollable:c})=>{c&&r?e.setAttribute("tabindex","0"):e.removeAttribute("tabindex")});let a=[];if(Rn.default.isSupported()&&(e.closest(".copy")||B("content.code.copy")&&!e.closest(".no-copy"))){let c=e.closest("pre");c.id=`__code_${Va++}`;let l=Sn(c.id);c.insertBefore(l,e),B("content.tooltips")&&a.push(mt(l,{viewport$}))}let s=e.closest(".highlight");if(s instanceof HTMLElement){let c=In(s);if(typeof c!="undefined"&&(s.classList.contains("annotate")||B("content.code.annotate"))){let l=fr(c,e,t);a.push(ge(s).pipe(W(i),m(({width:f,height:u})=>f&&u),K(),v(f=>f?l:S)))}}return P(":scope > span[id]",e).length&&e.classList.add("md-code__content"),Na(e).pipe(w(c=>n.next(c)),_(()=>n.complete()),m(c=>$({ref:e},c)),Re(...a))});return B("content.lazy")?tt(e).pipe(b(n=>n),Te(1),v(()=>o)):o}function za(e,{target$:t,print$:r}){let o=!0;return O(t.pipe(m(n=>n.closest("details:not([open])")),b(n=>e===n),m(()=>({action:"open",reveal:!0}))),r.pipe(b(n=>n||!o),w(()=>o=e.open),m(n=>({action:n?"open":"close"}))))}function Fn(e,t){return C(()=>{let r=new g;return r.subscribe(({action:o,reveal:n})=>{e.toggleAttribute("open",o==="open"),n&&e.scrollIntoView()}),za(e,t).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}var Un=".node circle,.node ellipse,.node path,.node polygon,.node rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}marker{fill:var(--md-mermaid-edge-color)!important}.edgeLabel .label rect{fill:#0000}.flowchartTitleText{fill:var(--md-mermaid-label-fg-color)}.label{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.label foreignObject{line-height:normal;overflow:visible}.label div .edgeLabel{color:var(--md-mermaid-label-fg-color)}.edgeLabel,.edgeLabel p,.label div .edgeLabel{background-color:var(--md-mermaid-label-bg-color)}.edgeLabel,.edgeLabel p{fill:var(--md-mermaid-label-bg-color);color:var(--md-mermaid-edge-color)}.edgePath .path,.flowchart-link{stroke:var(--md-mermaid-edge-color);stroke-width:.05rem}.edgePath .arrowheadPath{fill:var(--md-mermaid-edge-color);stroke:none}.cluster rect{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}.cluster span{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}g #flowchart-circleEnd,g #flowchart-circleStart,g #flowchart-crossEnd,g #flowchart-crossStart,g #flowchart-pointEnd,g #flowchart-pointStart{stroke:none}.classDiagramTitleText{fill:var(--md-mermaid-label-fg-color)}g.classGroup line,g.classGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.classGroup text{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.classLabel .box{fill:var(--md-mermaid-label-bg-color);background-color:var(--md-mermaid-label-bg-color);opacity:1}.classLabel .label{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.node .divider{stroke:var(--md-mermaid-node-fg-color)}.relation{stroke:var(--md-mermaid-edge-color)}.cardinality{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.cardinality text{fill:inherit!important}defs #classDiagram-compositionEnd,defs #classDiagram-compositionStart,defs #classDiagram-dependencyEnd,defs #classDiagram-dependencyStart,defs #classDiagram-extensionEnd,defs #classDiagram-extensionStart{fill:var(--md-mermaid-edge-color)!important;stroke:var(--md-mermaid-edge-color)!important}defs #classDiagram-aggregationEnd,defs #classDiagram-aggregationStart{fill:var(--md-mermaid-label-bg-color)!important;stroke:var(--md-mermaid-edge-color)!important}.statediagramTitleText{fill:var(--md-mermaid-label-fg-color)}g.stateGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.stateGroup .state-title{fill:var(--md-mermaid-label-fg-color)!important;font-family:var(--md-mermaid-font-family)}g.stateGroup .composit{fill:var(--md-mermaid-label-bg-color)}.nodeLabel,.nodeLabel p{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}a .nodeLabel{text-decoration:underline}.node circle.state-end,.node circle.state-start,.start-state{fill:var(--md-mermaid-edge-color);stroke:none}.end-state-inner,.end-state-outer{fill:var(--md-mermaid-edge-color)}.end-state-inner,.node circle.state-end{stroke:var(--md-mermaid-label-bg-color)}.transition{stroke:var(--md-mermaid-edge-color)}[id^=state-fork] rect,[id^=state-join] rect{fill:var(--md-mermaid-edge-color)!important;stroke:none!important}.statediagram-cluster.statediagram-cluster .inner{fill:var(--md-default-bg-color)}.statediagram-cluster rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.statediagram-state rect.divider{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}defs #statediagram-barbEnd{stroke:var(--md-mermaid-edge-color)}.entityTitleText{fill:var(--md-mermaid-label-fg-color)}.attributeBoxEven,.attributeBoxOdd{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityBox{fill:var(--md-mermaid-label-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityLabel{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.relationshipLabelBox{fill:var(--md-mermaid-label-bg-color);fill-opacity:1;background-color:var(--md-mermaid-label-bg-color);opacity:1}.relationshipLabel{fill:var(--md-mermaid-label-fg-color)}.relationshipLine{stroke:var(--md-mermaid-edge-color)}defs #ONE_OR_MORE_END *,defs #ONE_OR_MORE_START *,defs #ONLY_ONE_END *,defs #ONLY_ONE_START *,defs #ZERO_OR_MORE_END *,defs #ZERO_OR_MORE_START *,defs #ZERO_OR_ONE_END *,defs #ZERO_OR_ONE_START *{stroke:var(--md-mermaid-edge-color)!important}defs #ZERO_OR_MORE_END circle,defs #ZERO_OR_MORE_START circle{fill:var(--md-mermaid-label-bg-color)}text:not([class]):last-child{fill:var(--md-mermaid-label-fg-color)}.actor{fill:var(--md-mermaid-sequence-actor-bg-color);stroke:var(--md-mermaid-sequence-actor-border-color)}text.actor>tspan{fill:var(--md-mermaid-sequence-actor-fg-color);font-family:var(--md-mermaid-font-family)}line{stroke:var(--md-mermaid-sequence-actor-line-color)}.actor-man circle,.actor-man line{fill:var(--md-mermaid-sequence-actorman-bg-color);stroke:var(--md-mermaid-sequence-actorman-line-color)}.messageLine0,.messageLine1{stroke:var(--md-mermaid-sequence-message-line-color)}.note{fill:var(--md-mermaid-sequence-note-bg-color);stroke:var(--md-mermaid-sequence-note-border-color)}.loopText,.loopText>tspan,.messageText,.noteText>tspan{stroke:none;font-family:var(--md-mermaid-font-family)!important}.messageText{fill:var(--md-mermaid-sequence-message-fg-color)}.loopText,.loopText>tspan{fill:var(--md-mermaid-sequence-loop-fg-color)}.noteText>tspan{fill:var(--md-mermaid-sequence-note-fg-color)}#arrowhead path{fill:var(--md-mermaid-sequence-message-line-color);stroke:none}.loopLine{fill:var(--md-mermaid-sequence-loop-bg-color);stroke:var(--md-mermaid-sequence-loop-border-color)}.labelBox{fill:var(--md-mermaid-sequence-label-bg-color);stroke:none}.labelText,.labelText>span{fill:var(--md-mermaid-sequence-label-fg-color);font-family:var(--md-mermaid-font-family)}.sequenceNumber{fill:var(--md-mermaid-sequence-number-fg-color)}rect.rect{fill:var(--md-mermaid-sequence-box-bg-color);stroke:none}rect.rect+text.text{fill:var(--md-mermaid-sequence-box-fg-color)}defs #sequencenumber{fill:var(--md-mermaid-sequence-number-bg-color)!important}";var Gr,Qa=0;function Ka(){return typeof mermaid=="undefined"||mermaid instanceof Element?Tt("https://unpkg.com/mermaid@11/dist/mermaid.min.js"):I(void 0)}function Wn(e){return e.classList.remove("mermaid"),Gr||(Gr=Ka().pipe(w(()=>mermaid.initialize({startOnLoad:!1,themeCSS:Un,sequence:{actorFontSize:"16px",messageFontSize:"16px",noteFontSize:"16px"}})),m(()=>{}),G(1))),Gr.subscribe(()=>co(this,null,function*(){e.classList.add("mermaid");let t=`__mermaid_${Qa++}`,r=x("div",{class:"mermaid"}),o=e.textContent,{svg:n,fn:i}=yield mermaid.render(t,o),a=r.attachShadow({mode:"closed"});a.innerHTML=n,e.replaceWith(r),i==null||i(a)})),Gr.pipe(m(()=>({ref:e})))}var Dn=x("table");function Vn(e){return e.replaceWith(Dn),Dn.replaceWith(An(e)),I({ref:e})}function Ya(e){let t=e.find(r=>r.checked)||e[0];return O(...e.map(r=>h(r,"change").pipe(m(()=>R(`label[for="${r.id}"]`))))).pipe(Q(R(`label[for="${t.id}"]`)),m(r=>({active:r})))}function Nn(e,{viewport$:t,target$:r}){let o=R(".tabbed-labels",e),n=P(":scope > input",e),i=Kr("prev");e.append(i);let a=Kr("next");return e.append(a),C(()=>{let s=new g,p=s.pipe(Z(),ie(!0));z([s,ge(e),tt(e)]).pipe(W(p),Me(1,me)).subscribe({next([{active:c},l]){let f=Ve(c),{width:u}=ce(c);e.style.setProperty("--md-indicator-x",`${f.x}px`),e.style.setProperty("--md-indicator-width",`${u}px`);let d=pr(o);(f.xd.x+l.width)&&o.scrollTo({left:Math.max(0,f.x-16),behavior:"smooth"})},complete(){e.style.removeProperty("--md-indicator-x"),e.style.removeProperty("--md-indicator-width")}}),z([Ne(o),ge(o)]).pipe(W(p)).subscribe(([c,l])=>{let f=St(o);i.hidden=c.x<16,a.hidden=c.x>f.width-l.width-16}),O(h(i,"click").pipe(m(()=>-1)),h(a,"click").pipe(m(()=>1))).pipe(W(p)).subscribe(c=>{let{width:l}=ce(o);o.scrollBy({left:l*c,behavior:"smooth"})}),r.pipe(W(p),b(c=>n.includes(c))).subscribe(c=>c.click()),o.classList.add("tabbed-labels--linked");for(let c of n){let l=R(`label[for="${c.id}"]`);l.replaceChildren(x("a",{href:`#${l.htmlFor}`,tabIndex:-1},...Array.from(l.childNodes))),h(l.firstElementChild,"click").pipe(W(p),b(f=>!(f.metaKey||f.ctrlKey)),w(f=>{f.preventDefault(),f.stopPropagation()})).subscribe(()=>{history.replaceState({},"",`#${l.htmlFor}`),l.click()})}return B("content.tabs.link")&&s.pipe(Ce(1),re(t)).subscribe(([{active:c},{offset:l}])=>{let f=c.innerText.trim();if(c.hasAttribute("data-md-switching"))c.removeAttribute("data-md-switching");else{let u=e.offsetTop-l.y;for(let y of P("[data-tabs]"))for(let L of P(":scope > input",y)){let X=R(`label[for="${L.id}"]`);if(X!==c&&X.innerText.trim()===f){X.setAttribute("data-md-switching",""),L.click();break}}window.scrollTo({top:e.offsetTop-u});let d=__md_get("__tabs")||[];__md_set("__tabs",[...new Set([f,...d])])}}),s.pipe(W(p)).subscribe(()=>{for(let c of P("audio, video",e))c.pause()}),Ya(n).pipe(w(c=>s.next(c)),_(()=>s.complete()),m(c=>$({ref:e},c)))}).pipe(Ke(se))}function zn(e,{viewport$:t,target$:r,print$:o}){return O(...P(".annotate:not(.highlight)",e).map(n=>Pn(n,{target$:r,print$:o})),...P("pre:not(.mermaid) > code",e).map(n=>jn(n,{target$:r,print$:o})),...P("pre.mermaid",e).map(n=>Wn(n)),...P("table:not([class])",e).map(n=>Vn(n)),...P("details",e).map(n=>Fn(n,{target$:r,print$:o})),...P("[data-tabs]",e).map(n=>Nn(n,{viewport$:t,target$:r})),...P("[title]",e).filter(()=>B("content.tooltips")).map(n=>mt(n,{viewport$:t})))}function Ba(e,{alert$:t}){return t.pipe(v(r=>O(I(!0),I(!1).pipe(Ge(2e3))).pipe(m(o=>({message:r,active:o})))))}function qn(e,t){let r=R(".md-typeset",e);return C(()=>{let o=new g;return o.subscribe(({message:n,active:i})=>{e.classList.toggle("md-dialog--active",i),r.textContent=n}),Ba(e,t).pipe(w(n=>o.next(n)),_(()=>o.complete()),m(n=>$({ref:e},n)))})}var Ga=0;function Ja(e,t){document.body.append(e);let{width:r}=ce(e);e.style.setProperty("--md-tooltip-width",`${r}px`),e.remove();let o=cr(t),n=typeof o!="undefined"?Ne(o):I({x:0,y:0}),i=O(et(t),$t(t)).pipe(K());return z([i,n]).pipe(m(([a,s])=>{let{x:p,y:c}=Ve(t),l=ce(t),f=t.closest("table");return f&&t.parentElement&&(p+=f.offsetLeft+t.parentElement.offsetLeft,c+=f.offsetTop+t.parentElement.offsetTop),{active:a,offset:{x:p-s.x+l.width/2-r/2,y:c-s.y+l.height+8}}}))}function Qn(e){let t=e.title;if(!t.length)return S;let r=`__tooltip_${Ga++}`,o=Rt(r,"inline"),n=R(".md-typeset",o);return n.innerHTML=t,C(()=>{let i=new g;return i.subscribe({next({offset:a}){o.style.setProperty("--md-tooltip-x",`${a.x}px`),o.style.setProperty("--md-tooltip-y",`${a.y}px`)},complete(){o.style.removeProperty("--md-tooltip-x"),o.style.removeProperty("--md-tooltip-y")}}),O(i.pipe(b(({active:a})=>a)),i.pipe(_e(250),b(({active:a})=>!a))).subscribe({next({active:a}){a?(e.insertAdjacentElement("afterend",o),e.setAttribute("aria-describedby",r),e.removeAttribute("title")):(o.remove(),e.removeAttribute("aria-describedby"),e.setAttribute("title",t))},complete(){o.remove(),e.removeAttribute("aria-describedby"),e.setAttribute("title",t)}}),i.pipe(Me(16,me)).subscribe(({active:a})=>{o.classList.toggle("md-tooltip--active",a)}),i.pipe(pt(125,me),b(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:a})=>a)).subscribe({next(a){a?o.style.setProperty("--md-tooltip-0",`${-a}px`):o.style.removeProperty("--md-tooltip-0")},complete(){o.style.removeProperty("--md-tooltip-0")}}),Ja(o,e).pipe(w(a=>i.next(a)),_(()=>i.complete()),m(a=>$({ref:e},a)))}).pipe(Ke(se))}function Xa({viewport$:e}){if(!B("header.autohide"))return I(!1);let t=e.pipe(m(({offset:{y:n}})=>n),Be(2,1),m(([n,i])=>[nMath.abs(i-n.y)>100),m(([,[n]])=>n),K()),o=ze("search");return z([e,o]).pipe(m(([{offset:n},i])=>n.y>400&&!i),K(),v(n=>n?r:I(!1)),Q(!1))}function Kn(e,t){return C(()=>z([ge(e),Xa(t)])).pipe(m(([{height:r},o])=>({height:r,hidden:o})),K((r,o)=>r.height===o.height&&r.hidden===o.hidden),G(1))}function Yn(e,{header$:t,main$:r}){return C(()=>{let o=new g,n=o.pipe(Z(),ie(!0));o.pipe(te("active"),He(t)).subscribe(([{active:a},{hidden:s}])=>{e.classList.toggle("md-header--shadow",a&&!s),e.hidden=s});let i=ue(P("[title]",e)).pipe(b(()=>B("content.tooltips")),ne(a=>Qn(a)));return r.subscribe(o),t.pipe(W(n),m(a=>$({ref:e},a)),Re(i.pipe(W(n))))})}function Za(e,{viewport$:t,header$:r}){return mr(e,{viewport$:t,header$:r}).pipe(m(({offset:{y:o}})=>{let{height:n}=ce(e);return{active:o>=n}}),te("active"))}function Bn(e,t){return C(()=>{let r=new g;r.subscribe({next({active:n}){e.classList.toggle("md-header__title--active",n)},complete(){e.classList.remove("md-header__title--active")}});let o=fe(".md-content h1");return typeof o=="undefined"?S:Za(o,t).pipe(w(n=>r.next(n)),_(()=>r.complete()),m(n=>$({ref:e},n)))})}function Gn(e,{viewport$:t,header$:r}){let o=r.pipe(m(({height:i})=>i),K()),n=o.pipe(v(()=>ge(e).pipe(m(({height:i})=>({top:e.offsetTop,bottom:e.offsetTop+i})),te("bottom"))));return z([o,n,t]).pipe(m(([i,{top:a,bottom:s},{offset:{y:p},size:{height:c}}])=>(c=Math.max(0,c-Math.max(0,a-p,i)-Math.max(0,c+p-s)),{offset:a-i,height:c,active:a-i<=p})),K((i,a)=>i.offset===a.offset&&i.height===a.height&&i.active===a.active))}function es(e){let t=__md_get("__palette")||{index:e.findIndex(o=>matchMedia(o.getAttribute("data-md-color-media")).matches)},r=Math.max(0,Math.min(t.index,e.length-1));return I(...e).pipe(ne(o=>h(o,"change").pipe(m(()=>o))),Q(e[r]),m(o=>({index:e.indexOf(o),color:{media:o.getAttribute("data-md-color-media"),scheme:o.getAttribute("data-md-color-scheme"),primary:o.getAttribute("data-md-color-primary"),accent:o.getAttribute("data-md-color-accent")}})),G(1))}function Jn(e){let t=P("input",e),r=x("meta",{name:"theme-color"});document.head.appendChild(r);let o=x("meta",{name:"color-scheme"});document.head.appendChild(o);let n=Pt("(prefers-color-scheme: light)");return C(()=>{let i=new g;return i.subscribe(a=>{if(document.body.setAttribute("data-md-color-switching",""),a.color.media==="(prefers-color-scheme)"){let s=matchMedia("(prefers-color-scheme: light)"),p=document.querySelector(s.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");a.color.scheme=p.getAttribute("data-md-color-scheme"),a.color.primary=p.getAttribute("data-md-color-primary"),a.color.accent=p.getAttribute("data-md-color-accent")}for(let[s,p]of Object.entries(a.color))document.body.setAttribute(`data-md-color-${s}`,p);for(let s=0;sa.key==="Enter"),re(i,(a,s)=>s)).subscribe(({index:a})=>{a=(a+1)%t.length,t[a].click(),t[a].focus()}),i.pipe(m(()=>{let a=Se("header"),s=window.getComputedStyle(a);return o.content=s.colorScheme,s.backgroundColor.match(/\d+/g).map(p=>(+p).toString(16).padStart(2,"0")).join("")})).subscribe(a=>r.content=`#${a}`),i.pipe(ve(se)).subscribe(()=>{document.body.removeAttribute("data-md-color-switching")}),es(t).pipe(W(n.pipe(Ce(1))),ct(),w(a=>i.next(a)),_(()=>i.complete()),m(a=>$({ref:e},a)))})}function Xn(e,{progress$:t}){return C(()=>{let r=new g;return r.subscribe(({value:o})=>{e.style.setProperty("--md-progress-value",`${o}`)}),t.pipe(w(o=>r.next({value:o})),_(()=>r.complete()),m(o=>({ref:e,value:o})))})}var Jr=Mt(Br());function ts(e){e.setAttribute("data-md-copying","");let t=e.closest("[data-copy]"),r=t?t.getAttribute("data-copy"):e.innerText;return e.removeAttribute("data-md-copying"),r.trimEnd()}function Zn({alert$:e}){Jr.default.isSupported()&&new j(t=>{new Jr.default("[data-clipboard-target], [data-clipboard-text]",{text:r=>r.getAttribute("data-clipboard-text")||ts(R(r.getAttribute("data-clipboard-target")))}).on("success",r=>t.next(r))}).pipe(w(t=>{t.trigger.focus()}),m(()=>Ee("clipboard.copied"))).subscribe(e)}function ei(e,t){return e.protocol=t.protocol,e.hostname=t.hostname,e}function rs(e,t){let r=new Map;for(let o of P("url",e)){let n=R("loc",o),i=[ei(new URL(n.textContent),t)];r.set(`${i[0]}`,i);for(let a of P("[rel=alternate]",o)){let s=a.getAttribute("href");s!=null&&i.push(ei(new URL(s),t))}}return r}function ur(e){return un(new URL("sitemap.xml",e)).pipe(m(t=>rs(t,new URL(e))),de(()=>I(new Map)))}function os(e,t){if(!(e.target instanceof Element))return S;let r=e.target.closest("a");if(r===null)return S;if(r.target||e.metaKey||e.ctrlKey)return S;let o=new URL(r.href);return o.search=o.hash="",t.has(`${o}`)?(e.preventDefault(),I(new URL(r.href))):S}function ti(e){let t=new Map;for(let r of P(":scope > *",e.head))t.set(r.outerHTML,r);return t}function ri(e){for(let t of P("[href], [src]",e))for(let r of["href","src"]){let o=t.getAttribute(r);if(o&&!/^(?:[a-z]+:)?\/\//i.test(o)){t[r]=t[r];break}}return I(e)}function ns(e){for(let o of["[data-md-component=announce]","[data-md-component=container]","[data-md-component=header-topic]","[data-md-component=outdated]","[data-md-component=logo]","[data-md-component=skip]",...B("navigation.tabs.sticky")?["[data-md-component=tabs]"]:[]]){let n=fe(o),i=fe(o,e);typeof n!="undefined"&&typeof i!="undefined"&&n.replaceWith(i)}let t=ti(document);for(let[o,n]of ti(e))t.has(o)?t.delete(o):document.head.appendChild(n);for(let o of t.values()){let n=o.getAttribute("name");n!=="theme-color"&&n!=="color-scheme"&&o.remove()}let r=Se("container");return We(P("script",r)).pipe(v(o=>{let n=e.createElement("script");if(o.src){for(let i of o.getAttributeNames())n.setAttribute(i,o.getAttribute(i));return o.replaceWith(n),new j(i=>{n.onload=()=>i.complete()})}else return n.textContent=o.textContent,o.replaceWith(n),S}),Z(),ie(document))}function oi({location$:e,viewport$:t,progress$:r}){let o=xe();if(location.protocol==="file:")return S;let n=ur(o.base);I(document).subscribe(ri);let i=h(document.body,"click").pipe(He(n),v(([p,c])=>os(p,c)),pe()),a=h(window,"popstate").pipe(m(ye),pe());i.pipe(re(t)).subscribe(([p,{offset:c}])=>{history.replaceState(c,""),history.pushState(null,"",p)}),O(i,a).subscribe(e);let s=e.pipe(te("pathname"),v(p=>fn(p,{progress$:r}).pipe(de(()=>(lt(p,!0),S)))),v(ri),v(ns),pe());return O(s.pipe(re(e,(p,c)=>c)),s.pipe(v(()=>e),te("hash")),e.pipe(K((p,c)=>p.pathname===c.pathname&&p.hash===c.hash),v(()=>i),w(()=>history.back()))).subscribe(p=>{var c,l;history.state!==null||!p.hash?window.scrollTo(0,(l=(c=history.state)==null?void 0:c.y)!=null?l:0):(history.scrollRestoration="auto",pn(p.hash),history.scrollRestoration="manual")}),e.subscribe(()=>{history.scrollRestoration="manual"}),h(window,"beforeunload").subscribe(()=>{history.scrollRestoration="auto"}),t.pipe(te("offset"),_e(100)).subscribe(({offset:p})=>{history.replaceState(p,"")}),s}var ni=Mt(qr());function ii(e){let t=e.separator.split("|").map(n=>n.replace(/(\(\?[!=<][^)]+\))/g,"").length===0?"\uFFFD":n).join("|"),r=new RegExp(t,"img"),o=(n,i,a)=>`${i}${a}`;return n=>{n=n.replace(/[\s*+\-:~^]+/g," ").trim();let i=new RegExp(`(^|${e.separator}|)(${n.replace(/[|\\{}()[\]^$+*?.-]/g,"\\$&").replace(r,"|")})`,"img");return a=>(0,ni.default)(a).replace(i,o).replace(/<\/mark>(\s+)]*>/img,"$1")}}function jt(e){return e.type===1}function dr(e){return e.type===3}function ai(e,t){let r=yn(e);return O(I(location.protocol!=="file:"),ze("search")).pipe(Ae(o=>o),v(()=>t)).subscribe(({config:o,docs:n})=>r.next({type:0,data:{config:o,docs:n,options:{suggest:B("search.suggest")}}})),r}function si(e){var l;let{selectedVersionSitemap:t,selectedVersionBaseURL:r,currentLocation:o,currentBaseURL:n}=e,i=(l=Xr(n))==null?void 0:l.pathname;if(i===void 0)return;let a=ss(o.pathname,i);if(a===void 0)return;let s=ps(t.keys());if(!t.has(s))return;let p=Xr(a,s);if(!p||!t.has(p.href))return;let c=Xr(a,r);if(c)return c.hash=o.hash,c.search=o.search,c}function Xr(e,t){try{return new URL(e,t)}catch(r){return}}function ss(e,t){if(e.startsWith(t))return e.slice(t.length)}function cs(e,t){let r=Math.min(e.length,t.length),o;for(o=0;oS)),o=r.pipe(m(n=>{let[,i]=t.base.match(/([^/]+)\/?$/);return n.find(({version:a,aliases:s})=>a===i||s.includes(i))||n[0]}));r.pipe(m(n=>new Map(n.map(i=>[`${new URL(`../${i.version}/`,t.base)}`,i]))),v(n=>h(document.body,"click").pipe(b(i=>!i.metaKey&&!i.ctrlKey),re(o),v(([i,a])=>{if(i.target instanceof Element){let s=i.target.closest("a");if(s&&!s.target&&n.has(s.href)){let p=s.href;return!i.target.closest(".md-version")&&n.get(p)===a?S:(i.preventDefault(),I(new URL(p)))}}return S}),v(i=>ur(i).pipe(m(a=>{var s;return(s=si({selectedVersionSitemap:a,selectedVersionBaseURL:i,currentLocation:ye(),currentBaseURL:t.base}))!=null?s:i})))))).subscribe(n=>lt(n,!0)),z([r,o]).subscribe(([n,i])=>{R(".md-header__topic").appendChild(Cn(n,i))}),e.pipe(v(()=>o)).subscribe(n=>{var s;let i=new URL(t.base),a=__md_get("__outdated",sessionStorage,i);if(a===null){a=!0;let p=((s=t.version)==null?void 0:s.default)||"latest";Array.isArray(p)||(p=[p]);e:for(let c of p)for(let l of n.aliases.concat(n.version))if(new RegExp(c,"i").test(l)){a=!1;break e}__md_set("__outdated",a,sessionStorage,i)}if(a)for(let p of ae("outdated"))p.hidden=!1})}function ls(e,{worker$:t}){let{searchParams:r}=ye();r.has("q")&&(Je("search",!0),e.value=r.get("q"),e.focus(),ze("search").pipe(Ae(i=>!i)).subscribe(()=>{let i=ye();i.searchParams.delete("q"),history.replaceState({},"",`${i}`)}));let o=et(e),n=O(t.pipe(Ae(jt)),h(e,"keyup"),o).pipe(m(()=>e.value),K());return z([n,o]).pipe(m(([i,a])=>({value:i,focus:a})),G(1))}function pi(e,{worker$:t}){let r=new g,o=r.pipe(Z(),ie(!0));z([t.pipe(Ae(jt)),r],(i,a)=>a).pipe(te("value")).subscribe(({value:i})=>t.next({type:2,data:i})),r.pipe(te("focus")).subscribe(({focus:i})=>{i&&Je("search",i)}),h(e.form,"reset").pipe(W(o)).subscribe(()=>e.focus());let n=R("header [for=__search]");return h(n,"click").subscribe(()=>e.focus()),ls(e,{worker$:t}).pipe(w(i=>r.next(i)),_(()=>r.complete()),m(i=>$({ref:e},i)),G(1))}function li(e,{worker$:t,query$:r}){let o=new g,n=on(e.parentElement).pipe(b(Boolean)),i=e.parentElement,a=R(":scope > :first-child",e),s=R(":scope > :last-child",e);ze("search").subscribe(l=>s.setAttribute("role",l?"list":"presentation")),o.pipe(re(r),Wr(t.pipe(Ae(jt)))).subscribe(([{items:l},{value:f}])=>{switch(l.length){case 0:a.textContent=f.length?Ee("search.result.none"):Ee("search.result.placeholder");break;case 1:a.textContent=Ee("search.result.one");break;default:let u=sr(l.length);a.textContent=Ee("search.result.other",u)}});let p=o.pipe(w(()=>s.innerHTML=""),v(({items:l})=>O(I(...l.slice(0,10)),I(...l.slice(10)).pipe(Be(4),Vr(n),v(([f])=>f)))),m(Mn),pe());return p.subscribe(l=>s.appendChild(l)),p.pipe(ne(l=>{let f=fe("details",l);return typeof f=="undefined"?S:h(f,"toggle").pipe(W(o),m(()=>f))})).subscribe(l=>{l.open===!1&&l.offsetTop<=i.scrollTop&&i.scrollTo({top:l.offsetTop})}),t.pipe(b(dr),m(({data:l})=>l)).pipe(w(l=>o.next(l)),_(()=>o.complete()),m(l=>$({ref:e},l)))}function ms(e,{query$:t}){return t.pipe(m(({value:r})=>{let o=ye();return o.hash="",r=r.replace(/\s+/g,"+").replace(/&/g,"%26").replace(/=/g,"%3D"),o.search=`q=${r}`,{url:o}}))}function mi(e,t){let r=new g,o=r.pipe(Z(),ie(!0));return r.subscribe(({url:n})=>{e.setAttribute("data-clipboard-text",e.href),e.href=`${n}`}),h(e,"click").pipe(W(o)).subscribe(n=>n.preventDefault()),ms(e,t).pipe(w(n=>r.next(n)),_(()=>r.complete()),m(n=>$({ref:e},n)))}function fi(e,{worker$:t,keyboard$:r}){let o=new g,n=Se("search-query"),i=O(h(n,"keydown"),h(n,"focus")).pipe(ve(se),m(()=>n.value),K());return o.pipe(He(i),m(([{suggest:s},p])=>{let c=p.split(/([\s-]+)/);if(s!=null&&s.length&&c[c.length-1]){let l=s[s.length-1];l.startsWith(c[c.length-1])&&(c[c.length-1]=l)}else c.length=0;return c})).subscribe(s=>e.innerHTML=s.join("").replace(/\s/g," ")),r.pipe(b(({mode:s})=>s==="search")).subscribe(s=>{switch(s.type){case"ArrowRight":e.innerText.length&&n.selectionStart===n.value.length&&(n.value=e.innerText);break}}),t.pipe(b(dr),m(({data:s})=>s)).pipe(w(s=>o.next(s)),_(()=>o.complete()),m(()=>({ref:e})))}function ui(e,{index$:t,keyboard$:r}){let o=xe();try{let n=ai(o.search,t),i=Se("search-query",e),a=Se("search-result",e);h(e,"click").pipe(b(({target:p})=>p instanceof Element&&!!p.closest("a"))).subscribe(()=>Je("search",!1)),r.pipe(b(({mode:p})=>p==="search")).subscribe(p=>{let c=Ie();switch(p.type){case"Enter":if(c===i){let l=new Map;for(let f of P(":first-child [href]",a)){let u=f.firstElementChild;l.set(f,parseFloat(u.getAttribute("data-md-score")))}if(l.size){let[[f]]=[...l].sort(([,u],[,d])=>d-u);f.click()}p.claim()}break;case"Escape":case"Tab":Je("search",!1),i.blur();break;case"ArrowUp":case"ArrowDown":if(typeof c=="undefined")i.focus();else{let l=[i,...P(":not(details) > [href], summary, details[open] [href]",a)],f=Math.max(0,(Math.max(0,l.indexOf(c))+l.length+(p.type==="ArrowUp"?-1:1))%l.length);l[f].focus()}p.claim();break;default:i!==Ie()&&i.focus()}}),r.pipe(b(({mode:p})=>p==="global")).subscribe(p=>{switch(p.type){case"f":case"s":case"/":i.focus(),i.select(),p.claim();break}});let s=pi(i,{worker$:n});return O(s,li(a,{worker$:n,query$:s})).pipe(Re(...ae("search-share",e).map(p=>mi(p,{query$:s})),...ae("search-suggest",e).map(p=>fi(p,{worker$:n,keyboard$:r}))))}catch(n){return e.hidden=!0,Ye}}function di(e,{index$:t,location$:r}){return z([t,r.pipe(Q(ye()),b(o=>!!o.searchParams.get("h")))]).pipe(m(([o,n])=>ii(o.config)(n.searchParams.get("h"))),m(o=>{var a;let n=new Map,i=document.createNodeIterator(e,NodeFilter.SHOW_TEXT);for(let s=i.nextNode();s;s=i.nextNode())if((a=s.parentElement)!=null&&a.offsetHeight){let p=s.textContent,c=o(p);c.length>p.length&&n.set(s,c)}for(let[s,p]of n){let{childNodes:c}=x("span",null,p);s.replaceWith(...Array.from(c))}return{ref:e,nodes:n}}))}function fs(e,{viewport$:t,main$:r}){let o=e.closest(".md-grid"),n=o.offsetTop-o.parentElement.offsetTop;return z([r,t]).pipe(m(([{offset:i,height:a},{offset:{y:s}}])=>(a=a+Math.min(n,Math.max(0,s-i))-n,{height:a,locked:s>=i+n})),K((i,a)=>i.height===a.height&&i.locked===a.locked))}function Zr(e,o){var n=o,{header$:t}=n,r=so(n,["header$"]);let i=R(".md-sidebar__scrollwrap",e),{y:a}=Ve(i);return C(()=>{let s=new g,p=s.pipe(Z(),ie(!0)),c=s.pipe(Me(0,me));return c.pipe(re(t)).subscribe({next([{height:l},{height:f}]){i.style.height=`${l-2*a}px`,e.style.top=`${f}px`},complete(){i.style.height="",e.style.top=""}}),c.pipe(Ae()).subscribe(()=>{for(let l of P(".md-nav__link--active[href]",e)){if(!l.clientHeight)continue;let f=l.closest(".md-sidebar__scrollwrap");if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:d}=ce(f);f.scrollTo({top:u-d/2})}}}),ue(P("label[tabindex]",e)).pipe(ne(l=>h(l,"click").pipe(ve(se),m(()=>l),W(p)))).subscribe(l=>{let f=R(`[id="${l.htmlFor}"]`);R(`[aria-labelledby="${l.id}"]`).setAttribute("aria-expanded",`${f.checked}`)}),fs(e,r).pipe(w(l=>s.next(l)),_(()=>s.complete()),m(l=>$({ref:e},l)))})}function hi(e,t){if(typeof t!="undefined"){let r=`https://api.github.com/repos/${e}/${t}`;return st(je(`${r}/releases/latest`).pipe(de(()=>S),m(o=>({version:o.tag_name})),De({})),je(r).pipe(de(()=>S),m(o=>({stars:o.stargazers_count,forks:o.forks_count})),De({}))).pipe(m(([o,n])=>$($({},o),n)))}else{let r=`https://api.github.com/users/${e}`;return je(r).pipe(m(o=>({repositories:o.public_repos})),De({}))}}function bi(e,t){let r=`https://${e}/api/v4/projects/${encodeURIComponent(t)}`;return st(je(`${r}/releases/permalink/latest`).pipe(de(()=>S),m(({tag_name:o})=>({version:o})),De({})),je(r).pipe(de(()=>S),m(({star_count:o,forks_count:n})=>({stars:o,forks:n})),De({}))).pipe(m(([o,n])=>$($({},o),n)))}function vi(e){let t=e.match(/^.+github\.com\/([^/]+)\/?([^/]+)?/i);if(t){let[,r,o]=t;return hi(r,o)}if(t=e.match(/^.+?([^/]*gitlab[^/]+)\/(.+?)\/?$/i),t){let[,r,o]=t;return bi(r,o)}return S}var us;function ds(e){return us||(us=C(()=>{let t=__md_get("__source",sessionStorage);if(t)return I(t);if(ae("consent").length){let o=__md_get("__consent");if(!(o&&o.github))return S}return vi(e.href).pipe(w(o=>__md_set("__source",o,sessionStorage)))}).pipe(de(()=>S),b(t=>Object.keys(t).length>0),m(t=>({facts:t})),G(1)))}function gi(e){let t=R(":scope > :last-child",e);return C(()=>{let r=new g;return r.subscribe(({facts:o})=>{t.appendChild(_n(o)),t.classList.add("md-source__repository--active")}),ds(e).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}function hs(e,{viewport$:t,header$:r}){return ge(document.body).pipe(v(()=>mr(e,{header$:r,viewport$:t})),m(({offset:{y:o}})=>({hidden:o>=10})),te("hidden"))}function yi(e,t){return C(()=>{let r=new g;return r.subscribe({next({hidden:o}){e.hidden=o},complete(){e.hidden=!1}}),(B("navigation.tabs.sticky")?I({hidden:!1}):hs(e,t)).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}function bs(e,{viewport$:t,header$:r}){let o=new Map,n=P(".md-nav__link",e);for(let s of n){let p=decodeURIComponent(s.hash.substring(1)),c=fe(`[id="${p}"]`);typeof c!="undefined"&&o.set(s,c)}let i=r.pipe(te("height"),m(({height:s})=>{let p=Se("main"),c=R(":scope > :first-child",p);return s+.8*(c.offsetTop-p.offsetTop)}),pe());return ge(document.body).pipe(te("height"),v(s=>C(()=>{let p=[];return I([...o].reduce((c,[l,f])=>{for(;p.length&&o.get(p[p.length-1]).tagName>=f.tagName;)p.pop();let u=f.offsetTop;for(;!u&&f.parentElement;)f=f.parentElement,u=f.offsetTop;let d=f.offsetParent;for(;d;d=d.offsetParent)u+=d.offsetTop;return c.set([...p=[...p,l]].reverse(),u)},new Map))}).pipe(m(p=>new Map([...p].sort(([,c],[,l])=>c-l))),He(i),v(([p,c])=>t.pipe(Fr(([l,f],{offset:{y:u},size:d})=>{let y=u+d.height>=Math.floor(s.height);for(;f.length;){let[,L]=f[0];if(L-c=u&&!y)f=[l.pop(),...f];else break}return[l,f]},[[],[...p]]),K((l,f)=>l[0]===f[0]&&l[1]===f[1])))))).pipe(m(([s,p])=>({prev:s.map(([c])=>c),next:p.map(([c])=>c)})),Q({prev:[],next:[]}),Be(2,1),m(([s,p])=>s.prev.length{let i=new g,a=i.pipe(Z(),ie(!0));if(i.subscribe(({prev:s,next:p})=>{for(let[c]of p)c.classList.remove("md-nav__link--passed"),c.classList.remove("md-nav__link--active");for(let[c,[l]]of s.entries())l.classList.add("md-nav__link--passed"),l.classList.toggle("md-nav__link--active",c===s.length-1)}),B("toc.follow")){let s=O(t.pipe(_e(1),m(()=>{})),t.pipe(_e(250),m(()=>"smooth")));i.pipe(b(({prev:p})=>p.length>0),He(o.pipe(ve(se))),re(s)).subscribe(([[{prev:p}],c])=>{let[l]=p[p.length-1];if(l.offsetHeight){let f=cr(l);if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:d}=ce(f);f.scrollTo({top:u-d/2,behavior:c})}}})}return B("navigation.tracking")&&t.pipe(W(a),te("offset"),_e(250),Ce(1),W(n.pipe(Ce(1))),ct({delay:250}),re(i)).subscribe(([,{prev:s}])=>{let p=ye(),c=s[s.length-1];if(c&&c.length){let[l]=c,{hash:f}=new URL(l.href);p.hash!==f&&(p.hash=f,history.replaceState({},"",`${p}`))}else p.hash="",history.replaceState({},"",`${p}`)}),bs(e,{viewport$:t,header$:r}).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))})}function vs(e,{viewport$:t,main$:r,target$:o}){let n=t.pipe(m(({offset:{y:a}})=>a),Be(2,1),m(([a,s])=>a>s&&s>0),K()),i=r.pipe(m(({active:a})=>a));return z([i,n]).pipe(m(([a,s])=>!(a&&s)),K(),W(o.pipe(Ce(1))),ie(!0),ct({delay:250}),m(a=>({hidden:a})))}function Ei(e,{viewport$:t,header$:r,main$:o,target$:n}){let i=new g,a=i.pipe(Z(),ie(!0));return i.subscribe({next({hidden:s}){e.hidden=s,s?(e.setAttribute("tabindex","-1"),e.blur()):e.removeAttribute("tabindex")},complete(){e.style.top="",e.hidden=!0,e.removeAttribute("tabindex")}}),r.pipe(W(a),te("height")).subscribe(({height:s})=>{e.style.top=`${s+16}px`}),h(e,"click").subscribe(s=>{s.preventDefault(),window.scrollTo({top:0})}),vs(e,{viewport$:t,main$:o,target$:n}).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))}function wi({document$:e,viewport$:t}){e.pipe(v(()=>P(".md-ellipsis")),ne(r=>tt(r).pipe(W(e.pipe(Ce(1))),b(o=>o),m(()=>r),Te(1))),b(r=>r.offsetWidth{let o=r.innerText,n=r.closest("a")||r;return n.title=o,B("content.tooltips")?mt(n,{viewport$:t}).pipe(W(e.pipe(Ce(1))),_(()=>n.removeAttribute("title"))):S})).subscribe(),B("content.tooltips")&&e.pipe(v(()=>P(".md-status")),ne(r=>mt(r,{viewport$:t}))).subscribe()}function Ti({document$:e,tablet$:t}){e.pipe(v(()=>P(".md-toggle--indeterminate")),w(r=>{r.indeterminate=!0,r.checked=!1}),ne(r=>h(r,"change").pipe(Dr(()=>r.classList.contains("md-toggle--indeterminate")),m(()=>r))),re(t)).subscribe(([r,o])=>{r.classList.remove("md-toggle--indeterminate"),o&&(r.checked=!1)})}function gs(){return/(iPad|iPhone|iPod)/.test(navigator.userAgent)}function Si({document$:e}){e.pipe(v(()=>P("[data-md-scrollfix]")),w(t=>t.removeAttribute("data-md-scrollfix")),b(gs),ne(t=>h(t,"touchstart").pipe(m(()=>t)))).subscribe(t=>{let r=t.scrollTop;r===0?t.scrollTop=1:r+t.offsetHeight===t.scrollHeight&&(t.scrollTop=r-1)})}function Oi({viewport$:e,tablet$:t}){z([ze("search"),t]).pipe(m(([r,o])=>r&&!o),v(r=>I(r).pipe(Ge(r?400:100))),re(e)).subscribe(([r,{offset:{y:o}}])=>{if(r)document.body.setAttribute("data-md-scrolllock",""),document.body.style.top=`-${o}px`;else{let n=-1*parseInt(document.body.style.top,10);document.body.removeAttribute("data-md-scrolllock"),document.body.style.top="",n&&window.scrollTo(0,n)}})}Object.entries||(Object.entries=function(e){let t=[];for(let r of Object.keys(e))t.push([r,e[r]]);return t});Object.values||(Object.values=function(e){let t=[];for(let r of Object.keys(e))t.push(e[r]);return t});typeof Element!="undefined"&&(Element.prototype.scrollTo||(Element.prototype.scrollTo=function(e,t){typeof e=="object"?(this.scrollLeft=e.left,this.scrollTop=e.top):(this.scrollLeft=e,this.scrollTop=t)}),Element.prototype.replaceWith||(Element.prototype.replaceWith=function(...e){let t=this.parentNode;if(t){e.length===0&&t.removeChild(this);for(let r=e.length-1;r>=0;r--){let o=e[r];typeof o=="string"?o=document.createTextNode(o):o.parentNode&&o.parentNode.removeChild(o),r?t.insertBefore(this.previousSibling,o):t.replaceChild(o,this)}}}));function ys(){return location.protocol==="file:"?Tt(`${new URL("search/search_index.js",eo.base)}`).pipe(m(()=>__index),G(1)):je(new URL("search/search_index.json",eo.base))}document.documentElement.classList.remove("no-js");document.documentElement.classList.add("js");var ot=Go(),Ut=sn(),Lt=ln(Ut),to=an(),Oe=gn(),hr=Pt("(min-width: 960px)"),Mi=Pt("(min-width: 1220px)"),_i=mn(),eo=xe(),Ai=document.forms.namedItem("search")?ys():Ye,ro=new g;Zn({alert$:ro});var oo=new g;B("navigation.instant")&&oi({location$:Ut,viewport$:Oe,progress$:oo}).subscribe(ot);var Li;((Li=eo.version)==null?void 0:Li.provider)==="mike"&&ci({document$:ot});O(Ut,Lt).pipe(Ge(125)).subscribe(()=>{Je("drawer",!1),Je("search",!1)});to.pipe(b(({mode:e})=>e==="global")).subscribe(e=>{switch(e.type){case"p":case",":let t=fe("link[rel=prev]");typeof t!="undefined"&<(t);break;case"n":case".":let r=fe("link[rel=next]");typeof r!="undefined"&<(r);break;case"Enter":let o=Ie();o instanceof HTMLLabelElement&&o.click()}});wi({viewport$:Oe,document$:ot});Ti({document$:ot,tablet$:hr});Si({document$:ot});Oi({viewport$:Oe,tablet$:hr});var rt=Kn(Se("header"),{viewport$:Oe}),Ft=ot.pipe(m(()=>Se("main")),v(e=>Gn(e,{viewport$:Oe,header$:rt})),G(1)),xs=O(...ae("consent").map(e=>En(e,{target$:Lt})),...ae("dialog").map(e=>qn(e,{alert$:ro})),...ae("palette").map(e=>Jn(e)),...ae("progress").map(e=>Xn(e,{progress$:oo})),...ae("search").map(e=>ui(e,{index$:Ai,keyboard$:to})),...ae("source").map(e=>gi(e))),Es=C(()=>O(...ae("announce").map(e=>xn(e)),...ae("content").map(e=>zn(e,{viewport$:Oe,target$:Lt,print$:_i})),...ae("content").map(e=>B("search.highlight")?di(e,{index$:Ai,location$:Ut}):S),...ae("header").map(e=>Yn(e,{viewport$:Oe,header$:rt,main$:Ft})),...ae("header-title").map(e=>Bn(e,{viewport$:Oe,header$:rt})),...ae("sidebar").map(e=>e.getAttribute("data-md-type")==="navigation"?Nr(Mi,()=>Zr(e,{viewport$:Oe,header$:rt,main$:Ft})):Nr(hr,()=>Zr(e,{viewport$:Oe,header$:rt,main$:Ft}))),...ae("tabs").map(e=>yi(e,{viewport$:Oe,header$:rt})),...ae("toc").map(e=>xi(e,{viewport$:Oe,header$:rt,main$:Ft,target$:Lt})),...ae("top").map(e=>Ei(e,{viewport$:Oe,header$:rt,main$:Ft,target$:Lt})))),Ci=ot.pipe(v(()=>Es),Re(xs),G(1));Ci.subscribe();window.document$=ot;window.location$=Ut;window.target$=Lt;window.keyboard$=to;window.viewport$=Oe;window.tablet$=hr;window.screen$=Mi;window.print$=_i;window.alert$=ro;window.progress$=oo;window.component$=Ci;})(); +//# sourceMappingURL=bundle.f13b1293.min.js.map diff --git a/v2.20/assets/javascripts/bundle.5090c770.min.js.map b/v2.20/assets/javascripts/bundle.f13b1293.min.js.map similarity index 88% rename from v2.20/assets/javascripts/bundle.5090c770.min.js.map rename to v2.20/assets/javascripts/bundle.f13b1293.min.js.map index c29855d852..8bc6fb8d1e 100644 --- a/v2.20/assets/javascripts/bundle.5090c770.min.js.map +++ b/v2.20/assets/javascripts/bundle.f13b1293.min.js.map @@ -1,7 +1,7 @@ { "version": 3, "sources": ["node_modules/focus-visible/dist/focus-visible.js", "node_modules/escape-html/index.js", "node_modules/clipboard/dist/clipboard.js", "src/templates/assets/javascripts/bundle.ts", "node_modules/tslib/tslib.es6.mjs", "node_modules/rxjs/src/internal/util/isFunction.ts", "node_modules/rxjs/src/internal/util/createErrorClass.ts", "node_modules/rxjs/src/internal/util/UnsubscriptionError.ts", "node_modules/rxjs/src/internal/util/arrRemove.ts", "node_modules/rxjs/src/internal/Subscription.ts", "node_modules/rxjs/src/internal/config.ts", "node_modules/rxjs/src/internal/scheduler/timeoutProvider.ts", "node_modules/rxjs/src/internal/util/reportUnhandledError.ts", "node_modules/rxjs/src/internal/util/noop.ts", "node_modules/rxjs/src/internal/NotificationFactories.ts", "node_modules/rxjs/src/internal/util/errorContext.ts", "node_modules/rxjs/src/internal/Subscriber.ts", "node_modules/rxjs/src/internal/symbol/observable.ts", "node_modules/rxjs/src/internal/util/identity.ts", "node_modules/rxjs/src/internal/util/pipe.ts", "node_modules/rxjs/src/internal/Observable.ts", "node_modules/rxjs/src/internal/util/lift.ts", "node_modules/rxjs/src/internal/operators/OperatorSubscriber.ts", "node_modules/rxjs/src/internal/scheduler/animationFrameProvider.ts", "node_modules/rxjs/src/internal/util/ObjectUnsubscribedError.ts", "node_modules/rxjs/src/internal/Subject.ts", "node_modules/rxjs/src/internal/BehaviorSubject.ts", "node_modules/rxjs/src/internal/scheduler/dateTimestampProvider.ts", "node_modules/rxjs/src/internal/ReplaySubject.ts", "node_modules/rxjs/src/internal/scheduler/Action.ts", "node_modules/rxjs/src/internal/scheduler/intervalProvider.ts", "node_modules/rxjs/src/internal/scheduler/AsyncAction.ts", "node_modules/rxjs/src/internal/Scheduler.ts", "node_modules/rxjs/src/internal/scheduler/AsyncScheduler.ts", "node_modules/rxjs/src/internal/scheduler/async.ts", "node_modules/rxjs/src/internal/scheduler/QueueAction.ts", "node_modules/rxjs/src/internal/scheduler/QueueScheduler.ts", "node_modules/rxjs/src/internal/scheduler/queue.ts", "node_modules/rxjs/src/internal/scheduler/AnimationFrameAction.ts", "node_modules/rxjs/src/internal/scheduler/AnimationFrameScheduler.ts", "node_modules/rxjs/src/internal/scheduler/animationFrame.ts", "node_modules/rxjs/src/internal/observable/empty.ts", "node_modules/rxjs/src/internal/util/isScheduler.ts", "node_modules/rxjs/src/internal/util/args.ts", "node_modules/rxjs/src/internal/util/isArrayLike.ts", "node_modules/rxjs/src/internal/util/isPromise.ts", "node_modules/rxjs/src/internal/util/isInteropObservable.ts", "node_modules/rxjs/src/internal/util/isAsyncIterable.ts", "node_modules/rxjs/src/internal/util/throwUnobservableError.ts", "node_modules/rxjs/src/internal/symbol/iterator.ts", "node_modules/rxjs/src/internal/util/isIterable.ts", "node_modules/rxjs/src/internal/util/isReadableStreamLike.ts", "node_modules/rxjs/src/internal/observable/innerFrom.ts", "node_modules/rxjs/src/internal/util/executeSchedule.ts", "node_modules/rxjs/src/internal/operators/observeOn.ts", "node_modules/rxjs/src/internal/operators/subscribeOn.ts", "node_modules/rxjs/src/internal/scheduled/scheduleObservable.ts", "node_modules/rxjs/src/internal/scheduled/schedulePromise.ts", "node_modules/rxjs/src/internal/scheduled/scheduleArray.ts", "node_modules/rxjs/src/internal/scheduled/scheduleIterable.ts", "node_modules/rxjs/src/internal/scheduled/scheduleAsyncIterable.ts", "node_modules/rxjs/src/internal/scheduled/scheduleReadableStreamLike.ts", "node_modules/rxjs/src/internal/scheduled/scheduled.ts", "node_modules/rxjs/src/internal/observable/from.ts", "node_modules/rxjs/src/internal/observable/of.ts", "node_modules/rxjs/src/internal/observable/throwError.ts", "node_modules/rxjs/src/internal/util/EmptyError.ts", "node_modules/rxjs/src/internal/util/isDate.ts", "node_modules/rxjs/src/internal/operators/map.ts", "node_modules/rxjs/src/internal/util/mapOneOrManyArgs.ts", "node_modules/rxjs/src/internal/util/argsArgArrayOrObject.ts", "node_modules/rxjs/src/internal/util/createObject.ts", "node_modules/rxjs/src/internal/observable/combineLatest.ts", "node_modules/rxjs/src/internal/operators/mergeInternals.ts", "node_modules/rxjs/src/internal/operators/mergeMap.ts", "node_modules/rxjs/src/internal/operators/mergeAll.ts", "node_modules/rxjs/src/internal/operators/concatAll.ts", "node_modules/rxjs/src/internal/observable/concat.ts", "node_modules/rxjs/src/internal/observable/defer.ts", "node_modules/rxjs/src/internal/observable/fromEvent.ts", "node_modules/rxjs/src/internal/observable/fromEventPattern.ts", "node_modules/rxjs/src/internal/observable/timer.ts", "node_modules/rxjs/src/internal/observable/merge.ts", "node_modules/rxjs/src/internal/observable/never.ts", "node_modules/rxjs/src/internal/util/argsOrArgArray.ts", "node_modules/rxjs/src/internal/operators/filter.ts", "node_modules/rxjs/src/internal/observable/zip.ts", "node_modules/rxjs/src/internal/operators/audit.ts", "node_modules/rxjs/src/internal/operators/auditTime.ts", "node_modules/rxjs/src/internal/operators/bufferCount.ts", "node_modules/rxjs/src/internal/operators/catchError.ts", "node_modules/rxjs/src/internal/operators/scanInternals.ts", "node_modules/rxjs/src/internal/operators/combineLatest.ts", "node_modules/rxjs/src/internal/operators/combineLatestWith.ts", "node_modules/rxjs/src/internal/operators/debounce.ts", "node_modules/rxjs/src/internal/operators/debounceTime.ts", "node_modules/rxjs/src/internal/operators/defaultIfEmpty.ts", "node_modules/rxjs/src/internal/operators/take.ts", "node_modules/rxjs/src/internal/operators/ignoreElements.ts", "node_modules/rxjs/src/internal/operators/mapTo.ts", "node_modules/rxjs/src/internal/operators/delayWhen.ts", "node_modules/rxjs/src/internal/operators/delay.ts", "node_modules/rxjs/src/internal/operators/distinctUntilChanged.ts", "node_modules/rxjs/src/internal/operators/distinctUntilKeyChanged.ts", "node_modules/rxjs/src/internal/operators/throwIfEmpty.ts", "node_modules/rxjs/src/internal/operators/endWith.ts", "node_modules/rxjs/src/internal/operators/finalize.ts", "node_modules/rxjs/src/internal/operators/first.ts", "node_modules/rxjs/src/internal/operators/takeLast.ts", "node_modules/rxjs/src/internal/operators/merge.ts", "node_modules/rxjs/src/internal/operators/mergeWith.ts", "node_modules/rxjs/src/internal/operators/repeat.ts", "node_modules/rxjs/src/internal/operators/scan.ts", "node_modules/rxjs/src/internal/operators/share.ts", "node_modules/rxjs/src/internal/operators/shareReplay.ts", "node_modules/rxjs/src/internal/operators/skip.ts", "node_modules/rxjs/src/internal/operators/skipUntil.ts", "node_modules/rxjs/src/internal/operators/startWith.ts", "node_modules/rxjs/src/internal/operators/switchMap.ts", "node_modules/rxjs/src/internal/operators/takeUntil.ts", "node_modules/rxjs/src/internal/operators/takeWhile.ts", "node_modules/rxjs/src/internal/operators/tap.ts", "node_modules/rxjs/src/internal/operators/throttle.ts", "node_modules/rxjs/src/internal/operators/throttleTime.ts", "node_modules/rxjs/src/internal/operators/withLatestFrom.ts", "node_modules/rxjs/src/internal/operators/zip.ts", "node_modules/rxjs/src/internal/operators/zipWith.ts", "src/templates/assets/javascripts/browser/document/index.ts", "src/templates/assets/javascripts/browser/element/_/index.ts", "src/templates/assets/javascripts/browser/element/focus/index.ts", "src/templates/assets/javascripts/browser/element/hover/index.ts", "src/templates/assets/javascripts/utilities/h/index.ts", "src/templates/assets/javascripts/utilities/round/index.ts", "src/templates/assets/javascripts/browser/script/index.ts", "src/templates/assets/javascripts/browser/element/size/_/index.ts", "src/templates/assets/javascripts/browser/element/size/content/index.ts", "src/templates/assets/javascripts/browser/element/offset/_/index.ts", "src/templates/assets/javascripts/browser/element/offset/content/index.ts", "src/templates/assets/javascripts/browser/element/visibility/index.ts", "src/templates/assets/javascripts/browser/toggle/index.ts", "src/templates/assets/javascripts/browser/keyboard/index.ts", "src/templates/assets/javascripts/browser/location/_/index.ts", "src/templates/assets/javascripts/browser/location/hash/index.ts", "src/templates/assets/javascripts/browser/media/index.ts", "src/templates/assets/javascripts/browser/request/index.ts", "src/templates/assets/javascripts/browser/viewport/offset/index.ts", "src/templates/assets/javascripts/browser/viewport/size/index.ts", "src/templates/assets/javascripts/browser/viewport/_/index.ts", "src/templates/assets/javascripts/browser/viewport/at/index.ts", "src/templates/assets/javascripts/browser/worker/index.ts", "src/templates/assets/javascripts/_/index.ts", "src/templates/assets/javascripts/components/_/index.ts", "src/templates/assets/javascripts/components/announce/index.ts", "src/templates/assets/javascripts/components/consent/index.ts", "src/templates/assets/javascripts/templates/tooltip/index.tsx", "src/templates/assets/javascripts/templates/annotation/index.tsx", "src/templates/assets/javascripts/templates/clipboard/index.tsx", "src/templates/assets/javascripts/templates/search/index.tsx", "src/templates/assets/javascripts/templates/source/index.tsx", "src/templates/assets/javascripts/templates/tabbed/index.tsx", "src/templates/assets/javascripts/templates/table/index.tsx", "src/templates/assets/javascripts/templates/version/index.tsx", "src/templates/assets/javascripts/components/tooltip2/index.ts", "src/templates/assets/javascripts/components/content/annotation/_/index.ts", "src/templates/assets/javascripts/components/content/annotation/list/index.ts", "src/templates/assets/javascripts/components/content/annotation/block/index.ts", "src/templates/assets/javascripts/components/content/code/_/index.ts", "src/templates/assets/javascripts/components/content/details/index.ts", "src/templates/assets/javascripts/components/content/mermaid/index.css", "src/templates/assets/javascripts/components/content/mermaid/index.ts", "src/templates/assets/javascripts/components/content/table/index.ts", "src/templates/assets/javascripts/components/content/tabs/index.ts", "src/templates/assets/javascripts/components/content/_/index.ts", "src/templates/assets/javascripts/components/dialog/index.ts", "src/templates/assets/javascripts/components/tooltip/index.ts", "src/templates/assets/javascripts/components/header/_/index.ts", "src/templates/assets/javascripts/components/header/title/index.ts", "src/templates/assets/javascripts/components/main/index.ts", "src/templates/assets/javascripts/components/palette/index.ts", "src/templates/assets/javascripts/components/progress/index.ts", "src/templates/assets/javascripts/integrations/clipboard/index.ts", "src/templates/assets/javascripts/integrations/sitemap/index.ts", "src/templates/assets/javascripts/integrations/instant/index.ts", "src/templates/assets/javascripts/integrations/search/highlighter/index.ts", "src/templates/assets/javascripts/integrations/search/worker/message/index.ts", "src/templates/assets/javascripts/integrations/search/worker/_/index.ts", "src/templates/assets/javascripts/integrations/version/findurl/index.ts", "src/templates/assets/javascripts/integrations/version/index.ts", "src/templates/assets/javascripts/components/search/query/index.ts", "src/templates/assets/javascripts/components/search/result/index.ts", "src/templates/assets/javascripts/components/search/share/index.ts", "src/templates/assets/javascripts/components/search/suggest/index.ts", "src/templates/assets/javascripts/components/search/_/index.ts", "src/templates/assets/javascripts/components/search/highlight/index.ts", "src/templates/assets/javascripts/components/sidebar/index.ts", "src/templates/assets/javascripts/components/source/facts/github/index.ts", "src/templates/assets/javascripts/components/source/facts/gitlab/index.ts", "src/templates/assets/javascripts/components/source/facts/_/index.ts", "src/templates/assets/javascripts/components/source/_/index.ts", "src/templates/assets/javascripts/components/tabs/index.ts", "src/templates/assets/javascripts/components/toc/index.ts", "src/templates/assets/javascripts/components/top/index.ts", "src/templates/assets/javascripts/patches/ellipsis/index.ts", "src/templates/assets/javascripts/patches/indeterminate/index.ts", "src/templates/assets/javascripts/patches/scrollfix/index.ts", "src/templates/assets/javascripts/patches/scrolllock/index.ts", "src/templates/assets/javascripts/polyfills/index.ts"], - "sourcesContent": ["(function (global, factory) {\n typeof exports === 'object' && typeof module !== 'undefined' ? factory() :\n typeof define === 'function' && define.amd ? define(factory) :\n (factory());\n}(this, (function () { 'use strict';\n\n /**\n * Applies the :focus-visible polyfill at the given scope.\n * A scope in this case is either the top-level Document or a Shadow Root.\n *\n * @param {(Document|ShadowRoot)} scope\n * @see https://github.com/WICG/focus-visible\n */\n function applyFocusVisiblePolyfill(scope) {\n var hadKeyboardEvent = true;\n var hadFocusVisibleRecently = false;\n var hadFocusVisibleRecentlyTimeout = null;\n\n var inputTypesAllowlist = {\n text: true,\n search: true,\n url: true,\n tel: true,\n email: true,\n password: true,\n number: true,\n date: true,\n month: true,\n week: true,\n time: true,\n datetime: true,\n 'datetime-local': true\n };\n\n /**\n * Helper function for legacy browsers and iframes which sometimes focus\n * elements like document, body, and non-interactive SVG.\n * @param {Element} el\n */\n function isValidFocusTarget(el) {\n if (\n el &&\n el !== document &&\n el.nodeName !== 'HTML' &&\n el.nodeName !== 'BODY' &&\n 'classList' in el &&\n 'contains' in el.classList\n ) {\n return true;\n }\n return false;\n }\n\n /**\n * Computes whether the given element should automatically trigger the\n * `focus-visible` class being added, i.e. whether it should always match\n * `:focus-visible` when focused.\n * @param {Element} el\n * @return {boolean}\n */\n function focusTriggersKeyboardModality(el) {\n var type = el.type;\n var tagName = el.tagName;\n\n if (tagName === 'INPUT' && inputTypesAllowlist[type] && !el.readOnly) {\n return true;\n }\n\n if (tagName === 'TEXTAREA' && !el.readOnly) {\n return true;\n }\n\n if (el.isContentEditable) {\n return true;\n }\n\n return false;\n }\n\n /**\n * Add the `focus-visible` class to the given element if it was not added by\n * the author.\n * @param {Element} el\n */\n function addFocusVisibleClass(el) {\n if (el.classList.contains('focus-visible')) {\n return;\n }\n el.classList.add('focus-visible');\n el.setAttribute('data-focus-visible-added', '');\n }\n\n /**\n * Remove the `focus-visible` class from the given element if it was not\n * originally added by the author.\n * @param {Element} el\n */\n function removeFocusVisibleClass(el) {\n if (!el.hasAttribute('data-focus-visible-added')) {\n return;\n }\n el.classList.remove('focus-visible');\n el.removeAttribute('data-focus-visible-added');\n }\n\n /**\n * If the most recent user interaction was via the keyboard;\n * and the key press did not include a meta, alt/option, or control key;\n * then the modality is keyboard. Otherwise, the modality is not keyboard.\n * Apply `focus-visible` to any current active element and keep track\n * of our keyboard modality state with `hadKeyboardEvent`.\n * @param {KeyboardEvent} e\n */\n function onKeyDown(e) {\n if (e.metaKey || e.altKey || e.ctrlKey) {\n return;\n }\n\n if (isValidFocusTarget(scope.activeElement)) {\n addFocusVisibleClass(scope.activeElement);\n }\n\n hadKeyboardEvent = true;\n }\n\n /**\n * If at any point a user clicks with a pointing device, ensure that we change\n * the modality away from keyboard.\n * This avoids the situation where a user presses a key on an already focused\n * element, and then clicks on a different element, focusing it with a\n * pointing device, while we still think we're in keyboard modality.\n * @param {Event} e\n */\n function onPointerDown(e) {\n hadKeyboardEvent = false;\n }\n\n /**\n * On `focus`, add the `focus-visible` class to the target if:\n * - the target received focus as a result of keyboard navigation, or\n * - the event target is an element that will likely require interaction\n * via the keyboard (e.g. a text box)\n * @param {Event} e\n */\n function onFocus(e) {\n // Prevent IE from focusing the document or HTML element.\n if (!isValidFocusTarget(e.target)) {\n return;\n }\n\n if (hadKeyboardEvent || focusTriggersKeyboardModality(e.target)) {\n addFocusVisibleClass(e.target);\n }\n }\n\n /**\n * On `blur`, remove the `focus-visible` class from the target.\n * @param {Event} e\n */\n function onBlur(e) {\n if (!isValidFocusTarget(e.target)) {\n return;\n }\n\n if (\n e.target.classList.contains('focus-visible') ||\n e.target.hasAttribute('data-focus-visible-added')\n ) {\n // To detect a tab/window switch, we look for a blur event followed\n // rapidly by a visibility change.\n // If we don't see a visibility change within 100ms, it's probably a\n // regular focus change.\n hadFocusVisibleRecently = true;\n window.clearTimeout(hadFocusVisibleRecentlyTimeout);\n hadFocusVisibleRecentlyTimeout = window.setTimeout(function() {\n hadFocusVisibleRecently = false;\n }, 100);\n removeFocusVisibleClass(e.target);\n }\n }\n\n /**\n * If the user changes tabs, keep track of whether or not the previously\n * focused element had .focus-visible.\n * @param {Event} e\n */\n function onVisibilityChange(e) {\n if (document.visibilityState === 'hidden') {\n // If the tab becomes active again, the browser will handle calling focus\n // on the element (Safari actually calls it twice).\n // If this tab change caused a blur on an element with focus-visible,\n // re-apply the class when the user switches back to the tab.\n if (hadFocusVisibleRecently) {\n hadKeyboardEvent = true;\n }\n addInitialPointerMoveListeners();\n }\n }\n\n /**\n * Add a group of listeners to detect usage of any pointing devices.\n * These listeners will be added when the polyfill first loads, and anytime\n * the window is blurred, so that they are active when the window regains\n * focus.\n */\n function addInitialPointerMoveListeners() {\n document.addEventListener('mousemove', onInitialPointerMove);\n document.addEventListener('mousedown', onInitialPointerMove);\n document.addEventListener('mouseup', onInitialPointerMove);\n document.addEventListener('pointermove', onInitialPointerMove);\n document.addEventListener('pointerdown', onInitialPointerMove);\n document.addEventListener('pointerup', onInitialPointerMove);\n document.addEventListener('touchmove', onInitialPointerMove);\n document.addEventListener('touchstart', onInitialPointerMove);\n document.addEventListener('touchend', onInitialPointerMove);\n }\n\n function removeInitialPointerMoveListeners() {\n document.removeEventListener('mousemove', onInitialPointerMove);\n document.removeEventListener('mousedown', onInitialPointerMove);\n document.removeEventListener('mouseup', onInitialPointerMove);\n document.removeEventListener('pointermove', onInitialPointerMove);\n document.removeEventListener('pointerdown', onInitialPointerMove);\n document.removeEventListener('pointerup', onInitialPointerMove);\n document.removeEventListener('touchmove', onInitialPointerMove);\n document.removeEventListener('touchstart', onInitialPointerMove);\n document.removeEventListener('touchend', onInitialPointerMove);\n }\n\n /**\n * When the polfyill first loads, assume the user is in keyboard modality.\n * If any event is received from a pointing device (e.g. mouse, pointer,\n * touch), turn off keyboard modality.\n * This accounts for situations where focus enters the page from the URL bar.\n * @param {Event} e\n */\n function onInitialPointerMove(e) {\n // Work around a Safari quirk that fires a mousemove on whenever the\n // window blurs, even if you're tabbing out of the page. \u00AF\\_(\u30C4)_/\u00AF\n if (e.target.nodeName && e.target.nodeName.toLowerCase() === 'html') {\n return;\n }\n\n hadKeyboardEvent = false;\n removeInitialPointerMoveListeners();\n }\n\n // For some kinds of state, we are interested in changes at the global scope\n // only. For example, global pointer input, global key presses and global\n // visibility change should affect the state at every scope:\n document.addEventListener('keydown', onKeyDown, true);\n document.addEventListener('mousedown', onPointerDown, true);\n document.addEventListener('pointerdown', onPointerDown, true);\n document.addEventListener('touchstart', onPointerDown, true);\n document.addEventListener('visibilitychange', onVisibilityChange, true);\n\n addInitialPointerMoveListeners();\n\n // For focus and blur, we specifically care about state changes in the local\n // scope. This is because focus / blur events that originate from within a\n // shadow root are not re-dispatched from the host element if it was already\n // the active element in its own scope:\n scope.addEventListener('focus', onFocus, true);\n scope.addEventListener('blur', onBlur, true);\n\n // We detect that a node is a ShadowRoot by ensuring that it is a\n // DocumentFragment and also has a host property. This check covers native\n // implementation and polyfill implementation transparently. If we only cared\n // about the native implementation, we could just check if the scope was\n // an instance of a ShadowRoot.\n if (scope.nodeType === Node.DOCUMENT_FRAGMENT_NODE && scope.host) {\n // Since a ShadowRoot is a special kind of DocumentFragment, it does not\n // have a root element to add a class to. So, we add this attribute to the\n // host element instead:\n scope.host.setAttribute('data-js-focus-visible', '');\n } else if (scope.nodeType === Node.DOCUMENT_NODE) {\n document.documentElement.classList.add('js-focus-visible');\n document.documentElement.setAttribute('data-js-focus-visible', '');\n }\n }\n\n // It is important to wrap all references to global window and document in\n // these checks to support server-side rendering use cases\n // @see https://github.com/WICG/focus-visible/issues/199\n if (typeof window !== 'undefined' && typeof document !== 'undefined') {\n // Make the polyfill helper globally available. This can be used as a signal\n // to interested libraries that wish to coordinate with the polyfill for e.g.,\n // applying the polyfill to a shadow root:\n window.applyFocusVisiblePolyfill = applyFocusVisiblePolyfill;\n\n // Notify interested libraries of the polyfill's presence, in case the\n // polyfill was loaded lazily:\n var event;\n\n try {\n event = new CustomEvent('focus-visible-polyfill-ready');\n } catch (error) {\n // IE11 does not support using CustomEvent as a constructor directly:\n event = document.createEvent('CustomEvent');\n event.initCustomEvent('focus-visible-polyfill-ready', false, false, {});\n }\n\n window.dispatchEvent(event);\n }\n\n if (typeof document !== 'undefined') {\n // Apply the polyfill to the global document, so that no JavaScript\n // coordination is required to use the polyfill in the top-level document:\n applyFocusVisiblePolyfill(document);\n }\n\n})));\n", "/*!\n * escape-html\n * Copyright(c) 2012-2013 TJ Holowaychuk\n * Copyright(c) 2015 Andreas Lubbe\n * Copyright(c) 2015 Tiancheng \"Timothy\" Gu\n * MIT Licensed\n */\n\n'use strict';\n\n/**\n * Module variables.\n * @private\n */\n\nvar matchHtmlRegExp = /[\"'&<>]/;\n\n/**\n * Module exports.\n * @public\n */\n\nmodule.exports = escapeHtml;\n\n/**\n * Escape special characters in the given string of html.\n *\n * @param {string} string The string to escape for inserting into HTML\n * @return {string}\n * @public\n */\n\nfunction escapeHtml(string) {\n var str = '' + string;\n var match = matchHtmlRegExp.exec(str);\n\n if (!match) {\n return str;\n }\n\n var escape;\n var html = '';\n var index = 0;\n var lastIndex = 0;\n\n for (index = match.index; index < str.length; index++) {\n switch (str.charCodeAt(index)) {\n case 34: // \"\n escape = '"';\n break;\n case 38: // &\n escape = '&';\n break;\n case 39: // '\n escape = ''';\n break;\n case 60: // <\n escape = '<';\n break;\n case 62: // >\n escape = '>';\n break;\n default:\n continue;\n }\n\n if (lastIndex !== index) {\n html += str.substring(lastIndex, index);\n }\n\n lastIndex = index + 1;\n html += escape;\n }\n\n return lastIndex !== index\n ? html + str.substring(lastIndex, index)\n : html;\n}\n", "/*!\n * clipboard.js v2.0.11\n * https://clipboardjs.com/\n *\n * Licensed MIT \u00A9 Zeno Rocha\n */\n(function webpackUniversalModuleDefinition(root, factory) {\n\tif(typeof exports === 'object' && typeof module === 'object')\n\t\tmodule.exports = factory();\n\telse if(typeof define === 'function' && define.amd)\n\t\tdefine([], factory);\n\telse if(typeof exports === 'object')\n\t\texports[\"ClipboardJS\"] = factory();\n\telse\n\t\troot[\"ClipboardJS\"] = factory();\n})(this, function() {\nreturn /******/ (function() { // webpackBootstrap\n/******/ \tvar __webpack_modules__ = ({\n\n/***/ 686:\n/***/ (function(__unused_webpack_module, __webpack_exports__, __webpack_require__) {\n\n\"use strict\";\n\n// EXPORTS\n__webpack_require__.d(__webpack_exports__, {\n \"default\": function() { return /* binding */ clipboard; }\n});\n\n// EXTERNAL MODULE: ./node_modules/tiny-emitter/index.js\nvar tiny_emitter = __webpack_require__(279);\nvar tiny_emitter_default = /*#__PURE__*/__webpack_require__.n(tiny_emitter);\n// EXTERNAL MODULE: ./node_modules/good-listener/src/listen.js\nvar listen = __webpack_require__(370);\nvar listen_default = /*#__PURE__*/__webpack_require__.n(listen);\n// EXTERNAL MODULE: ./node_modules/select/src/select.js\nvar src_select = __webpack_require__(817);\nvar select_default = /*#__PURE__*/__webpack_require__.n(src_select);\n;// CONCATENATED MODULE: ./src/common/command.js\n/**\n * Executes a given operation type.\n * @param {String} type\n * @return {Boolean}\n */\nfunction command(type) {\n try {\n return document.execCommand(type);\n } catch (err) {\n return false;\n }\n}\n;// CONCATENATED MODULE: ./src/actions/cut.js\n\n\n/**\n * Cut action wrapper.\n * @param {String|HTMLElement} target\n * @return {String}\n */\n\nvar ClipboardActionCut = function ClipboardActionCut(target) {\n var selectedText = select_default()(target);\n command('cut');\n return selectedText;\n};\n\n/* harmony default export */ var actions_cut = (ClipboardActionCut);\n;// CONCATENATED MODULE: ./src/common/create-fake-element.js\n/**\n * Creates a fake textarea element with a value.\n * @param {String} value\n * @return {HTMLElement}\n */\nfunction createFakeElement(value) {\n var isRTL = document.documentElement.getAttribute('dir') === 'rtl';\n var fakeElement = document.createElement('textarea'); // Prevent zooming on iOS\n\n fakeElement.style.fontSize = '12pt'; // Reset box model\n\n fakeElement.style.border = '0';\n fakeElement.style.padding = '0';\n fakeElement.style.margin = '0'; // Move element out of screen horizontally\n\n fakeElement.style.position = 'absolute';\n fakeElement.style[isRTL ? 'right' : 'left'] = '-9999px'; // Move element to the same position vertically\n\n var yPosition = window.pageYOffset || document.documentElement.scrollTop;\n fakeElement.style.top = \"\".concat(yPosition, \"px\");\n fakeElement.setAttribute('readonly', '');\n fakeElement.value = value;\n return fakeElement;\n}\n;// CONCATENATED MODULE: ./src/actions/copy.js\n\n\n\n/**\n * Create fake copy action wrapper using a fake element.\n * @param {String} target\n * @param {Object} options\n * @return {String}\n */\n\nvar fakeCopyAction = function fakeCopyAction(value, options) {\n var fakeElement = createFakeElement(value);\n options.container.appendChild(fakeElement);\n var selectedText = select_default()(fakeElement);\n command('copy');\n fakeElement.remove();\n return selectedText;\n};\n/**\n * Copy action wrapper.\n * @param {String|HTMLElement} target\n * @param {Object} options\n * @return {String}\n */\n\n\nvar ClipboardActionCopy = function ClipboardActionCopy(target) {\n var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {\n container: document.body\n };\n var selectedText = '';\n\n if (typeof target === 'string') {\n selectedText = fakeCopyAction(target, options);\n } else if (target instanceof HTMLInputElement && !['text', 'search', 'url', 'tel', 'password'].includes(target === null || target === void 0 ? void 0 : target.type)) {\n // If input type doesn't support `setSelectionRange`. Simulate it. https://developer.mozilla.org/en-US/docs/Web/API/HTMLInputElement/setSelectionRange\n selectedText = fakeCopyAction(target.value, options);\n } else {\n selectedText = select_default()(target);\n command('copy');\n }\n\n return selectedText;\n};\n\n/* harmony default export */ var actions_copy = (ClipboardActionCopy);\n;// CONCATENATED MODULE: ./src/actions/default.js\nfunction _typeof(obj) { \"@babel/helpers - typeof\"; if (typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\") { _typeof = function _typeof(obj) { return typeof obj; }; } else { _typeof = function _typeof(obj) { return obj && typeof Symbol === \"function\" && obj.constructor === Symbol && obj !== Symbol.prototype ? \"symbol\" : typeof obj; }; } return _typeof(obj); }\n\n\n\n/**\n * Inner function which performs selection from either `text` or `target`\n * properties and then executes copy or cut operations.\n * @param {Object} options\n */\n\nvar ClipboardActionDefault = function ClipboardActionDefault() {\n var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};\n // Defines base properties passed from constructor.\n var _options$action = options.action,\n action = _options$action === void 0 ? 'copy' : _options$action,\n container = options.container,\n target = options.target,\n text = options.text; // Sets the `action` to be performed which can be either 'copy' or 'cut'.\n\n if (action !== 'copy' && action !== 'cut') {\n throw new Error('Invalid \"action\" value, use either \"copy\" or \"cut\"');\n } // Sets the `target` property using an element that will be have its content copied.\n\n\n if (target !== undefined) {\n if (target && _typeof(target) === 'object' && target.nodeType === 1) {\n if (action === 'copy' && target.hasAttribute('disabled')) {\n throw new Error('Invalid \"target\" attribute. Please use \"readonly\" instead of \"disabled\" attribute');\n }\n\n if (action === 'cut' && (target.hasAttribute('readonly') || target.hasAttribute('disabled'))) {\n throw new Error('Invalid \"target\" attribute. You can\\'t cut text from elements with \"readonly\" or \"disabled\" attributes');\n }\n } else {\n throw new Error('Invalid \"target\" value, use a valid Element');\n }\n } // Define selection strategy based on `text` property.\n\n\n if (text) {\n return actions_copy(text, {\n container: container\n });\n } // Defines which selection strategy based on `target` property.\n\n\n if (target) {\n return action === 'cut' ? actions_cut(target) : actions_copy(target, {\n container: container\n });\n }\n};\n\n/* harmony default export */ var actions_default = (ClipboardActionDefault);\n;// CONCATENATED MODULE: ./src/clipboard.js\nfunction clipboard_typeof(obj) { \"@babel/helpers - typeof\"; if (typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\") { clipboard_typeof = function _typeof(obj) { return typeof obj; }; } else { clipboard_typeof = function _typeof(obj) { return obj && typeof Symbol === \"function\" && obj.constructor === Symbol && obj !== Symbol.prototype ? \"symbol\" : typeof obj; }; } return clipboard_typeof(obj); }\n\nfunction _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError(\"Cannot call a class as a function\"); } }\n\nfunction _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if (\"value\" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } }\n\nfunction _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; }\n\nfunction _inherits(subClass, superClass) { if (typeof superClass !== \"function\" && superClass !== null) { throw new TypeError(\"Super expression must either be null or a function\"); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, writable: true, configurable: true } }); if (superClass) _setPrototypeOf(subClass, superClass); }\n\nfunction _setPrototypeOf(o, p) { _setPrototypeOf = Object.setPrototypeOf || function _setPrototypeOf(o, p) { o.__proto__ = p; return o; }; return _setPrototypeOf(o, p); }\n\nfunction _createSuper(Derived) { var hasNativeReflectConstruct = _isNativeReflectConstruct(); return function _createSuperInternal() { var Super = _getPrototypeOf(Derived), result; if (hasNativeReflectConstruct) { var NewTarget = _getPrototypeOf(this).constructor; result = Reflect.construct(Super, arguments, NewTarget); } else { result = Super.apply(this, arguments); } return _possibleConstructorReturn(this, result); }; }\n\nfunction _possibleConstructorReturn(self, call) { if (call && (clipboard_typeof(call) === \"object\" || typeof call === \"function\")) { return call; } return _assertThisInitialized(self); }\n\nfunction _assertThisInitialized(self) { if (self === void 0) { throw new ReferenceError(\"this hasn't been initialised - super() hasn't been called\"); } return self; }\n\nfunction _isNativeReflectConstruct() { if (typeof Reflect === \"undefined\" || !Reflect.construct) return false; if (Reflect.construct.sham) return false; if (typeof Proxy === \"function\") return true; try { Date.prototype.toString.call(Reflect.construct(Date, [], function () {})); return true; } catch (e) { return false; } }\n\nfunction _getPrototypeOf(o) { _getPrototypeOf = Object.setPrototypeOf ? Object.getPrototypeOf : function _getPrototypeOf(o) { return o.__proto__ || Object.getPrototypeOf(o); }; return _getPrototypeOf(o); }\n\n\n\n\n\n\n/**\n * Helper function to retrieve attribute value.\n * @param {String} suffix\n * @param {Element} element\n */\n\nfunction getAttributeValue(suffix, element) {\n var attribute = \"data-clipboard-\".concat(suffix);\n\n if (!element.hasAttribute(attribute)) {\n return;\n }\n\n return element.getAttribute(attribute);\n}\n/**\n * Base class which takes one or more elements, adds event listeners to them,\n * and instantiates a new `ClipboardAction` on each click.\n */\n\n\nvar Clipboard = /*#__PURE__*/function (_Emitter) {\n _inherits(Clipboard, _Emitter);\n\n var _super = _createSuper(Clipboard);\n\n /**\n * @param {String|HTMLElement|HTMLCollection|NodeList} trigger\n * @param {Object} options\n */\n function Clipboard(trigger, options) {\n var _this;\n\n _classCallCheck(this, Clipboard);\n\n _this = _super.call(this);\n\n _this.resolveOptions(options);\n\n _this.listenClick(trigger);\n\n return _this;\n }\n /**\n * Defines if attributes would be resolved using internal setter functions\n * or custom functions that were passed in the constructor.\n * @param {Object} options\n */\n\n\n _createClass(Clipboard, [{\n key: \"resolveOptions\",\n value: function resolveOptions() {\n var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};\n this.action = typeof options.action === 'function' ? options.action : this.defaultAction;\n this.target = typeof options.target === 'function' ? options.target : this.defaultTarget;\n this.text = typeof options.text === 'function' ? options.text : this.defaultText;\n this.container = clipboard_typeof(options.container) === 'object' ? options.container : document.body;\n }\n /**\n * Adds a click event listener to the passed trigger.\n * @param {String|HTMLElement|HTMLCollection|NodeList} trigger\n */\n\n }, {\n key: \"listenClick\",\n value: function listenClick(trigger) {\n var _this2 = this;\n\n this.listener = listen_default()(trigger, 'click', function (e) {\n return _this2.onClick(e);\n });\n }\n /**\n * Defines a new `ClipboardAction` on each click event.\n * @param {Event} e\n */\n\n }, {\n key: \"onClick\",\n value: function onClick(e) {\n var trigger = e.delegateTarget || e.currentTarget;\n var action = this.action(trigger) || 'copy';\n var text = actions_default({\n action: action,\n container: this.container,\n target: this.target(trigger),\n text: this.text(trigger)\n }); // Fires an event based on the copy operation result.\n\n this.emit(text ? 'success' : 'error', {\n action: action,\n text: text,\n trigger: trigger,\n clearSelection: function clearSelection() {\n if (trigger) {\n trigger.focus();\n }\n\n window.getSelection().removeAllRanges();\n }\n });\n }\n /**\n * Default `action` lookup function.\n * @param {Element} trigger\n */\n\n }, {\n key: \"defaultAction\",\n value: function defaultAction(trigger) {\n return getAttributeValue('action', trigger);\n }\n /**\n * Default `target` lookup function.\n * @param {Element} trigger\n */\n\n }, {\n key: \"defaultTarget\",\n value: function defaultTarget(trigger) {\n var selector = getAttributeValue('target', trigger);\n\n if (selector) {\n return document.querySelector(selector);\n }\n }\n /**\n * Allow fire programmatically a copy action\n * @param {String|HTMLElement} target\n * @param {Object} options\n * @returns Text copied.\n */\n\n }, {\n key: \"defaultText\",\n\n /**\n * Default `text` lookup function.\n * @param {Element} trigger\n */\n value: function defaultText(trigger) {\n return getAttributeValue('text', trigger);\n }\n /**\n * Destroy lifecycle.\n */\n\n }, {\n key: \"destroy\",\n value: function destroy() {\n this.listener.destroy();\n }\n }], [{\n key: \"copy\",\n value: function copy(target) {\n var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {\n container: document.body\n };\n return actions_copy(target, options);\n }\n /**\n * Allow fire programmatically a cut action\n * @param {String|HTMLElement} target\n * @returns Text cutted.\n */\n\n }, {\n key: \"cut\",\n value: function cut(target) {\n return actions_cut(target);\n }\n /**\n * Returns the support of the given action, or all actions if no action is\n * given.\n * @param {String} [action]\n */\n\n }, {\n key: \"isSupported\",\n value: function isSupported() {\n var action = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : ['copy', 'cut'];\n var actions = typeof action === 'string' ? [action] : action;\n var support = !!document.queryCommandSupported;\n actions.forEach(function (action) {\n support = support && !!document.queryCommandSupported(action);\n });\n return support;\n }\n }]);\n\n return Clipboard;\n}((tiny_emitter_default()));\n\n/* harmony default export */ var clipboard = (Clipboard);\n\n/***/ }),\n\n/***/ 828:\n/***/ (function(module) {\n\nvar DOCUMENT_NODE_TYPE = 9;\n\n/**\n * A polyfill for Element.matches()\n */\nif (typeof Element !== 'undefined' && !Element.prototype.matches) {\n var proto = Element.prototype;\n\n proto.matches = proto.matchesSelector ||\n proto.mozMatchesSelector ||\n proto.msMatchesSelector ||\n proto.oMatchesSelector ||\n proto.webkitMatchesSelector;\n}\n\n/**\n * Finds the closest parent that matches a selector.\n *\n * @param {Element} element\n * @param {String} selector\n * @return {Function}\n */\nfunction closest (element, selector) {\n while (element && element.nodeType !== DOCUMENT_NODE_TYPE) {\n if (typeof element.matches === 'function' &&\n element.matches(selector)) {\n return element;\n }\n element = element.parentNode;\n }\n}\n\nmodule.exports = closest;\n\n\n/***/ }),\n\n/***/ 438:\n/***/ (function(module, __unused_webpack_exports, __webpack_require__) {\n\nvar closest = __webpack_require__(828);\n\n/**\n * Delegates event to a selector.\n *\n * @param {Element} element\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @param {Boolean} useCapture\n * @return {Object}\n */\nfunction _delegate(element, selector, type, callback, useCapture) {\n var listenerFn = listener.apply(this, arguments);\n\n element.addEventListener(type, listenerFn, useCapture);\n\n return {\n destroy: function() {\n element.removeEventListener(type, listenerFn, useCapture);\n }\n }\n}\n\n/**\n * Delegates event to a selector.\n *\n * @param {Element|String|Array} [elements]\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @param {Boolean} useCapture\n * @return {Object}\n */\nfunction delegate(elements, selector, type, callback, useCapture) {\n // Handle the regular Element usage\n if (typeof elements.addEventListener === 'function') {\n return _delegate.apply(null, arguments);\n }\n\n // Handle Element-less usage, it defaults to global delegation\n if (typeof type === 'function') {\n // Use `document` as the first parameter, then apply arguments\n // This is a short way to .unshift `arguments` without running into deoptimizations\n return _delegate.bind(null, document).apply(null, arguments);\n }\n\n // Handle Selector-based usage\n if (typeof elements === 'string') {\n elements = document.querySelectorAll(elements);\n }\n\n // Handle Array-like based usage\n return Array.prototype.map.call(elements, function (element) {\n return _delegate(element, selector, type, callback, useCapture);\n });\n}\n\n/**\n * Finds closest match and invokes callback.\n *\n * @param {Element} element\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @return {Function}\n */\nfunction listener(element, selector, type, callback) {\n return function(e) {\n e.delegateTarget = closest(e.target, selector);\n\n if (e.delegateTarget) {\n callback.call(element, e);\n }\n }\n}\n\nmodule.exports = delegate;\n\n\n/***/ }),\n\n/***/ 879:\n/***/ (function(__unused_webpack_module, exports) {\n\n/**\n * Check if argument is a HTML element.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.node = function(value) {\n return value !== undefined\n && value instanceof HTMLElement\n && value.nodeType === 1;\n};\n\n/**\n * Check if argument is a list of HTML elements.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.nodeList = function(value) {\n var type = Object.prototype.toString.call(value);\n\n return value !== undefined\n && (type === '[object NodeList]' || type === '[object HTMLCollection]')\n && ('length' in value)\n && (value.length === 0 || exports.node(value[0]));\n};\n\n/**\n * Check if argument is a string.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.string = function(value) {\n return typeof value === 'string'\n || value instanceof String;\n};\n\n/**\n * Check if argument is a function.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.fn = function(value) {\n var type = Object.prototype.toString.call(value);\n\n return type === '[object Function]';\n};\n\n\n/***/ }),\n\n/***/ 370:\n/***/ (function(module, __unused_webpack_exports, __webpack_require__) {\n\nvar is = __webpack_require__(879);\nvar delegate = __webpack_require__(438);\n\n/**\n * Validates all params and calls the right\n * listener function based on its target type.\n *\n * @param {String|HTMLElement|HTMLCollection|NodeList} target\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listen(target, type, callback) {\n if (!target && !type && !callback) {\n throw new Error('Missing required arguments');\n }\n\n if (!is.string(type)) {\n throw new TypeError('Second argument must be a String');\n }\n\n if (!is.fn(callback)) {\n throw new TypeError('Third argument must be a Function');\n }\n\n if (is.node(target)) {\n return listenNode(target, type, callback);\n }\n else if (is.nodeList(target)) {\n return listenNodeList(target, type, callback);\n }\n else if (is.string(target)) {\n return listenSelector(target, type, callback);\n }\n else {\n throw new TypeError('First argument must be a String, HTMLElement, HTMLCollection, or NodeList');\n }\n}\n\n/**\n * Adds an event listener to a HTML element\n * and returns a remove listener function.\n *\n * @param {HTMLElement} node\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenNode(node, type, callback) {\n node.addEventListener(type, callback);\n\n return {\n destroy: function() {\n node.removeEventListener(type, callback);\n }\n }\n}\n\n/**\n * Add an event listener to a list of HTML elements\n * and returns a remove listener function.\n *\n * @param {NodeList|HTMLCollection} nodeList\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenNodeList(nodeList, type, callback) {\n Array.prototype.forEach.call(nodeList, function(node) {\n node.addEventListener(type, callback);\n });\n\n return {\n destroy: function() {\n Array.prototype.forEach.call(nodeList, function(node) {\n node.removeEventListener(type, callback);\n });\n }\n }\n}\n\n/**\n * Add an event listener to a selector\n * and returns a remove listener function.\n *\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenSelector(selector, type, callback) {\n return delegate(document.body, selector, type, callback);\n}\n\nmodule.exports = listen;\n\n\n/***/ }),\n\n/***/ 817:\n/***/ (function(module) {\n\nfunction select(element) {\n var selectedText;\n\n if (element.nodeName === 'SELECT') {\n element.focus();\n\n selectedText = element.value;\n }\n else if (element.nodeName === 'INPUT' || element.nodeName === 'TEXTAREA') {\n var isReadOnly = element.hasAttribute('readonly');\n\n if (!isReadOnly) {\n element.setAttribute('readonly', '');\n }\n\n element.select();\n element.setSelectionRange(0, element.value.length);\n\n if (!isReadOnly) {\n element.removeAttribute('readonly');\n }\n\n selectedText = element.value;\n }\n else {\n if (element.hasAttribute('contenteditable')) {\n element.focus();\n }\n\n var selection = window.getSelection();\n var range = document.createRange();\n\n range.selectNodeContents(element);\n selection.removeAllRanges();\n selection.addRange(range);\n\n selectedText = selection.toString();\n }\n\n return selectedText;\n}\n\nmodule.exports = select;\n\n\n/***/ }),\n\n/***/ 279:\n/***/ (function(module) {\n\nfunction E () {\n // Keep this empty so it's easier to inherit from\n // (via https://github.com/lipsmack from https://github.com/scottcorgan/tiny-emitter/issues/3)\n}\n\nE.prototype = {\n on: function (name, callback, ctx) {\n var e = this.e || (this.e = {});\n\n (e[name] || (e[name] = [])).push({\n fn: callback,\n ctx: ctx\n });\n\n return this;\n },\n\n once: function (name, callback, ctx) {\n var self = this;\n function listener () {\n self.off(name, listener);\n callback.apply(ctx, arguments);\n };\n\n listener._ = callback\n return this.on(name, listener, ctx);\n },\n\n emit: function (name) {\n var data = [].slice.call(arguments, 1);\n var evtArr = ((this.e || (this.e = {}))[name] || []).slice();\n var i = 0;\n var len = evtArr.length;\n\n for (i; i < len; i++) {\n evtArr[i].fn.apply(evtArr[i].ctx, data);\n }\n\n return this;\n },\n\n off: function (name, callback) {\n var e = this.e || (this.e = {});\n var evts = e[name];\n var liveEvents = [];\n\n if (evts && callback) {\n for (var i = 0, len = evts.length; i < len; i++) {\n if (evts[i].fn !== callback && evts[i].fn._ !== callback)\n liveEvents.push(evts[i]);\n }\n }\n\n // Remove event from queue to prevent memory leak\n // Suggested by https://github.com/lazd\n // Ref: https://github.com/scottcorgan/tiny-emitter/commit/c6ebfaa9bc973b33d110a84a307742b7cf94c953#commitcomment-5024910\n\n (liveEvents.length)\n ? e[name] = liveEvents\n : delete e[name];\n\n return this;\n }\n};\n\nmodule.exports = E;\nmodule.exports.TinyEmitter = E;\n\n\n/***/ })\n\n/******/ \t});\n/************************************************************************/\n/******/ \t// The module cache\n/******/ \tvar __webpack_module_cache__ = {};\n/******/ \t\n/******/ \t// The require function\n/******/ \tfunction __webpack_require__(moduleId) {\n/******/ \t\t// Check if module is in cache\n/******/ \t\tif(__webpack_module_cache__[moduleId]) {\n/******/ \t\t\treturn __webpack_module_cache__[moduleId].exports;\n/******/ \t\t}\n/******/ \t\t// Create a new module (and put it into the cache)\n/******/ \t\tvar module = __webpack_module_cache__[moduleId] = {\n/******/ \t\t\t// no module.id needed\n/******/ \t\t\t// no module.loaded needed\n/******/ \t\t\texports: {}\n/******/ \t\t};\n/******/ \t\n/******/ \t\t// Execute the module function\n/******/ \t\t__webpack_modules__[moduleId](module, module.exports, __webpack_require__);\n/******/ \t\n/******/ \t\t// Return the exports of the module\n/******/ \t\treturn module.exports;\n/******/ \t}\n/******/ \t\n/************************************************************************/\n/******/ \t/* webpack/runtime/compat get default export */\n/******/ \t!function() {\n/******/ \t\t// getDefaultExport function for compatibility with non-harmony modules\n/******/ \t\t__webpack_require__.n = function(module) {\n/******/ \t\t\tvar getter = module && module.__esModule ?\n/******/ \t\t\t\tfunction() { return module['default']; } :\n/******/ \t\t\t\tfunction() { return module; };\n/******/ \t\t\t__webpack_require__.d(getter, { a: getter });\n/******/ \t\t\treturn getter;\n/******/ \t\t};\n/******/ \t}();\n/******/ \t\n/******/ \t/* webpack/runtime/define property getters */\n/******/ \t!function() {\n/******/ \t\t// define getter functions for harmony exports\n/******/ \t\t__webpack_require__.d = function(exports, definition) {\n/******/ \t\t\tfor(var key in definition) {\n/******/ \t\t\t\tif(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n/******/ \t\t\t\t\tObject.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n/******/ \t\t\t\t}\n/******/ \t\t\t}\n/******/ \t\t};\n/******/ \t}();\n/******/ \t\n/******/ \t/* webpack/runtime/hasOwnProperty shorthand */\n/******/ \t!function() {\n/******/ \t\t__webpack_require__.o = function(obj, prop) { return Object.prototype.hasOwnProperty.call(obj, prop); }\n/******/ \t}();\n/******/ \t\n/************************************************************************/\n/******/ \t// module exports must be returned from runtime so entry inlining is disabled\n/******/ \t// startup\n/******/ \t// Load entry module and return exports\n/******/ \treturn __webpack_require__(686);\n/******/ })()\n.default;\n});", "/*\n * Copyright (c) 2016-2025 Martin Donath \n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport \"focus-visible\"\n\nimport {\n EMPTY,\n NEVER,\n Observable,\n Subject,\n defer,\n delay,\n filter,\n map,\n merge,\n mergeWith,\n shareReplay,\n switchMap\n} from \"rxjs\"\n\nimport { configuration, feature } from \"./_\"\nimport {\n at,\n getActiveElement,\n getOptionalElement,\n requestJSON,\n setLocation,\n setToggle,\n watchDocument,\n watchKeyboard,\n watchLocation,\n watchLocationTarget,\n watchMedia,\n watchPrint,\n watchScript,\n watchViewport\n} from \"./browser\"\nimport {\n getComponentElement,\n getComponentElements,\n mountAnnounce,\n mountBackToTop,\n mountConsent,\n mountContent,\n mountDialog,\n mountHeader,\n mountHeaderTitle,\n mountPalette,\n mountProgress,\n mountSearch,\n mountSearchHiglight,\n mountSidebar,\n mountSource,\n mountTableOfContents,\n mountTabs,\n watchHeader,\n watchMain\n} from \"./components\"\nimport {\n SearchIndex,\n setupClipboardJS,\n setupInstantNavigation,\n setupVersionSelector\n} from \"./integrations\"\nimport {\n patchEllipsis,\n patchIndeterminate,\n patchScrollfix,\n patchScrolllock\n} from \"./patches\"\nimport \"./polyfills\"\n\n/* ----------------------------------------------------------------------------\n * Functions - @todo refactor\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch search index\n *\n * @returns Search index observable\n */\nfunction fetchSearchIndex(): Observable {\n if (location.protocol === \"file:\") {\n return watchScript(\n `${new URL(\"search/search_index.js\", config.base)}`\n )\n .pipe(\n // @ts-ignore - @todo fix typings\n map(() => __index),\n shareReplay(1)\n )\n } else {\n return requestJSON(\n new URL(\"search/search_index.json\", config.base)\n )\n }\n}\n\n/* ----------------------------------------------------------------------------\n * Application\n * ------------------------------------------------------------------------- */\n\n/* Yay, JavaScript is available */\ndocument.documentElement.classList.remove(\"no-js\")\ndocument.documentElement.classList.add(\"js\")\n\n/* Set up navigation observables and subjects */\nconst document$ = watchDocument()\nconst location$ = watchLocation()\nconst target$ = watchLocationTarget(location$)\nconst keyboard$ = watchKeyboard()\n\n/* Set up media observables */\nconst viewport$ = watchViewport()\nconst tablet$ = watchMedia(\"(min-width: 960px)\")\nconst screen$ = watchMedia(\"(min-width: 1220px)\")\nconst print$ = watchPrint()\n\n/* Retrieve search index, if search is enabled */\nconst config = configuration()\nconst index$ = document.forms.namedItem(\"search\")\n ? fetchSearchIndex()\n : NEVER\n\n/* Set up Clipboard.js integration */\nconst alert$ = new Subject()\nsetupClipboardJS({ alert$ })\n\n/* Set up progress indicator */\nconst progress$ = new Subject()\n\n/* Set up instant navigation, if enabled */\nif (feature(\"navigation.instant\"))\n setupInstantNavigation({ location$, viewport$, progress$ })\n .subscribe(document$)\n\n/* Set up version selector */\nif (config.version?.provider === \"mike\")\n setupVersionSelector({ document$ })\n\n/* Always close drawer and search on navigation */\nmerge(location$, target$)\n .pipe(\n delay(125)\n )\n .subscribe(() => {\n setToggle(\"drawer\", false)\n setToggle(\"search\", false)\n })\n\n/* Set up global keyboard handlers */\nkeyboard$\n .pipe(\n filter(({ mode }) => mode === \"global\")\n )\n .subscribe(key => {\n switch (key.type) {\n\n /* Go to previous page */\n case \"p\":\n case \",\":\n const prev = getOptionalElement(\"link[rel=prev]\")\n if (typeof prev !== \"undefined\")\n setLocation(prev)\n break\n\n /* Go to next page */\n case \"n\":\n case \".\":\n const next = getOptionalElement(\"link[rel=next]\")\n if (typeof next !== \"undefined\")\n setLocation(next)\n break\n\n /* Expand navigation, see https://bit.ly/3ZjG5io */\n case \"Enter\":\n const active = getActiveElement()\n if (active instanceof HTMLLabelElement)\n active.click()\n }\n })\n\n/* Set up patches */\npatchEllipsis({ viewport$, document$ })\npatchIndeterminate({ document$, tablet$ })\npatchScrollfix({ document$ })\npatchScrolllock({ viewport$, tablet$ })\n\n/* Set up header and main area observable */\nconst header$ = watchHeader(getComponentElement(\"header\"), { viewport$ })\nconst main$ = document$\n .pipe(\n map(() => getComponentElement(\"main\")),\n switchMap(el => watchMain(el, { viewport$, header$ })),\n shareReplay(1)\n )\n\n/* Set up control component observables */\nconst control$ = merge(\n\n /* Consent */\n ...getComponentElements(\"consent\")\n .map(el => mountConsent(el, { target$ })),\n\n /* Dialog */\n ...getComponentElements(\"dialog\")\n .map(el => mountDialog(el, { alert$ })),\n\n /* Color palette */\n ...getComponentElements(\"palette\")\n .map(el => mountPalette(el)),\n\n /* Progress bar */\n ...getComponentElements(\"progress\")\n .map(el => mountProgress(el, { progress$ })),\n\n /* Search */\n ...getComponentElements(\"search\")\n .map(el => mountSearch(el, { index$, keyboard$ })),\n\n /* Repository information */\n ...getComponentElements(\"source\")\n .map(el => mountSource(el))\n)\n\n/* Set up content component observables */\nconst content$ = defer(() => merge(\n\n /* Announcement bar */\n ...getComponentElements(\"announce\")\n .map(el => mountAnnounce(el)),\n\n /* Content */\n ...getComponentElements(\"content\")\n .map(el => mountContent(el, { viewport$, target$, print$ })),\n\n /* Search highlighting */\n ...getComponentElements(\"content\")\n .map(el => feature(\"search.highlight\")\n ? mountSearchHiglight(el, { index$, location$ })\n : EMPTY\n ),\n\n /* Header */\n ...getComponentElements(\"header\")\n .map(el => mountHeader(el, { viewport$, header$, main$ })),\n\n /* Header title */\n ...getComponentElements(\"header-title\")\n .map(el => mountHeaderTitle(el, { viewport$, header$ })),\n\n /* Sidebar */\n ...getComponentElements(\"sidebar\")\n .map(el => el.getAttribute(\"data-md-type\") === \"navigation\"\n ? at(screen$, () => mountSidebar(el, { viewport$, header$, main$ }))\n : at(tablet$, () => mountSidebar(el, { viewport$, header$, main$ }))\n ),\n\n /* Navigation tabs */\n ...getComponentElements(\"tabs\")\n .map(el => mountTabs(el, { viewport$, header$ })),\n\n /* Table of contents */\n ...getComponentElements(\"toc\")\n .map(el => mountTableOfContents(el, {\n viewport$, header$, main$, target$\n })),\n\n /* Back-to-top button */\n ...getComponentElements(\"top\")\n .map(el => mountBackToTop(el, { viewport$, header$, main$, target$ }))\n))\n\n/* Set up component observables */\nconst component$ = document$\n .pipe(\n switchMap(() => content$),\n mergeWith(control$),\n shareReplay(1)\n )\n\n/* Subscribe to all components */\ncomponent$.subscribe()\n\n/* ----------------------------------------------------------------------------\n * Exports\n * ------------------------------------------------------------------------- */\n\nwindow.document$ = document$ /* Document observable */\nwindow.location$ = location$ /* Location subject */\nwindow.target$ = target$ /* Location target observable */\nwindow.keyboard$ = keyboard$ /* Keyboard observable */\nwindow.viewport$ = viewport$ /* Viewport observable */\nwindow.tablet$ = tablet$ /* Media tablet observable */\nwindow.screen$ = screen$ /* Media screen observable */\nwindow.print$ = print$ /* Media print observable */\nwindow.alert$ = alert$ /* Alert subject */\nwindow.progress$ = progress$ /* Progress indicator subject */\nwindow.component$ = component$ /* Component observable */\n", "/******************************************************************************\nCopyright (c) Microsoft Corporation.\n\nPermission to use, copy, modify, and/or distribute this software for any\npurpose with or without fee is hereby granted.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH\nREGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY\nAND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,\nINDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM\nLOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR\nOTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR\nPERFORMANCE OF THIS SOFTWARE.\n***************************************************************************** */\n/* global Reflect, Promise, SuppressedError, Symbol, Iterator */\n\nvar extendStatics = function(d, b) {\n extendStatics = Object.setPrototypeOf ||\n ({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) ||\n function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; };\n return extendStatics(d, b);\n};\n\nexport function __extends(d, b) {\n if (typeof b !== \"function\" && b !== null)\n throw new TypeError(\"Class extends value \" + String(b) + \" is not a constructor or null\");\n extendStatics(d, b);\n function __() { this.constructor = d; }\n d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());\n}\n\nexport var __assign = function() {\n __assign = Object.assign || function __assign(t) {\n for (var s, i = 1, n = arguments.length; i < n; i++) {\n s = arguments[i];\n for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p];\n }\n return t;\n }\n return __assign.apply(this, arguments);\n}\n\nexport function __rest(s, e) {\n var t = {};\n for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)\n t[p] = s[p];\n if (s != null && typeof Object.getOwnPropertySymbols === \"function\")\n for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {\n if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))\n t[p[i]] = s[p[i]];\n }\n return t;\n}\n\nexport function __decorate(decorators, target, key, desc) {\n var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;\n if (typeof Reflect === \"object\" && typeof Reflect.decorate === \"function\") r = Reflect.decorate(decorators, target, key, desc);\n else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;\n return c > 3 && r && Object.defineProperty(target, key, r), r;\n}\n\nexport function __param(paramIndex, decorator) {\n return function (target, key) { decorator(target, key, paramIndex); }\n}\n\nexport function __esDecorate(ctor, descriptorIn, decorators, contextIn, initializers, extraInitializers) {\n function accept(f) { if (f !== void 0 && typeof f !== \"function\") throw new TypeError(\"Function expected\"); return f; }\n var kind = contextIn.kind, key = kind === \"getter\" ? \"get\" : kind === \"setter\" ? \"set\" : \"value\";\n var target = !descriptorIn && ctor ? contextIn[\"static\"] ? ctor : ctor.prototype : null;\n var descriptor = descriptorIn || (target ? Object.getOwnPropertyDescriptor(target, contextIn.name) : {});\n var _, done = false;\n for (var i = decorators.length - 1; i >= 0; i--) {\n var context = {};\n for (var p in contextIn) context[p] = p === \"access\" ? {} : contextIn[p];\n for (var p in contextIn.access) context.access[p] = contextIn.access[p];\n context.addInitializer = function (f) { if (done) throw new TypeError(\"Cannot add initializers after decoration has completed\"); extraInitializers.push(accept(f || null)); };\n var result = (0, decorators[i])(kind === \"accessor\" ? { get: descriptor.get, set: descriptor.set } : descriptor[key], context);\n if (kind === \"accessor\") {\n if (result === void 0) continue;\n if (result === null || typeof result !== \"object\") throw new TypeError(\"Object expected\");\n if (_ = accept(result.get)) descriptor.get = _;\n if (_ = accept(result.set)) descriptor.set = _;\n if (_ = accept(result.init)) initializers.unshift(_);\n }\n else if (_ = accept(result)) {\n if (kind === \"field\") initializers.unshift(_);\n else descriptor[key] = _;\n }\n }\n if (target) Object.defineProperty(target, contextIn.name, descriptor);\n done = true;\n};\n\nexport function __runInitializers(thisArg, initializers, value) {\n var useValue = arguments.length > 2;\n for (var i = 0; i < initializers.length; i++) {\n value = useValue ? initializers[i].call(thisArg, value) : initializers[i].call(thisArg);\n }\n return useValue ? value : void 0;\n};\n\nexport function __propKey(x) {\n return typeof x === \"symbol\" ? x : \"\".concat(x);\n};\n\nexport function __setFunctionName(f, name, prefix) {\n if (typeof name === \"symbol\") name = name.description ? \"[\".concat(name.description, \"]\") : \"\";\n return Object.defineProperty(f, \"name\", { configurable: true, value: prefix ? \"\".concat(prefix, \" \", name) : name });\n};\n\nexport function __metadata(metadataKey, metadataValue) {\n if (typeof Reflect === \"object\" && typeof Reflect.metadata === \"function\") return Reflect.metadata(metadataKey, metadataValue);\n}\n\nexport function __awaiter(thisArg, _arguments, P, generator) {\n function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }\n return new (P || (P = Promise))(function (resolve, reject) {\n function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }\n function rejected(value) { try { step(generator[\"throw\"](value)); } catch (e) { reject(e); } }\n function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }\n step((generator = generator.apply(thisArg, _arguments || [])).next());\n });\n}\n\nexport function __generator(thisArg, body) {\n var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === \"function\" ? Iterator : Object).prototype);\n return g.next = verb(0), g[\"throw\"] = verb(1), g[\"return\"] = verb(2), typeof Symbol === \"function\" && (g[Symbol.iterator] = function() { return this; }), g;\n function verb(n) { return function (v) { return step([n, v]); }; }\n function step(op) {\n if (f) throw new TypeError(\"Generator is already executing.\");\n while (g && (g = 0, op[0] && (_ = 0)), _) try {\n if (f = 1, y && (t = op[0] & 2 ? y[\"return\"] : op[0] ? y[\"throw\"] || ((t = y[\"return\"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;\n if (y = 0, t) op = [op[0] & 2, t.value];\n switch (op[0]) {\n case 0: case 1: t = op; break;\n case 4: _.label++; return { value: op[1], done: false };\n case 5: _.label++; y = op[1]; op = [0]; continue;\n case 7: op = _.ops.pop(); _.trys.pop(); continue;\n default:\n if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }\n if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }\n if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }\n if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }\n if (t[2]) _.ops.pop();\n _.trys.pop(); continue;\n }\n op = body.call(thisArg, _);\n } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }\n if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };\n }\n}\n\nexport var __createBinding = Object.create ? (function(o, m, k, k2) {\n if (k2 === undefined) k2 = k;\n var desc = Object.getOwnPropertyDescriptor(m, k);\n if (!desc || (\"get\" in desc ? !m.__esModule : desc.writable || desc.configurable)) {\n desc = { enumerable: true, get: function() { return m[k]; } };\n }\n Object.defineProperty(o, k2, desc);\n}) : (function(o, m, k, k2) {\n if (k2 === undefined) k2 = k;\n o[k2] = m[k];\n});\n\nexport function __exportStar(m, o) {\n for (var p in m) if (p !== \"default\" && !Object.prototype.hasOwnProperty.call(o, p)) __createBinding(o, m, p);\n}\n\nexport function __values(o) {\n var s = typeof Symbol === \"function\" && Symbol.iterator, m = s && o[s], i = 0;\n if (m) return m.call(o);\n if (o && typeof o.length === \"number\") return {\n next: function () {\n if (o && i >= o.length) o = void 0;\n return { value: o && o[i++], done: !o };\n }\n };\n throw new TypeError(s ? \"Object is not iterable.\" : \"Symbol.iterator is not defined.\");\n}\n\nexport function __read(o, n) {\n var m = typeof Symbol === \"function\" && o[Symbol.iterator];\n if (!m) return o;\n var i = m.call(o), r, ar = [], e;\n try {\n while ((n === void 0 || n-- > 0) && !(r = i.next()).done) ar.push(r.value);\n }\n catch (error) { e = { error: error }; }\n finally {\n try {\n if (r && !r.done && (m = i[\"return\"])) m.call(i);\n }\n finally { if (e) throw e.error; }\n }\n return ar;\n}\n\n/** @deprecated */\nexport function __spread() {\n for (var ar = [], i = 0; i < arguments.length; i++)\n ar = ar.concat(__read(arguments[i]));\n return ar;\n}\n\n/** @deprecated */\nexport function __spreadArrays() {\n for (var s = 0, i = 0, il = arguments.length; i < il; i++) s += arguments[i].length;\n for (var r = Array(s), k = 0, i = 0; i < il; i++)\n for (var a = arguments[i], j = 0, jl = a.length; j < jl; j++, k++)\n r[k] = a[j];\n return r;\n}\n\nexport function __spreadArray(to, from, pack) {\n if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {\n if (ar || !(i in from)) {\n if (!ar) ar = Array.prototype.slice.call(from, 0, i);\n ar[i] = from[i];\n }\n }\n return to.concat(ar || Array.prototype.slice.call(from));\n}\n\nexport function __await(v) {\n return this instanceof __await ? (this.v = v, this) : new __await(v);\n}\n\nexport function __asyncGenerator(thisArg, _arguments, generator) {\n if (!Symbol.asyncIterator) throw new TypeError(\"Symbol.asyncIterator is not defined.\");\n var g = generator.apply(thisArg, _arguments || []), i, q = [];\n return i = Object.create((typeof AsyncIterator === \"function\" ? AsyncIterator : Object).prototype), verb(\"next\"), verb(\"throw\"), verb(\"return\", awaitReturn), i[Symbol.asyncIterator] = function () { return this; }, i;\n function awaitReturn(f) { return function (v) { return Promise.resolve(v).then(f, reject); }; }\n function verb(n, f) { if (g[n]) { i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; if (f) i[n] = f(i[n]); } }\n function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }\n function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }\n function fulfill(value) { resume(\"next\", value); }\n function reject(value) { resume(\"throw\", value); }\n function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }\n}\n\nexport function __asyncDelegator(o) {\n var i, p;\n return i = {}, verb(\"next\"), verb(\"throw\", function (e) { throw e; }), verb(\"return\"), i[Symbol.iterator] = function () { return this; }, i;\n function verb(n, f) { i[n] = o[n] ? function (v) { return (p = !p) ? { value: __await(o[n](v)), done: false } : f ? f(v) : v; } : f; }\n}\n\nexport function __asyncValues(o) {\n if (!Symbol.asyncIterator) throw new TypeError(\"Symbol.asyncIterator is not defined.\");\n var m = o[Symbol.asyncIterator], i;\n return m ? m.call(o) : (o = typeof __values === \"function\" ? __values(o) : o[Symbol.iterator](), i = {}, verb(\"next\"), verb(\"throw\"), verb(\"return\"), i[Symbol.asyncIterator] = function () { return this; }, i);\n function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }\n function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }\n}\n\nexport function __makeTemplateObject(cooked, raw) {\n if (Object.defineProperty) { Object.defineProperty(cooked, \"raw\", { value: raw }); } else { cooked.raw = raw; }\n return cooked;\n};\n\nvar __setModuleDefault = Object.create ? (function(o, v) {\n Object.defineProperty(o, \"default\", { enumerable: true, value: v });\n}) : function(o, v) {\n o[\"default\"] = v;\n};\n\nexport function __importStar(mod) {\n if (mod && mod.__esModule) return mod;\n var result = {};\n if (mod != null) for (var k in mod) if (k !== \"default\" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);\n __setModuleDefault(result, mod);\n return result;\n}\n\nexport function __importDefault(mod) {\n return (mod && mod.__esModule) ? mod : { default: mod };\n}\n\nexport function __classPrivateFieldGet(receiver, state, kind, f) {\n if (kind === \"a\" && !f) throw new TypeError(\"Private accessor was defined without a getter\");\n if (typeof state === \"function\" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError(\"Cannot read private member from an object whose class did not declare it\");\n return kind === \"m\" ? f : kind === \"a\" ? f.call(receiver) : f ? f.value : state.get(receiver);\n}\n\nexport function __classPrivateFieldSet(receiver, state, value, kind, f) {\n if (kind === \"m\") throw new TypeError(\"Private method is not writable\");\n if (kind === \"a\" && !f) throw new TypeError(\"Private accessor was defined without a setter\");\n if (typeof state === \"function\" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError(\"Cannot write private member to an object whose class did not declare it\");\n return (kind === \"a\" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;\n}\n\nexport function __classPrivateFieldIn(state, receiver) {\n if (receiver === null || (typeof receiver !== \"object\" && typeof receiver !== \"function\")) throw new TypeError(\"Cannot use 'in' operator on non-object\");\n return typeof state === \"function\" ? receiver === state : state.has(receiver);\n}\n\nexport function __addDisposableResource(env, value, async) {\n if (value !== null && value !== void 0) {\n if (typeof value !== \"object\" && typeof value !== \"function\") throw new TypeError(\"Object expected.\");\n var dispose, inner;\n if (async) {\n if (!Symbol.asyncDispose) throw new TypeError(\"Symbol.asyncDispose is not defined.\");\n dispose = value[Symbol.asyncDispose];\n }\n if (dispose === void 0) {\n if (!Symbol.dispose) throw new TypeError(\"Symbol.dispose is not defined.\");\n dispose = value[Symbol.dispose];\n if (async) inner = dispose;\n }\n if (typeof dispose !== \"function\") throw new TypeError(\"Object not disposable.\");\n if (inner) dispose = function() { try { inner.call(this); } catch (e) { return Promise.reject(e); } };\n env.stack.push({ value: value, dispose: dispose, async: async });\n }\n else if (async) {\n env.stack.push({ async: true });\n }\n return value;\n}\n\nvar _SuppressedError = typeof SuppressedError === \"function\" ? SuppressedError : function (error, suppressed, message) {\n var e = new Error(message);\n return e.name = \"SuppressedError\", e.error = error, e.suppressed = suppressed, e;\n};\n\nexport function __disposeResources(env) {\n function fail(e) {\n env.error = env.hasError ? new _SuppressedError(e, env.error, \"An error was suppressed during disposal.\") : e;\n env.hasError = true;\n }\n var r, s = 0;\n function next() {\n while (r = env.stack.pop()) {\n try {\n if (!r.async && s === 1) return s = 0, env.stack.push(r), Promise.resolve().then(next);\n if (r.dispose) {\n var result = r.dispose.call(r.value);\n if (r.async) return s |= 2, Promise.resolve(result).then(next, function(e) { fail(e); return next(); });\n }\n else s |= 1;\n }\n catch (e) {\n fail(e);\n }\n }\n if (s === 1) return env.hasError ? Promise.reject(env.error) : Promise.resolve();\n if (env.hasError) throw env.error;\n }\n return next();\n}\n\nexport default {\n __extends,\n __assign,\n __rest,\n __decorate,\n __param,\n __metadata,\n __awaiter,\n __generator,\n __createBinding,\n __exportStar,\n __values,\n __read,\n __spread,\n __spreadArrays,\n __spreadArray,\n __await,\n __asyncGenerator,\n __asyncDelegator,\n __asyncValues,\n __makeTemplateObject,\n __importStar,\n __importDefault,\n __classPrivateFieldGet,\n __classPrivateFieldSet,\n __classPrivateFieldIn,\n __addDisposableResource,\n __disposeResources,\n};\n", "/**\n * Returns true if the object is a function.\n * @param value The value to check\n */\nexport function isFunction(value: any): value is (...args: any[]) => any {\n return typeof value === 'function';\n}\n", "/**\n * Used to create Error subclasses until the community moves away from ES5.\n *\n * This is because compiling from TypeScript down to ES5 has issues with subclassing Errors\n * as well as other built-in types: https://github.com/Microsoft/TypeScript/issues/12123\n *\n * @param createImpl A factory function to create the actual constructor implementation. The returned\n * function should be a named function that calls `_super` internally.\n */\nexport function createErrorClass(createImpl: (_super: any) => any): T {\n const _super = (instance: any) => {\n Error.call(instance);\n instance.stack = new Error().stack;\n };\n\n const ctorFunc = createImpl(_super);\n ctorFunc.prototype = Object.create(Error.prototype);\n ctorFunc.prototype.constructor = ctorFunc;\n return ctorFunc;\n}\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface UnsubscriptionError extends Error {\n readonly errors: any[];\n}\n\nexport interface UnsubscriptionErrorCtor {\n /**\n * @deprecated Internal implementation detail. Do not construct error instances.\n * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n */\n new (errors: any[]): UnsubscriptionError;\n}\n\n/**\n * An error thrown when one or more errors have occurred during the\n * `unsubscribe` of a {@link Subscription}.\n */\nexport const UnsubscriptionError: UnsubscriptionErrorCtor = createErrorClass(\n (_super) =>\n function UnsubscriptionErrorImpl(this: any, errors: (Error | string)[]) {\n _super(this);\n this.message = errors\n ? `${errors.length} errors occurred during unsubscription:\n${errors.map((err, i) => `${i + 1}) ${err.toString()}`).join('\\n ')}`\n : '';\n this.name = 'UnsubscriptionError';\n this.errors = errors;\n }\n);\n", "/**\n * Removes an item from an array, mutating it.\n * @param arr The array to remove the item from\n * @param item The item to remove\n */\nexport function arrRemove(arr: T[] | undefined | null, item: T) {\n if (arr) {\n const index = arr.indexOf(item);\n 0 <= index && arr.splice(index, 1);\n }\n}\n", "import { isFunction } from './util/isFunction';\nimport { UnsubscriptionError } from './util/UnsubscriptionError';\nimport { SubscriptionLike, TeardownLogic, Unsubscribable } from './types';\nimport { arrRemove } from './util/arrRemove';\n\n/**\n * Represents a disposable resource, such as the execution of an Observable. A\n * Subscription has one important method, `unsubscribe`, that takes no argument\n * and just disposes the resource held by the subscription.\n *\n * Additionally, subscriptions may be grouped together through the `add()`\n * method, which will attach a child Subscription to the current Subscription.\n * When a Subscription is unsubscribed, all its children (and its grandchildren)\n * will be unsubscribed as well.\n *\n * @class Subscription\n */\nexport class Subscription implements SubscriptionLike {\n /** @nocollapse */\n public static EMPTY = (() => {\n const empty = new Subscription();\n empty.closed = true;\n return empty;\n })();\n\n /**\n * A flag to indicate whether this Subscription has already been unsubscribed.\n */\n public closed = false;\n\n private _parentage: Subscription[] | Subscription | null = null;\n\n /**\n * The list of registered finalizers to execute upon unsubscription. Adding and removing from this\n * list occurs in the {@link #add} and {@link #remove} methods.\n */\n private _finalizers: Exclude[] | null = null;\n\n /**\n * @param initialTeardown A function executed first as part of the finalization\n * process that is kicked off when {@link #unsubscribe} is called.\n */\n constructor(private initialTeardown?: () => void) {}\n\n /**\n * Disposes the resources held by the subscription. May, for instance, cancel\n * an ongoing Observable execution or cancel any other type of work that\n * started when the Subscription was created.\n * @return {void}\n */\n unsubscribe(): void {\n let errors: any[] | undefined;\n\n if (!this.closed) {\n this.closed = true;\n\n // Remove this from it's parents.\n const { _parentage } = this;\n if (_parentage) {\n this._parentage = null;\n if (Array.isArray(_parentage)) {\n for (const parent of _parentage) {\n parent.remove(this);\n }\n } else {\n _parentage.remove(this);\n }\n }\n\n const { initialTeardown: initialFinalizer } = this;\n if (isFunction(initialFinalizer)) {\n try {\n initialFinalizer();\n } catch (e) {\n errors = e instanceof UnsubscriptionError ? e.errors : [e];\n }\n }\n\n const { _finalizers } = this;\n if (_finalizers) {\n this._finalizers = null;\n for (const finalizer of _finalizers) {\n try {\n execFinalizer(finalizer);\n } catch (err) {\n errors = errors ?? [];\n if (err instanceof UnsubscriptionError) {\n errors = [...errors, ...err.errors];\n } else {\n errors.push(err);\n }\n }\n }\n }\n\n if (errors) {\n throw new UnsubscriptionError(errors);\n }\n }\n }\n\n /**\n * Adds a finalizer to this subscription, so that finalization will be unsubscribed/called\n * when this subscription is unsubscribed. If this subscription is already {@link #closed},\n * because it has already been unsubscribed, then whatever finalizer is passed to it\n * will automatically be executed (unless the finalizer itself is also a closed subscription).\n *\n * Closed Subscriptions cannot be added as finalizers to any subscription. Adding a closed\n * subscription to a any subscription will result in no operation. (A noop).\n *\n * Adding a subscription to itself, or adding `null` or `undefined` will not perform any\n * operation at all. (A noop).\n *\n * `Subscription` instances that are added to this instance will automatically remove themselves\n * if they are unsubscribed. Functions and {@link Unsubscribable} objects that you wish to remove\n * will need to be removed manually with {@link #remove}\n *\n * @param teardown The finalization logic to add to this subscription.\n */\n add(teardown: TeardownLogic): void {\n // Only add the finalizer if it's not undefined\n // and don't add a subscription to itself.\n if (teardown && teardown !== this) {\n if (this.closed) {\n // If this subscription is already closed,\n // execute whatever finalizer is handed to it automatically.\n execFinalizer(teardown);\n } else {\n if (teardown instanceof Subscription) {\n // We don't add closed subscriptions, and we don't add the same subscription\n // twice. Subscription unsubscribe is idempotent.\n if (teardown.closed || teardown._hasParent(this)) {\n return;\n }\n teardown._addParent(this);\n }\n (this._finalizers = this._finalizers ?? []).push(teardown);\n }\n }\n }\n\n /**\n * Checks to see if a this subscription already has a particular parent.\n * This will signal that this subscription has already been added to the parent in question.\n * @param parent the parent to check for\n */\n private _hasParent(parent: Subscription) {\n const { _parentage } = this;\n return _parentage === parent || (Array.isArray(_parentage) && _parentage.includes(parent));\n }\n\n /**\n * Adds a parent to this subscription so it can be removed from the parent if it\n * unsubscribes on it's own.\n *\n * NOTE: THIS ASSUMES THAT {@link _hasParent} HAS ALREADY BEEN CHECKED.\n * @param parent The parent subscription to add\n */\n private _addParent(parent: Subscription) {\n const { _parentage } = this;\n this._parentage = Array.isArray(_parentage) ? (_parentage.push(parent), _parentage) : _parentage ? [_parentage, parent] : parent;\n }\n\n /**\n * Called on a child when it is removed via {@link #remove}.\n * @param parent The parent to remove\n */\n private _removeParent(parent: Subscription) {\n const { _parentage } = this;\n if (_parentage === parent) {\n this._parentage = null;\n } else if (Array.isArray(_parentage)) {\n arrRemove(_parentage, parent);\n }\n }\n\n /**\n * Removes a finalizer from this subscription that was previously added with the {@link #add} method.\n *\n * Note that `Subscription` instances, when unsubscribed, will automatically remove themselves\n * from every other `Subscription` they have been added to. This means that using the `remove` method\n * is not a common thing and should be used thoughtfully.\n *\n * If you add the same finalizer instance of a function or an unsubscribable object to a `Subscription` instance\n * more than once, you will need to call `remove` the same number of times to remove all instances.\n *\n * All finalizer instances are removed to free up memory upon unsubscription.\n *\n * @param teardown The finalizer to remove from this subscription\n */\n remove(teardown: Exclude): void {\n const { _finalizers } = this;\n _finalizers && arrRemove(_finalizers, teardown);\n\n if (teardown instanceof Subscription) {\n teardown._removeParent(this);\n }\n }\n}\n\nexport const EMPTY_SUBSCRIPTION = Subscription.EMPTY;\n\nexport function isSubscription(value: any): value is Subscription {\n return (\n value instanceof Subscription ||\n (value && 'closed' in value && isFunction(value.remove) && isFunction(value.add) && isFunction(value.unsubscribe))\n );\n}\n\nfunction execFinalizer(finalizer: Unsubscribable | (() => void)) {\n if (isFunction(finalizer)) {\n finalizer();\n } else {\n finalizer.unsubscribe();\n }\n}\n", "import { Subscriber } from './Subscriber';\nimport { ObservableNotification } from './types';\n\n/**\n * The {@link GlobalConfig} object for RxJS. It is used to configure things\n * like how to react on unhandled errors.\n */\nexport const config: GlobalConfig = {\n onUnhandledError: null,\n onStoppedNotification: null,\n Promise: undefined,\n useDeprecatedSynchronousErrorHandling: false,\n useDeprecatedNextContext: false,\n};\n\n/**\n * The global configuration object for RxJS, used to configure things\n * like how to react on unhandled errors. Accessible via {@link config}\n * object.\n */\nexport interface GlobalConfig {\n /**\n * A registration point for unhandled errors from RxJS. These are errors that\n * cannot were not handled by consuming code in the usual subscription path. For\n * example, if you have this configured, and you subscribe to an observable without\n * providing an error handler, errors from that subscription will end up here. This\n * will _always_ be called asynchronously on another job in the runtime. This is because\n * we do not want errors thrown in this user-configured handler to interfere with the\n * behavior of the library.\n */\n onUnhandledError: ((err: any) => void) | null;\n\n /**\n * A registration point for notifications that cannot be sent to subscribers because they\n * have completed, errored or have been explicitly unsubscribed. By default, next, complete\n * and error notifications sent to stopped subscribers are noops. However, sometimes callers\n * might want a different behavior. For example, with sources that attempt to report errors\n * to stopped subscribers, a caller can configure RxJS to throw an unhandled error instead.\n * This will _always_ be called asynchronously on another job in the runtime. This is because\n * we do not want errors thrown in this user-configured handler to interfere with the\n * behavior of the library.\n */\n onStoppedNotification: ((notification: ObservableNotification, subscriber: Subscriber) => void) | null;\n\n /**\n * The promise constructor used by default for {@link Observable#toPromise toPromise} and {@link Observable#forEach forEach}\n * methods.\n *\n * @deprecated As of version 8, RxJS will no longer support this sort of injection of a\n * Promise constructor. If you need a Promise implementation other than native promises,\n * please polyfill/patch Promise as you see appropriate. Will be removed in v8.\n */\n Promise?: PromiseConstructorLike;\n\n /**\n * If true, turns on synchronous error rethrowing, which is a deprecated behavior\n * in v6 and higher. This behavior enables bad patterns like wrapping a subscribe\n * call in a try/catch block. It also enables producer interference, a nasty bug\n * where a multicast can be broken for all observers by a downstream consumer with\n * an unhandled error. DO NOT USE THIS FLAG UNLESS IT'S NEEDED TO BUY TIME\n * FOR MIGRATION REASONS.\n *\n * @deprecated As of version 8, RxJS will no longer support synchronous throwing\n * of unhandled errors. All errors will be thrown on a separate call stack to prevent bad\n * behaviors described above. Will be removed in v8.\n */\n useDeprecatedSynchronousErrorHandling: boolean;\n\n /**\n * If true, enables an as-of-yet undocumented feature from v5: The ability to access\n * `unsubscribe()` via `this` context in `next` functions created in observers passed\n * to `subscribe`.\n *\n * This is being removed because the performance was severely problematic, and it could also cause\n * issues when types other than POJOs are passed to subscribe as subscribers, as they will likely have\n * their `this` context overwritten.\n *\n * @deprecated As of version 8, RxJS will no longer support altering the\n * context of next functions provided as part of an observer to Subscribe. Instead,\n * you will have access to a subscription or a signal or token that will allow you to do things like\n * unsubscribe and test closed status. Will be removed in v8.\n */\n useDeprecatedNextContext: boolean;\n}\n", "import type { TimerHandle } from './timerHandle';\ntype SetTimeoutFunction = (handler: () => void, timeout?: number, ...args: any[]) => TimerHandle;\ntype ClearTimeoutFunction = (handle: TimerHandle) => void;\n\ninterface TimeoutProvider {\n setTimeout: SetTimeoutFunction;\n clearTimeout: ClearTimeoutFunction;\n delegate:\n | {\n setTimeout: SetTimeoutFunction;\n clearTimeout: ClearTimeoutFunction;\n }\n | undefined;\n}\n\nexport const timeoutProvider: TimeoutProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n setTimeout(handler: () => void, timeout?: number, ...args) {\n const { delegate } = timeoutProvider;\n if (delegate?.setTimeout) {\n return delegate.setTimeout(handler, timeout, ...args);\n }\n return setTimeout(handler, timeout, ...args);\n },\n clearTimeout(handle) {\n const { delegate } = timeoutProvider;\n return (delegate?.clearTimeout || clearTimeout)(handle as any);\n },\n delegate: undefined,\n};\n", "import { config } from '../config';\nimport { timeoutProvider } from '../scheduler/timeoutProvider';\n\n/**\n * Handles an error on another job either with the user-configured {@link onUnhandledError},\n * or by throwing it on that new job so it can be picked up by `window.onerror`, `process.on('error')`, etc.\n *\n * This should be called whenever there is an error that is out-of-band with the subscription\n * or when an error hits a terminal boundary of the subscription and no error handler was provided.\n *\n * @param err the error to report\n */\nexport function reportUnhandledError(err: any) {\n timeoutProvider.setTimeout(() => {\n const { onUnhandledError } = config;\n if (onUnhandledError) {\n // Execute the user-configured error handler.\n onUnhandledError(err);\n } else {\n // Throw so it is picked up by the runtime's uncaught error mechanism.\n throw err;\n }\n });\n}\n", "/* tslint:disable:no-empty */\nexport function noop() { }\n", "import { CompleteNotification, NextNotification, ErrorNotification } from './types';\n\n/**\n * A completion object optimized for memory use and created to be the\n * same \"shape\" as other notifications in v8.\n * @internal\n */\nexport const COMPLETE_NOTIFICATION = (() => createNotification('C', undefined, undefined) as CompleteNotification)();\n\n/**\n * Internal use only. Creates an optimized error notification that is the same \"shape\"\n * as other notifications.\n * @internal\n */\nexport function errorNotification(error: any): ErrorNotification {\n return createNotification('E', undefined, error) as any;\n}\n\n/**\n * Internal use only. Creates an optimized next notification that is the same \"shape\"\n * as other notifications.\n * @internal\n */\nexport function nextNotification(value: T) {\n return createNotification('N', value, undefined) as NextNotification;\n}\n\n/**\n * Ensures that all notifications created internally have the same \"shape\" in v8.\n *\n * TODO: This is only exported to support a crazy legacy test in `groupBy`.\n * @internal\n */\nexport function createNotification(kind: 'N' | 'E' | 'C', value: any, error: any) {\n return {\n kind,\n value,\n error,\n };\n}\n", "import { config } from '../config';\n\nlet context: { errorThrown: boolean; error: any } | null = null;\n\n/**\n * Handles dealing with errors for super-gross mode. Creates a context, in which\n * any synchronously thrown errors will be passed to {@link captureError}. Which\n * will record the error such that it will be rethrown after the call back is complete.\n * TODO: Remove in v8\n * @param cb An immediately executed function.\n */\nexport function errorContext(cb: () => void) {\n if (config.useDeprecatedSynchronousErrorHandling) {\n const isRoot = !context;\n if (isRoot) {\n context = { errorThrown: false, error: null };\n }\n cb();\n if (isRoot) {\n const { errorThrown, error } = context!;\n context = null;\n if (errorThrown) {\n throw error;\n }\n }\n } else {\n // This is the general non-deprecated path for everyone that\n // isn't crazy enough to use super-gross mode (useDeprecatedSynchronousErrorHandling)\n cb();\n }\n}\n\n/**\n * Captures errors only in super-gross mode.\n * @param err the error to capture\n */\nexport function captureError(err: any) {\n if (config.useDeprecatedSynchronousErrorHandling && context) {\n context.errorThrown = true;\n context.error = err;\n }\n}\n", "import { isFunction } from './util/isFunction';\nimport { Observer, ObservableNotification } from './types';\nimport { isSubscription, Subscription } from './Subscription';\nimport { config } from './config';\nimport { reportUnhandledError } from './util/reportUnhandledError';\nimport { noop } from './util/noop';\nimport { nextNotification, errorNotification, COMPLETE_NOTIFICATION } from './NotificationFactories';\nimport { timeoutProvider } from './scheduler/timeoutProvider';\nimport { captureError } from './util/errorContext';\n\n/**\n * Implements the {@link Observer} interface and extends the\n * {@link Subscription} class. While the {@link Observer} is the public API for\n * consuming the values of an {@link Observable}, all Observers get converted to\n * a Subscriber, in order to provide Subscription-like capabilities such as\n * `unsubscribe`. Subscriber is a common type in RxJS, and crucial for\n * implementing operators, but it is rarely used as a public API.\n *\n * @class Subscriber\n */\nexport class Subscriber extends Subscription implements Observer {\n /**\n * A static factory for a Subscriber, given a (potentially partial) definition\n * of an Observer.\n * @param next The `next` callback of an Observer.\n * @param error The `error` callback of an\n * Observer.\n * @param complete The `complete` callback of an\n * Observer.\n * @return A Subscriber wrapping the (partially defined)\n * Observer represented by the given arguments.\n * @nocollapse\n * @deprecated Do not use. Will be removed in v8. There is no replacement for this\n * method, and there is no reason to be creating instances of `Subscriber` directly.\n * If you have a specific use case, please file an issue.\n */\n static create(next?: (x?: T) => void, error?: (e?: any) => void, complete?: () => void): Subscriber {\n return new SafeSubscriber(next, error, complete);\n }\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n protected isStopped: boolean = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n protected destination: Subscriber | Observer; // this `any` is the escape hatch to erase extra type param (e.g. R)\n\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n * There is no reason to directly create an instance of Subscriber. This type is exported for typings reasons.\n */\n constructor(destination?: Subscriber | Observer) {\n super();\n if (destination) {\n this.destination = destination;\n // Automatically chain subscriptions together here.\n // if destination is a Subscription, then it is a Subscriber.\n if (isSubscription(destination)) {\n destination.add(this);\n }\n } else {\n this.destination = EMPTY_OBSERVER;\n }\n }\n\n /**\n * The {@link Observer} callback to receive notifications of type `next` from\n * the Observable, with a value. The Observable may call this method 0 or more\n * times.\n * @param {T} [value] The `next` value.\n * @return {void}\n */\n next(value?: T): void {\n if (this.isStopped) {\n handleStoppedNotification(nextNotification(value), this);\n } else {\n this._next(value!);\n }\n }\n\n /**\n * The {@link Observer} callback to receive notifications of type `error` from\n * the Observable, with an attached `Error`. Notifies the Observer that\n * the Observable has experienced an error condition.\n * @param {any} [err] The `error` exception.\n * @return {void}\n */\n error(err?: any): void {\n if (this.isStopped) {\n handleStoppedNotification(errorNotification(err), this);\n } else {\n this.isStopped = true;\n this._error(err);\n }\n }\n\n /**\n * The {@link Observer} callback to receive a valueless notification of type\n * `complete` from the Observable. Notifies the Observer that the Observable\n * has finished sending push-based notifications.\n * @return {void}\n */\n complete(): void {\n if (this.isStopped) {\n handleStoppedNotification(COMPLETE_NOTIFICATION, this);\n } else {\n this.isStopped = true;\n this._complete();\n }\n }\n\n unsubscribe(): void {\n if (!this.closed) {\n this.isStopped = true;\n super.unsubscribe();\n this.destination = null!;\n }\n }\n\n protected _next(value: T): void {\n this.destination.next(value);\n }\n\n protected _error(err: any): void {\n try {\n this.destination.error(err);\n } finally {\n this.unsubscribe();\n }\n }\n\n protected _complete(): void {\n try {\n this.destination.complete();\n } finally {\n this.unsubscribe();\n }\n }\n}\n\n/**\n * This bind is captured here because we want to be able to have\n * compatibility with monoid libraries that tend to use a method named\n * `bind`. In particular, a library called Monio requires this.\n */\nconst _bind = Function.prototype.bind;\n\nfunction bind any>(fn: Fn, thisArg: any): Fn {\n return _bind.call(fn, thisArg);\n}\n\n/**\n * Internal optimization only, DO NOT EXPOSE.\n * @internal\n */\nclass ConsumerObserver implements Observer {\n constructor(private partialObserver: Partial>) {}\n\n next(value: T): void {\n const { partialObserver } = this;\n if (partialObserver.next) {\n try {\n partialObserver.next(value);\n } catch (error) {\n handleUnhandledError(error);\n }\n }\n }\n\n error(err: any): void {\n const { partialObserver } = this;\n if (partialObserver.error) {\n try {\n partialObserver.error(err);\n } catch (error) {\n handleUnhandledError(error);\n }\n } else {\n handleUnhandledError(err);\n }\n }\n\n complete(): void {\n const { partialObserver } = this;\n if (partialObserver.complete) {\n try {\n partialObserver.complete();\n } catch (error) {\n handleUnhandledError(error);\n }\n }\n }\n}\n\nexport class SafeSubscriber extends Subscriber {\n constructor(\n observerOrNext?: Partial> | ((value: T) => void) | null,\n error?: ((e?: any) => void) | null,\n complete?: (() => void) | null\n ) {\n super();\n\n let partialObserver: Partial>;\n if (isFunction(observerOrNext) || !observerOrNext) {\n // The first argument is a function, not an observer. The next\n // two arguments *could* be observers, or they could be empty.\n partialObserver = {\n next: (observerOrNext ?? undefined) as (((value: T) => void) | undefined),\n error: error ?? undefined,\n complete: complete ?? undefined,\n };\n } else {\n // The first argument is a partial observer.\n let context: any;\n if (this && config.useDeprecatedNextContext) {\n // This is a deprecated path that made `this.unsubscribe()` available in\n // next handler functions passed to subscribe. This only exists behind a flag\n // now, as it is *very* slow.\n context = Object.create(observerOrNext);\n context.unsubscribe = () => this.unsubscribe();\n partialObserver = {\n next: observerOrNext.next && bind(observerOrNext.next, context),\n error: observerOrNext.error && bind(observerOrNext.error, context),\n complete: observerOrNext.complete && bind(observerOrNext.complete, context),\n };\n } else {\n // The \"normal\" path. Just use the partial observer directly.\n partialObserver = observerOrNext;\n }\n }\n\n // Wrap the partial observer to ensure it's a full observer, and\n // make sure proper error handling is accounted for.\n this.destination = new ConsumerObserver(partialObserver);\n }\n}\n\nfunction handleUnhandledError(error: any) {\n if (config.useDeprecatedSynchronousErrorHandling) {\n captureError(error);\n } else {\n // Ideal path, we report this as an unhandled error,\n // which is thrown on a new call stack.\n reportUnhandledError(error);\n }\n}\n\n/**\n * An error handler used when no error handler was supplied\n * to the SafeSubscriber -- meaning no error handler was supplied\n * do the `subscribe` call on our observable.\n * @param err The error to handle\n */\nfunction defaultErrorHandler(err: any) {\n throw err;\n}\n\n/**\n * A handler for notifications that cannot be sent to a stopped subscriber.\n * @param notification The notification being sent\n * @param subscriber The stopped subscriber\n */\nfunction handleStoppedNotification(notification: ObservableNotification, subscriber: Subscriber) {\n const { onStoppedNotification } = config;\n onStoppedNotification && timeoutProvider.setTimeout(() => onStoppedNotification(notification, subscriber));\n}\n\n/**\n * The observer used as a stub for subscriptions where the user did not\n * pass any arguments to `subscribe`. Comes with the default error handling\n * behavior.\n */\nexport const EMPTY_OBSERVER: Readonly> & { closed: true } = {\n closed: true,\n next: noop,\n error: defaultErrorHandler,\n complete: noop,\n};\n", "/**\n * Symbol.observable or a string \"@@observable\". Used for interop\n *\n * @deprecated We will no longer be exporting this symbol in upcoming versions of RxJS.\n * Instead polyfill and use Symbol.observable directly *or* use https://www.npmjs.com/package/symbol-observable\n */\nexport const observable: string | symbol = (() => (typeof Symbol === 'function' && Symbol.observable) || '@@observable')();\n", "/**\n * This function takes one parameter and just returns it. Simply put,\n * this is like `(x: T): T => x`.\n *\n * ## Examples\n *\n * This is useful in some cases when using things like `mergeMap`\n *\n * ```ts\n * import { interval, take, map, range, mergeMap, identity } from 'rxjs';\n *\n * const source$ = interval(1000).pipe(take(5));\n *\n * const result$ = source$.pipe(\n * map(i => range(i)),\n * mergeMap(identity) // same as mergeMap(x => x)\n * );\n *\n * result$.subscribe({\n * next: console.log\n * });\n * ```\n *\n * Or when you want to selectively apply an operator\n *\n * ```ts\n * import { interval, take, identity } from 'rxjs';\n *\n * const shouldLimit = () => Math.random() < 0.5;\n *\n * const source$ = interval(1000);\n *\n * const result$ = source$.pipe(shouldLimit() ? take(5) : identity);\n *\n * result$.subscribe({\n * next: console.log\n * });\n * ```\n *\n * @param x Any value that is returned by this function\n * @returns The value passed as the first parameter to this function\n */\nexport function identity(x: T): T {\n return x;\n}\n", "import { identity } from './identity';\nimport { UnaryFunction } from '../types';\n\nexport function pipe(): typeof identity;\nexport function pipe(fn1: UnaryFunction): UnaryFunction;\nexport function pipe(fn1: UnaryFunction, fn2: UnaryFunction): UnaryFunction;\nexport function pipe(fn1: UnaryFunction, fn2: UnaryFunction, fn3: UnaryFunction): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction,\n fn9: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction,\n fn9: UnaryFunction,\n ...fns: UnaryFunction[]\n): UnaryFunction;\n\n/**\n * pipe() can be called on one or more functions, each of which can take one argument (\"UnaryFunction\")\n * and uses it to return a value.\n * It returns a function that takes one argument, passes it to the first UnaryFunction, and then\n * passes the result to the next one, passes that result to the next one, and so on. \n */\nexport function pipe(...fns: Array>): UnaryFunction {\n return pipeFromArray(fns);\n}\n\n/** @internal */\nexport function pipeFromArray(fns: Array>): UnaryFunction {\n if (fns.length === 0) {\n return identity as UnaryFunction;\n }\n\n if (fns.length === 1) {\n return fns[0];\n }\n\n return function piped(input: T): R {\n return fns.reduce((prev: any, fn: UnaryFunction) => fn(prev), input as any);\n };\n}\n", "import { Operator } from './Operator';\nimport { SafeSubscriber, Subscriber } from './Subscriber';\nimport { isSubscription, Subscription } from './Subscription';\nimport { TeardownLogic, OperatorFunction, Subscribable, Observer } from './types';\nimport { observable as Symbol_observable } from './symbol/observable';\nimport { pipeFromArray } from './util/pipe';\nimport { config } from './config';\nimport { isFunction } from './util/isFunction';\nimport { errorContext } from './util/errorContext';\n\n/**\n * A representation of any set of values over any amount of time. This is the most basic building block\n * of RxJS.\n *\n * @class Observable\n */\nexport class Observable implements Subscribable {\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n */\n source: Observable | undefined;\n\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n */\n operator: Operator | undefined;\n\n /**\n * @constructor\n * @param {Function} subscribe the function that is called when the Observable is\n * initially subscribed to. This function is given a Subscriber, to which new values\n * can be `next`ed, or an `error` method can be called to raise an error, or\n * `complete` can be called to notify of a successful completion.\n */\n constructor(subscribe?: (this: Observable, subscriber: Subscriber) => TeardownLogic) {\n if (subscribe) {\n this._subscribe = subscribe;\n }\n }\n\n // HACK: Since TypeScript inherits static properties too, we have to\n // fight against TypeScript here so Subject can have a different static create signature\n /**\n * Creates a new Observable by calling the Observable constructor\n * @owner Observable\n * @method create\n * @param {Function} subscribe? the subscriber function to be passed to the Observable constructor\n * @return {Observable} a new observable\n * @nocollapse\n * @deprecated Use `new Observable()` instead. Will be removed in v8.\n */\n static create: (...args: any[]) => any = (subscribe?: (subscriber: Subscriber) => TeardownLogic) => {\n return new Observable(subscribe);\n };\n\n /**\n * Creates a new Observable, with this Observable instance as the source, and the passed\n * operator defined as the new observable's operator.\n * @method lift\n * @param operator the operator defining the operation to take on the observable\n * @return a new observable with the Operator applied\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n * If you have implemented an operator using `lift`, it is recommended that you create an\n * operator by simply returning `new Observable()` directly. See \"Creating new operators from\n * scratch\" section here: https://rxjs.dev/guide/operators\n */\n lift(operator?: Operator): Observable {\n const observable = new Observable();\n observable.source = this;\n observable.operator = operator;\n return observable;\n }\n\n subscribe(observerOrNext?: Partial> | ((value: T) => void)): Subscription;\n /** @deprecated Instead of passing separate callback arguments, use an observer argument. Signatures taking separate callback arguments will be removed in v8. Details: https://rxjs.dev/deprecations/subscribe-arguments */\n subscribe(next?: ((value: T) => void) | null, error?: ((error: any) => void) | null, complete?: (() => void) | null): Subscription;\n /**\n * Invokes an execution of an Observable and registers Observer handlers for notifications it will emit.\n *\n * Use it when you have all these Observables, but still nothing is happening.\n *\n * `subscribe` is not a regular operator, but a method that calls Observable's internal `subscribe` function. It\n * might be for example a function that you passed to Observable's constructor, but most of the time it is\n * a library implementation, which defines what will be emitted by an Observable, and when it be will emitted. This means\n * that calling `subscribe` is actually the moment when Observable starts its work, not when it is created, as it is often\n * the thought.\n *\n * Apart from starting the execution of an Observable, this method allows you to listen for values\n * that an Observable emits, as well as for when it completes or errors. You can achieve this in two\n * of the following ways.\n *\n * The first way is creating an object that implements {@link Observer} interface. It should have methods\n * defined by that interface, but note that it should be just a regular JavaScript object, which you can create\n * yourself in any way you want (ES6 class, classic function constructor, object literal etc.). In particular, do\n * not attempt to use any RxJS implementation details to create Observers - you don't need them. Remember also\n * that your object does not have to implement all methods. If you find yourself creating a method that doesn't\n * do anything, you can simply omit it. Note however, if the `error` method is not provided and an error happens,\n * it will be thrown asynchronously. Errors thrown asynchronously cannot be caught using `try`/`catch`. Instead,\n * use the {@link onUnhandledError} configuration option or use a runtime handler (like `window.onerror` or\n * `process.on('error)`) to be notified of unhandled errors. Because of this, it's recommended that you provide\n * an `error` method to avoid missing thrown errors.\n *\n * The second way is to give up on Observer object altogether and simply provide callback functions in place of its methods.\n * This means you can provide three functions as arguments to `subscribe`, where the first function is equivalent\n * of a `next` method, the second of an `error` method and the third of a `complete` method. Just as in case of an Observer,\n * if you do not need to listen for something, you can omit a function by passing `undefined` or `null`,\n * since `subscribe` recognizes these functions by where they were placed in function call. When it comes\n * to the `error` function, as with an Observer, if not provided, errors emitted by an Observable will be thrown asynchronously.\n *\n * You can, however, subscribe with no parameters at all. This may be the case where you're not interested in terminal events\n * and you also handled emissions internally by using operators (e.g. using `tap`).\n *\n * Whichever style of calling `subscribe` you use, in both cases it returns a Subscription object.\n * This object allows you to call `unsubscribe` on it, which in turn will stop the work that an Observable does and will clean\n * up all resources that an Observable used. Note that cancelling a subscription will not call `complete` callback\n * provided to `subscribe` function, which is reserved for a regular completion signal that comes from an Observable.\n *\n * Remember that callbacks provided to `subscribe` are not guaranteed to be called asynchronously.\n * It is an Observable itself that decides when these functions will be called. For example {@link of}\n * by default emits all its values synchronously. Always check documentation for how given Observable\n * will behave when subscribed and if its default behavior can be modified with a `scheduler`.\n *\n * #### Examples\n *\n * Subscribe with an {@link guide/observer Observer}\n *\n * ```ts\n * import { of } from 'rxjs';\n *\n * const sumObserver = {\n * sum: 0,\n * next(value) {\n * console.log('Adding: ' + value);\n * this.sum = this.sum + value;\n * },\n * error() {\n * // We actually could just remove this method,\n * // since we do not really care about errors right now.\n * },\n * complete() {\n * console.log('Sum equals: ' + this.sum);\n * }\n * };\n *\n * of(1, 2, 3) // Synchronously emits 1, 2, 3 and then completes.\n * .subscribe(sumObserver);\n *\n * // Logs:\n * // 'Adding: 1'\n * // 'Adding: 2'\n * // 'Adding: 3'\n * // 'Sum equals: 6'\n * ```\n *\n * Subscribe with functions ({@link deprecations/subscribe-arguments deprecated})\n *\n * ```ts\n * import { of } from 'rxjs'\n *\n * let sum = 0;\n *\n * of(1, 2, 3).subscribe(\n * value => {\n * console.log('Adding: ' + value);\n * sum = sum + value;\n * },\n * undefined,\n * () => console.log('Sum equals: ' + sum)\n * );\n *\n * // Logs:\n * // 'Adding: 1'\n * // 'Adding: 2'\n * // 'Adding: 3'\n * // 'Sum equals: 6'\n * ```\n *\n * Cancel a subscription\n *\n * ```ts\n * import { interval } from 'rxjs';\n *\n * const subscription = interval(1000).subscribe({\n * next(num) {\n * console.log(num)\n * },\n * complete() {\n * // Will not be called, even when cancelling subscription.\n * console.log('completed!');\n * }\n * });\n *\n * setTimeout(() => {\n * subscription.unsubscribe();\n * console.log('unsubscribed!');\n * }, 2500);\n *\n * // Logs:\n * // 0 after 1s\n * // 1 after 2s\n * // 'unsubscribed!' after 2.5s\n * ```\n *\n * @param {Observer|Function} observerOrNext (optional) Either an observer with methods to be called,\n * or the first of three possible handlers, which is the handler for each value emitted from the subscribed\n * Observable.\n * @param {Function} error (optional) A handler for a terminal event resulting from an error. If no error handler is provided,\n * the error will be thrown asynchronously as unhandled.\n * @param {Function} complete (optional) A handler for a terminal event resulting from successful completion.\n * @return {Subscription} a subscription reference to the registered handlers\n * @method subscribe\n */\n subscribe(\n observerOrNext?: Partial> | ((value: T) => void) | null,\n error?: ((error: any) => void) | null,\n complete?: (() => void) | null\n ): Subscription {\n const subscriber = isSubscriber(observerOrNext) ? observerOrNext : new SafeSubscriber(observerOrNext, error, complete);\n\n errorContext(() => {\n const { operator, source } = this;\n subscriber.add(\n operator\n ? // We're dealing with a subscription in the\n // operator chain to one of our lifted operators.\n operator.call(subscriber, source)\n : source\n ? // If `source` has a value, but `operator` does not, something that\n // had intimate knowledge of our API, like our `Subject`, must have\n // set it. We're going to just call `_subscribe` directly.\n this._subscribe(subscriber)\n : // In all other cases, we're likely wrapping a user-provided initializer\n // function, so we need to catch errors and handle them appropriately.\n this._trySubscribe(subscriber)\n );\n });\n\n return subscriber;\n }\n\n /** @internal */\n protected _trySubscribe(sink: Subscriber): TeardownLogic {\n try {\n return this._subscribe(sink);\n } catch (err) {\n // We don't need to return anything in this case,\n // because it's just going to try to `add()` to a subscription\n // above.\n sink.error(err);\n }\n }\n\n /**\n * Used as a NON-CANCELLABLE means of subscribing to an observable, for use with\n * APIs that expect promises, like `async/await`. You cannot unsubscribe from this.\n *\n * **WARNING**: Only use this with observables you *know* will complete. If the source\n * observable does not complete, you will end up with a promise that is hung up, and\n * potentially all of the state of an async function hanging out in memory. To avoid\n * this situation, look into adding something like {@link timeout}, {@link take},\n * {@link takeWhile}, or {@link takeUntil} amongst others.\n *\n * #### Example\n *\n * ```ts\n * import { interval, take } from 'rxjs';\n *\n * const source$ = interval(1000).pipe(take(4));\n *\n * async function getTotal() {\n * let total = 0;\n *\n * await source$.forEach(value => {\n * total += value;\n * console.log('observable -> ' + value);\n * });\n *\n * return total;\n * }\n *\n * getTotal().then(\n * total => console.log('Total: ' + total)\n * );\n *\n * // Expected:\n * // 'observable -> 0'\n * // 'observable -> 1'\n * // 'observable -> 2'\n * // 'observable -> 3'\n * // 'Total: 6'\n * ```\n *\n * @param next a handler for each value emitted by the observable\n * @return a promise that either resolves on observable completion or\n * rejects with the handled error\n */\n forEach(next: (value: T) => void): Promise;\n\n /**\n * @param next a handler for each value emitted by the observable\n * @param promiseCtor a constructor function used to instantiate the Promise\n * @return a promise that either resolves on observable completion or\n * rejects with the handled error\n * @deprecated Passing a Promise constructor will no longer be available\n * in upcoming versions of RxJS. This is because it adds weight to the library, for very\n * little benefit. If you need this functionality, it is recommended that you either\n * polyfill Promise, or you create an adapter to convert the returned native promise\n * to whatever promise implementation you wanted. Will be removed in v8.\n */\n forEach(next: (value: T) => void, promiseCtor: PromiseConstructorLike): Promise;\n\n forEach(next: (value: T) => void, promiseCtor?: PromiseConstructorLike): Promise {\n promiseCtor = getPromiseCtor(promiseCtor);\n\n return new promiseCtor((resolve, reject) => {\n const subscriber = new SafeSubscriber({\n next: (value) => {\n try {\n next(value);\n } catch (err) {\n reject(err);\n subscriber.unsubscribe();\n }\n },\n error: reject,\n complete: resolve,\n });\n this.subscribe(subscriber);\n }) as Promise;\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): TeardownLogic {\n return this.source?.subscribe(subscriber);\n }\n\n /**\n * An interop point defined by the es7-observable spec https://github.com/zenparsing/es-observable\n * @method Symbol.observable\n * @return {Observable} this instance of the observable\n */\n [Symbol_observable]() {\n return this;\n }\n\n /* tslint:disable:max-line-length */\n pipe(): Observable;\n pipe(op1: OperatorFunction): Observable;\n pipe(op1: OperatorFunction, op2: OperatorFunction): Observable;\n pipe(op1: OperatorFunction, op2: OperatorFunction, op3: OperatorFunction): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction,\n op9: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction,\n op9: OperatorFunction,\n ...operations: OperatorFunction[]\n ): Observable;\n /* tslint:enable:max-line-length */\n\n /**\n * Used to stitch together functional operators into a chain.\n * @method pipe\n * @return {Observable} the Observable result of all of the operators having\n * been called in the order they were passed in.\n *\n * ## Example\n *\n * ```ts\n * import { interval, filter, map, scan } from 'rxjs';\n *\n * interval(1000)\n * .pipe(\n * filter(x => x % 2 === 0),\n * map(x => x + x),\n * scan((acc, x) => acc + x)\n * )\n * .subscribe(x => console.log(x));\n * ```\n */\n pipe(...operations: OperatorFunction[]): Observable {\n return pipeFromArray(operations)(this);\n }\n\n /* tslint:disable:max-line-length */\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(): Promise;\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(PromiseCtor: typeof Promise): Promise;\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(PromiseCtor: PromiseConstructorLike): Promise;\n /* tslint:enable:max-line-length */\n\n /**\n * Subscribe to this Observable and get a Promise resolving on\n * `complete` with the last emission (if any).\n *\n * **WARNING**: Only use this with observables you *know* will complete. If the source\n * observable does not complete, you will end up with a promise that is hung up, and\n * potentially all of the state of an async function hanging out in memory. To avoid\n * this situation, look into adding something like {@link timeout}, {@link take},\n * {@link takeWhile}, or {@link takeUntil} amongst others.\n *\n * @method toPromise\n * @param [promiseCtor] a constructor function used to instantiate\n * the Promise\n * @return A Promise that resolves with the last value emit, or\n * rejects on an error. If there were no emissions, Promise\n * resolves with undefined.\n * @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise\n */\n toPromise(promiseCtor?: PromiseConstructorLike): Promise {\n promiseCtor = getPromiseCtor(promiseCtor);\n\n return new promiseCtor((resolve, reject) => {\n let value: T | undefined;\n this.subscribe(\n (x: T) => (value = x),\n (err: any) => reject(err),\n () => resolve(value)\n );\n }) as Promise;\n }\n}\n\n/**\n * Decides between a passed promise constructor from consuming code,\n * A default configured promise constructor, and the native promise\n * constructor and returns it. If nothing can be found, it will throw\n * an error.\n * @param promiseCtor The optional promise constructor to passed by consuming code\n */\nfunction getPromiseCtor(promiseCtor: PromiseConstructorLike | undefined) {\n return promiseCtor ?? config.Promise ?? Promise;\n}\n\nfunction isObserver(value: any): value is Observer {\n return value && isFunction(value.next) && isFunction(value.error) && isFunction(value.complete);\n}\n\nfunction isSubscriber(value: any): value is Subscriber {\n return (value && value instanceof Subscriber) || (isObserver(value) && isSubscription(value));\n}\n", "import { Observable } from '../Observable';\nimport { Subscriber } from '../Subscriber';\nimport { OperatorFunction } from '../types';\nimport { isFunction } from './isFunction';\n\n/**\n * Used to determine if an object is an Observable with a lift function.\n */\nexport function hasLift(source: any): source is { lift: InstanceType['lift'] } {\n return isFunction(source?.lift);\n}\n\n/**\n * Creates an `OperatorFunction`. Used to define operators throughout the library in a concise way.\n * @param init The logic to connect the liftedSource to the subscriber at the moment of subscription.\n */\nexport function operate(\n init: (liftedSource: Observable, subscriber: Subscriber) => (() => void) | void\n): OperatorFunction {\n return (source: Observable) => {\n if (hasLift(source)) {\n return source.lift(function (this: Subscriber, liftedSource: Observable) {\n try {\n return init(liftedSource, this);\n } catch (err) {\n this.error(err);\n }\n });\n }\n throw new TypeError('Unable to lift unknown Observable type');\n };\n}\n", "import { Subscriber } from '../Subscriber';\n\n/**\n * Creates an instance of an `OperatorSubscriber`.\n * @param destination The downstream subscriber.\n * @param onNext Handles next values, only called if this subscriber is not stopped or closed. Any\n * error that occurs in this function is caught and sent to the `error` method of this subscriber.\n * @param onError Handles errors from the subscription, any errors that occur in this handler are caught\n * and send to the `destination` error handler.\n * @param onComplete Handles completion notification from the subscription. Any errors that occur in\n * this handler are sent to the `destination` error handler.\n * @param onFinalize Additional teardown logic here. This will only be called on teardown if the\n * subscriber itself is not already closed. This is called after all other teardown logic is executed.\n */\nexport function createOperatorSubscriber(\n destination: Subscriber,\n onNext?: (value: T) => void,\n onComplete?: () => void,\n onError?: (err: any) => void,\n onFinalize?: () => void\n): Subscriber {\n return new OperatorSubscriber(destination, onNext, onComplete, onError, onFinalize);\n}\n\n/**\n * A generic helper for allowing operators to be created with a Subscriber and\n * use closures to capture necessary state from the operator function itself.\n */\nexport class OperatorSubscriber extends Subscriber {\n /**\n * Creates an instance of an `OperatorSubscriber`.\n * @param destination The downstream subscriber.\n * @param onNext Handles next values, only called if this subscriber is not stopped or closed. Any\n * error that occurs in this function is caught and sent to the `error` method of this subscriber.\n * @param onError Handles errors from the subscription, any errors that occur in this handler are caught\n * and send to the `destination` error handler.\n * @param onComplete Handles completion notification from the subscription. Any errors that occur in\n * this handler are sent to the `destination` error handler.\n * @param onFinalize Additional finalization logic here. This will only be called on finalization if the\n * subscriber itself is not already closed. This is called after all other finalization logic is executed.\n * @param shouldUnsubscribe An optional check to see if an unsubscribe call should truly unsubscribe.\n * NOTE: This currently **ONLY** exists to support the strange behavior of {@link groupBy}, where unsubscription\n * to the resulting observable does not actually disconnect from the source if there are active subscriptions\n * to any grouped observable. (DO NOT EXPOSE OR USE EXTERNALLY!!!)\n */\n constructor(\n destination: Subscriber,\n onNext?: (value: T) => void,\n onComplete?: () => void,\n onError?: (err: any) => void,\n private onFinalize?: () => void,\n private shouldUnsubscribe?: () => boolean\n ) {\n // It's important - for performance reasons - that all of this class's\n // members are initialized and that they are always initialized in the same\n // order. This will ensure that all OperatorSubscriber instances have the\n // same hidden class in V8. This, in turn, will help keep the number of\n // hidden classes involved in property accesses within the base class as\n // low as possible. If the number of hidden classes involved exceeds four,\n // the property accesses will become megamorphic and performance penalties\n // will be incurred - i.e. inline caches won't be used.\n //\n // The reasons for ensuring all instances have the same hidden class are\n // further discussed in this blog post from Benedikt Meurer:\n // https://benediktmeurer.de/2018/03/23/impact-of-polymorphism-on-component-based-frameworks-like-react/\n super(destination);\n this._next = onNext\n ? function (this: OperatorSubscriber, value: T) {\n try {\n onNext(value);\n } catch (err) {\n destination.error(err);\n }\n }\n : super._next;\n this._error = onError\n ? function (this: OperatorSubscriber, err: any) {\n try {\n onError(err);\n } catch (err) {\n // Send any errors that occur down stream.\n destination.error(err);\n } finally {\n // Ensure finalization.\n this.unsubscribe();\n }\n }\n : super._error;\n this._complete = onComplete\n ? function (this: OperatorSubscriber) {\n try {\n onComplete();\n } catch (err) {\n // Send any errors that occur down stream.\n destination.error(err);\n } finally {\n // Ensure finalization.\n this.unsubscribe();\n }\n }\n : super._complete;\n }\n\n unsubscribe() {\n if (!this.shouldUnsubscribe || this.shouldUnsubscribe()) {\n const { closed } = this;\n super.unsubscribe();\n // Execute additional teardown if we have any and we didn't already do so.\n !closed && this.onFinalize?.();\n }\n }\n}\n", "import { Subscription } from '../Subscription';\n\ninterface AnimationFrameProvider {\n schedule(callback: FrameRequestCallback): Subscription;\n requestAnimationFrame: typeof requestAnimationFrame;\n cancelAnimationFrame: typeof cancelAnimationFrame;\n delegate:\n | {\n requestAnimationFrame: typeof requestAnimationFrame;\n cancelAnimationFrame: typeof cancelAnimationFrame;\n }\n | undefined;\n}\n\nexport const animationFrameProvider: AnimationFrameProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n schedule(callback) {\n let request = requestAnimationFrame;\n let cancel: typeof cancelAnimationFrame | undefined = cancelAnimationFrame;\n const { delegate } = animationFrameProvider;\n if (delegate) {\n request = delegate.requestAnimationFrame;\n cancel = delegate.cancelAnimationFrame;\n }\n const handle = request((timestamp) => {\n // Clear the cancel function. The request has been fulfilled, so\n // attempting to cancel the request upon unsubscription would be\n // pointless.\n cancel = undefined;\n callback(timestamp);\n });\n return new Subscription(() => cancel?.(handle));\n },\n requestAnimationFrame(...args) {\n const { delegate } = animationFrameProvider;\n return (delegate?.requestAnimationFrame || requestAnimationFrame)(...args);\n },\n cancelAnimationFrame(...args) {\n const { delegate } = animationFrameProvider;\n return (delegate?.cancelAnimationFrame || cancelAnimationFrame)(...args);\n },\n delegate: undefined,\n};\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface ObjectUnsubscribedError extends Error {}\n\nexport interface ObjectUnsubscribedErrorCtor {\n /**\n * @deprecated Internal implementation detail. Do not construct error instances.\n * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n */\n new (): ObjectUnsubscribedError;\n}\n\n/**\n * An error thrown when an action is invalid because the object has been\n * unsubscribed.\n *\n * @see {@link Subject}\n * @see {@link BehaviorSubject}\n *\n * @class ObjectUnsubscribedError\n */\nexport const ObjectUnsubscribedError: ObjectUnsubscribedErrorCtor = createErrorClass(\n (_super) =>\n function ObjectUnsubscribedErrorImpl(this: any) {\n _super(this);\n this.name = 'ObjectUnsubscribedError';\n this.message = 'object unsubscribed';\n }\n);\n", "import { Operator } from './Operator';\nimport { Observable } from './Observable';\nimport { Subscriber } from './Subscriber';\nimport { Subscription, EMPTY_SUBSCRIPTION } from './Subscription';\nimport { Observer, SubscriptionLike, TeardownLogic } from './types';\nimport { ObjectUnsubscribedError } from './util/ObjectUnsubscribedError';\nimport { arrRemove } from './util/arrRemove';\nimport { errorContext } from './util/errorContext';\n\n/**\n * A Subject is a special type of Observable that allows values to be\n * multicasted to many Observers. Subjects are like EventEmitters.\n *\n * Every Subject is an Observable and an Observer. You can subscribe to a\n * Subject, and you can call next to feed values as well as error and complete.\n */\nexport class Subject extends Observable implements SubscriptionLike {\n closed = false;\n\n private currentObservers: Observer[] | null = null;\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n observers: Observer[] = [];\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n isStopped = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n hasError = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n thrownError: any = null;\n\n /**\n * Creates a \"subject\" by basically gluing an observer to an observable.\n *\n * @nocollapse\n * @deprecated Recommended you do not use. Will be removed at some point in the future. Plans for replacement still under discussion.\n */\n static create: (...args: any[]) => any = (destination: Observer, source: Observable): AnonymousSubject => {\n return new AnonymousSubject(destination, source);\n };\n\n constructor() {\n // NOTE: This must be here to obscure Observable's constructor.\n super();\n }\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n lift(operator: Operator): Observable {\n const subject = new AnonymousSubject(this, this);\n subject.operator = operator as any;\n return subject as any;\n }\n\n /** @internal */\n protected _throwIfClosed() {\n if (this.closed) {\n throw new ObjectUnsubscribedError();\n }\n }\n\n next(value: T) {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n if (!this.currentObservers) {\n this.currentObservers = Array.from(this.observers);\n }\n for (const observer of this.currentObservers) {\n observer.next(value);\n }\n }\n });\n }\n\n error(err: any) {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n this.hasError = this.isStopped = true;\n this.thrownError = err;\n const { observers } = this;\n while (observers.length) {\n observers.shift()!.error(err);\n }\n }\n });\n }\n\n complete() {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n this.isStopped = true;\n const { observers } = this;\n while (observers.length) {\n observers.shift()!.complete();\n }\n }\n });\n }\n\n unsubscribe() {\n this.isStopped = this.closed = true;\n this.observers = this.currentObservers = null!;\n }\n\n get observed() {\n return this.observers?.length > 0;\n }\n\n /** @internal */\n protected _trySubscribe(subscriber: Subscriber): TeardownLogic {\n this._throwIfClosed();\n return super._trySubscribe(subscriber);\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n this._throwIfClosed();\n this._checkFinalizedStatuses(subscriber);\n return this._innerSubscribe(subscriber);\n }\n\n /** @internal */\n protected _innerSubscribe(subscriber: Subscriber) {\n const { hasError, isStopped, observers } = this;\n if (hasError || isStopped) {\n return EMPTY_SUBSCRIPTION;\n }\n this.currentObservers = null;\n observers.push(subscriber);\n return new Subscription(() => {\n this.currentObservers = null;\n arrRemove(observers, subscriber);\n });\n }\n\n /** @internal */\n protected _checkFinalizedStatuses(subscriber: Subscriber) {\n const { hasError, thrownError, isStopped } = this;\n if (hasError) {\n subscriber.error(thrownError);\n } else if (isStopped) {\n subscriber.complete();\n }\n }\n\n /**\n * Creates a new Observable with this Subject as the source. You can do this\n * to create custom Observer-side logic of the Subject and conceal it from\n * code that uses the Observable.\n * @return {Observable} Observable that the Subject casts to\n */\n asObservable(): Observable {\n const observable: any = new Observable();\n observable.source = this;\n return observable;\n }\n}\n\n/**\n * @class AnonymousSubject\n */\nexport class AnonymousSubject extends Subject {\n constructor(\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n public destination?: Observer,\n source?: Observable\n ) {\n super();\n this.source = source;\n }\n\n next(value: T) {\n this.destination?.next?.(value);\n }\n\n error(err: any) {\n this.destination?.error?.(err);\n }\n\n complete() {\n this.destination?.complete?.();\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n return this.source?.subscribe(subscriber) ?? EMPTY_SUBSCRIPTION;\n }\n}\n", "import { Subject } from './Subject';\nimport { Subscriber } from './Subscriber';\nimport { Subscription } from './Subscription';\n\n/**\n * A variant of Subject that requires an initial value and emits its current\n * value whenever it is subscribed to.\n *\n * @class BehaviorSubject\n */\nexport class BehaviorSubject extends Subject {\n constructor(private _value: T) {\n super();\n }\n\n get value(): T {\n return this.getValue();\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n const subscription = super._subscribe(subscriber);\n !subscription.closed && subscriber.next(this._value);\n return subscription;\n }\n\n getValue(): T {\n const { hasError, thrownError, _value } = this;\n if (hasError) {\n throw thrownError;\n }\n this._throwIfClosed();\n return _value;\n }\n\n next(value: T): void {\n super.next((this._value = value));\n }\n}\n", "import { TimestampProvider } from '../types';\n\ninterface DateTimestampProvider extends TimestampProvider {\n delegate: TimestampProvider | undefined;\n}\n\nexport const dateTimestampProvider: DateTimestampProvider = {\n now() {\n // Use the variable rather than `this` so that the function can be called\n // without being bound to the provider.\n return (dateTimestampProvider.delegate || Date).now();\n },\n delegate: undefined,\n};\n", "import { Subject } from './Subject';\nimport { TimestampProvider } from './types';\nimport { Subscriber } from './Subscriber';\nimport { Subscription } from './Subscription';\nimport { dateTimestampProvider } from './scheduler/dateTimestampProvider';\n\n/**\n * A variant of {@link Subject} that \"replays\" old values to new subscribers by emitting them when they first subscribe.\n *\n * `ReplaySubject` has an internal buffer that will store a specified number of values that it has observed. Like `Subject`,\n * `ReplaySubject` \"observes\" values by having them passed to its `next` method. When it observes a value, it will store that\n * value for a time determined by the configuration of the `ReplaySubject`, as passed to its constructor.\n *\n * When a new subscriber subscribes to the `ReplaySubject` instance, it will synchronously emit all values in its buffer in\n * a First-In-First-Out (FIFO) manner. The `ReplaySubject` will also complete, if it has observed completion; and it will\n * error if it has observed an error.\n *\n * There are two main configuration items to be concerned with:\n *\n * 1. `bufferSize` - This will determine how many items are stored in the buffer, defaults to infinite.\n * 2. `windowTime` - The amount of time to hold a value in the buffer before removing it from the buffer.\n *\n * Both configurations may exist simultaneously. So if you would like to buffer a maximum of 3 values, as long as the values\n * are less than 2 seconds old, you could do so with a `new ReplaySubject(3, 2000)`.\n *\n * ### Differences with BehaviorSubject\n *\n * `BehaviorSubject` is similar to `new ReplaySubject(1)`, with a couple of exceptions:\n *\n * 1. `BehaviorSubject` comes \"primed\" with a single value upon construction.\n * 2. `ReplaySubject` will replay values, even after observing an error, where `BehaviorSubject` will not.\n *\n * @see {@link Subject}\n * @see {@link BehaviorSubject}\n * @see {@link shareReplay}\n */\nexport class ReplaySubject extends Subject {\n private _buffer: (T | number)[] = [];\n private _infiniteTimeWindow = true;\n\n /**\n * @param bufferSize The size of the buffer to replay on subscription\n * @param windowTime The amount of time the buffered items will stay buffered\n * @param timestampProvider An object with a `now()` method that provides the current timestamp. This is used to\n * calculate the amount of time something has been buffered.\n */\n constructor(\n private _bufferSize = Infinity,\n private _windowTime = Infinity,\n private _timestampProvider: TimestampProvider = dateTimestampProvider\n ) {\n super();\n this._infiniteTimeWindow = _windowTime === Infinity;\n this._bufferSize = Math.max(1, _bufferSize);\n this._windowTime = Math.max(1, _windowTime);\n }\n\n next(value: T): void {\n const { isStopped, _buffer, _infiniteTimeWindow, _timestampProvider, _windowTime } = this;\n if (!isStopped) {\n _buffer.push(value);\n !_infiniteTimeWindow && _buffer.push(_timestampProvider.now() + _windowTime);\n }\n this._trimBuffer();\n super.next(value);\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n this._throwIfClosed();\n this._trimBuffer();\n\n const subscription = this._innerSubscribe(subscriber);\n\n const { _infiniteTimeWindow, _buffer } = this;\n // We use a copy here, so reentrant code does not mutate our array while we're\n // emitting it to a new subscriber.\n const copy = _buffer.slice();\n for (let i = 0; i < copy.length && !subscriber.closed; i += _infiniteTimeWindow ? 1 : 2) {\n subscriber.next(copy[i] as T);\n }\n\n this._checkFinalizedStatuses(subscriber);\n\n return subscription;\n }\n\n private _trimBuffer() {\n const { _bufferSize, _timestampProvider, _buffer, _infiniteTimeWindow } = this;\n // If we don't have an infinite buffer size, and we're over the length,\n // use splice to truncate the old buffer values off. Note that we have to\n // double the size for instances where we're not using an infinite time window\n // because we're storing the values and the timestamps in the same array.\n const adjustedBufferSize = (_infiniteTimeWindow ? 1 : 2) * _bufferSize;\n _bufferSize < Infinity && adjustedBufferSize < _buffer.length && _buffer.splice(0, _buffer.length - adjustedBufferSize);\n\n // Now, if we're not in an infinite time window, remove all values where the time is\n // older than what is allowed.\n if (!_infiniteTimeWindow) {\n const now = _timestampProvider.now();\n let last = 0;\n // Search the array for the first timestamp that isn't expired and\n // truncate the buffer up to that point.\n for (let i = 1; i < _buffer.length && (_buffer[i] as number) <= now; i += 2) {\n last = i;\n }\n last && _buffer.splice(0, last + 1);\n }\n }\n}\n", "import { Scheduler } from '../Scheduler';\nimport { Subscription } from '../Subscription';\nimport { SchedulerAction } from '../types';\n\n/**\n * A unit of work to be executed in a `scheduler`. An action is typically\n * created from within a {@link SchedulerLike} and an RxJS user does not need to concern\n * themselves about creating and manipulating an Action.\n *\n * ```ts\n * class Action extends Subscription {\n * new (scheduler: Scheduler, work: (state?: T) => void);\n * schedule(state?: T, delay: number = 0): Subscription;\n * }\n * ```\n *\n * @class Action\n */\nexport class Action extends Subscription {\n constructor(scheduler: Scheduler, work: (this: SchedulerAction, state?: T) => void) {\n super();\n }\n /**\n * Schedules this action on its parent {@link SchedulerLike} for execution. May be passed\n * some context object, `state`. May happen at some point in the future,\n * according to the `delay` parameter, if specified.\n * @param {T} [state] Some contextual data that the `work` function uses when\n * called by the Scheduler.\n * @param {number} [delay] Time to wait before executing the work, where the\n * time unit is implicit and defined by the Scheduler.\n * @return {void}\n */\n public schedule(state?: T, delay: number = 0): Subscription {\n return this;\n }\n}\n", "import type { TimerHandle } from './timerHandle';\ntype SetIntervalFunction = (handler: () => void, timeout?: number, ...args: any[]) => TimerHandle;\ntype ClearIntervalFunction = (handle: TimerHandle) => void;\n\ninterface IntervalProvider {\n setInterval: SetIntervalFunction;\n clearInterval: ClearIntervalFunction;\n delegate:\n | {\n setInterval: SetIntervalFunction;\n clearInterval: ClearIntervalFunction;\n }\n | undefined;\n}\n\nexport const intervalProvider: IntervalProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n setInterval(handler: () => void, timeout?: number, ...args) {\n const { delegate } = intervalProvider;\n if (delegate?.setInterval) {\n return delegate.setInterval(handler, timeout, ...args);\n }\n return setInterval(handler, timeout, ...args);\n },\n clearInterval(handle) {\n const { delegate } = intervalProvider;\n return (delegate?.clearInterval || clearInterval)(handle as any);\n },\n delegate: undefined,\n};\n", "import { Action } from './Action';\nimport { SchedulerAction } from '../types';\nimport { Subscription } from '../Subscription';\nimport { AsyncScheduler } from './AsyncScheduler';\nimport { intervalProvider } from './intervalProvider';\nimport { arrRemove } from '../util/arrRemove';\nimport { TimerHandle } from './timerHandle';\n\nexport class AsyncAction extends Action {\n public id: TimerHandle | undefined;\n public state?: T;\n // @ts-ignore: Property has no initializer and is not definitely assigned\n public delay: number;\n protected pending: boolean = false;\n\n constructor(protected scheduler: AsyncScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n public schedule(state?: T, delay: number = 0): Subscription {\n if (this.closed) {\n return this;\n }\n\n // Always replace the current state with the new state.\n this.state = state;\n\n const id = this.id;\n const scheduler = this.scheduler;\n\n //\n // Important implementation note:\n //\n // Actions only execute once by default, unless rescheduled from within the\n // scheduled callback. This allows us to implement single and repeat\n // actions via the same code path, without adding API surface area, as well\n // as mimic traditional recursion but across asynchronous boundaries.\n //\n // However, JS runtimes and timers distinguish between intervals achieved by\n // serial `setTimeout` calls vs. a single `setInterval` call. An interval of\n // serial `setTimeout` calls can be individually delayed, which delays\n // scheduling the next `setTimeout`, and so on. `setInterval` attempts to\n // guarantee the interval callback will be invoked more precisely to the\n // interval period, regardless of load.\n //\n // Therefore, we use `setInterval` to schedule single and repeat actions.\n // If the action reschedules itself with the same delay, the interval is not\n // canceled. If the action doesn't reschedule, or reschedules with a\n // different delay, the interval will be canceled after scheduled callback\n // execution.\n //\n if (id != null) {\n this.id = this.recycleAsyncId(scheduler, id, delay);\n }\n\n // Set the pending flag indicating that this action has been scheduled, or\n // has recursively rescheduled itself.\n this.pending = true;\n\n this.delay = delay;\n // If this action has already an async Id, don't request a new one.\n this.id = this.id ?? this.requestAsyncId(scheduler, this.id, delay);\n\n return this;\n }\n\n protected requestAsyncId(scheduler: AsyncScheduler, _id?: TimerHandle, delay: number = 0): TimerHandle {\n return intervalProvider.setInterval(scheduler.flush.bind(scheduler, this), delay);\n }\n\n protected recycleAsyncId(_scheduler: AsyncScheduler, id?: TimerHandle, delay: number | null = 0): TimerHandle | undefined {\n // If this action is rescheduled with the same delay time, don't clear the interval id.\n if (delay != null && this.delay === delay && this.pending === false) {\n return id;\n }\n // Otherwise, if the action's delay time is different from the current delay,\n // or the action has been rescheduled before it's executed, clear the interval id\n if (id != null) {\n intervalProvider.clearInterval(id);\n }\n\n return undefined;\n }\n\n /**\n * Immediately executes this action and the `work` it contains.\n * @return {any}\n */\n public execute(state: T, delay: number): any {\n if (this.closed) {\n return new Error('executing a cancelled action');\n }\n\n this.pending = false;\n const error = this._execute(state, delay);\n if (error) {\n return error;\n } else if (this.pending === false && this.id != null) {\n // Dequeue if the action didn't reschedule itself. Don't call\n // unsubscribe(), because the action could reschedule later.\n // For example:\n // ```\n // scheduler.schedule(function doWork(counter) {\n // /* ... I'm a busy worker bee ... */\n // var originalAction = this;\n // /* wait 100ms before rescheduling the action */\n // setTimeout(function () {\n // originalAction.schedule(counter + 1);\n // }, 100);\n // }, 1000);\n // ```\n this.id = this.recycleAsyncId(this.scheduler, this.id, null);\n }\n }\n\n protected _execute(state: T, _delay: number): any {\n let errored: boolean = false;\n let errorValue: any;\n try {\n this.work(state);\n } catch (e) {\n errored = true;\n // HACK: Since code elsewhere is relying on the \"truthiness\" of the\n // return here, we can't have it return \"\" or 0 or false.\n // TODO: Clean this up when we refactor schedulers mid-version-8 or so.\n errorValue = e ? e : new Error('Scheduled action threw falsy error');\n }\n if (errored) {\n this.unsubscribe();\n return errorValue;\n }\n }\n\n unsubscribe() {\n if (!this.closed) {\n const { id, scheduler } = this;\n const { actions } = scheduler;\n\n this.work = this.state = this.scheduler = null!;\n this.pending = false;\n\n arrRemove(actions, this);\n if (id != null) {\n this.id = this.recycleAsyncId(scheduler, id, null);\n }\n\n this.delay = null!;\n super.unsubscribe();\n }\n }\n}\n", "import { Action } from './scheduler/Action';\nimport { Subscription } from './Subscription';\nimport { SchedulerLike, SchedulerAction } from './types';\nimport { dateTimestampProvider } from './scheduler/dateTimestampProvider';\n\n/**\n * An execution context and a data structure to order tasks and schedule their\n * execution. Provides a notion of (potentially virtual) time, through the\n * `now()` getter method.\n *\n * Each unit of work in a Scheduler is called an `Action`.\n *\n * ```ts\n * class Scheduler {\n * now(): number;\n * schedule(work, delay?, state?): Subscription;\n * }\n * ```\n *\n * @class Scheduler\n * @deprecated Scheduler is an internal implementation detail of RxJS, and\n * should not be used directly. Rather, create your own class and implement\n * {@link SchedulerLike}. Will be made internal in v8.\n */\nexport class Scheduler implements SchedulerLike {\n public static now: () => number = dateTimestampProvider.now;\n\n constructor(private schedulerActionCtor: typeof Action, now: () => number = Scheduler.now) {\n this.now = now;\n }\n\n /**\n * A getter method that returns a number representing the current time\n * (at the time this function was called) according to the scheduler's own\n * internal clock.\n * @return {number} A number that represents the current time. May or may not\n * have a relation to wall-clock time. May or may not refer to a time unit\n * (e.g. milliseconds).\n */\n public now: () => number;\n\n /**\n * Schedules a function, `work`, for execution. May happen at some point in\n * the future, according to the `delay` parameter, if specified. May be passed\n * some context object, `state`, which will be passed to the `work` function.\n *\n * The given arguments will be processed an stored as an Action object in a\n * queue of actions.\n *\n * @param {function(state: ?T): ?Subscription} work A function representing a\n * task, or some unit of work to be executed by the Scheduler.\n * @param {number} [delay] Time to wait before executing the work, where the\n * time unit is implicit and defined by the Scheduler itself.\n * @param {T} [state] Some contextual data that the `work` function uses when\n * called by the Scheduler.\n * @return {Subscription} A subscription in order to be able to unsubscribe\n * the scheduled work.\n */\n public schedule(work: (this: SchedulerAction, state?: T) => void, delay: number = 0, state?: T): Subscription {\n return new this.schedulerActionCtor(this, work).schedule(state, delay);\n }\n}\n", "import { Scheduler } from '../Scheduler';\nimport { Action } from './Action';\nimport { AsyncAction } from './AsyncAction';\nimport { TimerHandle } from './timerHandle';\n\nexport class AsyncScheduler extends Scheduler {\n public actions: Array> = [];\n /**\n * A flag to indicate whether the Scheduler is currently executing a batch of\n * queued actions.\n * @type {boolean}\n * @internal\n */\n public _active: boolean = false;\n /**\n * An internal ID used to track the latest asynchronous task such as those\n * coming from `setTimeout`, `setInterval`, `requestAnimationFrame`, and\n * others.\n * @type {any}\n * @internal\n */\n public _scheduled: TimerHandle | undefined;\n\n constructor(SchedulerAction: typeof Action, now: () => number = Scheduler.now) {\n super(SchedulerAction, now);\n }\n\n public flush(action: AsyncAction): void {\n const { actions } = this;\n\n if (this._active) {\n actions.push(action);\n return;\n }\n\n let error: any;\n this._active = true;\n\n do {\n if ((error = action.execute(action.state, action.delay))) {\n break;\n }\n } while ((action = actions.shift()!)); // exhaust the scheduler queue\n\n this._active = false;\n\n if (error) {\n while ((action = actions.shift()!)) {\n action.unsubscribe();\n }\n throw error;\n }\n }\n}\n", "import { AsyncAction } from './AsyncAction';\nimport { AsyncScheduler } from './AsyncScheduler';\n\n/**\n *\n * Async Scheduler\n *\n * Schedule task as if you used setTimeout(task, duration)\n *\n * `async` scheduler schedules tasks asynchronously, by putting them on the JavaScript\n * event loop queue. It is best used to delay tasks in time or to schedule tasks repeating\n * in intervals.\n *\n * If you just want to \"defer\" task, that is to perform it right after currently\n * executing synchronous code ends (commonly achieved by `setTimeout(deferredTask, 0)`),\n * better choice will be the {@link asapScheduler} scheduler.\n *\n * ## Examples\n * Use async scheduler to delay task\n * ```ts\n * import { asyncScheduler } from 'rxjs';\n *\n * const task = () => console.log('it works!');\n *\n * asyncScheduler.schedule(task, 2000);\n *\n * // After 2 seconds logs:\n * // \"it works!\"\n * ```\n *\n * Use async scheduler to repeat task in intervals\n * ```ts\n * import { asyncScheduler } from 'rxjs';\n *\n * function task(state) {\n * console.log(state);\n * this.schedule(state + 1, 1000); // `this` references currently executing Action,\n * // which we reschedule with new state and delay\n * }\n *\n * asyncScheduler.schedule(task, 3000, 0);\n *\n * // Logs:\n * // 0 after 3s\n * // 1 after 4s\n * // 2 after 5s\n * // 3 after 6s\n * ```\n */\n\nexport const asyncScheduler = new AsyncScheduler(AsyncAction);\n\n/**\n * @deprecated Renamed to {@link asyncScheduler}. Will be removed in v8.\n */\nexport const async = asyncScheduler;\n", "import { AsyncAction } from './AsyncAction';\nimport { Subscription } from '../Subscription';\nimport { QueueScheduler } from './QueueScheduler';\nimport { SchedulerAction } from '../types';\nimport { TimerHandle } from './timerHandle';\n\nexport class QueueAction extends AsyncAction {\n constructor(protected scheduler: QueueScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n public schedule(state?: T, delay: number = 0): Subscription {\n if (delay > 0) {\n return super.schedule(state, delay);\n }\n this.delay = delay;\n this.state = state;\n this.scheduler.flush(this);\n return this;\n }\n\n public execute(state: T, delay: number): any {\n return delay > 0 || this.closed ? super.execute(state, delay) : this._execute(state, delay);\n }\n\n protected requestAsyncId(scheduler: QueueScheduler, id?: TimerHandle, delay: number = 0): TimerHandle {\n // If delay exists and is greater than 0, or if the delay is null (the\n // action wasn't rescheduled) but was originally scheduled as an async\n // action, then recycle as an async action.\n\n if ((delay != null && delay > 0) || (delay == null && this.delay > 0)) {\n return super.requestAsyncId(scheduler, id, delay);\n }\n\n // Otherwise flush the scheduler starting with this action.\n scheduler.flush(this);\n\n // HACK: In the past, this was returning `void`. However, `void` isn't a valid\n // `TimerHandle`, and generally the return value here isn't really used. So the\n // compromise is to return `0` which is both \"falsy\" and a valid `TimerHandle`,\n // as opposed to refactoring every other instanceo of `requestAsyncId`.\n return 0;\n }\n}\n", "import { AsyncScheduler } from './AsyncScheduler';\n\nexport class QueueScheduler extends AsyncScheduler {\n}\n", "import { QueueAction } from './QueueAction';\nimport { QueueScheduler } from './QueueScheduler';\n\n/**\n *\n * Queue Scheduler\n *\n * Put every next task on a queue, instead of executing it immediately\n *\n * `queue` scheduler, when used with delay, behaves the same as {@link asyncScheduler} scheduler.\n *\n * When used without delay, it schedules given task synchronously - executes it right when\n * it is scheduled. However when called recursively, that is when inside the scheduled task,\n * another task is scheduled with queue scheduler, instead of executing immediately as well,\n * that task will be put on a queue and wait for current one to finish.\n *\n * This means that when you execute task with `queue` scheduler, you are sure it will end\n * before any other task scheduled with that scheduler will start.\n *\n * ## Examples\n * Schedule recursively first, then do something\n * ```ts\n * import { queueScheduler } from 'rxjs';\n *\n * queueScheduler.schedule(() => {\n * queueScheduler.schedule(() => console.log('second')); // will not happen now, but will be put on a queue\n *\n * console.log('first');\n * });\n *\n * // Logs:\n * // \"first\"\n * // \"second\"\n * ```\n *\n * Reschedule itself recursively\n * ```ts\n * import { queueScheduler } from 'rxjs';\n *\n * queueScheduler.schedule(function(state) {\n * if (state !== 0) {\n * console.log('before', state);\n * this.schedule(state - 1); // `this` references currently executing Action,\n * // which we reschedule with new state\n * console.log('after', state);\n * }\n * }, 0, 3);\n *\n * // In scheduler that runs recursively, you would expect:\n * // \"before\", 3\n * // \"before\", 2\n * // \"before\", 1\n * // \"after\", 1\n * // \"after\", 2\n * // \"after\", 3\n *\n * // But with queue it logs:\n * // \"before\", 3\n * // \"after\", 3\n * // \"before\", 2\n * // \"after\", 2\n * // \"before\", 1\n * // \"after\", 1\n * ```\n */\n\nexport const queueScheduler = new QueueScheduler(QueueAction);\n\n/**\n * @deprecated Renamed to {@link queueScheduler}. Will be removed in v8.\n */\nexport const queue = queueScheduler;\n", "import { AsyncAction } from './AsyncAction';\nimport { AnimationFrameScheduler } from './AnimationFrameScheduler';\nimport { SchedulerAction } from '../types';\nimport { animationFrameProvider } from './animationFrameProvider';\nimport { TimerHandle } from './timerHandle';\n\nexport class AnimationFrameAction extends AsyncAction {\n constructor(protected scheduler: AnimationFrameScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n protected requestAsyncId(scheduler: AnimationFrameScheduler, id?: TimerHandle, delay: number = 0): TimerHandle {\n // If delay is greater than 0, request as an async action.\n if (delay !== null && delay > 0) {\n return super.requestAsyncId(scheduler, id, delay);\n }\n // Push the action to the end of the scheduler queue.\n scheduler.actions.push(this);\n // If an animation frame has already been requested, don't request another\n // one. If an animation frame hasn't been requested yet, request one. Return\n // the current animation frame request id.\n return scheduler._scheduled || (scheduler._scheduled = animationFrameProvider.requestAnimationFrame(() => scheduler.flush(undefined)));\n }\n\n protected recycleAsyncId(scheduler: AnimationFrameScheduler, id?: TimerHandle, delay: number = 0): TimerHandle | undefined {\n // If delay exists and is greater than 0, or if the delay is null (the\n // action wasn't rescheduled) but was originally scheduled as an async\n // action, then recycle as an async action.\n if (delay != null ? delay > 0 : this.delay > 0) {\n return super.recycleAsyncId(scheduler, id, delay);\n }\n // If the scheduler queue has no remaining actions with the same async id,\n // cancel the requested animation frame and set the scheduled flag to\n // undefined so the next AnimationFrameAction will request its own.\n const { actions } = scheduler;\n if (id != null && actions[actions.length - 1]?.id !== id) {\n animationFrameProvider.cancelAnimationFrame(id as number);\n scheduler._scheduled = undefined;\n }\n // Return undefined so the action knows to request a new async id if it's rescheduled.\n return undefined;\n }\n}\n", "import { AsyncAction } from './AsyncAction';\nimport { AsyncScheduler } from './AsyncScheduler';\n\nexport class AnimationFrameScheduler extends AsyncScheduler {\n public flush(action?: AsyncAction): void {\n this._active = true;\n // The async id that effects a call to flush is stored in _scheduled.\n // Before executing an action, it's necessary to check the action's async\n // id to determine whether it's supposed to be executed in the current\n // flush.\n // Previous implementations of this method used a count to determine this,\n // but that was unsound, as actions that are unsubscribed - i.e. cancelled -\n // are removed from the actions array and that can shift actions that are\n // scheduled to be executed in a subsequent flush into positions at which\n // they are executed within the current flush.\n const flushId = this._scheduled;\n this._scheduled = undefined;\n\n const { actions } = this;\n let error: any;\n action = action || actions.shift()!;\n\n do {\n if ((error = action.execute(action.state, action.delay))) {\n break;\n }\n } while ((action = actions[0]) && action.id === flushId && actions.shift());\n\n this._active = false;\n\n if (error) {\n while ((action = actions[0]) && action.id === flushId && actions.shift()) {\n action.unsubscribe();\n }\n throw error;\n }\n }\n}\n", "import { AnimationFrameAction } from './AnimationFrameAction';\nimport { AnimationFrameScheduler } from './AnimationFrameScheduler';\n\n/**\n *\n * Animation Frame Scheduler\n *\n * Perform task when `window.requestAnimationFrame` would fire\n *\n * When `animationFrame` scheduler is used with delay, it will fall back to {@link asyncScheduler} scheduler\n * behaviour.\n *\n * Without delay, `animationFrame` scheduler can be used to create smooth browser animations.\n * It makes sure scheduled task will happen just before next browser content repaint,\n * thus performing animations as efficiently as possible.\n *\n * ## Example\n * Schedule div height animation\n * ```ts\n * // html:
    \n * import { animationFrameScheduler } from 'rxjs';\n *\n * const div = document.querySelector('div');\n *\n * animationFrameScheduler.schedule(function(height) {\n * div.style.height = height + \"px\";\n *\n * this.schedule(height + 1); // `this` references currently executing Action,\n * // which we reschedule with new state\n * }, 0, 0);\n *\n * // You will see a div element growing in height\n * ```\n */\n\nexport const animationFrameScheduler = new AnimationFrameScheduler(AnimationFrameAction);\n\n/**\n * @deprecated Renamed to {@link animationFrameScheduler}. Will be removed in v8.\n */\nexport const animationFrame = animationFrameScheduler;\n", "import { Observable } from '../Observable';\nimport { SchedulerLike } from '../types';\n\n/**\n * A simple Observable that emits no items to the Observer and immediately\n * emits a complete notification.\n *\n * Just emits 'complete', and nothing else.\n *\n * ![](empty.png)\n *\n * A simple Observable that only emits the complete notification. It can be used\n * for composing with other Observables, such as in a {@link mergeMap}.\n *\n * ## Examples\n *\n * Log complete notification\n *\n * ```ts\n * import { EMPTY } from 'rxjs';\n *\n * EMPTY.subscribe({\n * next: () => console.log('Next'),\n * complete: () => console.log('Complete!')\n * });\n *\n * // Outputs\n * // Complete!\n * ```\n *\n * Emit the number 7, then complete\n *\n * ```ts\n * import { EMPTY, startWith } from 'rxjs';\n *\n * const result = EMPTY.pipe(startWith(7));\n * result.subscribe(x => console.log(x));\n *\n * // Outputs\n * // 7\n * ```\n *\n * Map and flatten only odd numbers to the sequence `'a'`, `'b'`, `'c'`\n *\n * ```ts\n * import { interval, mergeMap, of, EMPTY } from 'rxjs';\n *\n * const interval$ = interval(1000);\n * const result = interval$.pipe(\n * mergeMap(x => x % 2 === 1 ? of('a', 'b', 'c') : EMPTY),\n * );\n * result.subscribe(x => console.log(x));\n *\n * // Results in the following to the console:\n * // x is equal to the count on the interval, e.g. (0, 1, 2, 3, ...)\n * // x will occur every 1000ms\n * // if x % 2 is equal to 1, print a, b, c (each on its own)\n * // if x % 2 is not equal to 1, nothing will be output\n * ```\n *\n * @see {@link Observable}\n * @see {@link NEVER}\n * @see {@link of}\n * @see {@link throwError}\n */\nexport const EMPTY = new Observable((subscriber) => subscriber.complete());\n\n/**\n * @param scheduler A {@link SchedulerLike} to use for scheduling\n * the emission of the complete notification.\n * @deprecated Replaced with the {@link EMPTY} constant or {@link scheduled} (e.g. `scheduled([], scheduler)`). Will be removed in v8.\n */\nexport function empty(scheduler?: SchedulerLike) {\n return scheduler ? emptyScheduled(scheduler) : EMPTY;\n}\n\nfunction emptyScheduled(scheduler: SchedulerLike) {\n return new Observable((subscriber) => scheduler.schedule(() => subscriber.complete()));\n}\n", "import { SchedulerLike } from '../types';\nimport { isFunction } from './isFunction';\n\nexport function isScheduler(value: any): value is SchedulerLike {\n return value && isFunction(value.schedule);\n}\n", "import { SchedulerLike } from '../types';\nimport { isFunction } from './isFunction';\nimport { isScheduler } from './isScheduler';\n\nfunction last(arr: T[]): T | undefined {\n return arr[arr.length - 1];\n}\n\nexport function popResultSelector(args: any[]): ((...args: unknown[]) => unknown) | undefined {\n return isFunction(last(args)) ? args.pop() : undefined;\n}\n\nexport function popScheduler(args: any[]): SchedulerLike | undefined {\n return isScheduler(last(args)) ? args.pop() : undefined;\n}\n\nexport function popNumber(args: any[], defaultValue: number): number {\n return typeof last(args) === 'number' ? args.pop()! : defaultValue;\n}\n", "export const isArrayLike = ((x: any): x is ArrayLike => x && typeof x.length === 'number' && typeof x !== 'function');", "import { isFunction } from \"./isFunction\";\n\n/**\n * Tests to see if the object is \"thennable\".\n * @param value the object to test\n */\nexport function isPromise(value: any): value is PromiseLike {\n return isFunction(value?.then);\n}\n", "import { InteropObservable } from '../types';\nimport { observable as Symbol_observable } from '../symbol/observable';\nimport { isFunction } from './isFunction';\n\n/** Identifies an input as being Observable (but not necessary an Rx Observable) */\nexport function isInteropObservable(input: any): input is InteropObservable {\n return isFunction(input[Symbol_observable]);\n}\n", "import { isFunction } from './isFunction';\n\nexport function isAsyncIterable(obj: any): obj is AsyncIterable {\n return Symbol.asyncIterator && isFunction(obj?.[Symbol.asyncIterator]);\n}\n", "/**\n * Creates the TypeError to throw if an invalid object is passed to `from` or `scheduled`.\n * @param input The object that was passed.\n */\nexport function createInvalidObservableTypeError(input: any) {\n // TODO: We should create error codes that can be looked up, so this can be less verbose.\n return new TypeError(\n `You provided ${\n input !== null && typeof input === 'object' ? 'an invalid object' : `'${input}'`\n } where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.`\n );\n}\n", "export function getSymbolIterator(): symbol {\n if (typeof Symbol !== 'function' || !Symbol.iterator) {\n return '@@iterator' as any;\n }\n\n return Symbol.iterator;\n}\n\nexport const iterator = getSymbolIterator();\n", "import { iterator as Symbol_iterator } from '../symbol/iterator';\nimport { isFunction } from './isFunction';\n\n/** Identifies an input as being an Iterable */\nexport function isIterable(input: any): input is Iterable {\n return isFunction(input?.[Symbol_iterator]);\n}\n", "import { ReadableStreamLike } from '../types';\nimport { isFunction } from './isFunction';\n\nexport async function* readableStreamLikeToAsyncGenerator(readableStream: ReadableStreamLike): AsyncGenerator {\n const reader = readableStream.getReader();\n try {\n while (true) {\n const { value, done } = await reader.read();\n if (done) {\n return;\n }\n yield value!;\n }\n } finally {\n reader.releaseLock();\n }\n}\n\nexport function isReadableStreamLike(obj: any): obj is ReadableStreamLike {\n // We don't want to use instanceof checks because they would return\n // false for instances from another Realm, like an

    Reference

    For a full reference for the YAML API parameters see the YAML Reference document.

    Reference

    For a full reference for the YAML API parameters see the YAML Reference document.

    \ No newline at end of file diff --git a/v2.20/developer/cluster-api/submit-rest/index.html b/v2.20/developer/cluster-api/submit-rest/index.html index 1b56e2b796..3348a04fba 100644 --- a/v2.20/developer/cluster-api/submit-rest/index.html +++ b/v2.20/developer/cluster-api/submit-rest/index.html @@ -1,4 +1,4 @@ - Submit Workload via HTTP/REST - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/developer/cluster-api/submit-yaml/index.html b/v2.20/developer/cluster-api/submit-yaml/index.html index af368d71cb..7a38accdcc 100644 --- a/v2.20/developer/cluster-api/submit-yaml/index.html +++ b/v2.20/developer/cluster-api/submit-yaml/index.html @@ -1,4 +1,4 @@ - Submit Workload via YAML - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/developer/cluster-api/workload-overview-dev/index.html b/v2.20/developer/cluster-api/workload-overview-dev/index.html index de928b03d2..5d6ea8080d 100644 --- a/v2.20/developer/cluster-api/workload-overview-dev/index.html +++ b/v2.20/developer/cluster-api/workload-overview-dev/index.html @@ -1,4 +1,4 @@ - Overview - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/developer/metrics/metrics-api/index.html b/v2.20/developer/metrics/metrics-api/index.html index f538d25d97..cf2f6ad332 100644 --- a/v2.20/developer/metrics/metrics-api/index.html +++ b/v2.20/developer/metrics/metrics-api/index.html @@ -1,4 +1,4 @@ - Metrics via API - Run:ai Documentation Library

    Metrics and telemetry

    Telemetry

    Telemetry is a numeric measurement recorded in real-time when emitted from the Run:ai cluster.

    Metrics

    Metrics are numeric measurements recorded over time that are emitted from the Run:ai cluster. Typical metrics involve utilization, allocation, time measurements and so on. Metrics are used in Run:ai dashboards as well as in the Run:ai administration user interface.


    The purpose of this document is to detail the structure and purpose of metrics emitted by Run:ai. This enables customers to create custom dashboards or integrate metric data into other monitoring systems.

    Run:ai provides metrics via the Run:ai Control-plane API. Previoulsy, Run:ai provided metrics information via direct access to an internal metrics store. This method is deprecated but is still documented here.

    Metric and telemetry Scopes

    Run:ai provides Control-plane API which supports and aggregates metrics at various levels.

    Level Description
    Cluster A cluster is a set of Nodes Pools & Nodes. With Cluster metrics, metrics are aggregated at the Cluster level
    Node Data is aggregated at the Node level.
    Node Pool Data is aggregated at the Node Pool level.
    Workload Data is aggregated at the Workload level. In some Workloads, e.g. with distributed workloads, these metrics aggregate data from all worker pods
    Pod The basic execution unit
    Project The basic organizational unit. Projects are the tool to implement resource allocation policies as well as the segregation between different initiatives.
    Department Departments are a grouping of projects.
    ## Supported Metrics
    Metric Cluster Node Pool Node Workload Pod Project Department
    API Cluster API Node Pool API Workload API Pod API
    ALLOCATED_GPU TRUE TRUE TRUE
    AVG_WORKLOAD_WAIT_TIME TRUE TRUE
    CPU_LIMIT_CORES TRUE
    CPU_MEMORY_LIMIT_BYTES TRUE
    CPU_MEMORY_REQUEST_BYTES TRUE
    CPU_MEMORY_USAGE_BYTES TRUE TRUE TRUE
    CPU_MEMORY_UTILIZATION TRUE TRUE TRUE
    CPU_REQUEST_CORES TRUE
    CPU_USAGE_CORES TRUE TRUE TRUE
    CPU_UTILIZATION TRUE TRUE TRUE
    GPU_ALLOCATION TRUE TRUE TRUE
    GPU_MEMORY_REQUEST_BYTES TRUE
    GPU_MEMORY_USAGE_BYTES TRUE TRUE
    GPU_MEMORY_USAGE_BYTES_PER_GPU TRUE TRUE
    GPU_MEMORY_UTILIZATION TRUE TRUE
    GPU_MEMORY_UTILIZATION_PER_GPU TRU
    GPU_QUOTA TRUE TRUE TRUE TRUE
    GPU_UTILIZATION TRUE TRUE TRUE TRUE
    GPU_UTILIZATION_PER_GPU TRUE TRUE
    POD_COUNT TRUE
    RUNNING_POD_COUNT TRUE
    TOTAL_GPU TRUE TRUE
    TOTAL_GPU_NODES TRUE TRUE
    GPU_UTILIZATION_DISTRIBUTION TRUE TRUE
    UNALLOCATED_GPU TRUE TRUE
    CPU_QUOTA_MILLICORES TRUE TRUE
    CPU_MEMORY_QUOTA_MB TRUE TRUE
    CPU_ALLOCATION_MILLICORES TRUE TRUE
    CPU_MEMORY_ALLOCATION_MB TRUE TRUE

    Advanced Metrics

    NVIDIA provides extended metrics at the Pod level. These are documented here. To enable these metrics please contact Run:ai customer support.

    Metric Cluster Node Pool Workload Pod
    GPU_FP16_ENGINE_ACTIVITY_PER_GPU TRUE
    GPU_FP32_ENGINE_ACTIVITY_PER_GPU TRUE
    GPU_FP64_ENGINE_ACTIVITY_PER_GPU TRUE
    GPU_GRAPHICS_ENGINE_ACTIVITY_PER_GPU TRUE
    GPU_MEMORY_BANDWIDTH_UTILIZATION_PER_GPU TRUE
    GPU_NVLINK_RECEIVED_BANDWIDTH_PER_GPU TRUE
    GPU_NVLINK_TRANSMITTED_BANDWIDTH_PER_GPU TRUE
    GPU_PCIE_RECEIVED_BANDWIDTH_PER_GPU TRUE
    GPU_PCIE_TRANSMITTED_BANDWIDTH_PER_GPU TRUE
    GPU_SM_ACTIVITY_PER_GPU TRUE
    GPU_SM_OCCUPANCY_PER_GPU TRUE
    GPU_TENSOR_ACTIVITY_PER_GPU TRUE

    Supported telemetry

    telemetry Node Workload Project Department
    API Node API Workload API
    WORKLOADS_COUNT TRUE
    ALLOCATED_GPUS TRUE TRUE TRUE TRUE
    READY_GPU_NODES TRUE
    READY_GPUS TRUE
    TOTAL_GPU_NODES TRUE
    TOTAL_GPUS TRUE
    IDLE_ALLOCATED_GPUS TRUE
    FREE_GPUS TRUE
    TOTAL_CPU_CORES TRUE
    USED_CPU_CORES TRUE
    ALLOCATED_CPU_CORES TRUE TRUE TRUE
    TOTAL_GPU_MEMORY_BYTES TRUE
    USED_GPU_MEMORY_BYTES TRUE
    TOTAL_CPU_MEMORY_BYTES TRUE
    USED_CPU_MEMORY_BYTES TRUE
    ALLOCATED_CPU_MEMORY_BYTES TRUE TRUE TRUE
    GPU_QUOTA TRUE TRUE
    CPU_QUOTA TRUE TRUE
    MEMORY_QUOTA TRUE TRUE
    GPU_ALLOCATION_NON_PREEMPTIBLE TRUE TRUE
    CPU_ALLOCATION_NON_PREEMPTIBLE TRUE TRUE
    MEMORY_ALLOCATION_NON_PREEMPTIBLE TRUE TRUE

    Metrics and telemetry

    Telemetry

    Telemetry is a numeric measurement recorded in real-time when emitted from the Run:ai cluster.

    Metrics

    Metrics are numeric measurements recorded over time that are emitted from the Run:ai cluster. Typical metrics involve utilization, allocation, time measurements and so on. Metrics are used in Run:ai dashboards as well as in the Run:ai administration user interface.


    The purpose of this document is to detail the structure and purpose of metrics emitted by Run:ai. This enables customers to create custom dashboards or integrate metric data into other monitoring systems.

    Run:ai provides metrics via the Run:ai Control-plane API. Previoulsy, Run:ai provided metrics information via direct access to an internal metrics store. This method is deprecated but is still documented here.

    Metric and telemetry Scopes

    Run:ai provides Control-plane API which supports and aggregates metrics at various levels.

    Level Description
    Cluster A cluster is a set of Nodes Pools & Nodes. With Cluster metrics, metrics are aggregated at the Cluster level
    Node Data is aggregated at the Node level.
    Node Pool Data is aggregated at the Node Pool level.
    Workload Data is aggregated at the Workload level. In some Workloads, e.g. with distributed workloads, these metrics aggregate data from all worker pods
    Pod The basic execution unit
    Project The basic organizational unit. Projects are the tool to implement resource allocation policies as well as the segregation between different initiatives.
    Department Departments are a grouping of projects.
    ## Supported Metrics
    Metric Cluster Node Pool Node Workload Pod Project Department
    API Cluster API Node Pool API Workload API Pod API
    ALLOCATED_GPU TRUE TRUE TRUE
    AVG_WORKLOAD_WAIT_TIME TRUE TRUE
    CPU_LIMIT_CORES TRUE
    CPU_MEMORY_LIMIT_BYTES TRUE
    CPU_MEMORY_REQUEST_BYTES TRUE
    CPU_MEMORY_USAGE_BYTES TRUE TRUE TRUE
    CPU_MEMORY_UTILIZATION TRUE TRUE TRUE
    CPU_REQUEST_CORES TRUE
    CPU_USAGE_CORES TRUE TRUE TRUE
    CPU_UTILIZATION TRUE TRUE TRUE
    GPU_ALLOCATION TRUE TRUE TRUE
    GPU_MEMORY_REQUEST_BYTES TRUE
    GPU_MEMORY_USAGE_BYTES TRUE TRUE
    GPU_MEMORY_USAGE_BYTES_PER_GPU TRUE TRUE
    GPU_MEMORY_UTILIZATION TRUE TRUE
    GPU_MEMORY_UTILIZATION_PER_GPU TRU
    GPU_QUOTA TRUE TRUE TRUE TRUE
    GPU_UTILIZATION TRUE TRUE TRUE TRUE
    GPU_UTILIZATION_PER_GPU TRUE TRUE
    POD_COUNT TRUE
    RUNNING_POD_COUNT TRUE
    TOTAL_GPU TRUE TRUE
    TOTAL_GPU_NODES TRUE TRUE
    GPU_UTILIZATION_DISTRIBUTION TRUE TRUE
    UNALLOCATED_GPU TRUE TRUE
    CPU_QUOTA_MILLICORES TRUE TRUE
    CPU_MEMORY_QUOTA_MB TRUE TRUE
    CPU_ALLOCATION_MILLICORES TRUE TRUE
    CPU_MEMORY_ALLOCATION_MB TRUE TRUE

    Advanced Metrics

    NVIDIA provides extended metrics at the Pod level. These are documented here. To enable these metrics please contact Run:ai customer support.

    Metric Cluster Node Pool Workload Pod
    GPU_FP16_ENGINE_ACTIVITY_PER_GPU TRUE
    GPU_FP32_ENGINE_ACTIVITY_PER_GPU TRUE
    GPU_FP64_ENGINE_ACTIVITY_PER_GPU TRUE
    GPU_GRAPHICS_ENGINE_ACTIVITY_PER_GPU TRUE
    GPU_MEMORY_BANDWIDTH_UTILIZATION_PER_GPU TRUE
    GPU_NVLINK_RECEIVED_BANDWIDTH_PER_GPU TRUE
    GPU_NVLINK_TRANSMITTED_BANDWIDTH_PER_GPU TRUE
    GPU_PCIE_RECEIVED_BANDWIDTH_PER_GPU TRUE
    GPU_PCIE_TRANSMITTED_BANDWIDTH_PER_GPU TRUE
    GPU_SM_ACTIVITY_PER_GPU TRUE
    GPU_SM_OCCUPANCY_PER_GPU TRUE
    GPU_TENSOR_ACTIVITY_PER_GPU TRUE

    Supported telemetry

    telemetry Node Workload Project Department
    API Node API Workload API
    WORKLOADS_COUNT TRUE
    ALLOCATED_GPUS TRUE TRUE TRUE TRUE
    READY_GPU_NODES TRUE
    READY_GPUS TRUE
    TOTAL_GPU_NODES TRUE
    TOTAL_GPUS TRUE
    IDLE_ALLOCATED_GPUS TRUE
    FREE_GPUS TRUE
    TOTAL_CPU_CORES TRUE
    USED_CPU_CORES TRUE
    ALLOCATED_CPU_CORES TRUE TRUE TRUE
    TOTAL_GPU_MEMORY_BYTES TRUE
    USED_GPU_MEMORY_BYTES TRUE
    TOTAL_CPU_MEMORY_BYTES TRUE
    USED_CPU_MEMORY_BYTES TRUE
    ALLOCATED_CPU_MEMORY_BYTES TRUE TRUE TRUE
    GPU_QUOTA TRUE TRUE
    CPU_QUOTA TRUE TRUE
    MEMORY_QUOTA TRUE TRUE
    GPU_ALLOCATION_NON_PREEMPTIBLE TRUE TRUE
    CPU_ALLOCATION_NON_PREEMPTIBLE TRUE TRUE
    MEMORY_ALLOCATION_NON_PREEMPTIBLE TRUE TRUE
    \ No newline at end of file diff --git a/v2.20/developer/metrics/metrics/index.html b/v2.20/developer/metrics/metrics/index.html index 6b5e1714cf..652c05e96c 100644 --- a/v2.20/developer/metrics/metrics/index.html +++ b/v2.20/developer/metrics/metrics/index.html @@ -1,4 +1,4 @@ - Metrics API - Run:ai Documentation Library

    (Deprecated) Metrics via Prometheus

    What are Metrics

    Metrics are numeric measurements recorded over time that are emitted from the Run:ai cluster. Typical metrics involve utilization, allocation, time measurements and so on. Metrics are used in Run:ai dashboards as well as in the Run:ai administration user interface.

    The purpose of this document is to detail the structure and purpose of metrics emitted by Run:ai to enable customers to create custom dashboards or integrate metric data into other monitoring systems.

    Run:ai uses Prometheus for collecting and querying metrics.

    Warning

    From cluster version 2.17 and onwards, Run:ai supports metrics via the Run:ai Control-plane API. Direct metrics queries (metrics that are queried directly from Prometheus) are deprecated.

    Published Run:ai Metrics

    Following is the list of published Run:ai metrics, per cluster version (make sure to pick the right cluster version in the picker at the top of the page):

    Metric name Labels Measurement Description
    runai_active_job_cpu_requested_cores {clusterId, job_name, job_uuid} CPU Cores Workload's requested CPU cores
    runai_active_job_memory_requested_bytes {clusterId, job_name, job_uuid} Bytes Workload's requested CPU memory
    runai_cluster_cpu_utilization {clusterId} 0 to 1 CPU utilization of the entire cluster
    runai_cluster_memory_used_bytes {clusterId} Bytes Used CPU memory of the entire cluster
    runai_cluster_memory_utilization {clusterId} 0 to 1 CPU memory utilization of the entire cluster
    runai_allocated_gpu_count_per_gpu {gpu, clusterId, node} 0/1 Is a GPU hosting a pod
    runai_last_gpu_utilization_time_per_gpu {gpu, clusterId, node} Unix time Last time GPU was not idle
    runai_requested_gpu_memory_mb_per_workload {clusterId, job_type, job_uuid, job_name, project, workload_id} MegaBytes Requested GPU memory per workload (0 if not specified by the user)
    runai_requested_gpus_per_workload {clusterId, workload_type, workload_id, workload_name, project} Double Number of requested GPUs per workload
    runai_run_time_seconds_per_workload {clusterId, workload_id, workload_name} Seconds Total run time per workload
    runai_wait_time_seconds_per_workload {clusterId, workload_id, workload_name} Seconds Total wait time per workload
    runai_node_cpu_requested_cores {clusterId, node} Double Sum of the requested CPU cores of all workloads running in a node
    runai_node_cpu_utilization {clusterId, node} 0 to 1 CPU utilization per node
    runai_node_memory_utilization {clusterId, node} 0 to 1 CPU memory utilization per node
    runai_node_requested_memory_bytes {clusterId, node} Bytes Sum of the requested CPU memory of all workloads running in a node
    runai_node_used_memory_bytes {clusterId, node} Bytes Used CPU memory per node
    runai_project_guaranteed_gpus {clusterId, project} Double Guaranteed GPU quota per project
    runai_project_info {memory_quota, cpu_quota, gpu_guaranteed_quota, clusterId, project, department} N/A Information on CPU, CPU memory, GPU quota per project
    runai_queue_info {memory_quota, cpu_quota, gpu_guaranteed_quota, clusterId, nodepool, queue_name, department} N/A Information on CPU, CPU memory, GPU quota per project/department per nodepool
    runai_cpu_limits_per_active_workload {clusterId, job_name , job_uuid} CPU Cores Workloads CPU limit (in number of cores). See link
    runai_job_cpu_usage {clusterId, workload_id, workload_name, project} Double Workloads CPU usage (in number of cores)
    runai_memory_limits_per_active_workload {clusterId, job_name, job_uuid} Bytes Workloads CPU memory limit. See link
    runai_active_job_memory_requested_bytes {clusterId, job_name, job_uuid} Bytes Workloads requested CPU memory. See link
    runai_job_memory_used_bytes {clusterId, workload_id, workload_name, project} Bytes Workloads used CPU memory
    runai_mig_mode_gpu_count {clusterId, node} Double Number of GPUs on MIG nodes (Deprecated)
    runai_gpu_utilization_per_gpu {clusterId, gpu, node} % GPU Utilization per GPU
    runai_gpu_utilization_per_node {clusterId, node} % GPU Utilization per Node
    runai_gpu_memory_used_mebibytes_per_gpu {clusterId, gpu, node} MiB Used GPU memory per GPU
    runai_gpu_memory_used_mebibytes_per_node {clusterId, node} MiB Used GPU memory per Node
    runai_gpu_memory_total_mebibytes_per_gpu {clusterId, gpu, node} MiB Total GPU memory per GPU
    runai_gpu_memory_total_mebibytes_per_node {clusterId, node} MiB Total GPU memory per Node
    runai_gpu_count_per_node {clusterId, node, modelName, ready, schedulable} Number Number of GPUs per Node
    runai_allocated_gpu_count_per_workload {clusterId, workload_id, workload_name, workload_type, user} Double Number of allocated GPUs per Workload
    runai_allocated_gpu_count_per_project {clusterId, project} Double Number of allocated GPUs per Project
    runai_gpu_memory_used_mebibytes_per_pod_per_gpu {clusterId, pod_name, pod_uuid, pod_namespace, node, gpu} MiB Used GPU Memory per Pod, per Gpu on which the workload is running
    runai_gpu_memory_used_mebibytes_per_workload {clusterId, workload_id, workload_name, workload_type, user} MiB Used GPU Memory per Workload
    runai_gpu_utilization_per_pod_per_gpu {clusterId, pod_name, pod_uuid, pod_namespace, node, gpu} % GPU Utilization per Pod per GPU
    runai_gpu_utilization_per_workload {clusterId, workload_id, workload_name, workload_type, user} % Average GPU Utilization per Workload
    runai_gpu_utilization_per_project {clusterId, project} % Average GPU Utilization per Project
    runai_last_gpu_utilization_time_per_workload {clusterId, workload_id, workload_name, workload_type, user} Seconds (Unix Timestamp) The Last Time (Unix Timestamp) That The Workload Utilized Any Of Its Allocated GPUs
    runai_gpu_idle_seconds_per_workload {clusterId, workload_id, workload_name, workload_type, user} Seconds Seconds Passed Since The Workload Utilized Any Of Its Allocated GPUs
    runai_allocated_gpu_count_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Double Number Of Allocated GPUs per Pod
    runai_allocated_gpu_count_per_node {clusterId, node} Double Number Of Allocated GPUs per Node
    runai_allocated_millicpus_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Integer Number Of Allocated Millicpus per Pod
    runai_allocated_memory_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Bytes Allocated Memory per Pod

    Following is a list of labels appearing in Run:ai metrics:

    Label Description
    clusterId Cluster Identifier
    department Name of Run:ai Department
    cpu_quota CPU limit per project
    gpu GPU index
    gpu_guaranteed_quota Guaranteed GPU quota per project
    image Name of Docker image
    namespace_name Namespace
    deployment_name Deployment name
    job_name Job name
    job_type Job type: training, interactive or inference
    job_uuid Job identifier
    workload_name Workload name
    workload_type Workload type: training, interactive or inference
    workload_uuid Workload identifier
    pod_name Pod name. A Workload can contain many pods.
    pod_namespace Pod namespace
    memory_quota CPU memory limit per project
    node Node name
    project Name of Run:ai Project
    status Workload status: Running, Pending, etc. For more information on Workload statuses see document
    user User identifier

    Other Metrics

    Run:ai exports other metrics emitted by NVIDIA and Kubernetes packages, as follows:

    Metric name Description
    runai_gpu_utilization_per_gpu GPU utilization
    kube_node_status_capacity The capacity for different resources of a node
    kube_node_status_condition The condition of a cluster node
    kube_pod_container_resource_requests_cpu_cores The number of CPU cores requested by container
    kube_pod_container_resource_requests_memory_bytes Bytes of memory requested by a container
    kube_pod_info Information about pod

    For additional information, see Kubernetes kube-state-metrics and NVIDIA dcgm exporter.

    Changed metrics and API mapping

    Starting in cluster version 2.17, some of the metrics names have been changed. In addition some Run:ai metrics are available as API endpoints. Using the API endpoints is more efficient and provides an easier way of retrieving metrics in any application. The following table lists the metrics that were changed.

    Metric name in version 2.16 2.17 Change Description 2.17 API Endpoint
    runai_active_job_cpu_requested_cores available also via API https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "CPU_REQUEST_CORES" metricType
    runai_active_job_memory_requested_bytes available also via API https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "CPU_MEMORY_REQUEST_BYTES" metricType
    runai_cluster_cpu_utilization available also via API https://app.run.ai/api/v2/clusters/{clusterUuid}/metrics ; with "CPU_UTILIZATION" metricType
    runai_cluster_memory_utilization available also via API https://app.run.ai/api/v2/clusters/{clusterUuid}/metrics ; with "CPU_MEMORY_UTILIZATION" metricType
    runai_gpu_utilization_non_fractional_jobs no longer available
    runai_allocated_gpu_count_per_workload labels changed
    runai_gpu_utilization_per_pod_per_gpu available also via API https://app.run.ai/api/v1/workloads/{workloadId}/pods/{podId}/metrics ; with "GPU_UTILIZATION_PER_GPU" metricType
    runai_gpu_utilization_per_workload available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "GPU_UTILIZATION" metricType
    runai_job_image no longer available
    runai_job_requested_gpu_memory available also via API and renamed to: "runai_requested_gpu_memory_mb_per_workload" with different labels https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "GPU_MEMORY_REQUEST_BYTES" metricType
    runai_job_requested_gpus renamed to: "runai_requested_gpus_per_workload" with different labels
    runai_job_total_runtime renamed to: "runai_run_time_seconds_per_workload" with different labels
    runai_job_total_wait_time renamed to: "runai_wait_time_seconds_per_workload" with different labels
    runai_gpu_memory_used_mebibytes_per_workload available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "GPU_MEMORY_USAGE_BYTES" metricType
    runai_gpu_memory_used_mebibytes_per_pod_per_gpu available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/pods/{podId}/metrics ; with "GPU_MEMORY_USAGE_BYTES_PER_GPU" metricType
    runai_node_gpu_used_memory_bytes renamed and changed units: "runai_gpu_memory_used_mebibytes_per_node"
    runai_node_total_memory_bytes renamed and changed units: "runai_gpu_memory_total_mebibytes_per_node"
    runai_project_info labels changed
    runai_active_job_cpu_limits available also via API and renamed to: "runai_cpu_limits_per_active_workload" https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "CPU_LIMIT_CORES" metricType
    runai_job_cpu_usage available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "CPU_USAGE_CORES" metricType
    runai_active_job_memory_limits available also via API and renamed to: "runai_memory_limits_per_active_workload" https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "CPU_MEMORY_LIMIT_BYTES" metricType
    runai_running_job_memory_requested_bytes was a duplication of "runai_active_job_memory_requested_bytes", see above
    runai_job_memory_used_bytes available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "CPU_MEMORY_USAGE_BYTES" metricType
    runai_job_swap_memory_used_bytes no longer available
    runai_gpu_count_per_node added labels
    runai_last_gpu_utilization_time_per_workload labels changed
    runai_gpu_idle_time_per_workload renamed to: "runai_gpu_idle_seconds_per_workload" with different labels

    Create custom dashboards

    To create custom dashboards based on the above metrics, please contact Run:ai customer support.

    (Deprecated) Metrics via Prometheus

    What are Metrics

    Metrics are numeric measurements recorded over time that are emitted from the Run:ai cluster. Typical metrics involve utilization, allocation, time measurements and so on. Metrics are used in Run:ai dashboards as well as in the Run:ai administration user interface.

    The purpose of this document is to detail the structure and purpose of metrics emitted by Run:ai to enable customers to create custom dashboards or integrate metric data into other monitoring systems.

    Run:ai uses Prometheus for collecting and querying metrics.

    Warning

    From cluster version 2.17 and onwards, Run:ai supports metrics via the Run:ai Control-plane API. Direct metrics queries (metrics that are queried directly from Prometheus) are deprecated.

    Published Run:ai Metrics

    Following is the list of published Run:ai metrics, per cluster version (make sure to pick the right cluster version in the picker at the top of the page):

    Metric name Labels Measurement Description
    runai_active_job_cpu_requested_cores {clusterId, job_name, job_uuid} CPU Cores Workload's requested CPU cores
    runai_active_job_memory_requested_bytes {clusterId, job_name, job_uuid} Bytes Workload's requested CPU memory
    runai_cluster_cpu_utilization {clusterId} 0 to 1 CPU utilization of the entire cluster
    runai_cluster_memory_used_bytes {clusterId} Bytes Used CPU memory of the entire cluster
    runai_cluster_memory_utilization {clusterId} 0 to 1 CPU memory utilization of the entire cluster
    runai_allocated_gpu_count_per_gpu {gpu, clusterId, node} 0/1 Is a GPU hosting a pod
    runai_last_gpu_utilization_time_per_gpu {gpu, clusterId, node} Unix time Last time GPU was not idle
    runai_requested_gpu_memory_mb_per_workload {clusterId, job_type, job_uuid, job_name, project, workload_id} MegaBytes Requested GPU memory per workload (0 if not specified by the user)
    runai_requested_gpus_per_workload {clusterId, workload_type, workload_id, workload_name, project} Double Number of requested GPUs per workload
    runai_run_time_seconds_per_workload {clusterId, workload_id, workload_name} Seconds Total run time per workload
    runai_wait_time_seconds_per_workload {clusterId, workload_id, workload_name} Seconds Total wait time per workload
    runai_node_cpu_requested_cores {clusterId, node} Double Sum of the requested CPU cores of all workloads running in a node
    runai_node_cpu_utilization {clusterId, node} 0 to 1 CPU utilization per node
    runai_node_memory_utilization {clusterId, node} 0 to 1 CPU memory utilization per node
    runai_node_requested_memory_bytes {clusterId, node} Bytes Sum of the requested CPU memory of all workloads running in a node
    runai_node_used_memory_bytes {clusterId, node} Bytes Used CPU memory per node
    runai_project_guaranteed_gpus {clusterId, project} Double Guaranteed GPU quota per project
    runai_project_info {memory_quota, cpu_quota, gpu_guaranteed_quota, clusterId, project, department} N/A Information on CPU, CPU memory, GPU quota per project
    runai_queue_info {memory_quota, cpu_quota, gpu_guaranteed_quota, clusterId, nodepool, queue_name, department} N/A Information on CPU, CPU memory, GPU quota per project/department per nodepool
    runai_cpu_limits_per_active_workload {clusterId, job_name , job_uuid} CPU Cores Workloads CPU limit (in number of cores). See link
    runai_job_cpu_usage {clusterId, workload_id, workload_name, project} Double Workloads CPU usage (in number of cores)
    runai_memory_limits_per_active_workload {clusterId, job_name, job_uuid} Bytes Workloads CPU memory limit. See link
    runai_active_job_memory_requested_bytes {clusterId, job_name, job_uuid} Bytes Workloads requested CPU memory. See link
    runai_job_memory_used_bytes {clusterId, workload_id, workload_name, project} Bytes Workloads used CPU memory
    runai_mig_mode_gpu_count {clusterId, node} Double Number of GPUs on MIG nodes (Deprecated)
    runai_gpu_utilization_per_gpu {clusterId, gpu, node} % GPU Utilization per GPU
    runai_gpu_utilization_per_node {clusterId, node} % GPU Utilization per Node
    runai_gpu_memory_used_mebibytes_per_gpu {clusterId, gpu, node} MiB Used GPU memory per GPU
    runai_gpu_memory_used_mebibytes_per_node {clusterId, node} MiB Used GPU memory per Node
    runai_gpu_memory_total_mebibytes_per_gpu {clusterId, gpu, node} MiB Total GPU memory per GPU
    runai_gpu_memory_total_mebibytes_per_node {clusterId, node} MiB Total GPU memory per Node
    runai_gpu_count_per_node {clusterId, node, modelName, ready, schedulable} Number Number of GPUs per Node
    runai_allocated_gpu_count_per_workload {clusterId, workload_id, workload_name, workload_type, user} Double Number of allocated GPUs per Workload
    runai_allocated_gpu_count_per_project {clusterId, project} Double Number of allocated GPUs per Project
    runai_gpu_memory_used_mebibytes_per_pod_per_gpu {clusterId, pod_name, pod_uuid, pod_namespace, node, gpu} MiB Used GPU Memory per Pod, per Gpu on which the workload is running
    runai_gpu_memory_used_mebibytes_per_workload {clusterId, workload_id, workload_name, workload_type, user} MiB Used GPU Memory per Workload
    runai_gpu_utilization_per_pod_per_gpu {clusterId, pod_name, pod_uuid, pod_namespace, node, gpu} % GPU Utilization per Pod per GPU
    runai_gpu_utilization_per_workload {clusterId, workload_id, workload_name, workload_type, user} % Average GPU Utilization per Workload
    runai_gpu_utilization_per_project {clusterId, project} % Average GPU Utilization per Project
    runai_last_gpu_utilization_time_per_workload {clusterId, workload_id, workload_name, workload_type, user} Seconds (Unix Timestamp) The Last Time (Unix Timestamp) That The Workload Utilized Any Of Its Allocated GPUs
    runai_gpu_idle_seconds_per_workload {clusterId, workload_id, workload_name, workload_type, user} Seconds Seconds Passed Since The Workload Utilized Any Of Its Allocated GPUs
    runai_allocated_gpu_count_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Double Number Of Allocated GPUs per Pod
    runai_allocated_gpu_count_per_node {clusterId, node} Double Number Of Allocated GPUs per Node
    runai_allocated_millicpus_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Integer Number Of Allocated Millicpus per Pod
    runai_allocated_memory_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Bytes Allocated Memory per Pod

    Following is a list of labels appearing in Run:ai metrics:

    Label Description
    clusterId Cluster Identifier
    department Name of Run:ai Department
    cpu_quota CPU limit per project
    gpu GPU index
    gpu_guaranteed_quota Guaranteed GPU quota per project
    image Name of Docker image
    namespace_name Namespace
    deployment_name Deployment name
    job_name Job name
    job_type Job type: training, interactive or inference
    job_uuid Job identifier
    workload_name Workload name
    workload_type Workload type: training, interactive or inference
    workload_uuid Workload identifier
    pod_name Pod name. A Workload can contain many pods.
    pod_namespace Pod namespace
    memory_quota CPU memory limit per project
    node Node name
    project Name of Run:ai Project
    status Workload status: Running, Pending, etc. For more information on Workload statuses see document
    user User identifier

    Other Metrics

    Run:ai exports other metrics emitted by NVIDIA and Kubernetes packages, as follows:

    Metric name Description
    runai_gpu_utilization_per_gpu GPU utilization
    kube_node_status_capacity The capacity for different resources of a node
    kube_node_status_condition The condition of a cluster node
    kube_pod_container_resource_requests_cpu_cores The number of CPU cores requested by container
    kube_pod_container_resource_requests_memory_bytes Bytes of memory requested by a container
    kube_pod_info Information about pod

    For additional information, see Kubernetes kube-state-metrics and NVIDIA dcgm exporter.

    Changed metrics and API mapping

    Starting in cluster version 2.17, some of the metrics names have been changed. In addition some Run:ai metrics are available as API endpoints. Using the API endpoints is more efficient and provides an easier way of retrieving metrics in any application. The following table lists the metrics that were changed.

    Metric name in version 2.16 2.17 Change Description 2.17 API Endpoint
    runai_active_job_cpu_requested_cores available also via API https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "CPU_REQUEST_CORES" metricType
    runai_active_job_memory_requested_bytes available also via API https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "CPU_MEMORY_REQUEST_BYTES" metricType
    runai_cluster_cpu_utilization available also via API https://app.run.ai/api/v2/clusters/{clusterUuid}/metrics ; with "CPU_UTILIZATION" metricType
    runai_cluster_memory_utilization available also via API https://app.run.ai/api/v2/clusters/{clusterUuid}/metrics ; with "CPU_MEMORY_UTILIZATION" metricType
    runai_gpu_utilization_non_fractional_jobs no longer available
    runai_allocated_gpu_count_per_workload labels changed
    runai_gpu_utilization_per_pod_per_gpu available also via API https://app.run.ai/api/v1/workloads/{workloadId}/pods/{podId}/metrics ; with "GPU_UTILIZATION_PER_GPU" metricType
    runai_gpu_utilization_per_workload available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "GPU_UTILIZATION" metricType
    runai_job_image no longer available
    runai_job_requested_gpu_memory available also via API and renamed to: "runai_requested_gpu_memory_mb_per_workload" with different labels https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "GPU_MEMORY_REQUEST_BYTES" metricType
    runai_job_requested_gpus renamed to: "runai_requested_gpus_per_workload" with different labels
    runai_job_total_runtime renamed to: "runai_run_time_seconds_per_workload" with different labels
    runai_job_total_wait_time renamed to: "runai_wait_time_seconds_per_workload" with different labels
    runai_gpu_memory_used_mebibytes_per_workload available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "GPU_MEMORY_USAGE_BYTES" metricType
    runai_gpu_memory_used_mebibytes_per_pod_per_gpu available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/pods/{podId}/metrics ; with "GPU_MEMORY_USAGE_BYTES_PER_GPU" metricType
    runai_node_gpu_used_memory_bytes renamed and changed units: "runai_gpu_memory_used_mebibytes_per_node"
    runai_node_total_memory_bytes renamed and changed units: "runai_gpu_memory_total_mebibytes_per_node"
    runai_project_info labels changed
    runai_active_job_cpu_limits available also via API and renamed to: "runai_cpu_limits_per_active_workload" https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "CPU_LIMIT_CORES" metricType
    runai_job_cpu_usage available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "CPU_USAGE_CORES" metricType
    runai_active_job_memory_limits available also via API and renamed to: "runai_memory_limits_per_active_workload" https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "CPU_MEMORY_LIMIT_BYTES" metricType
    runai_running_job_memory_requested_bytes was a duplication of "runai_active_job_memory_requested_bytes", see above
    runai_job_memory_used_bytes available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with "CPU_MEMORY_USAGE_BYTES" metricType
    runai_job_swap_memory_used_bytes no longer available
    runai_gpu_count_per_node added labels
    runai_last_gpu_utilization_time_per_workload labels changed
    runai_gpu_idle_time_per_workload renamed to: "runai_gpu_idle_seconds_per_workload" with different labels

    Create custom dashboards

    To create custom dashboards based on the above metrics, please contact Run:ai customer support.

    \ No newline at end of file diff --git a/v2.20/developer/overview-developer/index.html b/v2.20/developer/overview-developer/index.html index beaa7b7c4e..938d40385a 100644 --- a/v2.20/developer/overview-developer/index.html +++ b/v2.20/developer/overview-developer/index.html @@ -1,4 +1,4 @@ - Developer Documentation Overview - Run:ai Documentation Library

    Overview

    Developers can access Run:ai through various programmatic interfaces.

    API Architecture

    Run:ai is composed of a single, multi-tenant control plane. Each tenant can be connected to one or more GPU clusters. See Run:ai system components for detailed information.

    The following programming interfaces are available:

    API Description Purpose
    Run:ai REST API Get and Modify any Run:ai business object This is the API mostly used by system developers. The API is also used by the Run:ai user interface as well as the new command-line interface
    Cluster API (Deprecated) Submit Workloads directly to the Cluster A YAML-based API allowing submittion of Workloads directly to the Cluster. With Run:ai 2.18, this API is replaced by the above Run:ai, which is now the recommended method
    Metrics API (deprecated) Get cluster metrics Get utilization metrics.

    Run:ai REST API

    Allows you to Add, delete, modify and list Run:ai meta-data objects such as Projects, Departments, Users. For Clusters of Run:ai 2.18 and above, allows the submitting of Workloasd.

    The API is provided as REST and is accessible via the control plane endpoint.

    For more information see Run:ai REST API.

    Important

    The endpoints and fields specified in the API reference are the ones that are officially supported by Run:ai. Endpoints and fields that are not listed in the API reference are not supported.

    Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

    For details, see the Deprecation notifications.

    Cluster API (Deprecated)

    The Cluster API allows you to submit and delete Workloads directly to the cluster itself.

    The API is provided as Kubernetes API.

    Cluster API is accessible via the GPU cluster itself. As such, multiple clusters may have multiple endpoints.

    Important

    • This API is replaced by a Run:ai REST API to submit jobs, which is now the recommended method for cluster versions of 2.18 and above.
    • If you are looking to automate tasks with older versions of Run:ai, it's best to use the Run:ai Command-line interface which provides forward compatibility.

    Metrics API

    Retrieve metrics from multiple GPU clusters.

    See the Metrics API document.

    API Authentication

    See API Authentication for information on how to gain authenticated access to Run:ai APIs.

    Overview

    Developers can access Run:ai through various programmatic interfaces.

    API Architecture

    Run:ai is composed of a single, multi-tenant control plane. Each tenant can be connected to one or more GPU clusters. See Run:ai system components for detailed information.

    The following programming interfaces are available:

    API Description Purpose
    Run:ai REST API Get and Modify any Run:ai business object This is the API mostly used by system developers. The API is also used by the Run:ai user interface as well as the new command-line interface
    Cluster API (Deprecated) Submit Workloads directly to the Cluster A YAML-based API allowing submittion of Workloads directly to the Cluster. With Run:ai 2.18, this API is replaced by the above Run:ai, which is now the recommended method
    Metrics API (deprecated) Get cluster metrics Get utilization metrics.

    Run:ai REST API

    Allows you to Add, delete, modify and list Run:ai meta-data objects such as Projects, Departments, Users. For Clusters of Run:ai 2.18 and above, allows the submitting of Workloasd.

    The API is provided as REST and is accessible via the control plane endpoint.

    For more information see Run:ai REST API.

    Important

    The endpoints and fields specified in the API reference are the ones that are officially supported by Run:ai. Endpoints and fields that are not listed in the API reference are not supported.

    Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

    For details, see the Deprecation notifications.

    Cluster API (Deprecated)

    The Cluster API allows you to submit and delete Workloads directly to the cluster itself.

    The API is provided as Kubernetes API.

    Cluster API is accessible via the GPU cluster itself. As such, multiple clusters may have multiple endpoints.

    Important

    • This API is replaced by a Run:ai REST API to submit jobs, which is now the recommended method for cluster versions of 2.18 and above.
    • If you are looking to automate tasks with older versions of Run:ai, it's best to use the Run:ai Command-line interface which provides forward compatibility.

    Metrics API

    Retrieve metrics from multiple GPU clusters.

    See the Metrics API document.

    API Authentication

    See API Authentication for information on how to gain authenticated access to Run:ai APIs.

    \ No newline at end of file diff --git a/v2.20/developer/rest-auth/index.html b/v2.20/developer/rest-auth/index.html index 4d949716f3..6f51c636fb 100644 --- a/v2.20/developer/rest-auth/index.html +++ b/v2.20/developer/rest-auth/index.html @@ -1,4 +1,4 @@ - API Authentication - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/developer/user-applications/index.html b/v2.20/developer/user-applications/index.html index 8e6d9f0bf4..71e8184b65 100644 --- a/v2.20/developer/user-applications/index.html +++ b/v2.20/developer/user-applications/index.html @@ -1,4 +1,4 @@ - User Applications - Run:ai Documentation Library

    User Applications

    This article explains the procedure to create your own user applications.

    Applications are used for API integrations with Run:ai. An application contains a client ID and a client secret. With the client credentials, you can obtain a token as detailed in API authentication and use it within subsequent API calls.

    Note

    • User applications are supported in cluster version 2.20 and above.
    • The token obtained through user applications assumes the roles and permissions of the user.

    Creating Applications

    To create an application:

    1. Click the user icon, then select Settings
    2. Click +APPLICATION
    3. Enter the application’s name
    4. Click CREATE
    5. Copy the Client ID and Client secret and store securely
    6. Click DONE

    You can create up to 20 user applications.

    Note

    The client secret is visible only at the time of creation. It cannot be recovered but can be regenerated.

    Regenerating client secret

    To regenerate a client secret:

    1. Locate the application you want to regenerate its client secret
    2. Click Regenerate client secret
    3. Click REGENERATE
    4. Copy the New client secret and store it securely
    5. Click DONE

    Warning

    Regenerating a client secret revokes the previous one.

    Deleting an application

    1. Locate the application you want to delete
    2. Click on the trash icon
    3. On the dialog, click DELETE to confirm

    Using API

    Go to the User Applications API reference to view the available actions

    User Applications

    This article explains the procedure to create your own user applications.

    Applications are used for API integrations with Run:ai. An application contains a client ID and a client secret. With the client credentials, you can obtain a token as detailed in API authentication and use it within subsequent API calls.

    Note

    • User applications are supported in cluster version 2.20 and above.
    • The token obtained through user applications assumes the roles and permissions of the user.

    Creating Applications

    To create an application:

    1. Click the user icon, then select Settings
    2. Click +APPLICATION
    3. Enter the application’s name
    4. Click CREATE
    5. Copy the Client ID and Client secret and store securely
    6. Click DONE

    You can create up to 20 user applications.

    Note

    The client secret is visible only at the time of creation. It cannot be recovered but can be regenerated.

    Regenerating client secret

    To regenerate a client secret:

    1. Locate the application you want to regenerate its client secret
    2. Click Regenerate client secret
    3. Click REGENERATE
    4. Copy the New client secret and store it securely
    5. Click DONE

    Warning

    Regenerating a client secret revokes the previous one.

    Deleting an application

    1. Locate the application you want to delete
    2. Click on the trash icon
    3. On the dialog, click DELETE to confirm

    Using API

    Go to the User Applications API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/home/changelog/hotfixes-2-13/index.html b/v2.20/home/changelog/hotfixes-2-13/index.html index 52f2b64b8a..5aba3b413c 100644 --- a/v2.20/home/changelog/hotfixes-2-13/index.html +++ b/v2.20/home/changelog/hotfixes-2-13/index.html @@ -1,4 +1,4 @@ - Changelog Version 2.13 - Run:ai Documentation Library

    Hotfixes for 2.13

    The following is a list of the known and fixed issues for Run:ai V2.13.

    Version 2.13.48 - March 14, 2024

    Internal ID Description
    RUN-16787 Fixed an issue after an upgrade to 2.13 where distributed PyTorch jobs were not able to run due to PVCs being assigned to only worker pods.
    RUN-16626 Fixed an issue in SSO environments, where Workspaces created using a template were assigned the template creator's UID/GID and not the Workspace creator's UID/GID.
    RUN-16357 Fixed an issue where pressing the Project link in Jobs screen redirects the view to the Projects of a different cluster in multi-cluster environments.

    Version 2.13.43 - February 15, 2024

    Internal ID Description
    RUN-14946 Fixed an issue where Dashboards are displaying the hidden Grafana path.

    Version 2.13.37

    Internal ID Description
    RUN-13300 Fixed an issue where projects will appear with a status of empty while waiting for the project controller to update its status. This was caused because the cluster-sync works faster than the project controller.

    Version 2.13.35 - December 19, 2023

    Release content

    • Added the ability to set node affinity for Prometheus.

    Fixed issues

    Internal ID Description
    RUN-14472 Fixed an issue where template updates were not being applied to the workload.
    RUN-14434 Fixed an issue where runai_allocated_gpu_count_per_gpu was multiplied by seven.
    RUN-13956 Fixed an issue where editing templates failed.
    RUN-13825 Fixed an issue when deleting a job that is allocated a fraction of a GPU, an associated configmap is not deleted.
    RUN-13343 Fixed an issue in pod status calculation.

    Version 2.13.31

    Internal ID Description
    RUN-11367 Fixed an issue where a double click on SSO Users redirects to a blank screen.
    RUN-10560 Fixed an issue where the RunaiDaemonSetRolloutStuck alert did not work.

    Version 2.13.25

    Internal ID Description
    RUN-13171 Fixed an issue when a cluster is not connected the actions in the Workspace and Training pages are still enabled. After the corrections, the actions will be disabled.

    Version 2.13.21

    Internal ID Description
    RUN-12563 Fixed an issue where users are unable to login after upgrading the control plane from 2.9.16 to 2.13.16. To correct the issue, secrets need to be upgraded manually in keycloak.

    Version 2.13.20 - September 28, 2023

    Release content

    • Added the prevention of selecting tenant or department scopes for credentials, and the prevention of selecting s3, PVC, and Git data sources if the cluster version does not support these.
    • Quota management is now enabled by default.
    Internal ID Description
    RUN-12923 Fixed an issue in upgrading due to a misconfigured Docker image for airgapped systems in 2.13.19. The helm chart contained an error, and the image is not used even though it is packaged as part of the tar.
    RUN-12928, RUN-12968 Fixed an issue in upgrading Prometheus due to a misconfigured image for airgapped systems in 2.13.19. The helm chart contained an error, and the image is not used even though it is packaged as part of the tar.
    RUN-12751 Fixed an issue when upgrading from 2.9 to 2.13 results with a missing engine-config file.
    RUN-12717 Fixed an issue where the user that is logged in as researcher manager can't see the clusters.
    RUN-12642 Fixed an issue where assets-sync could not restart due to failing to get token from control plane.
    RUN-12191 Fixed an issue where there was a timeout while waiting for the runai_allocated_gpu_count_per_project metric to return values.
    RUN-10474 Fixed an issue where the runai-conatiner-toolkit-exporter DaemonSet fails to start.

    Version 2.13.19 - September 27, 2023

    Release content

    • Added the ability to identify Kubeflow notebooks and display them in the Jobs table.
    • Added the ability to schedule Kubelow workloads.
    • Added functionality that displays Jobs that only belong to the user that is logged in.
    • Added and refined alerts to the state of Run:ai components, schedule latency, and warnings for out of memory on Jobs.
    • Added the ability to work with restricted PSA policy.

    Fixed issues

    Internal ID Description
    RUN-12650 Fixed an issue that used an incorrect metric in analytics GPU ALLOCATION PER NODE panel. Now the correct allocation is in percentage.
    RUN-12602 Fixed an issue in runaiconfig where the WorkloadServices spec has memory requests/limits and cpu requests/limits and gets overwritten with the system default.
    RUN-12585 Fixed an issue where the workload-controller creates a delay in running jobs.
    RUN-12031 Fixed an issue when upgrading from 2.9 to 2.13 where the Scheduler pod fails to upgrade due to the change of owner.
    RUN-11091 Fixed an issue where the Departments feature is disabled, you are not able to schedule non-preemable jobs.

    Version 2.13.13

    Internal ID Description
    RUN-11321 Fixed an issue where metrics always showed CPU Memory Utilization and CPU Compute Utilization as 0.
    RUN-11307 Fixed an issue where node affinity might change mid way through a job. Node affinity in now calculated only once at job submission.
    RUN-11129 Fixed an issue where CRDs are not automatically upgraded when upgrading from 2.9 to 2.13.

    Version 2.13.12 - August 7, 2023

    Internal ID Description
    RUN-11476 Fixed an issue with analytics node pool filter in Allocated GPUs per Project panel.

    Version 2.13.11

    Internal ID Description
    RUN-11408 Added to the Run:ai job-controller 2 configurable parameters QPS and Burst which are applied as environment variables in the job-controller Deployment object.

    Version 2.13.7 - July 2023

    Release content

    • Added filters to the historic quota ratio widget on the Quota management dashboard.

    Fixed issues

    Internal ID Description
    RUN-11080 Fixed an issue in OpenShift environments where log in via SSO with the kubeadmin user, gets blank pages for every page.
    RUN-11119 Fixed an issue where values that should be the Order of priority column are in the wrong column.
    RUN-11120 Fixed an issue where the Projects table does not show correct metrics when Run:ai version 2.13 is paired with a Run:ai 2.8 cluster.
    RUN-11121 Fixed an issue where the wrong over quota memory alert is shown in the Quota management pane in project edit form.
    RUN-11272 Fixed an issue in OpenShift environments where the selection in the cluster drop down in the main UI does not match the cluster selected on the login page.

    Version 2.13.4

    Release date

    July 2023

    Fixed issues

    Internal ID Description
    RUN-11089 Fixed an issue when creating an environment, commands in the Runtime settings pane and are not persistent and cannot be found in other assets (for example in a new Training).

    Version 2.13.1 - July 2023

    Release content

    • Made an improvement so that occurrences of labels that are not in use anymore are deleted.

    Fixed issues

    N/A

    Hotfixes for 2.13

    The following is a list of the known and fixed issues for Run:ai V2.13.

    Version 2.13.48 - March 14, 2024

    Internal ID Description
    RUN-16787 Fixed an issue after an upgrade to 2.13 where distributed PyTorch jobs were not able to run due to PVCs being assigned to only worker pods.
    RUN-16626 Fixed an issue in SSO environments, where Workspaces created using a template were assigned the template creator's UID/GID and not the Workspace creator's UID/GID.
    RUN-16357 Fixed an issue where pressing the Project link in Jobs screen redirects the view to the Projects of a different cluster in multi-cluster environments.

    Version 2.13.43 - February 15, 2024

    Internal ID Description
    RUN-14946 Fixed an issue where Dashboards are displaying the hidden Grafana path.

    Version 2.13.37

    Internal ID Description
    RUN-13300 Fixed an issue where projects will appear with a status of empty while waiting for the project controller to update its status. This was caused because the cluster-sync works faster than the project controller.

    Version 2.13.35 - December 19, 2023

    Release content

    • Added the ability to set node affinity for Prometheus.

    Fixed issues

    Internal ID Description
    RUN-14472 Fixed an issue where template updates were not being applied to the workload.
    RUN-14434 Fixed an issue where runai_allocated_gpu_count_per_gpu was multiplied by seven.
    RUN-13956 Fixed an issue where editing templates failed.
    RUN-13825 Fixed an issue when deleting a job that is allocated a fraction of a GPU, an associated configmap is not deleted.
    RUN-13343 Fixed an issue in pod status calculation.

    Version 2.13.31

    Internal ID Description
    RUN-11367 Fixed an issue where a double click on SSO Users redirects to a blank screen.
    RUN-10560 Fixed an issue where the RunaiDaemonSetRolloutStuck alert did not work.

    Version 2.13.25

    Internal ID Description
    RUN-13171 Fixed an issue when a cluster is not connected the actions in the Workspace and Training pages are still enabled. After the corrections, the actions will be disabled.

    Version 2.13.21

    Internal ID Description
    RUN-12563 Fixed an issue where users are unable to login after upgrading the control plane from 2.9.16 to 2.13.16. To correct the issue, secrets need to be upgraded manually in keycloak.

    Version 2.13.20 - September 28, 2023

    Release content

    • Added the prevention of selecting tenant or department scopes for credentials, and the prevention of selecting s3, PVC, and Git data sources if the cluster version does not support these.
    • Quota management is now enabled by default.
    Internal ID Description
    RUN-12923 Fixed an issue in upgrading due to a misconfigured Docker image for airgapped systems in 2.13.19. The helm chart contained an error, and the image is not used even though it is packaged as part of the tar.
    RUN-12928, RUN-12968 Fixed an issue in upgrading Prometheus due to a misconfigured image for airgapped systems in 2.13.19. The helm chart contained an error, and the image is not used even though it is packaged as part of the tar.
    RUN-12751 Fixed an issue when upgrading from 2.9 to 2.13 results with a missing engine-config file.
    RUN-12717 Fixed an issue where the user that is logged in as researcher manager can't see the clusters.
    RUN-12642 Fixed an issue where assets-sync could not restart due to failing to get token from control plane.
    RUN-12191 Fixed an issue where there was a timeout while waiting for the runai_allocated_gpu_count_per_project metric to return values.
    RUN-10474 Fixed an issue where the runai-conatiner-toolkit-exporter DaemonSet fails to start.

    Version 2.13.19 - September 27, 2023

    Release content

    • Added the ability to identify Kubeflow notebooks and display them in the Jobs table.
    • Added the ability to schedule Kubelow workloads.
    • Added functionality that displays Jobs that only belong to the user that is logged in.
    • Added and refined alerts to the state of Run:ai components, schedule latency, and warnings for out of memory on Jobs.
    • Added the ability to work with restricted PSA policy.

    Fixed issues

    Internal ID Description
    RUN-12650 Fixed an issue that used an incorrect metric in analytics GPU ALLOCATION PER NODE panel. Now the correct allocation is in percentage.
    RUN-12602 Fixed an issue in runaiconfig where the WorkloadServices spec has memory requests/limits and cpu requests/limits and gets overwritten with the system default.
    RUN-12585 Fixed an issue where the workload-controller creates a delay in running jobs.
    RUN-12031 Fixed an issue when upgrading from 2.9 to 2.13 where the Scheduler pod fails to upgrade due to the change of owner.
    RUN-11091 Fixed an issue where the Departments feature is disabled, you are not able to schedule non-preemable jobs.

    Version 2.13.13

    Internal ID Description
    RUN-11321 Fixed an issue where metrics always showed CPU Memory Utilization and CPU Compute Utilization as 0.
    RUN-11307 Fixed an issue where node affinity might change mid way through a job. Node affinity in now calculated only once at job submission.
    RUN-11129 Fixed an issue where CRDs are not automatically upgraded when upgrading from 2.9 to 2.13.

    Version 2.13.12 - August 7, 2023

    Internal ID Description
    RUN-11476 Fixed an issue with analytics node pool filter in Allocated GPUs per Project panel.

    Version 2.13.11

    Internal ID Description
    RUN-11408 Added to the Run:ai job-controller 2 configurable parameters QPS and Burst which are applied as environment variables in the job-controller Deployment object.

    Version 2.13.7 - July 2023

    Release content

    • Added filters to the historic quota ratio widget on the Quota management dashboard.

    Fixed issues

    Internal ID Description
    RUN-11080 Fixed an issue in OpenShift environments where log in via SSO with the kubeadmin user, gets blank pages for every page.
    RUN-11119 Fixed an issue where values that should be the Order of priority column are in the wrong column.
    RUN-11120 Fixed an issue where the Projects table does not show correct metrics when Run:ai version 2.13 is paired with a Run:ai 2.8 cluster.
    RUN-11121 Fixed an issue where the wrong over quota memory alert is shown in the Quota management pane in project edit form.
    RUN-11272 Fixed an issue in OpenShift environments where the selection in the cluster drop down in the main UI does not match the cluster selected on the login page.

    Version 2.13.4

    Release date

    July 2023

    Fixed issues

    Internal ID Description
    RUN-11089 Fixed an issue when creating an environment, commands in the Runtime settings pane and are not persistent and cannot be found in other assets (for example in a new Training).

    Version 2.13.1 - July 2023

    Release content

    • Made an improvement so that occurrences of labels that are not in use anymore are deleted.

    Fixed issues

    N/A

    \ No newline at end of file diff --git a/v2.20/home/changelog/hotfixes-2-15/index.html b/v2.20/home/changelog/hotfixes-2-15/index.html index fe48ed25aa..b0e1c6abfd 100644 --- a/v2.20/home/changelog/hotfixes-2-15/index.html +++ b/v2.20/home/changelog/hotfixes-2-15/index.html @@ -1,4 +1,4 @@ - Changelog Version 2.15 - Run:ai Documentation Library

    Hotfixes for 2.15

    The following is a list of the known and fixed issues for Run:ai V2.15.

    Version 2.15.9 - February 5, 2024

    Fixed issues

    Internal ID Description
    RUN-15296 Fixed an issue where the resources parameter was deprecated in the Projects and Departments API.

    Version 2.15.4 - January 5, 2024

    Fixed issues

    Internal ID Description
    RUN-15026 Fixed an issue in workloads that were built on a cluster that does not support the NFS field.
    RUN-14907 Fixed an issue after an upgrade where the Analytics dashboard was missing the time ranges from before the upgrade.
    RUN-14903 Fixed an issue where internal operations were exposed to the customer audit log.
    RUN-14062 Fixed an issue in the Overview dashboard where the content for the Running Workload per Type panel did not fit.

    Version 2.15.2 - February 5, 2024

    Fixed issues

    Internal ID Description
    RUN-14434 Fixed an issue where the Allocated GPUs metric was multiplied by seven.

    Version 2.15.1 - December 17, 2023

    Release content

    • Added environment variables for customizable QPS and burst support.

    • Added the ability to support running multiple Prometheus replicas.

    Fixed issues

    Internal ID Description
    RUN-14292 Fixed an issue where BCM installations were failing due to missing create cluster permissions.
    RUN-14289 Fixed an issue where metrics were not working due to an incorrect parameter in the cluster-config file.
    RUN-14198 Fixed an issue in services where multi nodepool jobs were not scheduled due to an unassigned nodepool status.
    RUN-14191 Fixed an issue where a consolidation failure would cause unnecessary evictions.
    RUN-14154 Fixed an issue in the New cluster form, whefre the dropdown listed versions that were incompatible with the installed control plane.
    RUN-13956 Fixed an issue in the Jobs table where templates were not edited successfully.
    RUN-13891 Fixed an issue where Ray job statuses were shown as empty.
    RUN-13825 Fixed an issue where GPU sharing configmaps were not deleted.
    RUN-13628 Fixed an issue where the pre-install pod failed to run pre-install tasks due to the request being denied (Unauthorized).
    RUN-13550 Fixed an issue where environments were not recovering from a node restart due to a missing GPU runtime class for containerized nodes.
    RUN-11895 Fixed an issue where the wrong amount of GPU memory usage was shown (is now MB).
    RUN-11681 Fixed an issue in OpenShift environments where some metrics were not shown on dashboards when the GPU Operator from the RedHat marketplace was installed.

    Version 2.15.0

    Fixed issues

    Internal ID Description
    RUN-13456 Fixed an issue where the Researcher L1 role did not have permissions to create and manage credentials.
    RUN-13282 Fixed an issue where Workspace logs crashed unexpectedly after restarting.
    RUN-13121 Fixed an issue in not being able to launch jobs using the API after an upgrade overrode a change in keycloak for applications which have a custom mapping to an email.
    RUN-13103 Fixed an issue in the Workspaces and Trainings table where the action buttons were not greyed out for users with only the view role.
    RUN-12993 Fixed an issue where Prometheus was reporting metrics even though the cluster was disconnected.
    RUN-12978 Fixed an issue after an upgrade, where permissions fail to sync to a project due to a missing application name in the CRD.
    RUN-12900 Fixed an issue in the Projects table, when sorting by Allocated GPUs, the projects were displayed alphabetically and not numerically.
    RUN-12846 Fixed an issue after a control-plane upgrade, where GPU, CPU, and Memory Cost fields (in the Consumption Reports) were missing when not using Grafana.
    RUN-12824 Fixed an issue where airgapped environments tried to pull an image from gcr.io (Internet).
    RUN-12769 Fixed an issue where SSO users were unable to see projects in Job Form unless the group they belong to was added directly to the project.
    RUN-12602 Fixed an issue in the documentation where the WorkloadServices configuration in the runaiconfig file was incorrect.
    RUN-12528 Fixed an issue where the Workspace duration scheduling rule was suspending workspaces regardless of the configured duration.
    RUN-12298 Fixed an issue where projects were not shown in the Projects table due to the API not sanitizing the project name at time of creation.
    RUN-12157 Fixed an issue where querying pods completion time returned a negative number.
    RUN-10560 Fixed an issue where no Prometheus alerts were sent due to a misconfiguration of the parameter RunaiDaemonSetRolloutStuck.

    Hotfixes for 2.15

    The following is a list of the known and fixed issues for Run:ai V2.15.

    Version 2.15.9 - February 5, 2024

    Fixed issues

    Internal ID Description
    RUN-15296 Fixed an issue where the resources parameter was deprecated in the Projects and Departments API.

    Version 2.15.4 - January 5, 2024

    Fixed issues

    Internal ID Description
    RUN-15026 Fixed an issue in workloads that were built on a cluster that does not support the NFS field.
    RUN-14907 Fixed an issue after an upgrade where the Analytics dashboard was missing the time ranges from before the upgrade.
    RUN-14903 Fixed an issue where internal operations were exposed to the customer audit log.
    RUN-14062 Fixed an issue in the Overview dashboard where the content for the Running Workload per Type panel did not fit.

    Version 2.15.2 - February 5, 2024

    Fixed issues

    Internal ID Description
    RUN-14434 Fixed an issue where the Allocated GPUs metric was multiplied by seven.

    Version 2.15.1 - December 17, 2023

    Release content

    • Added environment variables for customizable QPS and burst support.

    • Added the ability to support running multiple Prometheus replicas.

    Fixed issues

    Internal ID Description
    RUN-14292 Fixed an issue where BCM installations were failing due to missing create cluster permissions.
    RUN-14289 Fixed an issue where metrics were not working due to an incorrect parameter in the cluster-config file.
    RUN-14198 Fixed an issue in services where multi nodepool jobs were not scheduled due to an unassigned nodepool status.
    RUN-14191 Fixed an issue where a consolidation failure would cause unnecessary evictions.
    RUN-14154 Fixed an issue in the New cluster form, whefre the dropdown listed versions that were incompatible with the installed control plane.
    RUN-13956 Fixed an issue in the Jobs table where templates were not edited successfully.
    RUN-13891 Fixed an issue where Ray job statuses were shown as empty.
    RUN-13825 Fixed an issue where GPU sharing configmaps were not deleted.
    RUN-13628 Fixed an issue where the pre-install pod failed to run pre-install tasks due to the request being denied (Unauthorized).
    RUN-13550 Fixed an issue where environments were not recovering from a node restart due to a missing GPU runtime class for containerized nodes.
    RUN-11895 Fixed an issue where the wrong amount of GPU memory usage was shown (is now MB).
    RUN-11681 Fixed an issue in OpenShift environments where some metrics were not shown on dashboards when the GPU Operator from the RedHat marketplace was installed.

    Version 2.15.0

    Fixed issues

    Internal ID Description
    RUN-13456 Fixed an issue where the Researcher L1 role did not have permissions to create and manage credentials.
    RUN-13282 Fixed an issue where Workspace logs crashed unexpectedly after restarting.
    RUN-13121 Fixed an issue in not being able to launch jobs using the API after an upgrade overrode a change in keycloak for applications which have a custom mapping to an email.
    RUN-13103 Fixed an issue in the Workspaces and Trainings table where the action buttons were not greyed out for users with only the view role.
    RUN-12993 Fixed an issue where Prometheus was reporting metrics even though the cluster was disconnected.
    RUN-12978 Fixed an issue after an upgrade, where permissions fail to sync to a project due to a missing application name in the CRD.
    RUN-12900 Fixed an issue in the Projects table, when sorting by Allocated GPUs, the projects were displayed alphabetically and not numerically.
    RUN-12846 Fixed an issue after a control-plane upgrade, where GPU, CPU, and Memory Cost fields (in the Consumption Reports) were missing when not using Grafana.
    RUN-12824 Fixed an issue where airgapped environments tried to pull an image from gcr.io (Internet).
    RUN-12769 Fixed an issue where SSO users were unable to see projects in Job Form unless the group they belong to was added directly to the project.
    RUN-12602 Fixed an issue in the documentation where the WorkloadServices configuration in the runaiconfig file was incorrect.
    RUN-12528 Fixed an issue where the Workspace duration scheduling rule was suspending workspaces regardless of the configured duration.
    RUN-12298 Fixed an issue where projects were not shown in the Projects table due to the API not sanitizing the project name at time of creation.
    RUN-12157 Fixed an issue where querying pods completion time returned a negative number.
    RUN-10560 Fixed an issue where no Prometheus alerts were sent due to a misconfiguration of the parameter RunaiDaemonSetRolloutStuck.
    \ No newline at end of file diff --git a/v2.20/home/changelog/hotfixes-2-16/index.html b/v2.20/home/changelog/hotfixes-2-16/index.html index 07d44762fd..1baf9e15db 100644 --- a/v2.20/home/changelog/hotfixes-2-16/index.html +++ b/v2.20/home/changelog/hotfixes-2-16/index.html @@ -1,4 +1,4 @@ - Changelog Version 2.16 - Run:ai Documentation Library

    Hotfixes for 2.16

    The following is a list of the known and fixed issues for Run:ai V2.16.

    Version 2.16.65

    Internal ID Description
    RUN-21448 Fixed an issue with degraded workload so the condition would reflect the actual state.
    RUN-20680 Fixed an issue where the workload page did not present the requested GPU.

    Version 2.16.57

    Internal ID Description
    RUN-20388 Fixed an issue where cluster-sync caused a memory leak.

    Version 2.16.25

    Internal ID Description
    RUN-17241 Fixed an issue where the nodes page showed nodes as not ready due to "tookit not installed".

    Version 2.16.21

    Internal ID Description
    RUN-16463 Fixed an issue after a cluster upgrade to v2.16, where some metrics of pre-existing workloads were displayed incorrectly in the Overview Dashboard.

    Version 2.16.18

    Internal ID Description
    RUN-16486 Fixed an issue in the Workloads creation form where the GPU fields of the compute resource tiles were showing no data.

    Version 2.16.16

    Internal ID Description
    RUN-16340 Fixed an issue in the Workloads table where filters were not saved correctly.

    Version 2.16.15

    Release content

    • Implemented a new Workloads API to support the Workloads feature.

    Fixed issues

    Internal ID Description
    RUN-16070 Fixed an issue where missing metrics caused the Nodepools table to appear empty.

    Version 2.16.14

    Release content

    *Improved overall performance by slowing down metrics updates from 10 seconds to 30 seconds.

    Fixed issues

    Internal ID Description
    RUN-16255 Fixed an issue in the Analytics dashboard where the GPU Allocation per Node and GPU Memory Allocation per Node panels were displaying incorrect data.
    RUN-16035 Fixed an issue in the Workloads table where completed pods continue to be counted in the requested resources column.

    Version 2.16.12

    Fixed issues

    Internal ID Description
    RUN-16110 Fixed an issue where creating a training workload (single or multi-node) with a new PVC or Volume, resulted in the Workloads table showing the workload in the Unknown/Pending status.
    RUN-16086 Fixed an issue in airgapped environments where incorrect installation commands were shown when upgrading to V2.15.

    Version 2.16.11

    N/A

    Version 2.16.9

    N/A

    Version 2.16.8

    Release content

    N/A

    Version 2.16.7

    Release content

    • Added an API endpoint that retrieves data from a workloads's pod.

    Fixed issues

    N/A

    Version 2.16.6

    N/A

    Hotfixes for 2.16

    The following is a list of the known and fixed issues for Run:ai V2.16.

    Version 2.16.65

    Internal ID Description
    RUN-21448 Fixed an issue with degraded workload so the condition would reflect the actual state.
    RUN-20680 Fixed an issue where the workload page did not present the requested GPU.

    Version 2.16.57

    Internal ID Description
    RUN-20388 Fixed an issue where cluster-sync caused a memory leak.

    Version 2.16.25

    Internal ID Description
    RUN-17241 Fixed an issue where the nodes page showed nodes as not ready due to "tookit not installed".

    Version 2.16.21

    Internal ID Description
    RUN-16463 Fixed an issue after a cluster upgrade to v2.16, where some metrics of pre-existing workloads were displayed incorrectly in the Overview Dashboard.

    Version 2.16.18

    Internal ID Description
    RUN-16486 Fixed an issue in the Workloads creation form where the GPU fields of the compute resource tiles were showing no data.

    Version 2.16.16

    Internal ID Description
    RUN-16340 Fixed an issue in the Workloads table where filters were not saved correctly.

    Version 2.16.15

    Release content

    • Implemented a new Workloads API to support the Workloads feature.

    Fixed issues

    Internal ID Description
    RUN-16070 Fixed an issue where missing metrics caused the Nodepools table to appear empty.

    Version 2.16.14

    Release content

    *Improved overall performance by slowing down metrics updates from 10 seconds to 30 seconds.

    Fixed issues

    Internal ID Description
    RUN-16255 Fixed an issue in the Analytics dashboard where the GPU Allocation per Node and GPU Memory Allocation per Node panels were displaying incorrect data.
    RUN-16035 Fixed an issue in the Workloads table where completed pods continue to be counted in the requested resources column.

    Version 2.16.12

    Fixed issues

    Internal ID Description
    RUN-16110 Fixed an issue where creating a training workload (single or multi-node) with a new PVC or Volume, resulted in the Workloads table showing the workload in the Unknown/Pending status.
    RUN-16086 Fixed an issue in airgapped environments where incorrect installation commands were shown when upgrading to V2.15.

    Version 2.16.11

    N/A

    Version 2.16.9

    N/A

    Version 2.16.8

    Release content

    N/A

    Version 2.16.7

    Release content

    • Added an API endpoint that retrieves data from a workloads's pod.

    Fixed issues

    N/A

    Version 2.16.6

    N/A

    \ No newline at end of file diff --git a/v2.20/home/changelog/hotfixes-2-17/index.html b/v2.20/home/changelog/hotfixes-2-17/index.html index 88ab387e22..8fda11152a 100644 --- a/v2.20/home/changelog/hotfixes-2-17/index.html +++ b/v2.20/home/changelog/hotfixes-2-17/index.html @@ -1,4 +1,4 @@ - Changelog Version 2.17 - Run:ai Documentation Library

    Hotfixes for 2.17

    The following is a list of the known and fixed issues for Run:ai V2.17.

    Version 2.17.63

    Internal ID Description
    RUN-21448 Fixed an issue where a degraded workload was stuck and could not be released.

    Version 2.17.46

    Internal ID Description
    RUN-20136 Updated postgres version.

    Version 2.17.43

    Internal ID Description
    RUN-19949 Fixed an issue where runai submit arguments were not parsed correctly to the command.

    Version 2.17.41

    Internal ID Description
    RUN-19870 Added debug logs to cluster-sync

    Version 2.17.26

    Internal ID Description
    RUN-19189 Fixed an issue in cluster-sync that sometimes caused unnecessary sync process to the control-plane.

    Version 2.17.25

    Internal ID Description
    RUN-16357 Fixed an issue where the Project button in the Jobs screen redirects to the Projects page but on the wrong cluster.

    Version 2.17.10

    Internal ID Description
    RUN-18065 Fixed an issue where the legacy job sumbission configuration was not available in the Settings page

    Version 2.17.0

    Internal ID Description
    RUN-20010 Fixed an issue of reduced permissions that run:ai grants users

    Hotfixes for 2.17

    The following is a list of the known and fixed issues for Run:ai V2.17.

    Version 2.17.63

    Internal ID Description
    RUN-21448 Fixed an issue where a degraded workload was stuck and could not be released.

    Version 2.17.46

    Internal ID Description
    RUN-20136 Updated postgres version.

    Version 2.17.43

    Internal ID Description
    RUN-19949 Fixed an issue where runai submit arguments were not parsed correctly to the command.

    Version 2.17.41

    Internal ID Description
    RUN-19870 Added debug logs to cluster-sync

    Version 2.17.26

    Internal ID Description
    RUN-19189 Fixed an issue in cluster-sync that sometimes caused unnecessary sync process to the control-plane.

    Version 2.17.25

    Internal ID Description
    RUN-16357 Fixed an issue where the Project button in the Jobs screen redirects to the Projects page but on the wrong cluster.

    Version 2.17.10

    Internal ID Description
    RUN-18065 Fixed an issue where the legacy job sumbission configuration was not available in the Settings page

    Version 2.17.0

    Internal ID Description
    RUN-20010 Fixed an issue of reduced permissions that run:ai grants users
    \ No newline at end of file diff --git a/v2.20/home/changelog/hotfixes-2-18/index.html b/v2.20/home/changelog/hotfixes-2-18/index.html index 8f3df024e4..9356981d1f 100644 --- a/v2.20/home/changelog/hotfixes-2-18/index.html +++ b/v2.20/home/changelog/hotfixes-2-18/index.html @@ -1,4 +1,4 @@ - Changelog Version 2.18 - Run:ai Documentation Library

    Hotfixes for 2.18

    The following is a list of the known and fixed issues for Run:ai V2.18.

    Hotfixes

    Internal ID Hotfix # Description
    RUN-24521 2.18.83 Fixed a security vulnerability in golang.org.x.crypto with CVE CVE-2024-45337 with severity HIGH.
    RUN-24733 2.18.83 Fixed an issue where department admins were unable to load the quota management page.
    RUN-25094 2.18.82 Fixed an issue where OpenShift could not be upgraded due to a broken 3rd binary.
    RUN-24921 2.18.80 Fixed a security vulnerability in golang.org.x.net and golang.org.x.crypto.
    RUN-24632 2.18.80 Fixed an issue where an existing monitoring Prometheus setup deployed in an unexpected namespace was reported as missing, causing Run:ai installation to fail on the cluster. The installation mechanism now searches for the monitoring prerequisite in additional relevant namespaces.
    RUN-24693 2.18.80 Fixed an issue where users were unable to provide metric store authentication details using secret references.
    RUN-24752 2.18.79 Fixed an issue where a workload would move to a failed state when created with a custom NodePort that was already allocated.
    RUN-24649 2.18.79 Fixed an issue where submitting a workload with existingPvc=false and not providing a claimName resulted in auto-generating a claimName that included both upper and lower case letters. Since Kubernetes rejects uppercase letters, the workload would fail. The behavior has been updated to generate names using only lowercase letters.
    RUN-24595 2.18.78 Fixed an issue where the new CLI did not parse master and worker commands/args simultaneously for distributed workloads.
    RUN-23914 2.18.78 Fixed an issue where unexpected behavior could occur if an application was capturing a graph while memory was being swapped in as part of the GPU memory swap feature.
    RUN-24020 2.18.77 Fixed a security vulnerability in k8s.io.kubernetes with CVE CVE-2024-0793.
    RUN-24021 2.18.77 Fixed a security vulnerability in pam with CVE CVE-2024-10963.
    RUN-23798 2.18.75 Fixed an issue in distributed PyTorch workloads where the worker pods are deleted immediately after completion, not allowing logs to be viewed.
    RUN-23838 2.18.74 Fixed an issue where the command-line interface could not access resources when configured as single-sign on in a self-hosted environment.
    RUN-23561 2.18.74 Fixed an issue where the frontend in airgapped environment attempted to download font resources from the internet.
    RUN-23789 2.18.73 Fixed an issue where in some cases, it was not possible to download the latest version of the command line interface.
    RUN-23790 2.18.73 Fixed an issue where in some cases it was not possible to download the Windows version of the command line interface.
    RUN-23855 2.18.73 Fixed an issue where the pods list in the UI showed past pods.
    RUN-23909 2.18.73 Fixed an issue where users based on group permissions cannot see dashboards.
    RUN-23857 2.18.72 Dashboard to transition from Grafana v9 to v10.
    RUN-24010 2.18.72 Fixed an infinite loop issue in the cluster-sync service.
    RUN-23040 2.18.72 Fixed an edge case where the Run:ai container toolkit hangs when user is spawning hundreds of sub-processes.
    RUN-23802 2.18.70 Fixed an issue where new scheduling rules were not applied to existing workloads, if those new rules were set on existing projects which had no scheduling rules before.
    RUN-23211 2.18.70 Fixed an issue where workloads were stuck at "Pending" when the command-line interface flag --gpu-memory was set to zero.
    RUN-23778 2.18.68 Fixed an issue where in single-sign-on configuration, the mapping of UID and other properties would sometimes disappear.
    RUN-23762 2.18.68 Fixed an issue where the wrong version of a Grafana dashboard was displayed in the UI.
    RUN-21198 2.18.66 Fixed an issue where creating a training workload via yaml (kubectl apply -f) and specifying spec.namePrefix, created infinite jobs.
    RUN-23541 2.18.65 Fixed an issue where in some cases workload authorization did not work properly due to wrong oidc configuration.
    RUN-23291 2.18.64 CLI change text to be user friendly
    RUN-23283 2.18.64 Fixed a permissions issue with the Analytics dashboard post upgrade for SSO Users
    RUN-23420 2.18.63 Replaced Redis with Keydb
    RUN-23140 2.18.63 Fixed an issue where distributed workloads were created with the wrong types
    RUN-23130 2.18.63 Fixed an issue where inference-workload-controller crashed when WorkloadOwnershipProtection was enabled
    RUN-23334 2.18.62 Updated core Dockerfiles to ubi9
    RUN-23296 2.18.62 Fixed an issue in the CLI where runai attach did not work with auto-complete
    RUN-23215 2.18.62 Fixed an issue where metrics requests from backend to mimir failed for certain tenants.
    RUN-22138 2.18.62 Fixed an issue where private URL user(s) input was an email and not a string.
    RUN-23282 2.18.61 CLI documentation fixes
    RUN-23055 2.18.60 Fixed unified Distributed and Training CLI commands
    RUN-23243 2.18.59 Fixed an issue where the scope tree wasn't calculating permissions correctly
    RUN-22463 2.18.59 Fixed an error in CLI bash command
    RUN-22314 2.18.59 Fixed distributed framework filtering in API commands
    RUN-23142 2.18.58 Fixed an issue where advanced GPU metrics per-gpu don't have gpu label
    RUN-23001 2.18.58 Fixed an issue of false overcommit on out-of-memory killed in the “swap” feature.
    RUN-22851 2.18.58 Fixed an issue where client may get stuck on device lock acquired during “swap” out-migration
    RUN-22758 2.18.58 Fixed an issue where inference workload showed wrong status when submission failed.
    RUN-22544 2.18.58 Updated Grafana version for security vulnerabilities.
    RUN-23055 2.18.57 Fixed the unified Distributed and Training CLI commands
    RUN-23014 2.18.56 Fixed an issue where node-scale-adjuster might not create a scaling pod if it is in cool-down and the pod was not updated after that.
    RUN-22660 2.18.56 Fixed an issue where workload charts have an unclear state
    RUN-22457 2.18.55 Fixed an issue where in rare edge cases the cluster-sync pod was out of memory.
    RUN-21825 2.18.55 Fixed all CVEs in Run:ai's Goofys-based image used for S3 integration.
    RUN-22871 2.18.55 Fixed an issue in runai-container-toolkit where in certain cases when a process is preempted, OOMKill metrics were not published correctly.
    RUN-22250 2.18.55 Fixed an issue where workloads trying to use an ingress URL which is already in use were behaving inconsistentyly instead of failing immediately.
    RUN-22880 2.18.55 Fixed an issue where the minAvailable field for training-operator CRDs did not consider all possible replica specs.
    RUN-22073 2.18.55 Fixed an issue where runai-operator failed to parse cluster URLs ending with '/'.
    RUN-22453 2.18.55 Fixed an issue where in rare edge cases the workload-overseer pod experienced a crash.
    RUN-22763 2.18.55 Fixed an issue where in rare edge cases an 'attach' command from CLI-V2 caused a crash in the cluster-api service.
    RUN-21948 2.18.49 Fixed an issue where in rare edge cases workload child resources could have duplicate names, causing inconsistent behavior.
    RUN-22623 2.18.49 Fixed an issue in Openshift where workloads were not suspended when reaching their idle GPU time limit.
    RUN-22600 2.18.49 Fixed an issue in AWS EKS clusters where the V1-CLI returned an empty table when listing all projects as an administrator.
    RUN-21878 2.18.49 Added a label to disable container toolkit from running on certain nodes run.ai/container-toolkit-enabled.
    RUN-22452 2.18.47 Fixed an issue where the scheduler has signature errors if TopologySpreadConstraints was partially defined.
    RUN-22570 2.18.47 Updated git-sync image to version v4.3.0.
    RUN-22054 2.18.46 Fixed an issue where users could not attach to jobs.
    RUN-22377 2.18.46 Removed uncached client from accessrule-controller.
    RUN-21697 2.18.46 Fixed an issue where client may deadlock on suspension during allocation request.
    RUN-20073 2.18.45 Fixed an issue where it wasn't possible to authenticate with user credentials in the CLI.
    RUN-21957 2.18.45 Fixed an issue where there was a missing username-loader container in inference workloads.
    RUN-22276 2.18.39 Fixed an issue where Knative external URL was missing from the Connections modal.
    RUN-22280 2.18.39 Fixed an issue when setting scale to zero - there was no pod counter in the Workload grid.
    RUN-19811 2.18.39 Added an option to set k8s tolerations to run:ai daemonsets (container-toolkit, runai-device-plugin, mig-parted, node-exporter, etc..) .
    RUN-22128 2.18.39 Added GID, UID, Supplemental groups to the V1 CLI.
    RUN-21800 2.18.37 Fixed an issue with old workloads residing in the cluster.
    RUN-21907 2.18.34 Fixed an issue where the SSO user credentials contain supplementary groups as string instead of int.
    RUN-21272 2.18.31 Fixed an issue with multi-cluster credinatils creation, specifically with the same name in different clusters.
    RUN-20680 2.18.29 Fixed an issue where workloads page do not present requested GPU.
    RUN-21200 2.18.29 Fixed issues with upgrades and connections from v2.13.
    RUN-20970 2.18.27 Fixed an issue with PUT APIs.
    RUN-20927 2.18.26 Fixed an issue where node affinity was not updated correctly in projects edit.
    RUN-20084 2.18.26 Fixed an issue where default department were deleted instead of a message being displayed.
    RUN-21062 2.18.26 Fixed issues with the API documentation.
    RUN-20434 2.18.25 Fixed an issue when creating a Project/Department with memory resources requires 'units'.
    RUN-20923 2.18.25 Fixed an issue with projects/departments page loading slowly.
    RUN-19872 2.18.23 Fixed an issue where the Toolkit crashes and fails to create and replace the publishing binaries.
    RUN-20861 2.18.22 Fixed an issue where a pod is stuck on pending due to a missing resource reservation pod.
    RUN-20842 2.18.22 Fixed an issue of illegal model name with "." in hugging face integration.
    RUN-20791 2.18.22 Fix an issue where notifications froze after startup.
    RUN-20865 2.18.22 Fixed an issue where default departments are not deleted when a cluster is deleted.
    RUN-20698 2.18.21 Fixed an issue where 2 processes requests a device at the same time received the same GPU, causing failures.
    RUN-20760 2.18.18 Fixed an issue where workload protection UI shows wrong status.
    RUN-20612 2.18.15 Fixed an issue where it was impossible with the use-table-data to hide node pool columns when there is only one default node pool.
    RUN-20735 2.18.15 Fixed an issue where nodePool.name is undefined
    RUN-20721 2.18.12 Added error handling to nodes pages.
    RUN-20578 2.18.10 Fixed an issue regarding policy enforcement.
    RUN-20188 2.18.10 Fixed issue with defining SSO in OpenShift identity provider.
    RUN-20673 2.18.9 Fixed an issue where a researcher uses a distributed elastic job, it is possible that in a specific flow it is scheduled on more than one node-pools.
    RUN-20360 2.18.7 Fixed an issue where the workload network status was misleading.
    RUN-22107 2.18.7 Fixed an issue where passwords containing $ were removed from the configuration.
    RUN-20510 2.18.5 Fixed an issue with external workloads - argocd workflow failed to be updated.
    RUN-20516 2.18.4 Fixed an issue when after deploying to prod, the cluster-service and authorization-service got multiple OOMKilled every ~1 hour.
    RUN-20485 2.18.2 Changed policy flags to Beta.
    RUN-20005 2.18.1 Fixed an issue where a sidecar container failure failed the workload.
    RUN-20169 2.18.1 Fixed an issue allowing the addition of annotations and labels to workload resources.
    RUN-20108 2.18.1 Fixed an issue exposing service node ports to workload status.
    RUN-20160 2.18.1 Fixed an issue with version display when installing a new cluster in an airgapped environment.
    RUN-19874 2.18.1 Fixed an issue when copying and editing a workload with group access to a tool and the group wasn't removed when selecting users option.
    RUN-19893 2.18.1 Fixed an issue when using a float number in the scale to zero inactivity value - custom which sometimes caused the submission to fail.
    RUN-20087 2.18.1 Fixed an issue where inference graphs should be displayed only for minimum cluster versions.
    RUN-10733 2.18.1 Fixed an issue where we needed to minify and obfuscate our code in production.
    RUN-19962 2.18.1 Fixed an issue to fix sentry domains regex and map them to relevant projects.
    RUN-20104 2.18.1 Fixed an issue where frontend Infinite loop on keycloak causes an error.
    RUN-19906 2.18.1 Fixed an issue where inference workload name validation fails with 2.16 cluster.
    RUN-19605 2.18.1 Fixed an issue where authorized users should support multiple users (workload-controller) .
    RUN-19903 2.18.1 Fixed an issue where inference chatbot creation fails with 2.16 cluster.
    RUN-20409 2.18.1 Fixed an issue where clicking on create new compute during the runai model flow did nothing.
    RUN-11224 2.18.1 Fixed an issue where ruani-adm collect all logs was not collecting all logs.
    RUN-20478 2.18.1 Improved workloads error status in overview panel.
    RUN-19850 2.18.1 Fixed an issue where an application administrator could not submit a job with CLI.
    RUN-19863 2.18.1 Fixed an issue where department admin received 403 on get tenants and cannot login to UI.
    RUN-19904 2.18.1 Fixed an issue when filtering by allocatedGPU in get workloads with operator returns incorrect result.
    RUN-19925 2.18.1 Fixed an issue when upgrade from v2.16 to v2.18 failed on worklaods migrations.
    RUN-19887 2.18.1 Fixed an issue in the UI when there is a scheduling rule of timeout, the form opened with the rules collapsed and written "none".
    RUN-19941 2.18.1 Fixed an issue where completed and failed jobs were shown in view pods in nodes screen.
    RUN-19940 2.18.1 Fixed an issue where setting gpu quota failed because the department quota was taken from wrong department.
    RUN-19890 2.18.1 Fixed an issue where editing a project by removing its node-affinity stuck updating.
    RUN-20120 2.18.1 Fixed an issue where project update fails when there is no cluster version.
    RUN-20113 2.18.1 Fixed an issue in the Workloads table where a researcher does not see other workloads once they clear their filters.
    RUN-19915 2.18.1 Fixed an issue when turning departments toggles on on cluster v2.11+ the gpu limit is -1 and there is ui error.
    RUN-20178 2.18.1 Fixed an issue where dashboard CPU tabs appeared in new overview.
    RUN-20247 2.18.1 Fixed an issue where you couldn't create a workload with namespace of a deleted project.
    RUN-20138 2.18.1 Fixed an issue where the system failed to create node-type on override-backend env.
    RUN-18994 2.18.1 Fixed an issue where some limitations for department administrator are not working as expected.
    RUN-19830 2.18.1 Fixed an issue where resources (GPU, CPU, Memory) units were added to k8s events that are published by run:ai scheduler making our messages more readable.

    Version 2.18.0 Fixes

    Internal ID Description
    RUN-20734 Fixed an issue where the enable/disable toggle for the feature was presenting wrong info.
    RUN-19895 Fixed an issue of empty state for deleted workloads which is incorrect.
    RUN-19507 Fixed an issue in V1 where get APIs are missing required field in swagger leading to omit empty.
    RUN-20246 Fixed an issue in Departments v1 org unit where if unrecognizable params are sent, an error is returned.
    RUN-19947 Fixed an issue where pending multi-nodepool podgroups got stuck after cluster upgrade.
    RUN-20047 Fixed an issue where Workload status shows as "deleting" rather than "deleted" in side panel.
    RUN-20163 Fixed an issue when a DV is shared with a department and a new project is added to this dep - no pvc/pv is created.
    RUN-20484 Fixed an issue where Create Projects Requests Returned 500 - services is not a valid ResourceType.
    RUN-20354 Fixed an issue when deleting a department with projects resulted in projects remaining in environment with the status NotReady.

    Hotfixes for 2.18

    The following is a list of the known and fixed issues for Run:ai V2.18.

    Hotfixes

    Internal ID Hotfix # Description
    RUN-24521 2.18.83 Fixed a security vulnerability in golang.org.x.crypto with CVE CVE-2024-45337 with severity HIGH.
    RUN-24733 2.18.83 Fixed an issue where department admins were unable to load the quota management page.
    RUN-25094 2.18.82 Fixed an issue where OpenShift could not be upgraded due to a broken 3rd binary.
    RUN-24921 2.18.80 Fixed a security vulnerability in golang.org.x.net and golang.org.x.crypto.
    RUN-24632 2.18.80 Fixed an issue where an existing monitoring Prometheus setup deployed in an unexpected namespace was reported as missing, causing Run:ai installation to fail on the cluster. The installation mechanism now searches for the monitoring prerequisite in additional relevant namespaces.
    RUN-24693 2.18.80 Fixed an issue where users were unable to provide metric store authentication details using secret references.
    RUN-24752 2.18.79 Fixed an issue where a workload would move to a failed state when created with a custom NodePort that was already allocated.
    RUN-24649 2.18.79 Fixed an issue where submitting a workload with existingPvc=false and not providing a claimName resulted in auto-generating a claimName that included both upper and lower case letters. Since Kubernetes rejects uppercase letters, the workload would fail. The behavior has been updated to generate names using only lowercase letters.
    RUN-24595 2.18.78 Fixed an issue where the new CLI did not parse master and worker commands/args simultaneously for distributed workloads.
    RUN-23914 2.18.78 Fixed an issue where unexpected behavior could occur if an application was capturing a graph while memory was being swapped in as part of the GPU memory swap feature.
    RUN-24020 2.18.77 Fixed a security vulnerability in k8s.io.kubernetes with CVE CVE-2024-0793.
    RUN-24021 2.18.77 Fixed a security vulnerability in pam with CVE CVE-2024-10963.
    RUN-23798 2.18.75 Fixed an issue in distributed PyTorch workloads where the worker pods are deleted immediately after completion, not allowing logs to be viewed.
    RUN-23838 2.18.74 Fixed an issue where the command-line interface could not access resources when configured as single-sign on in a self-hosted environment.
    RUN-23561 2.18.74 Fixed an issue where the frontend in airgapped environment attempted to download font resources from the internet.
    RUN-23789 2.18.73 Fixed an issue where in some cases, it was not possible to download the latest version of the command line interface.
    RUN-23790 2.18.73 Fixed an issue where in some cases it was not possible to download the Windows version of the command line interface.
    RUN-23855 2.18.73 Fixed an issue where the pods list in the UI showed past pods.
    RUN-23909 2.18.73 Fixed an issue where users based on group permissions cannot see dashboards.
    RUN-23857 2.18.72 Dashboard to transition from Grafana v9 to v10.
    RUN-24010 2.18.72 Fixed an infinite loop issue in the cluster-sync service.
    RUN-23040 2.18.72 Fixed an edge case where the Run:ai container toolkit hangs when user is spawning hundreds of sub-processes.
    RUN-23802 2.18.70 Fixed an issue where new scheduling rules were not applied to existing workloads, if those new rules were set on existing projects which had no scheduling rules before.
    RUN-23211 2.18.70 Fixed an issue where workloads were stuck at "Pending" when the command-line interface flag --gpu-memory was set to zero.
    RUN-23778 2.18.68 Fixed an issue where in single-sign-on configuration, the mapping of UID and other properties would sometimes disappear.
    RUN-23762 2.18.68 Fixed an issue where the wrong version of a Grafana dashboard was displayed in the UI.
    RUN-21198 2.18.66 Fixed an issue where creating a training workload via yaml (kubectl apply -f) and specifying spec.namePrefix, created infinite jobs.
    RUN-23541 2.18.65 Fixed an issue where in some cases workload authorization did not work properly due to wrong oidc configuration.
    RUN-23291 2.18.64 CLI change text to be user friendly
    RUN-23283 2.18.64 Fixed a permissions issue with the Analytics dashboard post upgrade for SSO Users
    RUN-23420 2.18.63 Replaced Redis with Keydb
    RUN-23140 2.18.63 Fixed an issue where distributed workloads were created with the wrong types
    RUN-23130 2.18.63 Fixed an issue where inference-workload-controller crashed when WorkloadOwnershipProtection was enabled
    RUN-23334 2.18.62 Updated core Dockerfiles to ubi9
    RUN-23296 2.18.62 Fixed an issue in the CLI where runai attach did not work with auto-complete
    RUN-23215 2.18.62 Fixed an issue where metrics requests from backend to mimir failed for certain tenants.
    RUN-22138 2.18.62 Fixed an issue where private URL user(s) input was an email and not a string.
    RUN-23282 2.18.61 CLI documentation fixes
    RUN-23055 2.18.60 Fixed unified Distributed and Training CLI commands
    RUN-23243 2.18.59 Fixed an issue where the scope tree wasn't calculating permissions correctly
    RUN-22463 2.18.59 Fixed an error in CLI bash command
    RUN-22314 2.18.59 Fixed distributed framework filtering in API commands
    RUN-23142 2.18.58 Fixed an issue where advanced GPU metrics per-gpu don't have gpu label
    RUN-23001 2.18.58 Fixed an issue of false overcommit on out-of-memory killed in the “swap” feature.
    RUN-22851 2.18.58 Fixed an issue where client may get stuck on device lock acquired during “swap” out-migration
    RUN-22758 2.18.58 Fixed an issue where inference workload showed wrong status when submission failed.
    RUN-22544 2.18.58 Updated Grafana version for security vulnerabilities.
    RUN-23055 2.18.57 Fixed the unified Distributed and Training CLI commands
    RUN-23014 2.18.56 Fixed an issue where node-scale-adjuster might not create a scaling pod if it is in cool-down and the pod was not updated after that.
    RUN-22660 2.18.56 Fixed an issue where workload charts have an unclear state
    RUN-22457 2.18.55 Fixed an issue where in rare edge cases the cluster-sync pod was out of memory.
    RUN-21825 2.18.55 Fixed all CVEs in Run:ai's Goofys-based image used for S3 integration.
    RUN-22871 2.18.55 Fixed an issue in runai-container-toolkit where in certain cases when a process is preempted, OOMKill metrics were not published correctly.
    RUN-22250 2.18.55 Fixed an issue where workloads trying to use an ingress URL which is already in use were behaving inconsistentyly instead of failing immediately.
    RUN-22880 2.18.55 Fixed an issue where the minAvailable field for training-operator CRDs did not consider all possible replica specs.
    RUN-22073 2.18.55 Fixed an issue where runai-operator failed to parse cluster URLs ending with '/'.
    RUN-22453 2.18.55 Fixed an issue where in rare edge cases the workload-overseer pod experienced a crash.
    RUN-22763 2.18.55 Fixed an issue where in rare edge cases an 'attach' command from CLI-V2 caused a crash in the cluster-api service.
    RUN-21948 2.18.49 Fixed an issue where in rare edge cases workload child resources could have duplicate names, causing inconsistent behavior.
    RUN-22623 2.18.49 Fixed an issue in Openshift where workloads were not suspended when reaching their idle GPU time limit.
    RUN-22600 2.18.49 Fixed an issue in AWS EKS clusters where the V1-CLI returned an empty table when listing all projects as an administrator.
    RUN-21878 2.18.49 Added a label to disable container toolkit from running on certain nodes run.ai/container-toolkit-enabled.
    RUN-22452 2.18.47 Fixed an issue where the scheduler has signature errors if TopologySpreadConstraints was partially defined.
    RUN-22570 2.18.47 Updated git-sync image to version v4.3.0.
    RUN-22054 2.18.46 Fixed an issue where users could not attach to jobs.
    RUN-22377 2.18.46 Removed uncached client from accessrule-controller.
    RUN-21697 2.18.46 Fixed an issue where client may deadlock on suspension during allocation request.
    RUN-20073 2.18.45 Fixed an issue where it wasn't possible to authenticate with user credentials in the CLI.
    RUN-21957 2.18.45 Fixed an issue where there was a missing username-loader container in inference workloads.
    RUN-22276 2.18.39 Fixed an issue where Knative external URL was missing from the Connections modal.
    RUN-22280 2.18.39 Fixed an issue when setting scale to zero - there was no pod counter in the Workload grid.
    RUN-19811 2.18.39 Added an option to set k8s tolerations to run:ai daemonsets (container-toolkit, runai-device-plugin, mig-parted, node-exporter, etc..) .
    RUN-22128 2.18.39 Added GID, UID, Supplemental groups to the V1 CLI.
    RUN-21800 2.18.37 Fixed an issue with old workloads residing in the cluster.
    RUN-21907 2.18.34 Fixed an issue where the SSO user credentials contain supplementary groups as string instead of int.
    RUN-21272 2.18.31 Fixed an issue with multi-cluster credinatils creation, specifically with the same name in different clusters.
    RUN-20680 2.18.29 Fixed an issue where workloads page do not present requested GPU.
    RUN-21200 2.18.29 Fixed issues with upgrades and connections from v2.13.
    RUN-20970 2.18.27 Fixed an issue with PUT APIs.
    RUN-20927 2.18.26 Fixed an issue where node affinity was not updated correctly in projects edit.
    RUN-20084 2.18.26 Fixed an issue where default department were deleted instead of a message being displayed.
    RUN-21062 2.18.26 Fixed issues with the API documentation.
    RUN-20434 2.18.25 Fixed an issue when creating a Project/Department with memory resources requires 'units'.
    RUN-20923 2.18.25 Fixed an issue with projects/departments page loading slowly.
    RUN-19872 2.18.23 Fixed an issue where the Toolkit crashes and fails to create and replace the publishing binaries.
    RUN-20861 2.18.22 Fixed an issue where a pod is stuck on pending due to a missing resource reservation pod.
    RUN-20842 2.18.22 Fixed an issue of illegal model name with "." in hugging face integration.
    RUN-20791 2.18.22 Fix an issue where notifications froze after startup.
    RUN-20865 2.18.22 Fixed an issue where default departments are not deleted when a cluster is deleted.
    RUN-20698 2.18.21 Fixed an issue where 2 processes requests a device at the same time received the same GPU, causing failures.
    RUN-20760 2.18.18 Fixed an issue where workload protection UI shows wrong status.
    RUN-20612 2.18.15 Fixed an issue where it was impossible with the use-table-data to hide node pool columns when there is only one default node pool.
    RUN-20735 2.18.15 Fixed an issue where nodePool.name is undefined
    RUN-20721 2.18.12 Added error handling to nodes pages.
    RUN-20578 2.18.10 Fixed an issue regarding policy enforcement.
    RUN-20188 2.18.10 Fixed issue with defining SSO in OpenShift identity provider.
    RUN-20673 2.18.9 Fixed an issue where a researcher uses a distributed elastic job, it is possible that in a specific flow it is scheduled on more than one node-pools.
    RUN-20360 2.18.7 Fixed an issue where the workload network status was misleading.
    RUN-22107 2.18.7 Fixed an issue where passwords containing $ were removed from the configuration.
    RUN-20510 2.18.5 Fixed an issue with external workloads - argocd workflow failed to be updated.
    RUN-20516 2.18.4 Fixed an issue when after deploying to prod, the cluster-service and authorization-service got multiple OOMKilled every ~1 hour.
    RUN-20485 2.18.2 Changed policy flags to Beta.
    RUN-20005 2.18.1 Fixed an issue where a sidecar container failure failed the workload.
    RUN-20169 2.18.1 Fixed an issue allowing the addition of annotations and labels to workload resources.
    RUN-20108 2.18.1 Fixed an issue exposing service node ports to workload status.
    RUN-20160 2.18.1 Fixed an issue with version display when installing a new cluster in an airgapped environment.
    RUN-19874 2.18.1 Fixed an issue when copying and editing a workload with group access to a tool and the group wasn't removed when selecting users option.
    RUN-19893 2.18.1 Fixed an issue when using a float number in the scale to zero inactivity value - custom which sometimes caused the submission to fail.
    RUN-20087 2.18.1 Fixed an issue where inference graphs should be displayed only for minimum cluster versions.
    RUN-10733 2.18.1 Fixed an issue where we needed to minify and obfuscate our code in production.
    RUN-19962 2.18.1 Fixed an issue to fix sentry domains regex and map them to relevant projects.
    RUN-20104 2.18.1 Fixed an issue where frontend Infinite loop on keycloak causes an error.
    RUN-19906 2.18.1 Fixed an issue where inference workload name validation fails with 2.16 cluster.
    RUN-19605 2.18.1 Fixed an issue where authorized users should support multiple users (workload-controller) .
    RUN-19903 2.18.1 Fixed an issue where inference chatbot creation fails with 2.16 cluster.
    RUN-20409 2.18.1 Fixed an issue where clicking on create new compute during the runai model flow did nothing.
    RUN-11224 2.18.1 Fixed an issue where ruani-adm collect all logs was not collecting all logs.
    RUN-20478 2.18.1 Improved workloads error status in overview panel.
    RUN-19850 2.18.1 Fixed an issue where an application administrator could not submit a job with CLI.
    RUN-19863 2.18.1 Fixed an issue where department admin received 403 on get tenants and cannot login to UI.
    RUN-19904 2.18.1 Fixed an issue when filtering by allocatedGPU in get workloads with operator returns incorrect result.
    RUN-19925 2.18.1 Fixed an issue when upgrade from v2.16 to v2.18 failed on worklaods migrations.
    RUN-19887 2.18.1 Fixed an issue in the UI when there is a scheduling rule of timeout, the form opened with the rules collapsed and written "none".
    RUN-19941 2.18.1 Fixed an issue where completed and failed jobs were shown in view pods in nodes screen.
    RUN-19940 2.18.1 Fixed an issue where setting gpu quota failed because the department quota was taken from wrong department.
    RUN-19890 2.18.1 Fixed an issue where editing a project by removing its node-affinity stuck updating.
    RUN-20120 2.18.1 Fixed an issue where project update fails when there is no cluster version.
    RUN-20113 2.18.1 Fixed an issue in the Workloads table where a researcher does not see other workloads once they clear their filters.
    RUN-19915 2.18.1 Fixed an issue when turning departments toggles on on cluster v2.11+ the gpu limit is -1 and there is ui error.
    RUN-20178 2.18.1 Fixed an issue where dashboard CPU tabs appeared in new overview.
    RUN-20247 2.18.1 Fixed an issue where you couldn't create a workload with namespace of a deleted project.
    RUN-20138 2.18.1 Fixed an issue where the system failed to create node-type on override-backend env.
    RUN-18994 2.18.1 Fixed an issue where some limitations for department administrator are not working as expected.
    RUN-19830 2.18.1 Fixed an issue where resources (GPU, CPU, Memory) units were added to k8s events that are published by run:ai scheduler making our messages more readable.

    Version 2.18.0 Fixes

    Internal ID Description
    RUN-20734 Fixed an issue where the enable/disable toggle for the feature was presenting wrong info.
    RUN-19895 Fixed an issue of empty state for deleted workloads which is incorrect.
    RUN-19507 Fixed an issue in V1 where get APIs are missing required field in swagger leading to omit empty.
    RUN-20246 Fixed an issue in Departments v1 org unit where if unrecognizable params are sent, an error is returned.
    RUN-19947 Fixed an issue where pending multi-nodepool podgroups got stuck after cluster upgrade.
    RUN-20047 Fixed an issue where Workload status shows as "deleting" rather than "deleted" in side panel.
    RUN-20163 Fixed an issue when a DV is shared with a department and a new project is added to this dep - no pvc/pv is created.
    RUN-20484 Fixed an issue where Create Projects Requests Returned 500 - services is not a valid ResourceType.
    RUN-20354 Fixed an issue when deleting a department with projects resulted in projects remaining in environment with the status NotReady.
    \ No newline at end of file diff --git a/v2.20/home/changelog/hotfixes-2-19/index.html b/v2.20/home/changelog/hotfixes-2-19/index.html index ea8dddcb51..4ea2ce50c0 100644 --- a/v2.20/home/changelog/hotfixes-2-19/index.html +++ b/v2.20/home/changelog/hotfixes-2-19/index.html @@ -1,4 +1,4 @@ - Changelog Version 2.19 - Run:ai Documentation Library

    Hotfixes for 2.19

    The following is a list of the known and fixed issues for Run:ai V2.19.

    Hotfixes

    Internal ID Hotfix # Description
    RUN-17284 2.19.49 Fixed an issue where workloads were suspended when set with the termination after preemption option.
    RUN-25290 2.19.49 Fixed a security vulnerability in golang.org/x/net v0.33.0 with CVE-2024-45338 with severity HIGH.
    RUN-25234 2.19.49 Fixed security vulnerabilities by updating oauth2 proxy image to the latest.
    RUN-25234 2.19.48 Fixed an authentication issue in CLI V1.
    RUN-25062 2.19.45 Fixed a security vulnerability in github.com.go-git.go-git.v5 with CVE CVE-2025-21614 with severity HIGH.
    RUN-25061 2.19.45 Fixed a security vulnerability in github.com.go-git.go-git.v5 with CVE CVE-2025-21613 with severity HIGH.
    RUN-24857 2.19.45 Fixed a security vulnerability in golang.org.x.net with CVE CVE-2024-45338 with severity HIGH.
    RUN-24733 2.19.45 Fixed an issue where users were unable to load the quota management dashboard.
    RUN-25094 2.19.44 Fixed an issue where OpenShift could not be upgraded due to a broken 3rd binary.
    RUN-24026 2.19.40 Fixed a security vulnerability in krb5-libs with CVE CVE-2024-3596.
    RUN-24649 2.19.40 Fixed an issue where submitting a workload with existingPvc=false and not providing a claimName resulted in auto-generating a claimName that included both upper and lower case letters. Since Kubernetes rejects uppercase letters, the workload would fail. The behavior has been updated to generate names using only lowercase letters.
    RUN-24632 2.19.40 Fixed an issue where an existing Prometheus monitoring setup deployed in an unexpected namespace was reported as missing, causing Run:ai installation to fail on the cluster. The installation mechanism now searches for the monitoring prerequisite in additional relevant namespaces.
    RUN-24693 2.19.40 Fixed an issue where users were unable to provide metric store authentication details using secret references.
    RUN-23744 2.19.40 Fixed an issue where refreshing some pages (such as the settings, policy, and access rules) removed the side navigation.
    RUN-24715 2.19.40 Fixed an issue in the templates form where selecting Secret as a data source got stuck in an infinite loading page.
    RUN-24831 2.19.40 Fixed an issue where some edge cases triggered consolidation without it actually being necessary.
    RUN-24873 2.19.40 Fixed an issue where users were unable to configure email notifications regarding workload statuses.
    RUN-24921 2.19.40 Fixed a security vulnerability in golang.org.x.net and golang.org.x.crypto.
    RUN-23914 2.19.38 Fixed an issue where unexpected behavior could occur if an application was capturing a graph while memory was being swapped in as part of the GPU memory swap feature.
    RUN-24521 2.19.36 Fixed a security vulnerability in golang.org.x.crypto with CVE CVE-2024-45337 with severity HIGH.
    RUN-24595 2.19.36 Fixed an issue where the new command-line interface did not parse master and worker commands/args simultaneously for distributed workloads.
    RUN-24565 2.19.34 Fixed an issue where the UI was hanging at times during Hugging Face model memory calculation.
    RUN-24021 2.19.33 Fixed a security vulnerability in pam with CVE-2024-10963.
    RUN-24506 2.19.33 Fixed a security vulnerability in krb5-libs with CVE-2024-3596.
    RUN-24259 2.19.31 Fixed an issue where the option to reset a local user password is sometimes not available.
    RUN-23798 2.19.30 Fixed an issue in distributed PyTorch workloads where the worker pods are deleted immediately after completion, not allowing logs to be viewed.
    RUN-24184 2.19.28 Fixed an issue in database migration when upgrading from 2.16 to 2.19.
    RUN-23752 2.19.27 Fixed an issue in the distributed training submission form when a policy on the master pod was applied.
    RUN-23040 2.19.27 Fixed an edge case where the Run:ai container toolkit hangs when user is spawning hundreds of sub-processes.
    RUN-23211 2.19.27 Fixed an issue where workloads were stuck at "Pending" when the command-line interface flag --gpu-memory was set to zero.
    RUN-23561 2.19.27 Fixed an issue where the frontend in airgapped environment attempted to download font resources from the internet.
    RUN-23789 2.19.27 Fixed an issue where in some cases, it was not possible to download the latest version of the command-line interface.
    RUN-23790 2.19.27 Fixed an issue where in some cases it was not possible to download the Windows version of the command-line interface.
    RUN-23802 2.19.27 Fixed an issue where new scheduling rules were not applied to existing workloads, if those new rules were set on existing projects which had no scheduling rules before.
    RUN-23838 2.19.27 Fixed an issue where the command-line interface could not access resources when configured as single-sign on in a self-hosted environment.
    RUN-23855 2.19.27 Fixed an issue where the pods list in the UI showed past pods.
    RUN-23857 2.19.27 Dashboard to transition from Grafana v9 to v10.
    RUN-24010 2.19.27 Fixed an infinite loop issue in the cluster-sync service.
    RUN-23669 2.19.25 Fixed an issue where export function of consumption Grafana dashboard was not showing.
    RUN-23778 2.19.24 Fixed an issue where mapping of UID and other properties disappears.
    RUN-23770 2.19.24 Fixed an issue where older overview dashboard does not filter on cluster, even though a cluster is selected.
    RUN-23762 2.19.24 Fixed an issue where the wrong version of a Grafana dashboard was displayed in the UI.
    RUN-23752 2.19.24 Fixed an issue in the distributed training submission form when a policy on the master pod was applied.
    RUN-23664 2.19.24 Fixed an issue where the GPU quota numbers on the department overview page did not mach the department edit page.
    RUN-21198 2.19.22 Fixed an issue where creating a training workload via yaml (kubectl apply -f) and specifying spec.namePrefix, created infinite jobs.
    RUN-23583 2.19.21 Fixed an issue where the new UI navigation bar sometimes showed multiple selections.
    RUN-23541 2.19.21 Fixed an issue where authorization was not working properly in SaaS due to wrong oidc URL being used.
    RUN-23376 2.19.21 Fixed an issue where the new command-line interface required re-login after 10 minutes.
    RUN-23162 2.19.21 Fixed an issue where older audit logs did not show on the new audit log UI.
    RUN-23385 2.19.20 Fixed an issue where calls to api/v1/notifications/config/notifications would return 502
    RUN-23382 2.19.20 Fixed an issue where all nodepools were deleted on cluster upgrade
    RUN-23374 2.19.20 Fixed an issue where "ghost" nodepool in project settings prevents workload creation via UI/API
    RUN-23291 2.19.20 CLI - change text to be user friendly
    RUN-23283 2.19.20 Fixed a permissions issue with the Analytics dashboard post upgrade for SSO Users
    RUN-23208 2.19.20 Upload the source map to sentry only
    RUN-22642 2.19.20 infw-controller service tests for the reconcile
    RUN-23373 2.19.19 Fixed an issue where a new data source couldn't be created from the "New Workload" form.
    RUN-23368 2.19.19 Fixed an issue where the getProjects v1 API returned a list of users which was not always in the same order.
    RUN-23333 2.19.19 Fixed an issue where node pool with overProvisioningRatio greater than 1 cannot be created.
    RUN-23215 2.19.18 Fixed an issue where metrics requests from backend to mimir failed for certain tenants.
    RUN-23334 2.19.17 Updated some dockerfiles to the latest ubi9 image for security vulnerabilities.
    RUN-23318 2.19.16 Fixed an issue where some projects held faulty data which caused the getProjectById API to fail
    RUN-23140 2.19.16 Fixed an issue where distributed workloads were created with the wrong types
    RUN-22069 2.19.16 Fixed an isuue where JWT parse with claims failed to parse token without Keyfunc.
    RUN-23321 2.19.15 Fixed and issue where the GetProjectById wrapper API of the org-unit client in the runai-common-packages ignored errors
    RUN-23296 2.19.15 Fixed an issue in the CLI where runai attach did not work with auto-complete
    RUN-23282 2.19.15 CLI documentation fixes
    RUN-23245 2.19.15 Fixed an issue where ther binder service didn't update the pod status
    RUN-23057 2.19.15 OCP 2.19 upgrade troubleshooting
    RUN-22138 2.19.15 Fixed an issue where private URL user(s) input was an email and not a string.
    RUN-23243 2.19.14 Fixed an issue where the scope tree wasn't calculating permissions correctly
    RUN-23208 2.19.14 Upload the source map to sentry only
    RUN-23198 2.19.14 Fixed an issue where external-workload-integrator sometimes crashed for RayJob
    RUN-23191 2.19.13 Fixed an issue where creating workloads in the UI returned only the first 50 projects
    RUN-23142 2.19.12 Fixed an issue where advanced GPU metrics per-gpu did not have gpu label
    RUN-23139 2.19.12 Fixed an issue where inference workload showed wrong status.
    RUN-23027 2.19.12 Deprecated migProfiles API fields
    RUN-23001 2.19.12 Fixed an issue of false overcommit on out-of-memory kills in the Swap feature.
    RUN-22851 2.19.12 Fixed an issue where client may get stuck on device lock acquired during “swap” out-migration
    RUN-22771 2.19.12 Fixed an issue where get cluster by id with metadata verbosity returned zero values
    RUN-22742 2.19.12 Fixed user experience issue in inference autoscaling
    RUN-22725 2.19.12 Fixed an issue where the cloud operator failed to get pods in nodes UI.
    RUN-22720 2.19.12 Fixed an issue where the cloud operator failed to get projects in node pools UI.
    RUN-22700 2.19.12 Added auto refresh to the overview dashboard, Pods modal in the Workloads page, and Event history page
    RUN-22544 2.19.12 Updated Grafana version for security vulnerabilities.
    RUN-23083 2.19.11 Fixed an issue where workload actions were blocked in the UI when the cluster had any issues
    RUN-22771 2.19.11 Fixed an issue where the getClusterById API with metadata verbosity returned zero values

    Version 2.19.0 Fixes

    Internal ID Description
    RUN-21756 Fixed an issue where the NFS mount path doesn’t accept “{}” characters
    RUN-21475 Fixed an issue where users failed to select the compute resource from UI if the compute resource is last in the list and has a long name

    Hotfixes for 2.19

    The following is a list of the known and fixed issues for Run:ai V2.19.

    Hotfixes

    Internal ID Hotfix # Description
    RUN-17284 2.19.49 Fixed an issue where workloads were suspended when set with the termination after preemption option.
    RUN-25290 2.19.49 Fixed a security vulnerability in golang.org/x/net v0.33.0 with CVE-2024-45338 with severity HIGH.
    RUN-25234 2.19.49 Fixed security vulnerabilities by updating oauth2 proxy image to the latest.
    RUN-25234 2.19.48 Fixed an authentication issue in CLI V1.
    RUN-25062 2.19.45 Fixed a security vulnerability in github.com.go-git.go-git.v5 with CVE CVE-2025-21614 with severity HIGH.
    RUN-25061 2.19.45 Fixed a security vulnerability in github.com.go-git.go-git.v5 with CVE CVE-2025-21613 with severity HIGH.
    RUN-24857 2.19.45 Fixed a security vulnerability in golang.org.x.net with CVE CVE-2024-45338 with severity HIGH.
    RUN-24733 2.19.45 Fixed an issue where users were unable to load the quota management dashboard.
    RUN-25094 2.19.44 Fixed an issue where OpenShift could not be upgraded due to a broken 3rd binary.
    RUN-24026 2.19.40 Fixed a security vulnerability in krb5-libs with CVE CVE-2024-3596.
    RUN-24649 2.19.40 Fixed an issue where submitting a workload with existingPvc=false and not providing a claimName resulted in auto-generating a claimName that included both upper and lower case letters. Since Kubernetes rejects uppercase letters, the workload would fail. The behavior has been updated to generate names using only lowercase letters.
    RUN-24632 2.19.40 Fixed an issue where an existing Prometheus monitoring setup deployed in an unexpected namespace was reported as missing, causing Run:ai installation to fail on the cluster. The installation mechanism now searches for the monitoring prerequisite in additional relevant namespaces.
    RUN-24693 2.19.40 Fixed an issue where users were unable to provide metric store authentication details using secret references.
    RUN-23744 2.19.40 Fixed an issue where refreshing some pages (such as the settings, policy, and access rules) removed the side navigation.
    RUN-24715 2.19.40 Fixed an issue in the templates form where selecting Secret as a data source got stuck in an infinite loading page.
    RUN-24831 2.19.40 Fixed an issue where some edge cases triggered consolidation without it actually being necessary.
    RUN-24873 2.19.40 Fixed an issue where users were unable to configure email notifications regarding workload statuses.
    RUN-24921 2.19.40 Fixed a security vulnerability in golang.org.x.net and golang.org.x.crypto.
    RUN-23914 2.19.38 Fixed an issue where unexpected behavior could occur if an application was capturing a graph while memory was being swapped in as part of the GPU memory swap feature.
    RUN-24521 2.19.36 Fixed a security vulnerability in golang.org.x.crypto with CVE CVE-2024-45337 with severity HIGH.
    RUN-24595 2.19.36 Fixed an issue where the new command-line interface did not parse master and worker commands/args simultaneously for distributed workloads.
    RUN-24565 2.19.34 Fixed an issue where the UI was hanging at times during Hugging Face model memory calculation.
    RUN-24021 2.19.33 Fixed a security vulnerability in pam with CVE-2024-10963.
    RUN-24506 2.19.33 Fixed a security vulnerability in krb5-libs with CVE-2024-3596.
    RUN-24259 2.19.31 Fixed an issue where the option to reset a local user password is sometimes not available.
    RUN-23798 2.19.30 Fixed an issue in distributed PyTorch workloads where the worker pods are deleted immediately after completion, not allowing logs to be viewed.
    RUN-24184 2.19.28 Fixed an issue in database migration when upgrading from 2.16 to 2.19.
    RUN-23752 2.19.27 Fixed an issue in the distributed training submission form when a policy on the master pod was applied.
    RUN-23040 2.19.27 Fixed an edge case where the Run:ai container toolkit hangs when user is spawning hundreds of sub-processes.
    RUN-23211 2.19.27 Fixed an issue where workloads were stuck at "Pending" when the command-line interface flag --gpu-memory was set to zero.
    RUN-23561 2.19.27 Fixed an issue where the frontend in airgapped environment attempted to download font resources from the internet.
    RUN-23789 2.19.27 Fixed an issue where in some cases, it was not possible to download the latest version of the command-line interface.
    RUN-23790 2.19.27 Fixed an issue where in some cases it was not possible to download the Windows version of the command-line interface.
    RUN-23802 2.19.27 Fixed an issue where new scheduling rules were not applied to existing workloads, if those new rules were set on existing projects which had no scheduling rules before.
    RUN-23838 2.19.27 Fixed an issue where the command-line interface could not access resources when configured as single-sign on in a self-hosted environment.
    RUN-23855 2.19.27 Fixed an issue where the pods list in the UI showed past pods.
    RUN-23857 2.19.27 Dashboard to transition from Grafana v9 to v10.
    RUN-24010 2.19.27 Fixed an infinite loop issue in the cluster-sync service.
    RUN-23669 2.19.25 Fixed an issue where export function of consumption Grafana dashboard was not showing.
    RUN-23778 2.19.24 Fixed an issue where mapping of UID and other properties disappears.
    RUN-23770 2.19.24 Fixed an issue where older overview dashboard does not filter on cluster, even though a cluster is selected.
    RUN-23762 2.19.24 Fixed an issue where the wrong version of a Grafana dashboard was displayed in the UI.
    RUN-23752 2.19.24 Fixed an issue in the distributed training submission form when a policy on the master pod was applied.
    RUN-23664 2.19.24 Fixed an issue where the GPU quota numbers on the department overview page did not mach the department edit page.
    RUN-21198 2.19.22 Fixed an issue where creating a training workload via yaml (kubectl apply -f) and specifying spec.namePrefix, created infinite jobs.
    RUN-23583 2.19.21 Fixed an issue where the new UI navigation bar sometimes showed multiple selections.
    RUN-23541 2.19.21 Fixed an issue where authorization was not working properly in SaaS due to wrong oidc URL being used.
    RUN-23376 2.19.21 Fixed an issue where the new command-line interface required re-login after 10 minutes.
    RUN-23162 2.19.21 Fixed an issue where older audit logs did not show on the new audit log UI.
    RUN-23385 2.19.20 Fixed an issue where calls to api/v1/notifications/config/notifications would return 502
    RUN-23382 2.19.20 Fixed an issue where all nodepools were deleted on cluster upgrade
    RUN-23374 2.19.20 Fixed an issue where "ghost" nodepool in project settings prevents workload creation via UI/API
    RUN-23291 2.19.20 CLI - change text to be user friendly
    RUN-23283 2.19.20 Fixed a permissions issue with the Analytics dashboard post upgrade for SSO Users
    RUN-23208 2.19.20 Upload the source map to sentry only
    RUN-22642 2.19.20 infw-controller service tests for the reconcile
    RUN-23373 2.19.19 Fixed an issue where a new data source couldn't be created from the "New Workload" form.
    RUN-23368 2.19.19 Fixed an issue where the getProjects v1 API returned a list of users which was not always in the same order.
    RUN-23333 2.19.19 Fixed an issue where node pool with overProvisioningRatio greater than 1 cannot be created.
    RUN-23215 2.19.18 Fixed an issue where metrics requests from backend to mimir failed for certain tenants.
    RUN-23334 2.19.17 Updated some dockerfiles to the latest ubi9 image for security vulnerabilities.
    RUN-23318 2.19.16 Fixed an issue where some projects held faulty data which caused the getProjectById API to fail
    RUN-23140 2.19.16 Fixed an issue where distributed workloads were created with the wrong types
    RUN-22069 2.19.16 Fixed an isuue where JWT parse with claims failed to parse token without Keyfunc.
    RUN-23321 2.19.15 Fixed and issue where the GetProjectById wrapper API of the org-unit client in the runai-common-packages ignored errors
    RUN-23296 2.19.15 Fixed an issue in the CLI where runai attach did not work with auto-complete
    RUN-23282 2.19.15 CLI documentation fixes
    RUN-23245 2.19.15 Fixed an issue where ther binder service didn't update the pod status
    RUN-23057 2.19.15 OCP 2.19 upgrade troubleshooting
    RUN-22138 2.19.15 Fixed an issue where private URL user(s) input was an email and not a string.
    RUN-23243 2.19.14 Fixed an issue where the scope tree wasn't calculating permissions correctly
    RUN-23208 2.19.14 Upload the source map to sentry only
    RUN-23198 2.19.14 Fixed an issue where external-workload-integrator sometimes crashed for RayJob
    RUN-23191 2.19.13 Fixed an issue where creating workloads in the UI returned only the first 50 projects
    RUN-23142 2.19.12 Fixed an issue where advanced GPU metrics per-gpu did not have gpu label
    RUN-23139 2.19.12 Fixed an issue where inference workload showed wrong status.
    RUN-23027 2.19.12 Deprecated migProfiles API fields
    RUN-23001 2.19.12 Fixed an issue of false overcommit on out-of-memory kills in the Swap feature.
    RUN-22851 2.19.12 Fixed an issue where client may get stuck on device lock acquired during “swap” out-migration
    RUN-22771 2.19.12 Fixed an issue where get cluster by id with metadata verbosity returned zero values
    RUN-22742 2.19.12 Fixed user experience issue in inference autoscaling
    RUN-22725 2.19.12 Fixed an issue where the cloud operator failed to get pods in nodes UI.
    RUN-22720 2.19.12 Fixed an issue where the cloud operator failed to get projects in node pools UI.
    RUN-22700 2.19.12 Added auto refresh to the overview dashboard, Pods modal in the Workloads page, and Event history page
    RUN-22544 2.19.12 Updated Grafana version for security vulnerabilities.
    RUN-23083 2.19.11 Fixed an issue where workload actions were blocked in the UI when the cluster had any issues
    RUN-22771 2.19.11 Fixed an issue where the getClusterById API with metadata verbosity returned zero values

    Version 2.19.0 Fixes

    Internal ID Description
    RUN-21756 Fixed an issue where the NFS mount path doesn’t accept “{}” characters
    RUN-21475 Fixed an issue where users failed to select the compute resource from UI if the compute resource is last in the list and has a long name
    \ No newline at end of file diff --git a/v2.20/home/changelog/hotfixes-2-20/index.html b/v2.20/home/changelog/hotfixes-2-20/index.html index 04be6f3fa9..88061a70ee 100644 --- a/v2.20/home/changelog/hotfixes-2-20/index.html +++ b/v2.20/home/changelog/hotfixes-2-20/index.html @@ -1,4 +1,4 @@ - Changelog Version 2.19 - Run:ai Documentation Library

    Hotfixes for 2.20

    This section provides details on all hotfixes available for version 2.20. Hotfixes are critical updates released between our major and minor versions to address specific issues or vulnerabilities. These updates ensure the system remains secure, stable, and optimized without requiring a full version upgrade.

    Hotfixes

    Version Date Internal ID Description
    2.20.15 24/01/2025 RUN-24354 Fixed an issue where migrating workloads failed due to slow network connection.
    2.20.14 23/01/2025 RUN-24754 Fixed an issue where the status of training and interactive workloads was not updated correctly.
    2.20.14 23/01/2025 RUN-24838 Fixed an issue where an environment asset could not be created if it included an environment variable with no value specified.
    2.20.11 21/01/2025 RUN-25303 Fixed an issue where submitting with the --attach flag was supported only in a workspace workload.
    2.20.11 21/01/2025 RUN-25291 Fixed a security vulnerability in golang.org/x/net v0.33.0 with CVE-2024-45338 with severity HIGH.
    2.20.10 20/01/2025 RUN-25234 Fixed an authentication issue in CLI V1.
    2.20.9 19/01/2025 RUN-25032 Fixed an issue where inference workloads with large container sizes skipped the Initializing state.
    2.20.9 19/01/2025 RUN-24752 Fixed an issue where a workload would move to a failed state when created with a custom NodePort that was already allocated.
    2.20.9 19/01/2025 RUN-25031 Fixed an issue in the Templates form where existing credentials in the environment variables section were not displayed.
    2.20.5 14/01/2025 RUN-25061 Fixed a security vulnerability in github.com.go-git.go-git.v5 with CVE CVE-2025-21613 with severity HIGH.

    Hotfixes for 2.20

    This section provides details on all hotfixes available for version 2.20. Hotfixes are critical updates released between our major and minor versions to address specific issues or vulnerabilities. These updates ensure the system remains secure, stable, and optimized without requiring a full version upgrade.

    Hotfixes

    Version Date Internal ID Description
    2.20.15 24/01/2025 RUN-24354 Fixed an issue where migrating workloads failed due to slow network connection.
    2.20.14 23/01/2025 RUN-24754 Fixed an issue where the status of training and interactive workloads was not updated correctly.
    2.20.14 23/01/2025 RUN-24838 Fixed an issue where an environment asset could not be created if it included an environment variable with no value specified.
    2.20.11 21/01/2025 RUN-25303 Fixed an issue where submitting with the --attach flag was supported only in a workspace workload.
    2.20.11 21/01/2025 RUN-25291 Fixed a security vulnerability in golang.org/x/net v0.33.0 with CVE-2024-45338 with severity HIGH.
    2.20.10 20/01/2025 RUN-25234 Fixed an authentication issue in CLI V1.
    2.20.9 19/01/2025 RUN-25032 Fixed an issue where inference workloads with large container sizes skipped the Initializing state.
    2.20.9 19/01/2025 RUN-24752 Fixed an issue where a workload would move to a failed state when created with a custom NodePort that was already allocated.
    2.20.9 19/01/2025 RUN-25031 Fixed an issue in the Templates form where existing credentials in the environment variables section were not displayed.
    2.20.5 14/01/2025 RUN-25061 Fixed a security vulnerability in github.com.go-git.go-git.v5 with CVE CVE-2025-21613 with severity HIGH.
    \ No newline at end of file diff --git a/v2.20/home/components/index.html b/v2.20/home/components/index.html index cd84345610..3c13478e80 100644 --- a/v2.20/home/components/index.html +++ b/v2.20/home/components/index.html @@ -1,4 +1,4 @@ - System Components - Run:ai Documentation Library

    Run:ai System Components

    Components

    Run:ai is made up of two components:

    • The Run:ai cluster provides scheduling services and workload management.
    • The Run:ai control plane provides resource management, Workload submission and cluster monitoring.

    Technology-wise, both are installed over a Kubernetes Cluster.

    Run:ai users:

    • Researchers submit Machine Learning workloads via the Run:ai Console, the Run:ai Command-Line Interface (CLI), or directly by sending YAML files to Kubernetes.
    • Administrators monitor and set priorities via the Run:ai User Interface

    multi-cluster-architecture

    Run:ai Cluster

    • Run:ai comes with its own Scheduler. The Run:ai scheduler extends the Kubernetes scheduler. It uses business rules to schedule workloads sent by Researchers.
    • Run:ai schedules Workloads. Workloads include the actual researcher code running as a Kubernetes container, together with all the system resources required to run the code, such as user storage, network endpoints to access the container etc.
    • The cluster uses an outbound-only, secure connection to synchronize with the Run:ai control plane. Information includes meta-data sync and various metrics on Workloads, Nodes etc.
    • The Run:ai cluster is installed as a Kubernetes Operator
    • Run:ai is installed in its own Kubernetes namespace named runai
    • Workloads are run in the context of Run:ai Projects. Each Project is mapped to a Kubernetes namespace with its own settings and access control.

    Run:ai Control Plane on the cloud

    The Run:ai control plane is used by multiple customers (tenants) to manage resources (such as Projects & Departments), submit Workloads and monitor multiple clusters.

    A single Run:ai customer (tenant) defined in the control-plane, can manage multiple Run:ai clusters. So a single customer, can manage mutltiple GPU clusters in multiple locations/subnets from a single interface.

    Self-hosted Control-Plane

    The Run:ai control plane can also be locally installed. To understand the various installation options see the installation types document.

    Run:ai System Components

    Components

    Run:ai is made up of two components:

    • The Run:ai cluster provides scheduling services and workload management.
    • The Run:ai control plane provides resource management, Workload submission and cluster monitoring.

    Technology-wise, both are installed over a Kubernetes Cluster.

    Run:ai users:

    • Researchers submit Machine Learning workloads via the Run:ai Console, the Run:ai Command-Line Interface (CLI), or directly by sending YAML files to Kubernetes.
    • Administrators monitor and set priorities via the Run:ai User Interface

    multi-cluster-architecture

    Run:ai Cluster

    • Run:ai comes with its own Scheduler. The Run:ai scheduler extends the Kubernetes scheduler. It uses business rules to schedule workloads sent by Researchers.
    • Run:ai schedules Workloads. Workloads include the actual researcher code running as a Kubernetes container, together with all the system resources required to run the code, such as user storage, network endpoints to access the container etc.
    • The cluster uses an outbound-only, secure connection to synchronize with the Run:ai control plane. Information includes meta-data sync and various metrics on Workloads, Nodes etc.
    • The Run:ai cluster is installed as a Kubernetes Operator
    • Run:ai is installed in its own Kubernetes namespace named runai
    • Workloads are run in the context of Run:ai Projects. Each Project is mapped to a Kubernetes namespace with its own settings and access control.

    Run:ai Control Plane on the cloud

    The Run:ai control plane is used by multiple customers (tenants) to manage resources (such as Projects & Departments), submit Workloads and monitor multiple clusters.

    A single Run:ai customer (tenant) defined in the control-plane, can manage multiple Run:ai clusters. So a single customer, can manage mutltiple GPU clusters in multiple locations/subnets from a single interface.

    Self-hosted Control-Plane

    The Run:ai control plane can also be locally installed. To understand the various installation options see the installation types document.

    \ No newline at end of file diff --git a/v2.20/home/data-privacy-details/index.html b/v2.20/home/data-privacy-details/index.html index 4e99c20a2b..8a1f509f1d 100644 --- a/v2.20/home/data-privacy-details/index.html +++ b/v2.20/home/data-privacy-details/index.html @@ -1,4 +1,4 @@ - Data Privacy - Run:ai Documentation Library

    Data Privacy

    This article details the data privacy and compliance considerations for deploying Run:ai. It is intended to help administrators and compliance teams understand the data management practices involved with Run:ai. This ensures the permissions align with organizational policies and regulatory requirements before installation and during integration and onboarding of the various teams.

    When using the Run:ai SaaS cluster, the Control plane operates through the Run:ai cloud, requiring the transmission of certain data for control and analytics. Below is a detailed breakdown of the specific data sent to the Run:ai cloud in the SaaS offering.

    Note

    For organizations where data privacy policies do not align with this data transmission, Run:ai offers a self-hosted version. This version includes the control plane on premise and does not communicate with the cloud.

    Data sent to the Run:ai cloud

    Asset Details
    Workload Metrics Includes workload names, CPU, GPU, and memory metrics, as well as parameters provided during the runai submit command.
    Workload Assets Covers environments, compute resources, and data resources associated with workloads.
    Resource Credentials Credentials for cluster resources, encrypted with a SHA-512 algorithm specific to each tenant.
    Node Metrics Node-specific data including names, IPs, and performance metrics (CPU, GPU, memory).
    Cluster Metrics Cluster-wide metrics such as names, CPU, GPU, and memory usage.
    Projects & Departments Includes names and quota information for projects and departments.
    Users User roles within Run:ai, email addresses, and passwords.

    Key consideration

    Run:ai ensures that no deep-learning artefacts, such as code, images, container logs, training data, models, or checkpoints, are transmitted to the cloud. These assets remain securely within your organization's firewalls, safeguarding sensitive intellectual property and data.

    See Also

    The Run:ai privacy policy.

    Data Privacy

    This article details the data privacy and compliance considerations for deploying Run:ai. It is intended to help administrators and compliance teams understand the data management practices involved with Run:ai. This ensures the permissions align with organizational policies and regulatory requirements before installation and during integration and onboarding of the various teams.

    When using the Run:ai SaaS cluster, the Control plane operates through the Run:ai cloud, requiring the transmission of certain data for control and analytics. Below is a detailed breakdown of the specific data sent to the Run:ai cloud in the SaaS offering.

    Note

    For organizations where data privacy policies do not align with this data transmission, Run:ai offers a self-hosted version. This version includes the control plane on premise and does not communicate with the cloud.

    Data sent to the Run:ai cloud

    Asset Details
    Workload Metrics Includes workload names, CPU, GPU, and memory metrics, as well as parameters provided during the runai submit command.
    Workload Assets Covers environments, compute resources, and data resources associated with workloads.
    Resource Credentials Credentials for cluster resources, encrypted with a SHA-512 algorithm specific to each tenant.
    Node Metrics Node-specific data including names, IPs, and performance metrics (CPU, GPU, memory).
    Cluster Metrics Cluster-wide metrics such as names, CPU, GPU, and memory usage.
    Projects & Departments Includes names and quota information for projects and departments.
    Users User roles within Run:ai, email addresses, and passwords.

    Key consideration

    Run:ai ensures that no deep-learning artefacts, such as code, images, container logs, training data, models, or checkpoints, are transmitted to the cloud. These assets remain securely within your organization's firewalls, safeguarding sensitive intellectual property and data.

    See Also

    The Run:ai privacy policy.

    \ No newline at end of file diff --git a/v2.20/home/overview/index.html b/v2.20/home/overview/index.html index 7dd3cb9c94..2f0706966c 100644 --- a/v2.20/home/overview/index.html +++ b/v2.20/home/overview/index.html @@ -1,4 +1,4 @@ - Overview - Run:ai Documentation Library

    Run:ai Documentation Library

    Welcome to the Run:ai documentation area. For an introduction about what is the Run:ai Platform see Run:ai platform on the run.ai website.

    The Run:ai documentation is targeting four personas:

    • Infrastructure Administrator - An IT person, responsible for the installation, setup and IT maintenance of the Run:ai product. Infrastructure Administrator documentation can be found here.

    • Platform Administrator - Responsible for the day-to-day administration of the product. Platform Administrator documentation can be found here.

    • Researcher — Using Run:ai to spin up notebooks, submit Workloads, prompt models, etc. Researcher documentation can be found here.

    • Developer — Using various APIs to automate work with Run:ai. The Developer documentation can be found here.

    How to Get Support

    To get support use the following channels:

    • On the Run:ai user interface at <company-name>.run.ai, use the 'Contact Support' link on the top right.

    • Or submit a ticket by clicking the button below:

    Submit a Ticket

    Community

    Run:ai provides its customers with access to the Run:ai Customer Community portal to submit tickets, track ticket progress and update support cases.

    Customer Community Portal

    Reach out to customer support for credentials.

    Run:ai Cloud Status Page

    Run:ai cloud availability is monitored at status.run.ai.

    Collect Logs to Send to Support

    As an IT Administrator, you can collect Run:ai logs to send to support. For more information see logs collection.

    Example Code

    Code for the Docker images referred to on this site is available at https://github.com/run-ai/docs/tree/master/quickstart.

    The following images are used throughout the documentation:

    Image Description Source
    runai.jfrog.io/demo/quickstart Basic training image. Multi-GPU support https://github.com/run-ai/docs/tree/master/quickstart/main
    runai.jfrog.io/demo/quickstart-distributed Distributed training using MPI and Horovod https://github.com/run-ai/docs/tree/master/quickstart/distributed
    zembutsu/docker-sample-nginx Build (interactive) with Connected Ports https://github.com/zembutsu/docker-sample-nginx
    runai.jfrog.io/demo/quickstart-x-forwarding Use X11 forwarding from Docker image https://github.com/run-ai/docs/tree/master/quickstart/x-forwarding
    runai.jfrog.io/demo/pycharm-demo Image used for tool integration (PyCharm and VSCode) https://github.com/run-ai/docs/tree/master/quickstart/python%2Bssh
    runai.jfrog.io/demo/example-triton-client and runai.jfrog.io/demo/example-triton-server Basic Inference https://github.com/run-ai/models/tree/main/models/triton

    Contributing to the documentation

    This documentation is made better by individuals from our customer and partner community. If you see something worth fixing, please comment at the bottom of the page or create a pull request via GitHub. The public GitHub repository can be found on the top-right of this page.

    Run:ai Documentation Library

    Welcome to the Run:ai documentation area. For an introduction about what is the Run:ai Platform see Run:ai platform on the run.ai website.

    The Run:ai documentation is targeting four personas:

    • Infrastructure Administrator - An IT person, responsible for the installation, setup and IT maintenance of the Run:ai product. Infrastructure Administrator documentation can be found here.

    • Platform Administrator - Responsible for the day-to-day administration of the product. Platform Administrator documentation can be found here.

    • Researcher — Using Run:ai to spin up notebooks, submit Workloads, prompt models, etc. Researcher documentation can be found here.

    • Developer — Using various APIs to automate work with Run:ai. The Developer documentation can be found here.

    How to Get Support

    To get support use the following channels:

    • On the Run:ai user interface at <company-name>.run.ai, use the 'Contact Support' link on the top right.

    • Or submit a ticket by clicking the button below:

    Submit a Ticket

    Community

    Run:ai provides its customers with access to the Run:ai Customer Community portal to submit tickets, track ticket progress and update support cases.

    Customer Community Portal

    Reach out to customer support for credentials.

    Run:ai Cloud Status Page

    Run:ai cloud availability is monitored at status.run.ai.

    Collect Logs to Send to Support

    As an IT Administrator, you can collect Run:ai logs to send to support. For more information see logs collection.

    Example Code

    Code for the Docker images referred to on this site is available at https://github.com/run-ai/docs/tree/master/quickstart.

    The following images are used throughout the documentation:

    Image Description Source
    runai.jfrog.io/demo/quickstart Basic training image. Multi-GPU support https://github.com/run-ai/docs/tree/master/quickstart/main
    runai.jfrog.io/demo/quickstart-distributed Distributed training using MPI and Horovod https://github.com/run-ai/docs/tree/master/quickstart/distributed
    zembutsu/docker-sample-nginx Build (interactive) with Connected Ports https://github.com/zembutsu/docker-sample-nginx
    runai.jfrog.io/demo/quickstart-x-forwarding Use X11 forwarding from Docker image https://github.com/run-ai/docs/tree/master/quickstart/x-forwarding
    runai.jfrog.io/demo/pycharm-demo Image used for tool integration (PyCharm and VSCode) https://github.com/run-ai/docs/tree/master/quickstart/python%2Bssh
    runai.jfrog.io/demo/example-triton-client and runai.jfrog.io/demo/example-triton-server Basic Inference https://github.com/run-ai/models/tree/main/models/triton

    Contributing to the documentation

    This documentation is made better by individuals from our customer and partner community. If you see something worth fixing, please comment at the bottom of the page or create a pull request via GitHub. The public GitHub repository can be found on the top-right of this page.

    \ No newline at end of file diff --git a/v2.20/home/saas-updates/index.html b/v2.20/home/saas-updates/index.html index 87b232c335..a3523b724f 100644 --- a/v2.20/home/saas-updates/index.html +++ b/v2.20/home/saas-updates/index.html @@ -1,9 +1,9 @@ - Run:ai SaaS Updates - Run:ai Documentation Library

    What's New for the Run:ai SaaS Platform

    The release notes are aimed to provide transparency into the latest changes and improvements to Run:ai’s SaaS platform. The updates include new features, optimizations, and fixes aimed at improving performance and user experience.

    Latest GA release notes (https://docs.run.ai/latest/home/whats-new-2-19/)

    Gradual Rollout

    SaaS features are gradually rolled out to customers over the course of a week to ensure a smooth transition and minimize any potential disruption.

    November Release

    Product Enhancements

    • The display of the default GPU quota for the default department has been updated. Previously, the GPU quota was shown as -1. It has now been changed to display as "-" for better clarity.
    • New permissions have been added for the Application Administrator role, enabling full CRUD (Create, Read, Update, Delete) capabilities for managing applications.

    Resolved Bugs

    ID Description
    RUN-23778 Resolved an issue where SAML mappers were displayed as null in the UI upon editing an Identity Provider (IdP). The mapper values now persist as expected, and associated attributes remain unchanged.
    RUN-23762 Fixed a bug that caused some customers to receive the incorrect version of the dashboard. This issue led to inconsistencies in the user interface and functionality, impacting affected users' ability to access the appropriate dashboard features.
    RUN-23735 Fixed an issue where the limit parameter on the Users page did not enforce the minimum value constraint. This allowed invalid values to be processed, potentially causing errors in pagination
    RUN-23669 Consumption report: The Inspect feature in Grafana, which allows users to export consumption data from the portal, has been re-enabled
    RUN-23664 An issue has been resolved where the GPU quota numbers displayed on the Department Overview page did not match the values shown on the Department Edit page.
    RUN-20116 An issue has been resolved where searching for certain pages in the UI only applied the search filter to the current page. Relevant tables are: Users, Applications, Workloads, Projects, departments, Node pools.
    RUN-23575 The dynamic refresh was not properly preserving the user’s widget settings, causing them to reset to default values after each refresh cycle.
    RUN-23376 CLI v2: An issue was resolved where the runai logs command failed with a 401 Unauthorized error after a period of inactivity
    RUN-23373 An issue where AWS storage classes were not appearing when creating a new data source within a new workload has been resolved. Previously, AWS storage classes were only visible when creating a data source directly from the Data Sources tab.

    What's New for the Run:ai SaaS Platform

    The what's new provides transparency into the latest changes and improvements to Run:ai’s SaaS platform. The updates include new features, optimizations, and fixes aimed at improving performance and user experience.

    Latest GA release notes (https://docs.run.ai/latest/home/whats-new-2-20/)

    Gradual Rollout

    SaaS features are gradually rolled out to customers over the course of a week to ensure a smooth transition and minimize any potential disruption.

    February Release

    Product Enhancements

    • Workload Events API, /api/v1/workloads/{workloadId}/events, now supports the sort order parameter (asc, desc).
    • MIG profile and MIG options are now marked as deprecated in CLI v2, following the deprecation notice in the last version.
    • As part of inference support in CLI v2, Knative readiness is now validated on submit requests.
    • Improved permission error messaging when attempting to delete a user with higher privileges.
    • Improved visibility of metrics in the Resources utilization widget by repositioning them above the graphs.
    • Added a new Idle workloads table widget to help users easily identify and manage underutilized resources.
    • Renamed and updated the "Workloads by type" widget to provide clearer insights into cluster usage with a focus on workloads.
    • Improved user experience by moving the date picker to a dedicated section within the overtime widgets, Resources allocation and Resources utilization.
    • Simplified configuration by enabling auto-creation of storage class for discovered storage classes.
    • Enhanced PVC underlying storage configuration by specifying allowed context for the selected storage (Workload Volume, PVC, both, or neither).
    • Added configurable grace period for workload preemption in CLI v2.

    Resolved Bugs

    ID Description
    RUN-24838 Fixed an issue where an environment asset could not be created if it included an environment variable with no value specified.
    RUN-25031 Fixed an issue in the Templates form where existing credentials in the environment variables section were not displayed.
    RUN-25303 Fixed an issue where submitting with the --attach flag was supported only in a workspace workload.
    RUN-24354 Fixed an issue where migrating workloads failed due to slow network connection.
    RUN-25220 CLI v2: Changed --image flag from a required field to an optional one.
    RUN-25290 Fixed a security vulnerability in golang.org/x/net v0.33.0 with CVE-2024-45338 with severity HIGH.
    RUN-24688 Fixed an issue that blocked the Create Template submission due to a server error. This occurred when using the Copy & Edit Template form.
    \ No newline at end of file diff --git a/v2.20/home/whats-new-2-13/index.html b/v2.20/home/whats-new-2-13/index.html index 1a52d79cba..e53e25e6da 100644 --- a/v2.20/home/whats-new-2-13/index.html +++ b/v2.20/home/whats-new-2-13/index.html @@ -1,4 +1,4 @@ - Version 2.13 - Run:ai Documentation Library

    Run:ai version 2.13

    Version 2.13.7

    Release date

    July 2023

    Release content

    • Added filters to the historic quota ratio widget on the Quota management dashboard.

    Fixed issues

    Internal ID Description
    RUN-11080 Fixed an issue in OpenShift environments where log in via SSO with the kubeadmin user, gets blank pages for every page.
    RUN-11119 Fixed an issue where values that should be the Order of priority column are in the wrong column.
    RUN-11120 Fixed an issue where the Projects table does not show correct metrics when Run:ai version 2.13 is paired with a Run:ai 2.8 cluster.
    RUN-11121 Fixed an issue where the wrong over quota memory alert is shown in the Quota management pane in project edit form.
    RUN-11272 Fixed an issue in OpenShift environments where the selection in the cluster drop down in the main UI does not match the cluster selected on the login page.

    Version 2.13.4

    Release date

    July 2023

    Fixed issues

    Internal ID Description
    RUN-11089 Fixed an issue when creating an environment, commands in the Runtime settings pane and are not persistent and cannot be found in other assets (for example in a new Training).

    Version 2.13.1

    Release date

    July 2023

    Release content

    • Made an improvement so that occurrences of labels that are not in use anymore are deleted.

    Fixed issues

    N/A

    Version 2.13.0

    Release content

    This version contains features and fixes from previous versions starting with 2.9. Refer to the prior versions for specific features and fixes.

    Projects

    • Improved the Projects UI for ease of use. Projects follows UI upgrades and changes that are designed to make setting up of components and assets easier for administrators and researchers. To configure a project, see Projects.

    Dashboards

    • Added a new dashboard for Quota management, which provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard filters the display of resource quotas based on Departments, Projects, and Node pools. For more information, see Quota management dashboard.

    • Added to the Overview dashboard, the ability to filter the cluster by one or more node pools. For more information, see Node pools.

    Nodes and Node pools

    • Run:ai scheduler supports 2 scheduling strategies: Bin Packing (default) and Spread. For more information, see Scheduling strategies. You can configure the scheduling strategy in the node pool level to improve the support of clusters with mixed types of resources and workloads. For configuration information, see Creating new node pools.

    • GPU device level DCGM Metrics are collected per GPU and presented by Run:ai in the Nodes table. Each node contains a list of its embedded GPUs with their respective DCGM metrics. See DCGM Metrics for the list of metrics which are provided by NVidia DCGM and collected by Run:ai. Contact your Run:ai customer representative to enable this feature.

    • Added per node pool over-quota priority. Over-quota priority sets the relative amount of additional unused resources that an asset can get above its current quota. For more information, see Over-quota priority.
    • Added support of associating workspaces to node pool. The association between workspaces and node pools is done using Compute resources section. In order to associate a compute resource to a node pool, in the Compute resource section, press More settings. Press Add new to add more node pools to the configuration. Drag and drop the node pools to set their priority.
    • Added Node pool selection as part of the workload submission form. This allows researchers to quickly determine the list of node pools available and their priority. Priority is set by dragging and dropping them in the desired order of priority. In addition, when the node pool priority list is locked by a policy, the list isn't editable by the Researcher even if the workspace is created from a template or copied from another workspace.

    Time limit duration

    • Improved the behavior of any workload time limit (for example, Idle time limit) so that the time limit will affect existing workloads that were created before the time limit was configured. This is an optional feature which provides help in handling situations where researchers leave sessions open even when they do not need to access the resources. For more information, see Limit duration of interactive training jobs.

    • Improved workspaces time limits. Workspaces that reach a time limit will now transition to a state of stopped so that they can be reactivated later.

    • Added time limits for training jobs per project. Administrators (Department Admin, Editor) can limit the duration of Run:ai Training jobs per Project using a specified time limit value. This capability can assist administrators to limit the duration and resources consumed over time by training jobs in specific projects. Each training job that reaches this duration will be terminated.

    Workload assets

    • Extended the collaboration functionality for any workload asset such as Environment, Compute resource, and some Data source types. These assets are now shared with Departments in the organization in addition to being shared with specific projects, or the entire cluster.
    • Added a search box for card galleries in any asset based workload creation form to provide an easy way to search for assets and resources. To filter use the asset name or one of the field values of the card.

    PVC data sources

    • Added support for PVC block storage in the New data source form. In the New data source form for a new PVC data source, in the Volume mode field, select from Filesystem or Block. For more information, see Create a PVC data source.

    Credentials

    • Added Docker registry to the Credentials menu. Users can create docker credentials for use in specific projects for image pulling. To configure credentials, see Configuring credentials.

    Policies

    • Improved policy support by adding DEFAULTS in the items section in the policy. The DEFAULTS section sets the default behavior for items declared in this section. For example, this can be use to limit the submission of workloads only to existing PVCs. For more information and an example, see Policies, Complex values.
    • Added support for making a PVC data source available to all projects. In the New data source form, when creating a new PVC data source, select All from the Project pane.

    Researcher API

    Integrations

    • Added support for Ray jobs. Ray is an open-source unified framework for scaling AI and Python applications. For more information, see Integrate Run:ai with Ray.

    • Added integration with Weights & Biases Sweep to allow data scientists to submit hyperparameter optimization workloads directly from the Run:ai UI. To configure sweep, see Sweep configuration.

    • Added support for XGBoost. XGBoost, which stands for Extreme Gradient Boosting, is a scalable, distributed gradient-boosted decision tree (GBDT) machine learning library. It provides parallel tree boosting and is the leading machine learning library for regression, classification, and ranking problems. For more information, see runai submit-dist xgboost

    Compatability

    Installation

    • The manual process of upgrading Kubernetes CRDs is no longer needed when upgrading to the most recent version (2.13) of Run:ai.
    • From Run:ai 2.12 and above, the control-plane installation has been simplified and no longer requires the creation of a backend values file. Instead, install directly using helm as described in Install the Run:ai Control Plane.
    • From Run:ai 2.12 and above, the air-gapped, control-plane installation now generates a custom-env.yaml values file during the preparation stage. This is used when installing the control-plane.

    Known issues

    Internal ID Description
    RUN-11005 Incorrect error messages when trying to run runai CLI commands in an OpenShift environment.
    RUN-11009 Incorrect error message when a user without permissions to tries to delete another user.

    Fixed issues

    Internal ID Description
    RUN-9039 Fixed an issue where in the new job screen, after toggling off the preemptible flag, and a job is submitted, the job still shows as preemptible.
    RUN-9323 Fixed an issue with a non-scaleable error message when scheduling hundreds of nodes is not successful.
    RUN-9324 Fixed an issue where the scheduler did not take into consideration the amount of storage so there is no explanation that pvc is not ready.
    RUN-9902 Fixed an issue in OpenShift environments, where there are no metrics in the dashboard because Prometheus doesn’t have permissions to monitor the runai namespace after an installation or upgrade to 2.9.
    RUN-9920 Fixed an issue where the canEdit key in a policy is not validated properly for itemized fields when configuring an interactive policy.
    RUN-10052 Fixed an issue when loading a new job from a template gives an error until there are changes made on the form.
    RUN-10053 Fixed an issue where the Node pool column is unsearchable in the job list.
    RUN-10422 Fixed an issue where node details show running workloads that were actually finished (successfully/failed/etc.).
    RUN-10500 Fixed an issue where jobs are shown as running even though they don't exist in the cluster.
    RUN-10813 Fixed an issue in adding a data source where the path is case sensitive and didn't allow uppercase.
    \ No newline at end of file diff --git a/v2.20/home/whats-new-2-15/index.html b/v2.20/home/whats-new-2-15/index.html index f958536a88..35d26d92dd 100644 --- a/v2.20/home/whats-new-2-15/index.html +++ b/v2.20/home/whats-new-2-15/index.html @@ -1,4 +1,4 @@ - What's New 2.15 - December 3, 2023 - Run:ai Documentation Library

    Version 2.15

    Release Content

    Researcher

    Jobs, Workloads, Trainings, and Workspaces

    • Added support to run distributed workloads via the training view in the UI. You can configure distributed training on the following:

      • Trainings form
      • Environments form

      You can select single or multi-node (distributed) training. When configuring distributed training, you will need to select a framework from the list. Supported frameworks now include:

      • PyTorch
      • Tensorflow
      • XGBoost
      • MPI

      For Trainings configuration, see Adding trainings. See your Run:ai representative to enable this feature. For Environments configuration, see Creating an Environment.

    • Preview the new Workloads view. Workloads is a new view for jobs that are running in the AI cluster. The Workloads view provides a more advanced UI than the previous Jobs UI. The new table format provides:

      • Improved views of the data
      • Improved filters and search
      • More information

      Use the toggle at the top of the Jobs page to switch to the Workloads view. For more information.

    • Improved support for Kubeflow Notebooks. Run:ai now supports the scheduling of Kubeflow notebooks with fractional GPUs. Kubeflow notebooks are identified automatically and appear with a dedicated icon in the Jobs UI.

    • Improved the Trainings and Workspaces forms. Now the runtime field for Command and Arguments can be edited directly in the new Workspace or Training creation form.
    • Added new functionality to the Run:ai CLI that allows submitting a workload with multiple service types at the same time in a CSV style format. Both the CLI and the UI now offer the same functionality. For more information, see runai submit.
    • Improved functionality in the runai submit command so that the port for the container is specified using the nodeport flag. For more information, see runai submit --service-type nodeport.

    Credentials

    • Improved Credentials creation. A Run:ai scope can now be added to credentials. For more information, see Credentials.

    Environments

    • Added support for workload types when creating a new or editing existing environments. Select from single-node or multi-node (distributed) workloads. The environment is available only on feature forms which are relevant to the workload type selected.

    Volumes and Storage

    • Added support for Ephemeral volumes in Workspaces. Ephemeral storage is temporary storage that gets wiped out and lost when the workspace is deleted. Adding Ephemeral storage to a workspace ties that storage to the lifecycle of the Workspace to which it was added. Ephemeral storage is added to the Workspace configuration form in the Volume pane. For configuration information, see Create a new workspace.

    Templates

    • Added support for Run:ai a Scope in the template form. For configuration information, see Creating templates.

    Deployments

    • Improvements in the New Deployment form include:
      • Support for Tolerations. Tolerations guide the system to which node each pod can be scheduled to or evicted by matching between rules and taints defined for each Kubernetes node.
      • Support for Multi-Process Service (MPS). MPS is a service which allows the running of parallel processes on the same GPU, which are all run by the same userid. To enable MPS support, use the toggle switch on the Deployments form.

      Note

      If you do not use the same userid, the processes will run in serial and could possibly degrade performance.

    Auto Delete Jobs

    • Added new functionality to the UI and CLI that provides configuration options to automatically delete jobs after a specified amount of time upon completion. Auto-deletion provides more efficient use of resources and makes it easier for researchers to manage their jobs. For more configuration options in the UI, see Auto deletion (Step 9) in Create a new workspace. For more information on the CLI flag, see --auto-deletion-time-after-completion.

    Run:ai Administrator

    Authorization

    • Run:ai has now revised and updated the Role Based Access Control (RBAC) mechanism, expanding the scope of Kubernetes. Using the new RBAC mechanism makes it easier for administrators to manage access policies across multiple clusters and to define specific access rules over specific scopes for specific users and groups. Along with the revised RBAC mechanism, new user interface views are introduced to support the management of users, groups, and access rules. For more information, see Role based access control.

    Policies

    • During Workspaces and Training creation, assets that do not comply with policies cannot be selected. These assets are greyed out and have a button on the cards when the item does not comply with a configured policy. The button displays information about which policies are non-compliant.
    • Added configuration options to Policies in order to prevent the submission of workloads that use data sources of type host path. This prevents data from being stored on the node, so that data is not lost when a node is deleted. For configuration information, see Prevent Data Storage on the Node.
    • Improved flexibility when creating policies which provide the ability to allocate a min and a max value for CPU and GPU memory. For configuration information, see GPU and CPU memory limits in Configuring policies.

    Nodes and Node Pools

    • Node pools are now enabled by default. There is no need to enable the feature in the settings.

    Quotas and Over-Quota

    • Improved control over how over-quota is managed by adding the ability to block over-subscription of the quota in Projects or Departments. For more information, see Limit Over-Quota.
    • Improved the scheduler fairness for departments using the over quota priority switch (in Settings). When the feature flag is disabled, over-quota weights are equal to the deserved quota and any excess resources are divided in the same proportion as the in-quota resources. For more information, see Over Quota Priority.
    • Added new functionality to always guarantee in-quota workloads at the expense of inter-Department fairness. Large distributed workloads from one department may preempt in-quota smaller workloads from another department. This new setting in the RunaiConfig file preserves in-quota workloads, even if the department quota or over-quota-fairness is not preserved. For more information, see Scheduler Fairness.

    Control and Visibility

    Dashboards

    • To ease the management of AI CPU and cluster resources, a new CPU focused dashboard was added for CPU based environments. The dashboards display specific information for CPU based nodes, node-pools, clusters, or tenants. These dashboards also include additional metrics that are specific to CPU based environments. This will help optimize visual information eliminating the views of empty GPU dashlets. For more information see CPU Dashboard.
    • Improved the Consumption report interface by moving the Cost settings to the General settings menu.
    • Added table to the Consumption dashboard that displays the consumption and cost per department. For more information, see Consumption dashboard.

    Nodes

    • Improved the readability of the Nodes table to include more detailed statuses and descriptions. The added information in the table makes it easier to inspect issues that may impact resource availability in the cluster. For more information, see Node and Node Pool Status.

    UI Enhancements

    • Added the ability to download a CSV file from any page that contains a table. Downloading a CSV provides a snapshot of the page's history over time, and helps with compliance tracking. All the columns that are selected (displayed) in the table are downloaded to the file.

    Installation and Configuration

    Cluster Installation and configuration

    • New cluster wizard for adding and installing new clusters to your system.

    OpenShift Support

    • Added support for restricted policy for Pod Security Admission (PSA) on OpenShift only. For more information, see [Pod security admission](../admin/runai-setup/cluster-setup/
    • Added the ability, in OpenShift environments, to configure cluster routes created by Run:ai instead of using the OpenShift certificate. For more information, see the table entry Dedicated certificate for the researcher service route.
    \ No newline at end of file diff --git a/v2.20/home/whats-new-2-16/index.html b/v2.20/home/whats-new-2-16/index.html index 802c38e5c6..015ff245ad 100644 --- a/v2.20/home/whats-new-2-16/index.html +++ b/v2.20/home/whats-new-2-16/index.html @@ -1,4 +1,4 @@ - Version 2.16 - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/home/whats-new-2-17/index.html b/v2.20/home/whats-new-2-17/index.html index 4350b5621f..4ecc96cb52 100644 --- a/v2.20/home/whats-new-2-17/index.html +++ b/v2.20/home/whats-new-2-17/index.html @@ -1,4 +1,4 @@ - Version 2.17 - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/home/whats-new-2-18/index.html b/v2.20/home/whats-new-2-18/index.html index cdfd0fcc14..f79738dfa1 100644 --- a/v2.20/home/whats-new-2-18/index.html +++ b/v2.20/home/whats-new-2-18/index.html @@ -1,4 +1,4 @@ - Version 2.18 - Run:ai Documentation Library

    Version 2.18

    Release Content - June 30, 2024

    Researcher

    Jobs, Workloads, and Workspaces

    • Added to UI backoff limit functionality to Training and Workspace workloads. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload's status will change to Failed. The UI will display the default number of retries based on 6 attempts for each pod in the workload. (For example, 6 pods = 36 attempts).

    • Updated Auto-deletion time default value from never to 30 days. The Auto-deletion time count starts when any Run:ai workload reaches a a completed, or failed status will be automatically deleted (including logs). This change only affects new or cloned workloads.

    • Added new Data sources of type Secret to workload form. Data sources of type Secret are used to hide 3rd party access credentials when submitting workloads. For more information, see Submitting Workloads.

    • Added new graphs for Inference workloads. The new graphs provide more information for Inference workloads to help analyze performance of the workloads. New graphs include Latency, Throughput, and number of replicas. For more information, see Workloads View. (Requires minimum cluster version v2.18).

    • Added latency metric for autoscaling. This feature allows automatic scale-up/down the number of replicas of a Run:ai inference workload based on the threshold set by the ML Engineer. This ensures that response time is kept under the target SLA. (Requires minimum cluster version v2.18).

    • Improved autoscaling for inference models by taking out ChatBot UI from models images. By moving ChatBot UI to predefined Environments, autoscaling is more accurate by taking into account all types of requests (API, and ChatBot UI). Adding a ChatBot UI environment preset by Run:ai allows AI practitioners to easily connect them to workloads.

    • Added more precision to trigger auto-scaling to zero. Now users can configure a precise consecutive idle threshold custom setting to trigger Run:ai inference workloads to scale-to-zero. (Requires minimum cluster version v2.18).

    • Added Hugging Face catalog integration of community models. Run:ai has added Hugging Face integration directly to the inference workload form, providing the ability to select models (vLLM models) from Hugging Face. This allows organizations to quickly experiment with the latest open source community language models. For more information on how Hugging Face is integrated, see Hugging Face.

    • Improved access permissions to external tools. This improvement now allows more granular control over which personas can access external tools (external URLs) such as Jupyter Notebooks, Chatbot UI, and others. For configuration information, see Submitting workloads. (Requires minimum cluster version v2.18).

    • Added a new API for submitting Run:ai inference workloads. This API allows users to easily submit inference workloads. This new API provides a consistent user experience for workload submission which maintains data integrity across all the user interfaces in the Run:ai platform. (Requires minimum cluster version v2.18).

    Command Line Interface V2

    • Added an improved, researcher-focused Command Line Interface (CLI). The improved CLI brings usability enhancements for the Researcher which include:

      • Support multiple clusters
      • Self-upgrade
      • Interactive mode
      • Align CLI to be data consistent with UI and API
      • Improved usability and performance

      This is an early access feature available for customers to use; however, be aware that there may be functional gaps versus the older, V1 CLI. For more information about installing and using the V2 CLI, see CLI V2. (Requires minimum cluster version v2.18).

    GPU memory swap

    • Added new GPU to CPU memory swap. To ensure efficient usage of an organization’s resources, Run:ai provides multiple features on multiple layers to help administrators and practitioners maximize their existing GPUs resource utilization. Run:ai’s GPU memory swap feature helps administrators and AI practitioners to further increase the utilization of existing GPU HW by improving GPU sharing between AI initiatives and stakeholders. This is done by expending the GPU physical memory to the CPU memory which is typically an order of magnitude larger than that of the GPU. For more information see, GPU Memory Swap. (Requires minimum cluster version v2.18).

    YAML Workload Reference table

    • Added a new YAML reference document that contains the value types and workload YAML references. Each table contains the field name, its description and the supported Run:ai workload types. The YAML field details contains information on the value type and currently available example workload snippets. For more information see, YAML Reference PDF.

    Email Notifications - Workload Status and timeouts

    • Added new Email notification system. AI Practitioners can setup the types of workload notifications they want to receive. In order to receive email notifications, you must ensure that the admin has enabled and configured notifications for the tenant. For more information, see Email notifications.

    Assets

    • Improved UI asset creation form by adding a Description field. Now asset creators can add a free text description(max 250 characters) to any asset created. The description field is intended to help explain the nature and goal of the asset, this way AI practitioners will be able to make better decisions when choosing their assets in workload creation.

    Run:ai Administrator

    Data Sources

    • Added Data Volumes new feature. Data Volumes are snapshots of datasets stored in Kubernetes Persistent Volume Claims (PVCs). They act as a central repository for training data, and offer several key benefits.

      • Managed with dedicated permissions—Data Admins, a new role within Run.ai, have exclusive control over data volume creation, data population, and sharing.
      • Shared between multiple scopes—unlike other Run:ai data sources, data volumes can be shared across projects, departments, or clusters. This promotes data reuse and collaboration within your organization.
      • Coupled to workloads in the submission process—similar to other Run:ai data sources, Data volumes can be easily attached to AI workloads during submission, specifying the data path within the workload environment.

      For more information, see Data Volumes. (Requires minimum cluster version v2.18).

    • Added new data source of type Secret. Run:ai now allows you to configure a Credential as a data source. A Data source of type Secret is best used in workloads so that access to 3rd party interfaces and storage used in containers, keep access credentials hidden. For more information, see Secrets as a data source.

    • Updated the logic of data source initializing state which keeps the workload in “initializing” status until S3 data is fully mapped. For more information see Sidecar containers documentation.

    • Additional storage unit sizes MiB, GiB & TiB (Megabyte, Gigabyte, and Terabyte respectively) added to the UI and API when creating a new data source of type PVC.

    Credentials

    • Added new Generic secret to Credentials. Credentials had been used only for access to data sources (S3, Git, etc.). However, AI practitioners need to use secrets to access sensitive data (interacting with 3rd party APIs, or other services) without having to put their credentials in their source code. Generic secrets leverage multiple key value pairs which helps reduce the number of Kubernetes resources and simplifies resource management by reducing the overhead associated with maintaining multiple Secrets. Generic secrets are best used as a data source of type Secret so that they can be used in containers to keep access credentials hidden. (Requires minimum cluster version v2.18).

    Single Sign On

    • Added support for Single Sign On using OpenShift v4 (OIDC based). When using OpenShift, you must first define OAuthClient which interacts with OpenShift's OAuth server to authenticate users and request access tokens. For more information, see Single Sign-On.

    • Added OIDC scopes to authentication requests. OIDC Scopes are used to specify what access privileges are being requested for access tokens. The scopes associated with the access tokens determine what resource are available when they are used to access OAuth 2.0 protected endpoints. Protected endpoints may perform different actions and return different information based on the scope values and other parameters used when requesting the presented access token. For more information, see UI configuration.

    Ownership protection

    • Added new ownership protection feature. Run:ai Ownership Protection ensures that only authorized users can delete or modify workloads. This feature is designed to safeguard important jobs and configurations from accidental or unauthorized modifications by users who did not originally create the workload. For configuration information, see your Run:ai representative.

    Email notifications

    • Added new email notifications feature. Email Notifications sends alerts for critical workload life cycle changes empowering data scientists to take necessary actions and prevent delays.

      • System administrators will need to configure the email notifications. For more information, see System notifications.

    Policy for distributed and inference workloads in the API

    • Added a new API for creating distributed training workload policies and inference workload policies. These new policies in the API allow to set defaults, enforce rules and impose setup on distributed training and inference workloads. For distributed policies, worker and master may require different rules due to their different specifications. The new capability is currently available via API only. Documentation on submitting policies to follow shortly.

    Policy for distributed and inference workloads in the API

    • Added a new API for creating distributed training workload policies and inference workload policies. These new policies in the API allow to set defaults, enforce rules and impose setup on distributed training and inference workloads. For distributed policies, worker and master may require different rules due to their different specifications. The new capability is currently available via API only. Documentation on submitting policies to follow shortly.

    Deprecation Notifications

    Existing notifications feature requires cluster configuration, is being deprecated in favor of an improved Notification System. If you have been using the existing notifications feature in the cluster, you can continue to use it for the next two versions. It is recommend that you change to the new notifications system in the Control Plane for better control and improved message granularity.

    Feature deprecations

    Deprecated features will be available for two versions ahead of the notification. For questions, see your Run:ai representative.

    API support and endpoint deprecations

    The endpoints and parameters specified in the API reference are the ones that are officially supported by Run:ai. For more information about Run:ai's API support policy and deprecation process, see note under Developer overview.

    Deprecated APIs and API fields

    Cluster API Deprecation

    Run:ai REST API now supports job submission. The older, Cluster API is now deprecated.

    Departments API
    Deprecated Replacement
    /v1/k8s/clusters/{clusterId}/departments /api/v1/org-unit/departments
    /v1/k8s/clusters/{clusterId}/departments/{department-id} /api/v1/org-unit/departments/{departmentId}
    /v1/k8s/clusters/{clusterId}/departments/{department-id} /api/v1/org-unit/departments/{departmentId}+PUT/PATCH /api/v1/org-unit/departments/{departmentId}/resources
    Projects API
    Deprecated Replacement
    /v1/k8s/clusters/{clusterId}/projects /api/v1/org-unit/projects
    /v1/k8s/clusters/{clusterId}/projects/{id} /api/v1/org-unit/projects/{projectId}
    /v1/k8s/clusters/{clusterId}/projects/{id} /api/v1/org-unit/projects/{projectId} + /api/v1/org-unit/projects/{projectId}/resources

    Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

    For a full explanation of the API Deprecation policy, see the Run:ai API Policy

    Breaking changes

    Breaking changes notifications allow you to plan around potential changes that may interfere your current workflow when interfacing with the Run:ai Platform.

    Version 2.18

    Release Content - June 30, 2024

    Researcher

    Jobs, Workloads, and Workspaces

    • Added to UI backoff limit functionality to Training and Workspace workloads. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload's status will change to Failed. The UI will display the default number of retries based on 6 attempts for each pod in the workload. (For example, 6 pods = 36 attempts).

    • Updated Auto-deletion time default value from never to 30 days. The Auto-deletion time count starts when any Run:ai workload reaches a a completed, or failed status will be automatically deleted (including logs). This change only affects new or cloned workloads.

    • Added new Data sources of type Secret to workload form. Data sources of type Secret are used to hide 3rd party access credentials when submitting workloads. For more information, see Submitting Workloads.

    • Added new graphs for Inference workloads. The new graphs provide more information for Inference workloads to help analyze performance of the workloads. New graphs include Latency, Throughput, and number of replicas. For more information, see Workloads View. (Requires minimum cluster version v2.18).

    • Added latency metric for autoscaling. This feature allows automatic scale-up/down the number of replicas of a Run:ai inference workload based on the threshold set by the ML Engineer. This ensures that response time is kept under the target SLA. (Requires minimum cluster version v2.18).

    • Improved autoscaling for inference models by taking out ChatBot UI from models images. By moving ChatBot UI to predefined Environments, autoscaling is more accurate by taking into account all types of requests (API, and ChatBot UI). Adding a ChatBot UI environment preset by Run:ai allows AI practitioners to easily connect them to workloads.

    • Added more precision to trigger auto-scaling to zero. Now users can configure a precise consecutive idle threshold custom setting to trigger Run:ai inference workloads to scale-to-zero. (Requires minimum cluster version v2.18).

    • Added Hugging Face catalog integration of community models. Run:ai has added Hugging Face integration directly to the inference workload form, providing the ability to select models (vLLM models) from Hugging Face. This allows organizations to quickly experiment with the latest open source community language models. For more information on how Hugging Face is integrated, see Hugging Face.

    • Improved access permissions to external tools. This improvement now allows more granular control over which personas can access external tools (external URLs) such as Jupyter Notebooks, Chatbot UI, and others. For configuration information, see Submitting workloads. (Requires minimum cluster version v2.18).

    • Added a new API for submitting Run:ai inference workloads. This API allows users to easily submit inference workloads. This new API provides a consistent user experience for workload submission which maintains data integrity across all the user interfaces in the Run:ai platform. (Requires minimum cluster version v2.18).

    Command Line Interface V2

    • Added an improved, researcher-focused Command Line Interface (CLI). The improved CLI brings usability enhancements for the Researcher which include:

      • Support multiple clusters
      • Self-upgrade
      • Interactive mode
      • Align CLI to be data consistent with UI and API
      • Improved usability and performance

      This is an early access feature available for customers to use; however, be aware that there may be functional gaps versus the older, V1 CLI. For more information about installing and using the V2 CLI, see CLI V2. (Requires minimum cluster version v2.18).

    GPU memory swap

    • Added new GPU to CPU memory swap. To ensure efficient usage of an organization’s resources, Run:ai provides multiple features on multiple layers to help administrators and practitioners maximize their existing GPUs resource utilization. Run:ai’s GPU memory swap feature helps administrators and AI practitioners to further increase the utilization of existing GPU HW by improving GPU sharing between AI initiatives and stakeholders. This is done by expending the GPU physical memory to the CPU memory which is typically an order of magnitude larger than that of the GPU. For more information see, GPU Memory Swap. (Requires minimum cluster version v2.18).

    YAML Workload Reference table

    • Added a new YAML reference document that contains the value types and workload YAML references. Each table contains the field name, its description and the supported Run:ai workload types. The YAML field details contains information on the value type and currently available example workload snippets. For more information see, YAML Reference PDF.

    Email Notifications - Workload Status and timeouts

    • Added new Email notification system. AI Practitioners can setup the types of workload notifications they want to receive. In order to receive email notifications, you must ensure that the admin has enabled and configured notifications for the tenant. For more information, see Email notifications.

    Assets

    • Improved UI asset creation form by adding a Description field. Now asset creators can add a free text description(max 250 characters) to any asset created. The description field is intended to help explain the nature and goal of the asset, this way AI practitioners will be able to make better decisions when choosing their assets in workload creation.

    Run:ai Administrator

    Data Sources

    • Added Data Volumes new feature. Data Volumes are snapshots of datasets stored in Kubernetes Persistent Volume Claims (PVCs). They act as a central repository for training data, and offer several key benefits.

      • Managed with dedicated permissions—Data Admins, a new role within Run.ai, have exclusive control over data volume creation, data population, and sharing.
      • Shared between multiple scopes—unlike other Run:ai data sources, data volumes can be shared across projects, departments, or clusters. This promotes data reuse and collaboration within your organization.
      • Coupled to workloads in the submission process—similar to other Run:ai data sources, Data volumes can be easily attached to AI workloads during submission, specifying the data path within the workload environment.

      For more information, see Data Volumes. (Requires minimum cluster version v2.18).

    • Added new data source of type Secret. Run:ai now allows you to configure a Credential as a data source. A Data source of type Secret is best used in workloads so that access to 3rd party interfaces and storage used in containers, keep access credentials hidden. For more information, see Secrets as a data source.

    • Updated the logic of data source initializing state which keeps the workload in “initializing” status until S3 data is fully mapped. For more information see Sidecar containers documentation.

    • Additional storage unit sizes MiB, GiB & TiB (Megabyte, Gigabyte, and Terabyte respectively) added to the UI and API when creating a new data source of type PVC.

    Credentials

    • Added new Generic secret to Credentials. Credentials had been used only for access to data sources (S3, Git, etc.). However, AI practitioners need to use secrets to access sensitive data (interacting with 3rd party APIs, or other services) without having to put their credentials in their source code. Generic secrets leverage multiple key value pairs which helps reduce the number of Kubernetes resources and simplifies resource management by reducing the overhead associated with maintaining multiple Secrets. Generic secrets are best used as a data source of type Secret so that they can be used in containers to keep access credentials hidden. (Requires minimum cluster version v2.18).

    Single Sign On

    • Added support for Single Sign On using OpenShift v4 (OIDC based). When using OpenShift, you must first define OAuthClient which interacts with OpenShift's OAuth server to authenticate users and request access tokens. For more information, see Single Sign-On.

    • Added OIDC scopes to authentication requests. OIDC Scopes are used to specify what access privileges are being requested for access tokens. The scopes associated with the access tokens determine what resource are available when they are used to access OAuth 2.0 protected endpoints. Protected endpoints may perform different actions and return different information based on the scope values and other parameters used when requesting the presented access token. For more information, see UI configuration.

    Ownership protection

    • Added new ownership protection feature. Run:ai Ownership Protection ensures that only authorized users can delete or modify workloads. This feature is designed to safeguard important jobs and configurations from accidental or unauthorized modifications by users who did not originally create the workload. For configuration information, see your Run:ai representative.

    Email notifications

    • Added new email notifications feature. Email Notifications sends alerts for critical workload life cycle changes empowering data scientists to take necessary actions and prevent delays.

      • System administrators will need to configure the email notifications. For more information, see System notifications.

    Policy for distributed and inference workloads in the API

    • Added a new API for creating distributed training workload policies and inference workload policies. These new policies in the API allow to set defaults, enforce rules and impose setup on distributed training and inference workloads. For distributed policies, worker and master may require different rules due to their different specifications. The new capability is currently available via API only. Documentation on submitting policies to follow shortly.

    Policy for distributed and inference workloads in the API

    • Added a new API for creating distributed training workload policies and inference workload policies. These new policies in the API allow to set defaults, enforce rules and impose setup on distributed training and inference workloads. For distributed policies, worker and master may require different rules due to their different specifications. The new capability is currently available via API only. Documentation on submitting policies to follow shortly.

    Deprecation Notifications

    Existing notifications feature requires cluster configuration, is being deprecated in favor of an improved Notification System. If you have been using the existing notifications feature in the cluster, you can continue to use it for the next two versions. It is recommend that you change to the new notifications system in the Control Plane for better control and improved message granularity.

    Feature deprecations

    Deprecated features will be available for two versions ahead of the notification. For questions, see your Run:ai representative.

    API support and endpoint deprecations

    The endpoints and parameters specified in the API reference are the ones that are officially supported by Run:ai. For more information about Run:ai's API support policy and deprecation process, see note under Developer overview.

    Deprecated APIs and API fields

    Cluster API Deprecation

    Run:ai REST API now supports job submission. The older, Cluster API is now deprecated.

    Departments API
    Deprecated Replacement
    /v1/k8s/clusters/{clusterId}/departments /api/v1/org-unit/departments
    /v1/k8s/clusters/{clusterId}/departments/{department-id} /api/v1/org-unit/departments/{departmentId}
    /v1/k8s/clusters/{clusterId}/departments/{department-id} /api/v1/org-unit/departments/{departmentId}+PUT/PATCH /api/v1/org-unit/departments/{departmentId}/resources
    Projects API
    Deprecated Replacement
    /v1/k8s/clusters/{clusterId}/projects /api/v1/org-unit/projects
    /v1/k8s/clusters/{clusterId}/projects/{id} /api/v1/org-unit/projects/{projectId}
    /v1/k8s/clusters/{clusterId}/projects/{id} /api/v1/org-unit/projects/{projectId} + /api/v1/org-unit/projects/{projectId}/resources

    Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

    For a full explanation of the API Deprecation policy, see the Run:ai API Policy

    Breaking changes

    Breaking changes notifications allow you to plan around potential changes that may interfere your current workflow when interfacing with the Run:ai Platform.

    \ No newline at end of file diff --git a/v2.20/home/whats-new-2-19/index.html b/v2.20/home/whats-new-2-19/index.html index 79e1867fda..f765c23254 100644 --- a/v2.20/home/whats-new-2-19/index.html +++ b/v2.20/home/whats-new-2-19/index.html @@ -1,4 +1,4 @@ - Version 2.19 - Run:ai Documentation Library

    What’s New in Version 2.19

    Release Content

    Researchers

    Improved visibility into pending workloads

    For workloads with the status of "Pending," the user can click the “i” icon next to the status to view details of why the workload hasn’t been scheduled. (Requires a minimum cluster version of v2.19)

    New workload events

    There are now new GPU resource optimization-related messages that are viewable as workload events. These events help users understand the decisions made by the Run:ai GPU toolkit while handling Run:ai’s GPU resource optimization features.
    Run:ai’s GPU resource optimization offers unique capabilities that take GPU utilization to a new level and helps customers increase their productivity while maximizing their return on GPU investment. (Requires a minimum cluster version of v2.19)

    Improved command line interface autocompletion

    CLI V2 now autocompletes nouns such as project names and workload names for better data consistency with the UI, auto-upgrades, and interactive mode.

    Details pane in the Workloads view

    A new DETAILS tab for workloads has been added and presents additional workload information, including Container command, Environment variables, and CLI command syntax (if the workload was submitted via CLI).

    Container path outside the data source asset

    AI practitioners can now override the predefined container path for each data source when submitting a workload via the Run:ai UI. While the container path must still be specified as part of the data source asset, researchers can now override the default container path when submitting workloads. (Requires a minimum cluster version of v2.16)

    Node toleration for workloads

    Researchers can now optionally set tolerations for workloads, letting them bypass node taints during workload submission via the Run:ai UI.
    To use this feature, make sure it is activated under General Settings.
    For more information, refer to the Kubernetes Taints and Tolerations Guide. (Requires a minimum cluster version of v2.19)

    Topology-aware scheduling

    When submitting a distributed training workload through the Run:ai UI, researchers can enable topology-aware scheduling. This feature allows an optimized placement within specific placement groups, such as regions, availability zones, or other topologies. To use this, make sure it is activated under General Settings. (Requires a minimum cluster version of v2.19)

    Bulk deletion of workloads

    Users can now delete workloads in bulk via the Run:ai UI. They’ll be notified if they try to delete workloads for which they don’t have permissions (and those workloads will not be deleted in this process). Multi-selection can also be done using standard keyboard functions. (Requires a minimum cluster version of v2.19)

    Enhanced policy representation in the Run:ai UI

    To improve AI practitioners' understanding of administrators’ policy rules and defaults, the UI now includes more clarity to the enforcement and the default values representation for workload fields that are not encapsulated in the asset selection. This update aims to make policy enforcement more intuitive and transparent for practitioners. (Requires a minimum cluster version of v2.18)

    Configuration of credentials as environment variables

    Researchers can now easily define pre-configured credentials as environment variables to access private resources. This is available through the Run:ai UI during the workload submission process, specifically under the runtime settings section. (Requires a minimum cluster version pf v2.18)

    Expanded scope of ConfigMap as data source

    When creating a data source of type ConfigMap, researchers can now not only select a project but also a cluster or department. (Requires a minimum cluster version of v2.19)

    Improved workload scheduling algorithm

    The Run:ai scheduler algorithm for handling large distributed workloads has been improved and is now more efficient, resulting in better handling of large distributed workloads, and better performance. (Requires a minimum cluster version of v2.19)

    ML Engineer (Inference)

    Additional data sources for inference workloads

    When submitting an inference workload via the UI and API, users can now use NFS and hostPath data sources. (Requires a minimum cluster version of v2.19)

    Hugging Face integration improvements

    To reduce errors when submitting inference workloads, additional validations are done for the Hugging Face integration, ensuring that only valid models are submitted, thus enhancing overall reliability. (Requires a minimum cluster version of v2.19)

    Rolling inference updates

    ML engineers can now roll updates onto existing inference workloads. Once the revised workload (the update) is up and running, request traffic is redirected to the new version of the workload and the previous version is terminated, ensuring that services are not impacted during the update.

    See Inference overview for more information. (Requires a minimum cluster version of v2.19)

    Inference endpoint authorization

    When sharing inference endpoints securely using Run:ai, ML engineers can limit access to the endpoint by specifying the authorized users or groups allowed to use the service (i.e., send requests to the endpoint) after being authenticated. This restriction is especially important when handling sensitive information or when you want to manage costs by sharing the service with a controlled group of consumers. (Requires a minimum cluster version of v2.19)

    Run:ai Developer

    Metrics and telemetry

    Additional metrics and telemetry are available via the API. For more information, see the details below and in Metrics API:

    • Metrics (over time)
      • Cluster
        • TOTAL_GPU_NODES
        • GPU_UTILIZATION_DISTRIBUTION
        • UNALLOCATED_GPU
      • Nodepool
        • TOTAL_GPU_NODES
        • GPU_UTILIZATION_DISTRIBUTION
        • UNALLOCATED_GPU
      • Workload
        • GPU_ALLOCATION
      • Node
        • GPU_UTILIZATION_PER_GPU
        • GPU_MEMORY_UTILIZATION_PER_GPU
        • GPU_MEMORY_USAGE_BYTES_PER_GPU
        • CPU_USAGE_CORES
        • CPU_UTILIZATION
        • CPU_MEMORY_USAGE_BYTES
        • CPU_MEMORY_UTILIZATION
    • Telemetry (current time)
      • Node
        • ALLOCATED_GPUS
        • TOTAL_CPU_CORES
        • USED_CPU_CORES
        • ALLOCATED_CPU_CORES
        • TOTAL_GPU_MEMORY_BYTES
        • USED_GPU_MEMORY_BYTES
        • TOTAL_CPU_MEMORY_BYTES
        • USED_CPU_MEMORY_BYTES
        • ALLOCATED_CPU_MEMORY_BYTES
        • IDLE_ALLOCATED_GPUS

    Administrator

    Pagination in user API

    Pagination has been added, removing the limitation to the number of users listed in the Run:ai UI.

    Audit log

    The audit log has been updated, so system admins can view audit logs directly in the Run:ai UI and download them in CSV or JSON formats, providing flexible options for data analysis and compliance reporting. Version 2.19 reintroduces a fully functional audit log (event history), ensuring comprehensive tracking across projects, departments, access rules, and more. In the new version, all entities are logged except logins and workloads.
    For more information, see Audit logs.

    Platform Administrator

    Department scheduling rules

    Scheduling rules have been added at the department level. For more information, see scheduling rules.

    Department node pool priority

    Node pool priority has been added at the department level. For more information, see node pools

    Department and project grids

    There is now improved filtering and sorting in the Projects and Departments views, including a multi-cluster view and new filters.

    Overview dashboard

    “Idle allocated GPU devices” has been added to the Overview dashboard.

    Workload policy for distributed training workloads in the Run:ai UI

    Distributed workload policies can now be created via the Run:ai UI. Admins can set defaults, enforce rules, and impose setup on distributed training through the UI YAML, as well as view the distributed policies (both in the policy grid and while submitting workloads). For distributed policies, workers and leaders may require different rules due to their different specifications. (Requires a minimum cluster version of v2.18)

    Reconciliation of policy rules

    A reconciliation mechanism for policy rules has been added to enhance flexibility in the policy submission process. Previously, if a specific field was governed by a policy for a certain hierarchy, other organizational units couldn’t submit a policy with rules that regarded this specific field. Now, new policies for hierarchies that mention an existing policy field will no longer be blocked.
    The effective rules are selected based on the following logic:
    1. For the compute and security sections in the workload spec of the Run:ai API, the highest hierarchy is chosen for the effective policy (tenant > cluster > department > project).
    2. For any other fields in the policy, the lowest hierarchy closest to the actual workload becomes the effective for the policy (similar to policy defaults).
    Additionally, while viewing the effective policy, each rule displays its source of the origin policy, allowing users to clearly understand the selected hierarchy of the effective policy. | (Requires a minimum cluster version of v2.18)

    Infrastructure Administrator

    Support for COS over GKE

    With Run:ai version 2.19, the Run:ai cluster on Google Kubernetes Engine (GKE) supports Container-Optimized OS (COS) when NVIDIA GPU Operator 24.6 or newer is installed. This is in addition to the already supported Ubuntu on GKE.

    Run:ai and Karpenter

    Run:ai now supports working with Karpenter. Karpenter is an open-source Kubernetes cluster auto-scaler built for cloud deployments. Karpenter optimizes the cloud cost of a customer’s cluster by moving workloads between different node types, bin-packing nodes, using lower-cost nodes where possible, scaling up new nodes on demand, and shutting down unused nodes with the goal of optimizing and reducing costs. (Requires a minimum cluster version of v2.19)

    Please read the documentation for more information on Run:ai and Karpenter integration considerations.

    Control and Visibility (UI changes)

    New Run:ai UI navigation

    The platform navigation has been updated to offer a more modern design, easier navigation, and address all personas interacting with the UI.

    The left-side menu now has seven categories, each with its own reorganized sub-options that appear in the pane next to the menu options.

    If you close the sub-options pane, you can hover over the categories, and the sub-options float and can be used in the same way.

    The options presented in the menu and categories continue to match each user’s permissions, as in the legacy navigation.

    Below is the full list of menu and sub-options and changes:

    Analytics
    Displays the Run:ai dashboards allowing the different users to analyze, plan, and improve system performance AI workload execution.
    This category contains the following options:

    • Overview
    • Quota management
    • Analytics
    • Consumption
    • Multi-cluster overview

    Workload manager
    Enables AI practitioners to develop modes, train them, and deploy them into production. All supported tools and capabilities can be found here. This category contains the following options:

    • Workloads
    • Deleted workloads (now separated from current workloads. If not visible, it can be activated from Settings -> Workloads -> Deleted workloads)
    • Templates
    • Assets (these options are visible via a collapsible menu)
      • Models
      • Environments
      • Compute resources
      • Data sources
      • Credentials

    Resources
    Enables viewing and managing all cluster resources. In the new navigation, nodes and node pools have been split into different grids.
    This category contains the following options:

    • Clusters
    • Node pools (separated from the Nodes page to its own page)
    • Nodes

    Organization
    Maps system organizations to ensure that resource allocation and policies align with the organizational structure, business projects, and priorities.
    This category contains the following options:

    • Departments
    • Projects

    Access
    Makes it possible to provide authorization of the different system users to perform actions and alignment with their role and scope of projects within the organization.
    This was moved from the legacy menu where it appeared in the header of the screen under Tools and Settings.
    This category contains the following options:

    • Users
    • Applications
    • Roles (separated from the Access rules and roles page to its own page)
    • Access rules (separated from the Access rules and roles page to its own page)

    Policies
    Presents the tools to enforce controls over the AI infrastructure enabling different users to be effective while working in alignment with organizational policies.
    This category contains the following options:

    • Workload policies

    Admin
    Presents all administrator functions of the Run:ai platform.
    This was moved from the legacy menu where it appeared in the header of the screen under Tools and Settings.
    This category contains the following options:

    • General settings (previously General)
    • Event history

    For users with more than one cluster, in the legacy version the cluster selection appeared in the header of the page. In the new navigation, the cluster selection is part of the grid and changes only affect the items on that page.

    If a user prefers not to use the new UI navigation, there is an option to switch back to the legacy navigation by clicking the Back to legacy navigation option.

    Installation and configuration

    • Tenant logos can now be uploaded to the Run:ai UI via API. The logo should be in base64 format and should not be white to avoid blending into the background. The logo should be no more than 20px tall. See Upload logo for tenant API.
    • Run:ai now supports NVIDIA GPU Operator version 24.6
    • Run:ai now supports Kubernetes version 1.31

    Deprecation notifications

    Feature deprecations

    Legacy Jobs view

    The legacy Jobs view will be fully deprecated in the Q1/25 release. We recommend that all users adopt the Workloads view, which offers all the capabilities of the legacy Jobs view with additional enhancements.
    SaaS customers will gradually be transitioned to the Workloads view during Q4/24.

    Note

    Users can still submit workloads via the legacy Jobs submission form.

    Dynamic MIG deprecation

    Dynamic MIG deprecation process starts with Run:ai v2.19 (Q4/24 release)

    • The feature is still available and MIG Profile APIs still function but are marked as Deprecated. See the table below for more details.
    • In Q1/25 release, ‘Dynamic MIG’ will not be usable anymore but the APIs will still be accessible.
    • In Q2/25 all ‘Dynamic MIG’ APIs will be fully deprecated.

    Legacy navigation - Run:ai UI

    The legacy navigation will be fully deprecated in the Q1/25 release, and during Q1/25 for SaaS customers.

    API support and endpoint deprecations

    Deprecated Replacement
    /v1/k8s/audit /api/v1/audit/log
    /api/v1/asset/compute/spec/migProfile
    /api/v1/workloads/spec/compute/migProfile
    /api/v1/workloads/workspaces/spec/compute/migProfile
    /api/v1/workloads/Trainings/spec/compute/migProfile
    /api/v1/workloads/Inferences/spec/compute/migProfile
    /api/v1/workloads/distributed/spec/compute/migProfile
    /api/v1/workloads/distributed/masterSpec/compute/migProfile

    Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

    For a full explanation of the API Deprecation policy, see the Run:ai API Policy

    Documentation enhancements

    Workload policy documentation

    A comprehensive set of articles detailing the usage and the process of submitting new workload policies has been introduced. It covers the structure, syntax, best practices, and examples for configuring policy YAML files. The new documentation includes step-by-step explanations of how to create a new rule in a policy, together with information of the different value types, rule types, and policy spec sections. For more information, refer to the Policies section.

    What’s New in Version 2.19

    Release Content

    Researchers

    Improved visibility into pending workloads

    For workloads with the status of "Pending," the user can click the “i” icon next to the status to view details of why the workload hasn’t been scheduled. (Requires a minimum cluster version of v2.19)

    New workload events

    There are now new GPU resource optimization-related messages that are viewable as workload events. These events help users understand the decisions made by the Run:ai GPU toolkit while handling Run:ai’s GPU resource optimization features.
    Run:ai’s GPU resource optimization offers unique capabilities that take GPU utilization to a new level and helps customers increase their productivity while maximizing their return on GPU investment. (Requires a minimum cluster version of v2.19)

    Improved command line interface autocompletion

    CLI V2 now autocompletes nouns such as project names and workload names for better data consistency with the UI, auto-upgrades, and interactive mode.

    Details pane in the Workloads view

    A new DETAILS tab for workloads has been added and presents additional workload information, including Container command, Environment variables, and CLI command syntax (if the workload was submitted via CLI).

    Container path outside the data source asset

    AI practitioners can now override the predefined container path for each data source when submitting a workload via the Run:ai UI. While the container path must still be specified as part of the data source asset, researchers can now override the default container path when submitting workloads. (Requires a minimum cluster version of v2.16)

    Node toleration for workloads

    Researchers can now optionally set tolerations for workloads, letting them bypass node taints during workload submission via the Run:ai UI.
    To use this feature, make sure it is activated under General Settings.
    For more information, refer to the Kubernetes Taints and Tolerations Guide. (Requires a minimum cluster version of v2.19)

    Topology-aware scheduling

    When submitting a distributed training workload through the Run:ai UI, researchers can enable topology-aware scheduling. This feature allows an optimized placement within specific placement groups, such as regions, availability zones, or other topologies. To use this, make sure it is activated under General Settings. (Requires a minimum cluster version of v2.19)

    Bulk deletion of workloads

    Users can now delete workloads in bulk via the Run:ai UI. They’ll be notified if they try to delete workloads for which they don’t have permissions (and those workloads will not be deleted in this process). Multi-selection can also be done using standard keyboard functions. (Requires a minimum cluster version of v2.19)

    Enhanced policy representation in the Run:ai UI

    To improve AI practitioners' understanding of administrators’ policy rules and defaults, the UI now includes more clarity to the enforcement and the default values representation for workload fields that are not encapsulated in the asset selection. This update aims to make policy enforcement more intuitive and transparent for practitioners. (Requires a minimum cluster version of v2.18)

    Configuration of credentials as environment variables

    Researchers can now easily define pre-configured credentials as environment variables to access private resources. This is available through the Run:ai UI during the workload submission process, specifically under the runtime settings section. (Requires a minimum cluster version pf v2.18)

    Expanded scope of ConfigMap as data source

    When creating a data source of type ConfigMap, researchers can now not only select a project but also a cluster or department. (Requires a minimum cluster version of v2.19)

    Improved workload scheduling algorithm

    The Run:ai scheduler algorithm for handling large distributed workloads has been improved and is now more efficient, resulting in better handling of large distributed workloads, and better performance. (Requires a minimum cluster version of v2.19)

    ML Engineer (Inference)

    Additional data sources for inference workloads

    When submitting an inference workload via the UI and API, users can now use NFS and hostPath data sources. (Requires a minimum cluster version of v2.19)

    Hugging Face integration improvements

    To reduce errors when submitting inference workloads, additional validations are done for the Hugging Face integration, ensuring that only valid models are submitted, thus enhancing overall reliability. (Requires a minimum cluster version of v2.19)

    Rolling inference updates

    ML engineers can now roll updates onto existing inference workloads. Once the revised workload (the update) is up and running, request traffic is redirected to the new version of the workload and the previous version is terminated, ensuring that services are not impacted during the update.

    See Inference overview for more information. (Requires a minimum cluster version of v2.19)

    Inference endpoint authorization

    When sharing inference endpoints securely using Run:ai, ML engineers can limit access to the endpoint by specifying the authorized users or groups allowed to use the service (i.e., send requests to the endpoint) after being authenticated. This restriction is especially important when handling sensitive information or when you want to manage costs by sharing the service with a controlled group of consumers. (Requires a minimum cluster version of v2.19)

    Run:ai Developer

    Metrics and telemetry

    Additional metrics and telemetry are available via the API. For more information, see the details below and in Metrics API:

    • Metrics (over time)
      • Cluster
        • TOTAL_GPU_NODES
        • GPU_UTILIZATION_DISTRIBUTION
        • UNALLOCATED_GPU
      • Nodepool
        • TOTAL_GPU_NODES
        • GPU_UTILIZATION_DISTRIBUTION
        • UNALLOCATED_GPU
      • Workload
        • GPU_ALLOCATION
      • Node
        • GPU_UTILIZATION_PER_GPU
        • GPU_MEMORY_UTILIZATION_PER_GPU
        • GPU_MEMORY_USAGE_BYTES_PER_GPU
        • CPU_USAGE_CORES
        • CPU_UTILIZATION
        • CPU_MEMORY_USAGE_BYTES
        • CPU_MEMORY_UTILIZATION
    • Telemetry (current time)
      • Node
        • ALLOCATED_GPUS
        • TOTAL_CPU_CORES
        • USED_CPU_CORES
        • ALLOCATED_CPU_CORES
        • TOTAL_GPU_MEMORY_BYTES
        • USED_GPU_MEMORY_BYTES
        • TOTAL_CPU_MEMORY_BYTES
        • USED_CPU_MEMORY_BYTES
        • ALLOCATED_CPU_MEMORY_BYTES
        • IDLE_ALLOCATED_GPUS

    Administrator

    Pagination in user API

    Pagination has been added, removing the limitation to the number of users listed in the Run:ai UI.

    Audit log

    The audit log has been updated, so system admins can view audit logs directly in the Run:ai UI and download them in CSV or JSON formats, providing flexible options for data analysis and compliance reporting. Version 2.19 reintroduces a fully functional audit log (event history), ensuring comprehensive tracking across projects, departments, access rules, and more. In the new version, all entities are logged except logins and workloads.
    For more information, see Audit logs.

    Platform Administrator

    Department scheduling rules

    Scheduling rules have been added at the department level. For more information, see scheduling rules.

    Department node pool priority

    Node pool priority has been added at the department level. For more information, see node pools

    Department and project grids

    There is now improved filtering and sorting in the Projects and Departments views, including a multi-cluster view and new filters.

    Overview dashboard

    “Idle allocated GPU devices” has been added to the Overview dashboard.

    Workload policy for distributed training workloads in the Run:ai UI

    Distributed workload policies can now be created via the Run:ai UI. Admins can set defaults, enforce rules, and impose setup on distributed training through the UI YAML, as well as view the distributed policies (both in the policy grid and while submitting workloads). For distributed policies, workers and leaders may require different rules due to their different specifications. (Requires a minimum cluster version of v2.18)

    Reconciliation of policy rules

    A reconciliation mechanism for policy rules has been added to enhance flexibility in the policy submission process. Previously, if a specific field was governed by a policy for a certain hierarchy, other organizational units couldn’t submit a policy with rules that regarded this specific field. Now, new policies for hierarchies that mention an existing policy field will no longer be blocked.
    The effective rules are selected based on the following logic:
    1. For the compute and security sections in the workload spec of the Run:ai API, the highest hierarchy is chosen for the effective policy (tenant > cluster > department > project).
    2. For any other fields in the policy, the lowest hierarchy closest to the actual workload becomes the effective for the policy (similar to policy defaults).
    Additionally, while viewing the effective policy, each rule displays its source of the origin policy, allowing users to clearly understand the selected hierarchy of the effective policy. | (Requires a minimum cluster version of v2.18)

    Infrastructure Administrator

    Support for COS over GKE

    With Run:ai version 2.19, the Run:ai cluster on Google Kubernetes Engine (GKE) supports Container-Optimized OS (COS) when NVIDIA GPU Operator 24.6 or newer is installed. This is in addition to the already supported Ubuntu on GKE.

    Run:ai and Karpenter

    Run:ai now supports working with Karpenter. Karpenter is an open-source Kubernetes cluster auto-scaler built for cloud deployments. Karpenter optimizes the cloud cost of a customer’s cluster by moving workloads between different node types, bin-packing nodes, using lower-cost nodes where possible, scaling up new nodes on demand, and shutting down unused nodes with the goal of optimizing and reducing costs. (Requires a minimum cluster version of v2.19)

    Please read the documentation for more information on Run:ai and Karpenter integration considerations.

    Control and Visibility (UI changes)

    New Run:ai UI navigation

    The platform navigation has been updated to offer a more modern design, easier navigation, and address all personas interacting with the UI.

    The left-side menu now has seven categories, each with its own reorganized sub-options that appear in the pane next to the menu options.

    If you close the sub-options pane, you can hover over the categories, and the sub-options float and can be used in the same way.

    The options presented in the menu and categories continue to match each user’s permissions, as in the legacy navigation.

    Below is the full list of menu and sub-options and changes:

    Analytics
    Displays the Run:ai dashboards allowing the different users to analyze, plan, and improve system performance AI workload execution.
    This category contains the following options:

    • Overview
    • Quota management
    • Analytics
    • Consumption
    • Multi-cluster overview

    Workload manager
    Enables AI practitioners to develop modes, train them, and deploy them into production. All supported tools and capabilities can be found here. This category contains the following options:

    • Workloads
    • Deleted workloads (now separated from current workloads. If not visible, it can be activated from Settings -> Workloads -> Deleted workloads)
    • Templates
    • Assets (these options are visible via a collapsible menu)
      • Models
      • Environments
      • Compute resources
      • Data sources
      • Credentials

    Resources
    Enables viewing and managing all cluster resources. In the new navigation, nodes and node pools have been split into different grids.
    This category contains the following options:

    • Clusters
    • Node pools (separated from the Nodes page to its own page)
    • Nodes

    Organization
    Maps system organizations to ensure that resource allocation and policies align with the organizational structure, business projects, and priorities.
    This category contains the following options:

    • Departments
    • Projects

    Access
    Makes it possible to provide authorization of the different system users to perform actions and alignment with their role and scope of projects within the organization.
    This was moved from the legacy menu where it appeared in the header of the screen under Tools and Settings.
    This category contains the following options:

    • Users
    • Applications
    • Roles (separated from the Access rules and roles page to its own page)
    • Access rules (separated from the Access rules and roles page to its own page)

    Policies
    Presents the tools to enforce controls over the AI infrastructure enabling different users to be effective while working in alignment with organizational policies.
    This category contains the following options:

    • Workload policies

    Admin
    Presents all administrator functions of the Run:ai platform.
    This was moved from the legacy menu where it appeared in the header of the screen under Tools and Settings.
    This category contains the following options:

    • General settings (previously General)
    • Event history

    For users with more than one cluster, in the legacy version the cluster selection appeared in the header of the page. In the new navigation, the cluster selection is part of the grid and changes only affect the items on that page.

    If a user prefers not to use the new UI navigation, there is an option to switch back to the legacy navigation by clicking the Back to legacy navigation option.

    Installation and configuration

    • Tenant logos can now be uploaded to the Run:ai UI via API. The logo should be in base64 format and should not be white to avoid blending into the background. The logo should be no more than 20px tall. See Upload logo for tenant API.
    • Run:ai now supports NVIDIA GPU Operator version 24.6
    • Run:ai now supports Kubernetes version 1.31

    Deprecation notifications

    Feature deprecations

    Legacy Jobs view

    The legacy Jobs view will be fully deprecated in the Q1/25 release. We recommend that all users adopt the Workloads view, which offers all the capabilities of the legacy Jobs view with additional enhancements.
    SaaS customers will gradually be transitioned to the Workloads view during Q4/24.

    Note

    Users can still submit workloads via the legacy Jobs submission form.

    Dynamic MIG deprecation

    Dynamic MIG deprecation process starts with Run:ai v2.19 (Q4/24 release)

    • The feature is still available and MIG Profile APIs still function but are marked as Deprecated. See the table below for more details.
    • In Q1/25 release, ‘Dynamic MIG’ will not be usable anymore but the APIs will still be accessible.
    • In Q2/25 all ‘Dynamic MIG’ APIs will be fully deprecated.

    Legacy navigation - Run:ai UI

    The legacy navigation will be fully deprecated in the Q1/25 release, and during Q1/25 for SaaS customers.

    API support and endpoint deprecations

    Deprecated Replacement
    /v1/k8s/audit /api/v1/audit/log
    /api/v1/asset/compute/spec/migProfile
    /api/v1/workloads/spec/compute/migProfile
    /api/v1/workloads/workspaces/spec/compute/migProfile
    /api/v1/workloads/Trainings/spec/compute/migProfile
    /api/v1/workloads/Inferences/spec/compute/migProfile
    /api/v1/workloads/distributed/spec/compute/migProfile
    /api/v1/workloads/distributed/masterSpec/compute/migProfile

    Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

    For a full explanation of the API Deprecation policy, see the Run:ai API Policy

    Documentation enhancements

    Workload policy documentation

    A comprehensive set of articles detailing the usage and the process of submitting new workload policies has been introduced. It covers the structure, syntax, best practices, and examples for configuring policy YAML files. The new documentation includes step-by-step explanations of how to create a new rule in a policy, together with information of the different value types, rule types, and policy spec sections. For more information, refer to the Policies section.

    \ No newline at end of file diff --git a/v2.20/home/whats-new-2-20/index.html b/v2.20/home/whats-new-2-20/index.html index c45b3b14b0..dab1c38f6d 100644 --- a/v2.20/home/whats-new-2-20/index.html +++ b/v2.20/home/whats-new-2-20/index.html @@ -1,4 +1,4 @@ - Version 2.20 - Run:ai Documentation Library

    What’s New in Version 2.20

    Release Content

    The Run:ai v2.20 What's New provides a detailed summary of the latest features, enhancements, and updates introduced in this version. They serve as a guide to help users, administrators, and researchers understand the new capabilities and how to leverage them for improved workload management, resource optimization, and more.

    Important

    For a complete list of deprecations, see Deprecation notifications. Deprecated features and capabilities will be available for two versions ahead of the notification.

    Researchers

    Workloads - Workspaces and Training

    • Stop/run actions for distributed workloads - You can now stop and run distributed workloads from the UI, CLI, and API. Scheduling rules for training workloads also apply to distributed workloads. This enhances control over distributed workloads, enabling greater flexibility and resource management. From cluster v2.20 onward

    • Visibility into idle GPU devices - Idle GPU devices are now displayed in the UI and API showing the number of allocated GPU devices that have been idle for more than 5 minutes. This provides better visibility into resource utilization, enabling more efficient workload management.

    • Configurable workload completion with multiple runs - You can now define the number of runs a training workload must complete to be considered finished directly in the UI, API, and CLI v2. Running training workloads multiple times improves the reliability and validity of training results. Additionally, you can configure how many runs can be scheduled in parallel, helping to significantly reduce training time and simplifying the process of managing jobs that require multiple runs. See Train models using a standard training workload for more details. From cluster v2.20 onward

    • Configurable grace period for workload preemption - You can now set a grace period in the UI, API and CLI v2 providing a buffer time for preempted workloads to reach a safe checkpoint before being forcibly preempted for standard and distributed training workloads. The grace period can be configured between 0 seconds and 5 minutes. This aims to minimize data loss and avoid unnecessary retraining, ensuring the latest checkpoints are saved. From cluster v2.20 onward

    • Pod deletion policy for terminal workloads - You can now specify which pods should be deleted when a distributed workload reaches a terminal state (completed/failed) using cleanPodPolicy in CLI v2 and API. This enhancement provides greater control over resource cleanup and helps maintain a more organized and efficient cluster environment. See cleanPodPolicy for more details.

    Workload Assets

    • Instructions for environment variables - You can now add instructions to environment variables when creating new environments via the UI and API. In addition, Run:ai's environments now include default instructions. Adding instructions provides guidance enabling anyone using the environment to set the environment variable values correctly. From cluster v2.20 onward

    • Enhanced environments and compute resource management - The action bar now contains "Make a Copy" and "Edit" while the "Rename" option has been removed. A new "Last Updated" column has also been added for easier tracking of asset modifications. From cluster v2.20 onward

    • Enhanced data sources and credentials tables - Added a new "Kubernetes name" column to data sources and credentials tables for visibility into Kubernetes resource associations. The credentials table now includes an "Environments" column displaying the environments associated with the credential. From cluster v2.20 onward

    Authentication and authorization

    • User applications for API authentication - You can now create your own applications for API integrations with Run:ai. Each application includes client credentials which can be used to obtain an authentication token to utilize for subsequent API calls. See User applications for more details. From cluster v2.20 onward

    Scheduler

    • Support for multiple fractional GPUs in a single workload - Run:ai now supports submitting workloads that utilize multiple fractional GPUs within a single workload using the UI and CLI. This feature enhances GPU utilization, increases scheduling probability in shorter timeframes, and allows workloads to consume only the memory they need. It maximizes quota usage and enables more workloads to share the same GPUs effectively. See Multi-GPU fractions and Multi-GPU dynamic fractions for more details. Beta for Dynamic Fractions From cluster v2.20 onward

    • Support for GPU memory swap with multiple GPUs per workload - Run:ai now supports GPU memory swap for workloads utilizing multiple GPUs. By leveraging GPU memory swap, you can maximize GPU utilization and serve more workloads using the same hardware. The swap scheduler on each node ensures that all GPUs of a distributed model run simultaneously, maintaining synchronization across GPUs. Workload configurations combine swap settings with multi-GPU dynamic fractions, providing flexibility and efficiency for managing large-scale workloads. See Multi-GPU memory swap. Beta From cluster v2.20 onward

    Command Line Interface (CLI v2)

    • Support for Windows OS - CLI v2 now supports Windows operating systems, enabling you to leverage the full capabilities of the CLI. From cluster v2.18 onward

    • Unified training command structure - Unified the distributed command into the training command to align with the Run:ai UI. The training command now includes a new sub-command to support distributed workloads, ensuring a more consistent and streamlined user experience across both the CLI v2 and UI.

    • New command for Kubernetes access - Added a new CLI v2 command, runai kubconfig set, allowing users to set the kubeconfig file with Run:ai authorization token. This enhancement enables users to gain access to the Kubernetes cluster, simplifying authentication and integration with Run:ai-managed environments.

    • Added view workload labels - You can now view the labels associated with a workload when using the CLI v2 runai workload describe command for all workload types. This enhancement provides better visibility into workload metadata.

    ML Engineers

    Workloads - Inference

    • Enhanced visibility into rolling updates for inference workloads - Run:ai now provides a phase message that provides detailed insights into the current state of the update, by hovering over the workload's status. This helps users to monitor and manage updates more effectively. See Rolling inference updates for more details. From cluster v2.20 onward

    • Inference serving endpoint configuration - You can now define an inference serving endpoint directly within the environment using the Run:ai UI. From cluster v2.19 onward

    • Persistent token management for Hugging Face models - Run:ai allows users to save their Hugging Face tokens persistently as part of their credentials within the Run:ai UI. Once saved, tokens can be easily selected from a list of stored credentials, removing the need to manually enter them each time. This enhancement improves the process of deploying Hugging Face models, making it more efficient and user-friendly. See Deploy inference workloads from Hugging Face for more details. From cluster v2.13 onward

    • Deploy and manage NVIDIA NIM models in inference workloads - Run:ai now supports NVIDIA NIM models, enabling you to easily deploy and manage these models when submitting inference workloads. You can select a NIM model and leverage NVIDIA’s hardware optimizations directly through the Run:ai UI. This feature also allows you to take advantage of Run:ai capabilities such as autoscaling and GPU fractioning. See Deploy inference workloads with NVIDIA NIM for more details.

    • Customizable autoscaling plans for inference workloads - Run:ai allows advanced users practicing autoscaling for inference workloads to fine-tune their autoscaling plans using the Update inference spec API. This feature enables you to achieve optimal behavior to meet fluctuating request demands. Experimental From cluster v2.20 onward

    Platform Administrator

    Analytics

    • New Reports view for analytics - The new Reports enables generating and organizing large data in a structured, CSV-formatted layout. With this feature, you can monitor resource consumption, identify trends, and make informed decisions to optimize their AI workloads with greater efficiency.

    Authentication and authorization

    • Client credentials for applications - Applications now use client credentials - Client ID and Client secret - to obtain an authentication token, aligned with OAuth standard. See Applications for more details. From cluster v2.20 onward

    Node pools

    • Enhanced metric graphs for node pools - Enhanced metric graphs in the DETAILS tab for node pools by aligning these graphs with the dashboard and the node pools API. As part of this improvement, the following columns have been removed from the Node pools table.

      • Node GPU Allocation
      • GPU Utilization Distribution
      • GPU Utilization
      • GPU Memory Utilization
      • CPU Utilization
      • CPU Memory Utilization

    Organizations - Projects/Departments

    • Enhanced project deletion - Deleting a project will now attempt to delete the project's associated workloads and assets, allowing better management of your organization's assets. From cluster v2.20 onward

    • Enhanced resource prioritization for projects and departments - Run:ai has introduced advanced prioritization capabilities to manage resources between projects or between departments more effectively using the Projects and Departments APIs. From cluster v2.20 onward

      This feature allows administrators to:

      • Prioritize resource allocation and reclaim between different projects and departments.
      • Prioritize projects within the same department.
      • Set priorities per node-pool for both projects and departments.
      • Implement distinct SLAs by assigning strict priority levels to over-quota resources.
    • Updated over quota naming - Renamed over quota priority to over quota weight to reflect its actual functionality.

    Policy

    • Added policy-based default field values - Administrators can now set default values for fields that are automatically calculated based on the values of other fields using defaultFrom. This ensures that critical fields in the workload submission form are populated automatically if not provided by the user. From cluster v2.20 onward

      This feature supports various field types:

      • Integer fields (e.g., cpuCoresRequest),
      • Number fields (e.g., gpuPortionRequest),
      • Quantity fields (e.g., gpuMemoryRequest)

    Data sources

    • Improved control over data source and storage class visibility - Run:ai now provides administrators with the ability to control the visibility of data source types and storage in the UI. Data source types that are restricted by policy will no longer appear during workload submission or when creating new data source assets. Additionally, administrators can configure storage classes as internal using the Storage class configuration API. From cluster v2.20 onward

    Email notifications

    • Added email notifications API - Email notifications can now be configured via API in addition to the UI, enabling integration with external tools. See NotificationChannels API for more details.

    Infrastructure Administrator

    NVIDIA Data Center GPUs - Grace-Hopper

    • Support for ARM-Based Grace-Hopper Superchip (GH200) - Run:ai now supports the ARM-based Grace-Hopper Superchip (GH200). Due to a limitation in version 2.20 with ARM64, the Run:ai control plane services must be scheduled on non-ARM based CPU nodes. This limitation will be addressed in a future release. See Self-Hosted installation over Kubernetes for more details. From cluster v2.20 onward

    System requirements

    • Run:ai now supports Kubernetes version 1.32.
    • Run:ai now supports OpenShift version 4.17.
    • Kubernetes version 1.28 is no longer supported.
    • OpenShift versions 4.12 to 4.13 are no longer supported.

    Advanced cluster configurations

    • Exclude nodes in mixed node clusters - Run:ai now allows you to exclude specific nodes in a mixed node cluster using the nodeSelectorTerms flag. See Advanced Cluster Configurations for more details. From cluster v2.20 onward

    • Advanced configuration options for cluster services - Introduced new cluster configuration options for setting node affinity and tolerations for Run:ai cluster services. These configuration ensure that the Run:ai cluster services are scheduled on the desired nodes. See Advanced Cluster Configurations for more details. From cluster v2.20 onward

      • global.affinity
      • global.tolerations
      • daemonSetsTolerations
    • Added Argo workflows auto-pod grouping - Introduced a new cluster configuration option, gangScheduleArgoWorkflow, to modify the default behavior for grouping ArgoWorkflow pods, allowing you to prevent pods from being grouped into a single pod-group. See Advanced Cluster Configurations for more details. Cluster v2.20 and v2.18

    • Added cloud auto-scaling for memory fractions - Run:ai now supports auto-scaling for workloads using memory fractions in cloud environments. Using gpuMemoryToFractionRatio configuration option allows a failed scheduling attempt for a memory fractions workload to create Run:ai scaling pods, triggering the auto-scaler. See Advanced Cluster Configurations for more details. From cluster v2.19 onward

    • Added stale gang eviction timeout for improved stability - Run:ai has introduced a default timeout of 60 seconds for gang eviction in gang scheduling workloads using defaultStalenessGracePeriod. This timeout allows both the workload controller and the scheduler sufficient time to remediate the workload, improving the stability of large training jobs. See Advanced Cluster Configurations for more details. From cluster v2.18 onward

    • Added custom labels for built-in alerts - Administrators can now add their own custom labels to the built-in alerts from Prometheus by setting spec.prometheus.additionalAlertLabels in their cluster. See Advanced Cluster Configurations for mode details. From cluster v2.20 onward

    • Enhanced configuration flexibility for cluster replica management - Administrators can now use the spec.global.replicaCount to manage replicas for Run:ai services. See Advanced Cluster Configurations for more details. From cluster v2.20 onward

    Run:ai built-in alerts

    • Added two new Run:ai built-in alerts for Kubernetes nodes hosting GPU workloads. The unknown state alert notifies when the node's health and readiness cannot be determined, and the low memory alert warns when the node has insufficient memory to support current or upcoming workloads. From cluster v2.20 onward

    Run:ai Developer

    Metrics and Telemetry

    • Additional metrics and telemetry are available via the API. For more details, see Metrics API:

      • Metrics (over time)

        • Project
          • GPU_QUOTA
          • CPU_QUOTA_MILLICORES
          • CPU_MEMORY_QUOTA_MB
          • GPU_ALLOCATION
          • CPU_ALLOCATION_MILLICORES
          • CPU_MEMORY_ALLOCATION_MB
        • Department
          • GPU_QUOTA
          • CPU_QUOTA_MILLICORES
          • CPU_MEMORY_QUOTA_MB
          • GPU_ALLOCATION
          • CPU_ALLOCATION_MILLICORES
          • CPU_MEMORY_ALLOCATION_MB
      • Telemetry (current time)

        • Project
          • GPU_QUOTA
          • CPU_QUOTA
          • MEMORY_QUOTA
          • GPU_ALLOCATION
          • CPU_ALLOCATION
          • MEMORY_ALLOCATION
          • GPU_ALLOCATION_NON_PREEMPTIBLE
          • CPU_ALLOCATION_NON_PREEMPTIBLE
          • MEMORY_ALLOCATION_NON_PREEMPTIBLE
        • Department
          • GPU_QUOTA
          • CPU_QUOTA
          • MEMORY_QUOTA
          • GPU_ALLOCATION
          • CPU_ALLOCATION
          • MEMORY_ALLOCATION
          • GPU_ALLOCATION_NON_PREEMPTIBLE
          • CPU_ALLOCATION_NON_PREEMPTIBLE
          • MEMORY_ALLOCATION_NON_PREEMPTIBLE

    Deprecation notifications

    Ongoing Dynamic MIG deprecation process

    The Dynamic MIG deprecation process started in version 2.19. Run:ai supports standard MIG profiles as detailed in Configuring NVIDIA MIG profiles.

    • Before upgrading to version 2.20, workloads submitted with Dynamic MIG and their associated node configurations must be removed
    • In version 2.20, MIG was removed from the Run:ai UI under compute resources.
    • In Q2/25 all ‘Dynamic MIG’ APIs and CLI commands will be fully deprecated.

    CLI v1 deprecation

    CLI V1 is deprecated and no new features will be developed for it. It will remain available for use for the next two releases to ensure a smooth transition for all users. We recommend switching to CLI v2, which provides feature parity, backward compatibility, and ongoing support for new enhancements. CLI v2 is designed to deliver a more robust, efficient, and user-friendly experience.

    Legacy Jobs view deprecation

    Starting with version 2.20, the legacy Jobs view will be discontinued in favor of the more advanced Workloads view. The legacy submission form will still be accessible via the Workload manager view for a smoother transition.

    appID and appSecret deprecation

    Deprecating appID and appSecret parameters used for requesting an API token. It will remain available for use for the next two releases. To create application tokens, use your client credentials - Client ID and Client secret.

    What’s New in Version 2.20

    Release Content

    The Run:ai v2.20 What's New provides a detailed summary of the latest features, enhancements, and updates introduced in this version. They serve as a guide to help users, administrators, and researchers understand the new capabilities and how to leverage them for improved workload management, resource optimization, and more.

    Important

    For a complete list of deprecations, see Deprecation notifications. Deprecated features and capabilities will be available for two versions ahead of the notification.

    Researchers

    Workloads - Workspaces and Training

    • Stop/run actions for distributed workloads - You can now stop and run distributed workloads from the UI, CLI, and API. Scheduling rules for training workloads also apply to distributed workloads. This enhances control over distributed workloads, enabling greater flexibility and resource management. From cluster v2.20 onward

    • Visibility into idle GPU devices - Idle GPU devices are now displayed in the UI and API showing the number of allocated GPU devices that have been idle for more than 5 minutes. This provides better visibility into resource utilization, enabling more efficient workload management.

    • Configurable workload completion with multiple runs - You can now define the number of runs a training workload must complete to be considered finished directly in the UI, API, and CLI v2. Running training workloads multiple times improves the reliability and validity of training results. Additionally, you can configure how many runs can be scheduled in parallel, helping to significantly reduce training time and simplifying the process of managing jobs that require multiple runs. See Train models using a standard training workload for more details. From cluster v2.20 onward

    • Configurable grace period for workload preemption - You can now set a grace period in the UI, API and CLI v2 providing a buffer time for preempted workloads to reach a safe checkpoint before being forcibly preempted for standard and distributed training workloads. The grace period can be configured between 0 seconds and 5 minutes. This aims to minimize data loss and avoid unnecessary retraining, ensuring the latest checkpoints are saved. From cluster v2.20 onward

    • Pod deletion policy for terminal workloads - You can now specify which pods should be deleted when a distributed workload reaches a terminal state (completed/failed) using cleanPodPolicy in CLI v2 and API. This enhancement provides greater control over resource cleanup and helps maintain a more organized and efficient cluster environment. See cleanPodPolicy for more details.

    Workload Assets

    • Instructions for environment variables - You can now add instructions to environment variables when creating new environments via the UI and API. In addition, Run:ai's environments now include default instructions. Adding instructions provides guidance enabling anyone using the environment to set the environment variable values correctly. From cluster v2.20 onward

    • Enhanced environments and compute resource management - The action bar now contains "Make a Copy" and "Edit" while the "Rename" option has been removed. A new "Last Updated" column has also been added for easier tracking of asset modifications. From cluster v2.20 onward

    • Enhanced data sources and credentials tables - Added a new "Kubernetes name" column to data sources and credentials tables for visibility into Kubernetes resource associations. The credentials table now includes an "Environments" column displaying the environments associated with the credential. From cluster v2.20 onward

    Authentication and authorization

    • User applications for API authentication - You can now create your own applications for API integrations with Run:ai. Each application includes client credentials which can be used to obtain an authentication token to utilize for subsequent API calls. See User applications for more details. From cluster v2.20 onward

    Scheduler

    • Support for multiple fractional GPUs in a single workload - Run:ai now supports submitting workloads that utilize multiple fractional GPUs within a single workload using the UI and CLI. This feature enhances GPU utilization, increases scheduling probability in shorter timeframes, and allows workloads to consume only the memory they need. It maximizes quota usage and enables more workloads to share the same GPUs effectively. See Multi-GPU fractions and Multi-GPU dynamic fractions for more details. Beta for Dynamic Fractions From cluster v2.20 onward

    • Support for GPU memory swap with multiple GPUs per workload - Run:ai now supports GPU memory swap for workloads utilizing multiple GPUs. By leveraging GPU memory swap, you can maximize GPU utilization and serve more workloads using the same hardware. The swap scheduler on each node ensures that all GPUs of a distributed model run simultaneously, maintaining synchronization across GPUs. Workload configurations combine swap settings with multi-GPU dynamic fractions, providing flexibility and efficiency for managing large-scale workloads. See Multi-GPU memory swap. Beta From cluster v2.20 onward

    Command Line Interface (CLI v2)

    • Support for Windows OS - CLI v2 now supports Windows operating systems, enabling you to leverage the full capabilities of the CLI. From cluster v2.18 onward

    • Unified training command structure - Unified the distributed command into the training command to align with the Run:ai UI. The training command now includes a new sub-command to support distributed workloads, ensuring a more consistent and streamlined user experience across both the CLI v2 and UI.

    • New command for Kubernetes access - Added a new CLI v2 command, runai kubconfig set, allowing users to set the kubeconfig file with Run:ai authorization token. This enhancement enables users to gain access to the Kubernetes cluster, simplifying authentication and integration with Run:ai-managed environments.

    • Added view workload labels - You can now view the labels associated with a workload when using the CLI v2 runai workload describe command for all workload types. This enhancement provides better visibility into workload metadata.

    ML Engineers

    Workloads - Inference

    • Enhanced visibility into rolling updates for inference workloads - Run:ai now provides a phase message that provides detailed insights into the current state of the update, by hovering over the workload's status. This helps users to monitor and manage updates more effectively. See Rolling inference updates for more details. From cluster v2.20 onward

    • Inference serving endpoint configuration - You can now define an inference serving endpoint directly within the environment using the Run:ai UI. From cluster v2.19 onward

    • Persistent token management for Hugging Face models - Run:ai allows users to save their Hugging Face tokens persistently as part of their credentials within the Run:ai UI. Once saved, tokens can be easily selected from a list of stored credentials, removing the need to manually enter them each time. This enhancement improves the process of deploying Hugging Face models, making it more efficient and user-friendly. See Deploy inference workloads from Hugging Face for more details. From cluster v2.13 onward

    • Deploy and manage NVIDIA NIM models in inference workloads - Run:ai now supports NVIDIA NIM models, enabling you to easily deploy and manage these models when submitting inference workloads. You can select a NIM model and leverage NVIDIA’s hardware optimizations directly through the Run:ai UI. This feature also allows you to take advantage of Run:ai capabilities such as autoscaling and GPU fractioning. See Deploy inference workloads with NVIDIA NIM for more details.

    • Customizable autoscaling plans for inference workloads - Run:ai allows advanced users practicing autoscaling for inference workloads to fine-tune their autoscaling plans using the Update inference spec API. This feature enables you to achieve optimal behavior to meet fluctuating request demands. Experimental From cluster v2.20 onward

    Platform Administrator

    Analytics

    • New Reports view for analytics - The new Reports enables generating and organizing large data in a structured, CSV-formatted layout. With this feature, you can monitor resource consumption, identify trends, and make informed decisions to optimize their AI workloads with greater efficiency.

    Authentication and authorization

    • Client credentials for applications - Applications now use client credentials - Client ID and Client secret - to obtain an authentication token, aligned with OAuth standard. See Applications for more details. From cluster v2.20 onward

    Node pools

    • Enhanced metric graphs for node pools - Enhanced metric graphs in the DETAILS tab for node pools by aligning these graphs with the dashboard and the node pools API. As part of this improvement, the following columns have been removed from the Node pools table.

      • Node GPU Allocation
      • GPU Utilization Distribution
      • GPU Utilization
      • GPU Memory Utilization
      • CPU Utilization
      • CPU Memory Utilization

    Organizations - Projects/Departments

    • Enhanced project deletion - Deleting a project will now attempt to delete the project's associated workloads and assets, allowing better management of your organization's assets. From cluster v2.20 onward

    • Enhanced resource prioritization for projects and departments - Run:ai has introduced advanced prioritization capabilities to manage resources between projects or between departments more effectively using the Projects and Departments APIs. From cluster v2.20 onward

      This feature allows administrators to:

      • Prioritize resource allocation and reclaim between different projects and departments.
      • Prioritize projects within the same department.
      • Set priorities per node-pool for both projects and departments.
      • Implement distinct SLAs by assigning strict priority levels to over-quota resources.
    • Updated over quota naming - Renamed over quota priority to over quota weight to reflect its actual functionality.

    Policy

    • Added policy-based default field values - Administrators can now set default values for fields that are automatically calculated based on the values of other fields using defaultFrom. This ensures that critical fields in the workload submission form are populated automatically if not provided by the user. From cluster v2.20 onward

      This feature supports various field types:

      • Integer fields (e.g., cpuCoresRequest),
      • Number fields (e.g., gpuPortionRequest),
      • Quantity fields (e.g., gpuMemoryRequest)

    Data sources

    • Improved control over data source and storage class visibility - Run:ai now provides administrators with the ability to control the visibility of data source types and storage in the UI. Data source types that are restricted by policy will no longer appear during workload submission or when creating new data source assets. Additionally, administrators can configure storage classes as internal using the Storage class configuration API. From cluster v2.20 onward

    Email notifications

    • Added email notifications API - Email notifications can now be configured via API in addition to the UI, enabling integration with external tools. See NotificationChannels API for more details.

    Infrastructure Administrator

    NVIDIA Data Center GPUs - Grace-Hopper

    • Support for ARM-Based Grace-Hopper Superchip (GH200) - Run:ai now supports the ARM-based Grace-Hopper Superchip (GH200). Due to a limitation in version 2.20 with ARM64, the Run:ai control plane services must be scheduled on non-ARM based CPU nodes. This limitation will be addressed in a future release. See Self-Hosted installation over Kubernetes for more details. From cluster v2.20 onward

    System requirements

    • Run:ai now supports Kubernetes version 1.32.
    • Run:ai now supports OpenShift version 4.17.
    • Kubernetes version 1.28 is no longer supported.
    • OpenShift versions 4.12 to 4.13 are no longer supported.

    Advanced cluster configurations

    • Exclude nodes in mixed node clusters - Run:ai now allows you to exclude specific nodes in a mixed node cluster using the nodeSelectorTerms flag. See Advanced Cluster Configurations for more details. From cluster v2.20 onward

    • Advanced configuration options for cluster services - Introduced new cluster configuration options for setting node affinity and tolerations for Run:ai cluster services. These configuration ensure that the Run:ai cluster services are scheduled on the desired nodes. See Advanced Cluster Configurations for more details. From cluster v2.20 onward

      • global.affinity
      • global.tolerations
      • daemonSetsTolerations
    • Added Argo workflows auto-pod grouping - Introduced a new cluster configuration option, gangScheduleArgoWorkflow, to modify the default behavior for grouping ArgoWorkflow pods, allowing you to prevent pods from being grouped into a single pod-group. See Advanced Cluster Configurations for more details. Cluster v2.20 and v2.18

    • Added cloud auto-scaling for memory fractions - Run:ai now supports auto-scaling for workloads using memory fractions in cloud environments. Using gpuMemoryToFractionRatio configuration option allows a failed scheduling attempt for a memory fractions workload to create Run:ai scaling pods, triggering the auto-scaler. See Advanced Cluster Configurations for more details. From cluster v2.19 onward

    • Added stale gang eviction timeout for improved stability - Run:ai has introduced a default timeout of 60 seconds for gang eviction in gang scheduling workloads using defaultStalenessGracePeriod. This timeout allows both the workload controller and the scheduler sufficient time to remediate the workload, improving the stability of large training jobs. See Advanced Cluster Configurations for more details. From cluster v2.18 onward

    • Added custom labels for built-in alerts - Administrators can now add their own custom labels to the built-in alerts from Prometheus by setting spec.prometheus.additionalAlertLabels in their cluster. See Advanced Cluster Configurations for mode details. From cluster v2.20 onward

    • Enhanced configuration flexibility for cluster replica management - Administrators can now use the spec.global.replicaCount to manage replicas for Run:ai services. See Advanced Cluster Configurations for more details. From cluster v2.20 onward

    Run:ai built-in alerts

    • Added two new Run:ai built-in alerts for Kubernetes nodes hosting GPU workloads. The unknown state alert notifies when the node's health and readiness cannot be determined, and the low memory alert warns when the node has insufficient memory to support current or upcoming workloads. From cluster v2.20 onward

    Run:ai Developer

    Metrics and Telemetry

    • Additional metrics and telemetry are available via the API. For more details, see Metrics API:

      • Metrics (over time)

        • Project
          • GPU_QUOTA
          • CPU_QUOTA_MILLICORES
          • CPU_MEMORY_QUOTA_MB
          • GPU_ALLOCATION
          • CPU_ALLOCATION_MILLICORES
          • CPU_MEMORY_ALLOCATION_MB
        • Department
          • GPU_QUOTA
          • CPU_QUOTA_MILLICORES
          • CPU_MEMORY_QUOTA_MB
          • GPU_ALLOCATION
          • CPU_ALLOCATION_MILLICORES
          • CPU_MEMORY_ALLOCATION_MB
      • Telemetry (current time)

        • Project
          • GPU_QUOTA
          • CPU_QUOTA
          • MEMORY_QUOTA
          • GPU_ALLOCATION
          • CPU_ALLOCATION
          • MEMORY_ALLOCATION
          • GPU_ALLOCATION_NON_PREEMPTIBLE
          • CPU_ALLOCATION_NON_PREEMPTIBLE
          • MEMORY_ALLOCATION_NON_PREEMPTIBLE
        • Department
          • GPU_QUOTA
          • CPU_QUOTA
          • MEMORY_QUOTA
          • GPU_ALLOCATION
          • CPU_ALLOCATION
          • MEMORY_ALLOCATION
          • GPU_ALLOCATION_NON_PREEMPTIBLE
          • CPU_ALLOCATION_NON_PREEMPTIBLE
          • MEMORY_ALLOCATION_NON_PREEMPTIBLE

    Deprecation notifications

    Ongoing Dynamic MIG deprecation process

    The Dynamic MIG deprecation process started in version 2.19. Run:ai supports standard MIG profiles as detailed in Configuring NVIDIA MIG profiles.

    • Before upgrading to version 2.20, workloads submitted with Dynamic MIG and their associated node configurations must be removed
    • In version 2.20, MIG was removed from the Run:ai UI under compute resources.
    • In Q2/25 all ‘Dynamic MIG’ APIs and CLI commands will be fully deprecated.

    CLI v1 deprecation

    CLI V1 is deprecated and no new features will be developed for it. It will remain available for use for the next two releases to ensure a smooth transition for all users. We recommend switching to CLI v2, which provides feature parity, backward compatibility, and ongoing support for new enhancements. CLI v2 is designed to deliver a more robust, efficient, and user-friendly experience.

    Legacy Jobs view deprecation

    Starting with version 2.20, the legacy Jobs view will be discontinued in favor of the more advanced Workloads view. The legacy submission form will still be accessible via the Workload manager view for a smoother transition.

    appID and appSecret deprecation

    Deprecating appID and appSecret parameters used for requesting an API token. It will remain available for use for the next two releases. To create application tokens, use your client credentials - Client ID and Client secret.

    \ No newline at end of file diff --git a/v2.20/platform-admin/aiinitiatives/org/departments/index.html b/v2.20/platform-admin/aiinitiatives/org/departments/index.html index 9763aa9d53..8a9a34cd9e 100644 --- a/v2.20/platform-admin/aiinitiatives/org/departments/index.html +++ b/v2.20/platform-admin/aiinitiatives/org/departments/index.html @@ -1,4 +1,4 @@ - Departments - Run:ai Documentation Library

    Departments

    This article explains the procedure for managing departments

    Departments are a grouping of projects. By grouping projects into a department, you can set quota limitations to a set of projects, create policies that are applied to the department, and create assets that can be scoped to the whole department or a partial group of descendent projects

    For example, in an academic environment, a department can be the Physics Department grouping various projects (AI Initiatives) within the department, or grouping projects where each project represents a single student.

    Departments

    The Departments table can be found under Organization in the Run:ai platform.

    Note

    Departments are disabled, by default. If you cannot see Departments in the menu, then it must be enabled by your Administrator, under General settings → Resources → Departments

    The Departments table lists all departments defined for a specific cluster and allows you to manage them. You can switch between clusters by selecting your cluster using the filter at the top.

    The Departments table consists of the following columns:

    Column Description
    Department The name of the department
    Node pool(s) with quota The node pools associated with this department. By default, all node pools within a cluster are associated with each department. Administrators can change the node pools’ quota parameters for a department. Click the values under this column to view the list of node pools with their parameters (as described below)
    GPU quota GPU quota associated with the department
    Total GPUs for projects The sum of all projects’ GPU quotas associated with this department
    Project(s) List of projects associated with this department
    Subject(s) The users, SSO groups, or applications with access to the project. Click the values under this column to view the list of subjects with their parameters (as described below). This column is only viewable if your role in Run:ai platform allows you those permissions.
    Allocated GPUs The total number of GPUs allocated by successfully scheduled workloads in projects associated with this department
    GPU allocation ratio The ratio of Allocated GPUs to GPU quota. This number reflects how well the department’s GPU quota is utilized by its descendant projects. A number higher than 100% means the department is using over-quota GPUs. A number lower than 100% means not all projects are utilizing their quotas. A quota becomes allocated once a workload is successfully scheduled.
    Creation time The timestamp for when the department was created
    Workload(s) The list of workloads under projects associated with this department. Click the values under this column to view the list of workloads with their resource parameters (as described below)
    Cluster The cluster that the department is associated with

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Node pools with quota associated with the department

    Click one of the values of Node pool(s) with quota column, to view the list of node pools and their parameters

    Column Description
    Node pool The name of the node pool is given by the administrator during node pool creation. All clusters have a default node pool created automatically by the system and named ‘default’.
    GPU quota The amount of GPU quota the administrator dedicated to the department for this node pool (floating number, e.g. 2.3 means 230% of a GPU capacity)
    CPU (Cores) The amount of CPU (cores) quota the administrator has dedicated to the department for this node pool (floating number, e.g. 1.3 Cores = 1300 mili-cores). The ‘unlimited’ value means the CPU (Cores) quota is not bound and workloads using this node pool can use as many CPU (Cores) resources as they need (if available)
    CPU memory The amount of CPU memory quota the administrator has dedicated to the department for this node pool (floating number, in MB or GB). The ‘unlimited’ value means the CPU memory quota is not bounded and workloads using this node pool can use as much CPU memory resource as they need (if available).
    Allocated GPUs The total amount of GPUs allocated by workloads using this node pool under projects associated with this department. The number of allocated GPUs may temporarily surpass the GPU quota of the department if over-quota is used.
    Allocated CPU (Cores) The total amount of CPUs (cores) allocated by workloads using this node pool under all projects associated with this department. The number of allocated CPUs (cores) may temporarily surpass the CPUs (Cores) quota of the department if over-quota is used.
    Allocated CPU memory The actual amount of CPU memory allocated by workloads using this node pool under all projects associated with this department. The number of Allocated CPU memory may temporarily surpass the CPU memory quota if over-quota is used.

    Subjects authorized for the project

    Click one of the values of the Subject(s) column, to view the list of subjects and their parameters. This column is only viewable if your role in the Run:ai system affords you those permissions.

    Column Description
    Subject A user, SSO group, or application assigned with a role in the scope of this department
    Type The type of subject assigned to the access rule (user, SSO group, or application).
    Scope The scope of this department within the organizational tree. Click the name of the scope to view the organizational tree diagram, you can only view the parts of the organizational tree for which you have permission to view.
    Role The role assigned to the subject, in this department’s scope
    Authorized by The user who granted the access rule
    Last updated The last time the access rule was updated

    Note

    A role given in a certain scope, means the role applies to this scope and any descendant scopes in the organizational tree.

    Adding a new department

    To create a new Department:

    1. Click +NEW DEPARTMENT
    2. Select a scope.
      By default, the field contains the scope of the current UI context cluster, viewable at the top left side of your screen. You can change the current UI context cluster by clicking the ‘Cluster: cluster-name’ field and applying another cluster as the UI context. Alternatively, you can choose another cluster within the ‘+ New Department’ form by clicking the organizational tree icon on the right side of the scope field, opening the organizational tree and selecting one of the available clusters.
    3. Enter a name for the department. Department names must start with a letter and can only contain lower case latin letters, numbers or a hyphen ('-’).
    4. Under Quota Management, select a quota for the department. The Quota management section may contain different fields depending on pre-created system configuration. Possible system configurations are:
      • Existence of Node Pools
      • CPU Quota - Allow setting a quota for CPU resources.

    When no node pools are configured, you can set the following quota parameters:

    • GPU Devices
      The number of GPUs you want to allocate for this department (decimal number). This quota is consumed by the department’s subordinated project.
    • CPUs (cores) (when CPU quota is set)
      The number of CPU cores you want to allocate for this department (decimal number). This quota is consumed by the department’s subordinated projects
    • CPUs memory (when CPU quota is set)
      The amount of CPU memory you want to allocate for this department (in Megabytes or Gigabytes). This quota is consumed by the department’s subordinated projects

    When node pools are enabled, it is possible to set the above quota parameters for each node-pool separately.

    • Order of priority This column is displayed only if more than one node pool exists. The default order in which the Scheduler uses node pools to schedule a workload. This means, the Scheduler first tries to allocate resources using the highest priority node pool, followed by the next in priority, until it reaches the lowest priority node pool list, then the Scheduler starts from the highest priority again. The Scheduler uses the department list of prioritized node pools, only if the order of priority of node pools is not set in project or the workload during submission (either by an admin policy or by the user). An empty value indicates that the node pool is not part of the department’s default node pool priority list, but a node pool can still be chosen by the admin policy or a user during workload submission. Department nodepool priority sets defaults to the subordinate projects but does not enforce it, meaning projects are free to change their priority.
    • In addition, you can decide whether to allow a department to go over-quota. Allowing over-quota at the department level means that one department can receive more resources than its quota when not required by other departments. If the over-quota is disabled, workloads running under subordinated projects are not able to use more resources than the department’s quota, but each project can still go over-quota (if enabled at the project level) up to the department’s quota.

    Unlimited CPU(Cores) and CPU memory quotas are an exception - in this case, workloads of subordinated projects can consume available resources up to the physical limitation of the cluster or any of the node pools.

    Example of Quota management:

    1. Click CREATE DEPARTMENT

    Adding an access rule to a department

    To create a new access rule for a department:

    1. Select the department you want to add an access rule for
    2. Click ACCESS RULES
    3. Click +ACCESS RULE
    4. Select a subject
    5. Select or enter the subject identifier:
      • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
      • Group name as recognized by the IDP
      • Application name as created in Run:ai
    6. Select a role
    7. Click SAVE RULE
    8. Click CLOSE

    Deleting an access rule from a department

    To delete an access rule from a department:

    1. Select the department you want to remove an access rule from
    2. Click ACCESS RULES
    3. Find the access rule you would like to delete
    4. Click on the trash icon
    5. Click CLOSE

    Editing a department

    1. Select the Department you want to edit
    2. Click EDIT
    3. Update the Department and click SAVE

    Viewing a department’s policy

    To view the policy of a department:

    1. Select the department for which you want to view its policies.
      This option is only active if the department has defined policies in place.
    2. Click VIEW POLICY and select the workload type for which you want to view the policies:
      a. Workspace workload type policy with its set of rules
      b. Training workload type policies with its set of rules
    3. In the Policy form, view the workload rules that are enforcing your department for the selected workload type as well as the defaults:
      • Parameter - The workload submission parameter that Rule and Default is applied on
      • Type (applicable for data sources only) - The data source type (Git, S3, nfs, pvc etc.)
      • Default - The default value of the Parameter
      • Rule - Set up constraints on workload policy fields
      • Source - The origin of the applied policy (cluster, department or project)

    Notes

    • The policy affecting the department consists of rules and defaults. Some of these rules and defaults may be derived from the policies of a parent cluster (source). You can see the source of each rule in the policy form.
    • A policy set for a department affects all subordinated projects and their workloads, according to the policy workload type

    Deleting a department

    1. Select the department you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Note

    Deleting a department permanently deletes its subordinated projects, any assets created in the scope of this department, and any of its subordinated projects such as compute resources, environments, data sources, templates, and credentials. However, workloads running within the department’s subordinated projects, or the policies defined for this department or its subordinated projects - remain intact and running.

    Reviewing a department

    1. Select the department you want to review
    2. Click REVIEW
    3. Review and click CLOSE

    Using API

    Go to the Departments API reference to view the available actions

    Departments

    This article explains the procedure for managing departments

    Departments are a grouping of projects. By grouping projects into a department, you can set quota limitations to a set of projects, create policies that are applied to the department, and create assets that can be scoped to the whole department or a partial group of descendent projects

    For example, in an academic environment, a department can be the Physics Department grouping various projects (AI Initiatives) within the department, or grouping projects where each project represents a single student.

    Departments

    The Departments table can be found under Organization in the Run:ai platform.

    Note

    Departments are disabled, by default. If you cannot see Departments in the menu, then it must be enabled by your Administrator, under General settings → Resources → Departments

    The Departments table lists all departments defined for a specific cluster and allows you to manage them. You can switch between clusters by selecting your cluster using the filter at the top.

    The Departments table consists of the following columns:

    Column Description
    Department The name of the department
    Node pool(s) with quota The node pools associated with this department. By default, all node pools within a cluster are associated with each department. Administrators can change the node pools’ quota parameters for a department. Click the values under this column to view the list of node pools with their parameters (as described below)
    GPU quota GPU quota associated with the department
    Total GPUs for projects The sum of all projects’ GPU quotas associated with this department
    Project(s) List of projects associated with this department
    Subject(s) The users, SSO groups, or applications with access to the project. Click the values under this column to view the list of subjects with their parameters (as described below). This column is only viewable if your role in Run:ai platform allows you those permissions.
    Allocated GPUs The total number of GPUs allocated by successfully scheduled workloads in projects associated with this department
    GPU allocation ratio The ratio of Allocated GPUs to GPU quota. This number reflects how well the department’s GPU quota is utilized by its descendant projects. A number higher than 100% means the department is using over-quota GPUs. A number lower than 100% means not all projects are utilizing their quotas. A quota becomes allocated once a workload is successfully scheduled.
    Creation time The timestamp for when the department was created
    Workload(s) The list of workloads under projects associated with this department. Click the values under this column to view the list of workloads with their resource parameters (as described below)
    Cluster The cluster that the department is associated with

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Node pools with quota associated with the department

    Click one of the values of Node pool(s) with quota column, to view the list of node pools and their parameters

    Column Description
    Node pool The name of the node pool is given by the administrator during node pool creation. All clusters have a default node pool created automatically by the system and named ‘default’.
    GPU quota The amount of GPU quota the administrator dedicated to the department for this node pool (floating number, e.g. 2.3 means 230% of a GPU capacity)
    CPU (Cores) The amount of CPU (cores) quota the administrator has dedicated to the department for this node pool (floating number, e.g. 1.3 Cores = 1300 mili-cores). The ‘unlimited’ value means the CPU (Cores) quota is not bound and workloads using this node pool can use as many CPU (Cores) resources as they need (if available)
    CPU memory The amount of CPU memory quota the administrator has dedicated to the department for this node pool (floating number, in MB or GB). The ‘unlimited’ value means the CPU memory quota is not bounded and workloads using this node pool can use as much CPU memory resource as they need (if available).
    Allocated GPUs The total amount of GPUs allocated by workloads using this node pool under projects associated with this department. The number of allocated GPUs may temporarily surpass the GPU quota of the department if over-quota is used.
    Allocated CPU (Cores) The total amount of CPUs (cores) allocated by workloads using this node pool under all projects associated with this department. The number of allocated CPUs (cores) may temporarily surpass the CPUs (Cores) quota of the department if over-quota is used.
    Allocated CPU memory The actual amount of CPU memory allocated by workloads using this node pool under all projects associated with this department. The number of Allocated CPU memory may temporarily surpass the CPU memory quota if over-quota is used.

    Subjects authorized for the project

    Click one of the values of the Subject(s) column, to view the list of subjects and their parameters. This column is only viewable if your role in the Run:ai system affords you those permissions.

    Column Description
    Subject A user, SSO group, or application assigned with a role in the scope of this department
    Type The type of subject assigned to the access rule (user, SSO group, or application).
    Scope The scope of this department within the organizational tree. Click the name of the scope to view the organizational tree diagram, you can only view the parts of the organizational tree for which you have permission to view.
    Role The role assigned to the subject, in this department’s scope
    Authorized by The user who granted the access rule
    Last updated The last time the access rule was updated

    Note

    A role given in a certain scope, means the role applies to this scope and any descendant scopes in the organizational tree.

    Adding a new department

    To create a new Department:

    1. Click +NEW DEPARTMENT
    2. Select a scope.
      By default, the field contains the scope of the current UI context cluster, viewable at the top left side of your screen. You can change the current UI context cluster by clicking the ‘Cluster: cluster-name’ field and applying another cluster as the UI context. Alternatively, you can choose another cluster within the ‘+ New Department’ form by clicking the organizational tree icon on the right side of the scope field, opening the organizational tree and selecting one of the available clusters.
    3. Enter a name for the department. Department names must start with a letter and can only contain lower case latin letters, numbers or a hyphen ('-’).
    4. Under Quota Management, select a quota for the department. The Quota management section may contain different fields depending on pre-created system configuration. Possible system configurations are:
      • Existence of Node Pools
      • CPU Quota - Allow setting a quota for CPU resources.

    When no node pools are configured, you can set the following quota parameters:

    • GPU Devices
      The number of GPUs you want to allocate for this department (decimal number). This quota is consumed by the department’s subordinated project.
    • CPUs (cores) (when CPU quota is set)
      The number of CPU cores you want to allocate for this department (decimal number). This quota is consumed by the department’s subordinated projects
    • CPUs memory (when CPU quota is set)
      The amount of CPU memory you want to allocate for this department (in Megabytes or Gigabytes). This quota is consumed by the department’s subordinated projects

    When node pools are enabled, it is possible to set the above quota parameters for each node-pool separately.

    • Order of priority This column is displayed only if more than one node pool exists. The default order in which the Scheduler uses node pools to schedule a workload. This means, the Scheduler first tries to allocate resources using the highest priority node pool, followed by the next in priority, until it reaches the lowest priority node pool list, then the Scheduler starts from the highest priority again. The Scheduler uses the department list of prioritized node pools, only if the order of priority of node pools is not set in project or the workload during submission (either by an admin policy or by the user). An empty value indicates that the node pool is not part of the department’s default node pool priority list, but a node pool can still be chosen by the admin policy or a user during workload submission. Department nodepool priority sets defaults to the subordinate projects but does not enforce it, meaning projects are free to change their priority.
    • In addition, you can decide whether to allow a department to go over-quota. Allowing over-quota at the department level means that one department can receive more resources than its quota when not required by other departments. If the over-quota is disabled, workloads running under subordinated projects are not able to use more resources than the department’s quota, but each project can still go over-quota (if enabled at the project level) up to the department’s quota.

    Unlimited CPU(Cores) and CPU memory quotas are an exception - in this case, workloads of subordinated projects can consume available resources up to the physical limitation of the cluster or any of the node pools.

    Example of Quota management:

    1. Click CREATE DEPARTMENT

    Adding an access rule to a department

    To create a new access rule for a department:

    1. Select the department you want to add an access rule for
    2. Click ACCESS RULES
    3. Click +ACCESS RULE
    4. Select a subject
    5. Select or enter the subject identifier:
      • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
      • Group name as recognized by the IDP
      • Application name as created in Run:ai
    6. Select a role
    7. Click SAVE RULE
    8. Click CLOSE

    Deleting an access rule from a department

    To delete an access rule from a department:

    1. Select the department you want to remove an access rule from
    2. Click ACCESS RULES
    3. Find the access rule you would like to delete
    4. Click on the trash icon
    5. Click CLOSE

    Editing a department

    1. Select the Department you want to edit
    2. Click EDIT
    3. Update the Department and click SAVE

    Viewing a department’s policy

    To view the policy of a department:

    1. Select the department for which you want to view its policies.
      This option is only active if the department has defined policies in place.
    2. Click VIEW POLICY and select the workload type for which you want to view the policies:
      a. Workspace workload type policy with its set of rules
      b. Training workload type policies with its set of rules
    3. In the Policy form, view the workload rules that are enforcing your department for the selected workload type as well as the defaults:
      • Parameter - The workload submission parameter that Rule and Default is applied on
      • Type (applicable for data sources only) - The data source type (Git, S3, nfs, pvc etc.)
      • Default - The default value of the Parameter
      • Rule - Set up constraints on workload policy fields
      • Source - The origin of the applied policy (cluster, department or project)

    Notes

    • The policy affecting the department consists of rules and defaults. Some of these rules and defaults may be derived from the policies of a parent cluster (source). You can see the source of each rule in the policy form.
    • A policy set for a department affects all subordinated projects and their workloads, according to the policy workload type

    Deleting a department

    1. Select the department you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Note

    Deleting a department permanently deletes its subordinated projects, any assets created in the scope of this department, and any of its subordinated projects such as compute resources, environments, data sources, templates, and credentials. However, workloads running within the department’s subordinated projects, or the policies defined for this department or its subordinated projects - remain intact and running.

    Reviewing a department

    1. Select the department you want to review
    2. Click REVIEW
    3. Review and click CLOSE

    Using API

    Go to the Departments API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/platform-admin/aiinitiatives/org/projects/index.html b/v2.20/platform-admin/aiinitiatives/org/projects/index.html index f4d2935039..e02c683643 100644 --- a/v2.20/platform-admin/aiinitiatives/org/projects/index.html +++ b/v2.20/platform-admin/aiinitiatives/org/projects/index.html @@ -1,4 +1,4 @@ - Projects - Run:ai Documentation Library

    Projects

    This article explains the procedure to manage Projects.

    Researchers submit AI workloads. To streamline resource allocation and prioritize work, Run:ai introduces the concept of Projects. Projects are the tool to implement resource allocation policies as well as the segregation between different initiatives. A project may represent a team, an individual, or an initiative that shares resources or has a specific resource quota. Projects may be aggregated in Run:ai departments.

    For example, you may have several people involved in a specific face-recognition initiative collaborating under one project named “face-recognition-2024”. Alternatively, you can have a project per person in your team, where each member receives their own quota.

    Projects table

    The Projects table can be found under Organization in the Run:ai platform.

    The Projects table provides a list of all projects defined for a specific cluster, and allows you to manage them. You can switch between clusters by selecting your cluster using the filter at the top.

    The Projects table consists of the following columns:

    Column Description
    Project The name of the project
    Department The name of the parent department. Several projects may be grouped under a department.
    Status The Project creation status. Projects are manifested as Kubernetes namespaces. The project status represents the Namespace creation status.
    Node pool(s) with quota The node pools associated with the project. By default, a new project is associated with all node pools within its associated cluster. Administrators can change the node pools’ quota parameters for a project. Click the values under this column to view the list of node pools with their parameters (as described below)
    Subject(s) The users, SSO groups, or applications with access to the project. Click the values under this column to view the list of subjects with their parameters (as described below). This column is only viewable if your role in the Run:ai platform allows you those permissions.
    Allocated GPUs The total number of GPUs allocated by successfully scheduled workloads under this project
    GPU allocation ratio The ratio of Allocated GPUs to GPU quota. This number reflects how well the project’s GPU quota is utilized by its descendent workloads. A number higher than 100% indicates the project is using over-quota GPUs.
    GPU quota The GPU quota allocated to the project. This number represents the sum of all node pools’ GPU quota allocated to this project.
    Allocated CPUs (Core) The total number of CPU cores allocated by workloads submitted within this project. (This column is only available if the CPU Quota setting is enabled, as described below).
    Allocated CPU Memory The total number of CPUs allocated by successfully scheduled workloads under this project. (This column is only available if the CPU Quota setting is enabled, as described below).
    CPU quota (Cores) CPU quota allocated to this project. (This column is only available if the CPU Quota setting is enabled, as described below). This number represents the sum of all node pools’ CPU quota allocated to this project. The ‘unlimited’ value means the CPU (cores) quota is not bounded and workloads using this project can use as many CPU (cores) resources as they need (if available).
    CPU memory quota CPU memory quota allocated to this project. (This column is only available if the CPU Quota setting is enabled, as described below). This number represents the sum of all node pools’ CPU memory quota allocated to this project. The ‘unlimited’ value means the CPU memory quota is not bounded and workloads using this Project can use as much CPU memory resources as they need (if available).
    CPU allocation ratio The ratio of Allocated CPUs (cores) to CPU quota (cores). This number reflects how much the project’s ‘CPU quota’ is utilized by its descendent workloads. A number higher than 100% indicates the project is using over-quota CPU cores.
    CPU memory allocation ratio The ratio of Allocated CPU memory to CPU memory quota. This number reflects how well the project’s ‘CPU memory quota’ is utilized by its descendent workloads. A number higher than 100% indicates the project is using over-quota CPU memory.
    Node affinity of training workloads The list of Run:ai node-affinities. Any training workload submitted within this project must specify one of those Run:ai node affinities, otherwise it is not submitted.
    Node affinity of interactive workloads The list of Run:ai node-affinities. Any interactive (workspace) workload submitted within this project must specify one of those Run:ai node affinities, otherwise it is not submitted.
    Idle time limit of training workloads The time in days:hours:minutes after which the project stops a training workload not using its allocated GPU resources.
    Idle time limit of preemptible workloads The time in days:hours:minutes after which the project stops a preemptible interactive (workspace) workload not using its allocated GPU resources.
    Idle time limit of non preemptible workloads The time in days:hours:minutes after which the project stops a non-preemptible interactive (workspace) workload not using its allocated GPU resources..
    Interactive workloads time limit The duration in days:hours:minutes after which the project stops an interactive (workspace) workload
    Training workloads time limit The duration in days:hours:minutes after which the project stops a training workload
    Creation time The timestamp for when the project was created
    Workload(s) The list of workloads associated with the project. Click the values under this column to view the list of workloads with their resource parameters (as described below).
    Cluster The cluster that the project is associated with

    Node pools with quota associated with the project

    Click one of the values of Node pool(s) with quota column, to view the list of node pools and their parameters

    Column Description
    Node pool The name of the node pool is given by the administrator during node pool creation. All clusters have a default node pool created automatically by the system and named ‘default’.
    GPU quota The amount of GPU quota the administrator dedicated to the project for this node pool (floating number, e.g. 2.3 means 230% of GPU capacity).
    CPU (Cores) The amount of CPUs (cores) quota the administrator has dedicated to the project for this node pool (floating number, e.g. 1.3 Cores = 1300 mili-cores). The ‘unlimited’ value means the CPU (Cores) quota is not bounded and workloads using this node pool can use as many CPU (Cores) resources as they require, (if available).
    CPU memory The amount of CPU memory quota the administrator has dedicated to the project for this node pool (floating number, in MB or GB). The ‘unlimited’ value means the CPU memory quota is not bounded and workloads using this node pool can use as much CPU memory resource as they need (if available).
    Allocated GPUs The actual amount of GPUs allocated by workloads using this node pool under this project. The number of allocated GPUs may temporarily surpass the GPU quota if over-quota is used.
    Allocated CPU (Cores) The actual amount of CPUs (cores) allocated by workloads using this node pool under this project. The number of allocated CPUs (cores) may temporarily surpass the CPUs (Cores) quota if over-quota is used.
    Allocated CPU memory The actual amount of CPU memory allocated by workloads using this node pool under this Project. The number of Allocated CPU memory may temporarily surpass the CPU memory quota if over-quota is used.
    Order of priority The default order in which the Scheduler uses node-pools to schedule a workload. This is used only if the order of priority of node pools is not set in the workload during submission, either by an admin policy or the user. An empty value means the node pool is not part of the project’s default list, but can still be chosen by an admin policy or the user during workload submission

    Subjects authorized for the project

    Click one of the values in the Subject(s) column, to view the list of subjects and their parameters. This column is only viewable, if your role in the Run:ai system affords you those permissions.

    Column Description
    Subject A user, SSO group, or application assigned with a role in the scope of this Project
    Type The type of subject assigned to the access rule (user, SSO group, or application)
    Scope The scope of this project in the organizational tree. Click the name of the scope to view the organizational tree diagram, you can only view the parts of the organizational tree for which you have permission to view.
    Role The role assigned to the subject, in this project’s scope
    Authorized by The user who granted the access rule
    Last updated The last time the access rule was updated

    Workloads associated with the project

    Click one of the values of Workload(s) column, to view the list of workloads and their parameters

    Column Description
    Workload The name of the workload, given during its submission. Optionally, an icon describing the type of workload is also visible
    Type The type of the workload, e.g. Workspace, Training, Inference
    Status The state of the workload and time elapsed since the last status change
    Created by The subject that created this workload
    Running/ requested pods The number of running pods out of the number of requested pods for this workload. e.g. a distributed workload requesting 4 pods but may be in a state where only 2 are running and 2 are pending
    Creation time The date and time the workload was created
    GPU compute request The amount of GPU compute requested (floating number, represents either a portion of the GPU compute, or the number of whole GPUs requested)
    GPU memory request The amount of GPU memory requested (floating number, can either be presented as a portion of the GPU memory, an absolute memory size in MB or GB, or a MIG profile)
    CPU memory request The amount of CPU memory requested (floating number, presented as an absolute memory size in MB or GB)
    CPU compute request The amount of CPU compute requested (floating number, represents the number of requested Cores)

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Adding a new project

    To create a new Project:

    1. Click +NEW PROJECT
    2. Select a scope, you can only view clusters if you have permission to do so - within the scope of the roles assigned to you
    3. Enter a name for the project
      Project names must start with a letter and can only contain lower case Latin letters, numbers or a hyphen ('-’)
    4. Namespace associated with Project
      Each project has an associated (Kubernetes) namespace in the cluster.
      All workloads under this project use this namespace.
      a. By default, Run:ai creates a namespace based on the Project name (in the form of runai-<name>)
      b. Alternatively, you can choose an existing namespace created for you by the cluster administrator
    5. In the Quota management section, you can set the quota parameters and prioritize resources

      • Order of priority
        This column is displayed only if more than one node pool exists. The default order in which the Scheduler uses node pools to schedule a workload. This means the Scheduler first tries to allocate resources using the highest priority node pool, then the next in priority, until it reaches the lowest priority node pool list, then the Scheduler starts from the highest again. The Scheduler uses the Project list of prioritized node pools, only if the order of priority of node pools is not set in the workload during submission, either by an admin policy or by the user. Empty value means the node pool is not part of the Project’s default node pool priority list, but a node pool can still be chosen by the admin policy or a user during workload submission
      • Node pool
        This column is displayed only if more than one node pool exists. It represents the name of the node pool.
      • GPU devices
        The number of GPUs you want to allocate for this project in this node pool (decimal number).
      • CPUs (Cores)
        This column is displayed only if CPU quota is enabled via the General settings.
        Represents the number of CPU cores you want to allocate for this project in this node pool (decimal number).
      • CPU memory
        This column is displayed only if CPU quota is enabled via the General settings.
        The amount of CPU memory you want to allocate for this project in this node pool (in Megabytes or Gigabytes).

      • Over quota / Over quota weight - If over-quota weight is enabled via the General settings then over-quota weight is presented, otherwise over-quota is presented

        • Over quota
          When enabled, the project can use non-guaranteed overage resources above its quota in this node pool. The amount of the non-guaranteed overage resources for this project is calculated proportionally to the project quota in this node pool. When disabled, the project cannot use more resources than the guaranteed quota in this node pool.
        • Over quota weight - Represents a weight used to calculate the amount of non-guaranteed overage resources a project can get on top of its quota in this node pool. All unused resources are split between projects that require the use of overage resources:
          • Medium
            The default value. The Admin can change the default to any of the following values: High, Low, Lowest, or None.
          • None
            When set, the project cannot use more resources than the guaranteed quota in this node pool.
          • Lowest
            Over-quota weight ‘lowest’ has a unique behavior, because its weight is 0, it can only use over-quota (unused overage) resources if no other project needs them, and any project with a higher over-quota weight can snap the average resources at any time.

    Note

    Setting the quota to 0 (either GPU, CPU, or CPU memory) and the over-quota to ‘disabled’ or over-quota weight to ‘none’ means the project is blocked from using those resources on this node pool.

    When no node pools are configured, you can set the same parameters but it is for the whole project, instead of per node pool.

    After node pools are created, you can set the above parameters for each node-pool separately.

    1. Set Scheduling rules as required. You can have a scheduling rule for:

      • Idle GPU timeout
        Preempt a workload that does not use GPUs for more than a specified duration. You can apply a single rule per workload type - Preemptive Workspaces, Non-preemptive Workspaces, and Training.

      Note

      To make ‘Idle GPU timeout’ effective, it must be set to a shorter duration than that workload duration of the same workload type.

      • Workspace duration
        Preempt workspaces after a specified duration. This applies to both preemptive and non-preemptive Workspaces.
      • Training duration
        Preempt a training workload after a specified duration.
      • Node type (Affinity)
        Node type is used to select a group of nodes, usually with specific characteristics such as a hardware feature, storage type, fast networking interconnection, etc. The scheduler uses node type as an indication of which nodes should be used for your workloads, within this project.
        Node type is a label in the form of run.ai/type and a value (e.g. run.ai/type = dgx200) that the administrator uses to tag a set of nodes. Adding the node type to the project’s scheduling rules enables the user to submit workloads with any node type label/value pairs in this list, according to the workload type - Workspace or Training. The Scheduler then schedules workloads using a node selector, targeting nodes tagged with the Run:ai node type label/value pair. Node pools and a node type can be used in conjunction with each other. For example, specifying a node pool and a smaller group of nodes from that node pool that includes a fast SSD memory or other unique characteristics.
    2. Click CREATE PROJECT

    Adding an access rule to a project

    To create a new access rule for a project:

    1. Select the project you want to add an access rule for
    2. Click ACCESS RULES
    3. Click +ACCESS RULE
    4. Select a subject
    5. Select or enter the subject identifier:
      • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
      • Group name as recognized by the IDP
      • Application name as created in Run:ai
    6. Select a role
    7. Click SAVE RULE
    8. Click CLOSE

    Deleting an access rule from a project

    To delete an access rule from a project:

    1. Select the project you want to remove an access rule from
    2. Click ACCESS RULES
    3. Find the access rule you want to delete
    4. Click on the trash icon
    5. Click CLOSE

    Editing a project

    To edit a project:

    1. Select the project you want to edit
    2. Click EDIT
    3. Update the Project and click SAVE

    Viewing a project’s policy

    To view the policy of a project:

    1. Select the project for which you want to view its policies. This option is only active for projects with defined policies in place.
    2. Click VIEW POLICY and select the workload type for which you want to view the policies:
      a. Workspace workload type policy with its set of rules
      b. Training workload type policies with its set of rules
    3. In the Policy form, view the workload rules that are enforcing your project for the selected workload type as well as the defaults:
      • Parameter - The workload submission parameter that Rules and Defaults are applied to
      • Type (applicable for data sources only) - The data source type (Git, S3, nfs, pvc etc.)
      • Default - The default value of the Parameter
      • Rule - Set up constraints on workload policy fields
      • Source - The origin of the applied policy (cluster, department or project)

    Note

    The policy affecting the project consists of rules and defaults. Some of these rules and defaults may be derived from policies of a parent cluster and/or department (source). You can see the source of each rule in the policy form.

    Deleting a project

    To delete a project:

    1. Select the project you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Note

    • Clusters < v2.20

      Deleting a project does not delete its associated namespace, any of the running workloads using this namespace, or the policies defined for this project. However, any assets created in the scope of this project such as compute resources, environments, data sources, templates and credentials, are permanently deleted from the system.

    • Clusters >=v2.20

      Deleting a project does not delete its associated namespace, but will attempt to delete its associated workloads and assets. Any assets created in the scope of this project such as compute resources, environments, data sources, templates and credentials, are permanently deleted from the system.

    Using API

    Go to the Projects API reference to view the available actions

    Projects

    This article explains the procedure to manage Projects.

    Researchers submit AI workloads. To streamline resource allocation and prioritize work, Run:ai introduces the concept of Projects. Projects are the tool to implement resource allocation policies as well as the segregation between different initiatives. A project may represent a team, an individual, or an initiative that shares resources or has a specific resource quota. Projects may be aggregated in Run:ai departments.

    For example, you may have several people involved in a specific face-recognition initiative collaborating under one project named “face-recognition-2024”. Alternatively, you can have a project per person in your team, where each member receives their own quota.

    Projects table

    The Projects table can be found under Organization in the Run:ai platform.

    The Projects table provides a list of all projects defined for a specific cluster, and allows you to manage them. You can switch between clusters by selecting your cluster using the filter at the top.

    The Projects table consists of the following columns:

    Column Description
    Project The name of the project
    Department The name of the parent department. Several projects may be grouped under a department.
    Status The Project creation status. Projects are manifested as Kubernetes namespaces. The project status represents the Namespace creation status.
    Node pool(s) with quota The node pools associated with the project. By default, a new project is associated with all node pools within its associated cluster. Administrators can change the node pools’ quota parameters for a project. Click the values under this column to view the list of node pools with their parameters (as described below)
    Subject(s) The users, SSO groups, or applications with access to the project. Click the values under this column to view the list of subjects with their parameters (as described below). This column is only viewable if your role in the Run:ai platform allows you those permissions.
    Allocated GPUs The total number of GPUs allocated by successfully scheduled workloads under this project
    GPU allocation ratio The ratio of Allocated GPUs to GPU quota. This number reflects how well the project’s GPU quota is utilized by its descendent workloads. A number higher than 100% indicates the project is using over-quota GPUs.
    GPU quota The GPU quota allocated to the project. This number represents the sum of all node pools’ GPU quota allocated to this project.
    Allocated CPUs (Core) The total number of CPU cores allocated by workloads submitted within this project. (This column is only available if the CPU Quota setting is enabled, as described below).
    Allocated CPU Memory The total number of CPUs allocated by successfully scheduled workloads under this project. (This column is only available if the CPU Quota setting is enabled, as described below).
    CPU quota (Cores) CPU quota allocated to this project. (This column is only available if the CPU Quota setting is enabled, as described below). This number represents the sum of all node pools’ CPU quota allocated to this project. The ‘unlimited’ value means the CPU (cores) quota is not bounded and workloads using this project can use as many CPU (cores) resources as they need (if available).
    CPU memory quota CPU memory quota allocated to this project. (This column is only available if the CPU Quota setting is enabled, as described below). This number represents the sum of all node pools’ CPU memory quota allocated to this project. The ‘unlimited’ value means the CPU memory quota is not bounded and workloads using this Project can use as much CPU memory resources as they need (if available).
    CPU allocation ratio The ratio of Allocated CPUs (cores) to CPU quota (cores). This number reflects how much the project’s ‘CPU quota’ is utilized by its descendent workloads. A number higher than 100% indicates the project is using over-quota CPU cores.
    CPU memory allocation ratio The ratio of Allocated CPU memory to CPU memory quota. This number reflects how well the project’s ‘CPU memory quota’ is utilized by its descendent workloads. A number higher than 100% indicates the project is using over-quota CPU memory.
    Node affinity of training workloads The list of Run:ai node-affinities. Any training workload submitted within this project must specify one of those Run:ai node affinities, otherwise it is not submitted.
    Node affinity of interactive workloads The list of Run:ai node-affinities. Any interactive (workspace) workload submitted within this project must specify one of those Run:ai node affinities, otherwise it is not submitted.
    Idle time limit of training workloads The time in days:hours:minutes after which the project stops a training workload not using its allocated GPU resources.
    Idle time limit of preemptible workloads The time in days:hours:minutes after which the project stops a preemptible interactive (workspace) workload not using its allocated GPU resources.
    Idle time limit of non preemptible workloads The time in days:hours:minutes after which the project stops a non-preemptible interactive (workspace) workload not using its allocated GPU resources..
    Interactive workloads time limit The duration in days:hours:minutes after which the project stops an interactive (workspace) workload
    Training workloads time limit The duration in days:hours:minutes after which the project stops a training workload
    Creation time The timestamp for when the project was created
    Workload(s) The list of workloads associated with the project. Click the values under this column to view the list of workloads with their resource parameters (as described below).
    Cluster The cluster that the project is associated with

    Node pools with quota associated with the project

    Click one of the values of Node pool(s) with quota column, to view the list of node pools and their parameters

    Column Description
    Node pool The name of the node pool is given by the administrator during node pool creation. All clusters have a default node pool created automatically by the system and named ‘default’.
    GPU quota The amount of GPU quota the administrator dedicated to the project for this node pool (floating number, e.g. 2.3 means 230% of GPU capacity).
    CPU (Cores) The amount of CPUs (cores) quota the administrator has dedicated to the project for this node pool (floating number, e.g. 1.3 Cores = 1300 mili-cores). The ‘unlimited’ value means the CPU (Cores) quota is not bounded and workloads using this node pool can use as many CPU (Cores) resources as they require, (if available).
    CPU memory The amount of CPU memory quota the administrator has dedicated to the project for this node pool (floating number, in MB or GB). The ‘unlimited’ value means the CPU memory quota is not bounded and workloads using this node pool can use as much CPU memory resource as they need (if available).
    Allocated GPUs The actual amount of GPUs allocated by workloads using this node pool under this project. The number of allocated GPUs may temporarily surpass the GPU quota if over-quota is used.
    Allocated CPU (Cores) The actual amount of CPUs (cores) allocated by workloads using this node pool under this project. The number of allocated CPUs (cores) may temporarily surpass the CPUs (Cores) quota if over-quota is used.
    Allocated CPU memory The actual amount of CPU memory allocated by workloads using this node pool under this Project. The number of Allocated CPU memory may temporarily surpass the CPU memory quota if over-quota is used.
    Order of priority The default order in which the Scheduler uses node-pools to schedule a workload. This is used only if the order of priority of node pools is not set in the workload during submission, either by an admin policy or the user. An empty value means the node pool is not part of the project’s default list, but can still be chosen by an admin policy or the user during workload submission

    Subjects authorized for the project

    Click one of the values in the Subject(s) column, to view the list of subjects and their parameters. This column is only viewable, if your role in the Run:ai system affords you those permissions.

    Column Description
    Subject A user, SSO group, or application assigned with a role in the scope of this Project
    Type The type of subject assigned to the access rule (user, SSO group, or application)
    Scope The scope of this project in the organizational tree. Click the name of the scope to view the organizational tree diagram, you can only view the parts of the organizational tree for which you have permission to view.
    Role The role assigned to the subject, in this project’s scope
    Authorized by The user who granted the access rule
    Last updated The last time the access rule was updated

    Workloads associated with the project

    Click one of the values of Workload(s) column, to view the list of workloads and their parameters

    Column Description
    Workload The name of the workload, given during its submission. Optionally, an icon describing the type of workload is also visible
    Type The type of the workload, e.g. Workspace, Training, Inference
    Status The state of the workload and time elapsed since the last status change
    Created by The subject that created this workload
    Running/ requested pods The number of running pods out of the number of requested pods for this workload. e.g. a distributed workload requesting 4 pods but may be in a state where only 2 are running and 2 are pending
    Creation time The date and time the workload was created
    GPU compute request The amount of GPU compute requested (floating number, represents either a portion of the GPU compute, or the number of whole GPUs requested)
    GPU memory request The amount of GPU memory requested (floating number, can either be presented as a portion of the GPU memory, an absolute memory size in MB or GB, or a MIG profile)
    CPU memory request The amount of CPU memory requested (floating number, presented as an absolute memory size in MB or GB)
    CPU compute request The amount of CPU compute requested (floating number, represents the number of requested Cores)

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Adding a new project

    To create a new Project:

    1. Click +NEW PROJECT
    2. Select a scope, you can only view clusters if you have permission to do so - within the scope of the roles assigned to you
    3. Enter a name for the project
      Project names must start with a letter and can only contain lower case Latin letters, numbers or a hyphen ('-’)
    4. Namespace associated with Project
      Each project has an associated (Kubernetes) namespace in the cluster.
      All workloads under this project use this namespace.
      a. By default, Run:ai creates a namespace based on the Project name (in the form of runai-<name>)
      b. Alternatively, you can choose an existing namespace created for you by the cluster administrator
    5. In the Quota management section, you can set the quota parameters and prioritize resources

      • Order of priority
        This column is displayed only if more than one node pool exists. The default order in which the Scheduler uses node pools to schedule a workload. This means the Scheduler first tries to allocate resources using the highest priority node pool, then the next in priority, until it reaches the lowest priority node pool list, then the Scheduler starts from the highest again. The Scheduler uses the Project list of prioritized node pools, only if the order of priority of node pools is not set in the workload during submission, either by an admin policy or by the user. Empty value means the node pool is not part of the Project’s default node pool priority list, but a node pool can still be chosen by the admin policy or a user during workload submission
      • Node pool
        This column is displayed only if more than one node pool exists. It represents the name of the node pool.
      • GPU devices
        The number of GPUs you want to allocate for this project in this node pool (decimal number).
      • CPUs (Cores)
        This column is displayed only if CPU quota is enabled via the General settings.
        Represents the number of CPU cores you want to allocate for this project in this node pool (decimal number).
      • CPU memory
        This column is displayed only if CPU quota is enabled via the General settings.
        The amount of CPU memory you want to allocate for this project in this node pool (in Megabytes or Gigabytes).

      • Over quota / Over quota weight - If over-quota weight is enabled via the General settings then over-quota weight is presented, otherwise over-quota is presented

        • Over quota
          When enabled, the project can use non-guaranteed overage resources above its quota in this node pool. The amount of the non-guaranteed overage resources for this project is calculated proportionally to the project quota in this node pool. When disabled, the project cannot use more resources than the guaranteed quota in this node pool.
        • Over quota weight - Represents a weight used to calculate the amount of non-guaranteed overage resources a project can get on top of its quota in this node pool. All unused resources are split between projects that require the use of overage resources:
          • Medium
            The default value. The Admin can change the default to any of the following values: High, Low, Lowest, or None.
          • None
            When set, the project cannot use more resources than the guaranteed quota in this node pool.
          • Lowest
            Over-quota weight ‘lowest’ has a unique behavior, because its weight is 0, it can only use over-quota (unused overage) resources if no other project needs them, and any project with a higher over-quota weight can snap the average resources at any time.

    Note

    Setting the quota to 0 (either GPU, CPU, or CPU memory) and the over-quota to ‘disabled’ or over-quota weight to ‘none’ means the project is blocked from using those resources on this node pool.

    When no node pools are configured, you can set the same parameters but it is for the whole project, instead of per node pool.

    After node pools are created, you can set the above parameters for each node-pool separately.

    1. Set Scheduling rules as required. You can have a scheduling rule for:

      • Idle GPU timeout
        Preempt a workload that does not use GPUs for more than a specified duration. You can apply a single rule per workload type - Preemptive Workspaces, Non-preemptive Workspaces, and Training.

      Note

      To make ‘Idle GPU timeout’ effective, it must be set to a shorter duration than that workload duration of the same workload type.

      • Workspace duration
        Preempt workspaces after a specified duration. This applies to both preemptive and non-preemptive Workspaces.
      • Training duration
        Preempt a training workload after a specified duration.
      • Node type (Affinity)
        Node type is used to select a group of nodes, usually with specific characteristics such as a hardware feature, storage type, fast networking interconnection, etc. The scheduler uses node type as an indication of which nodes should be used for your workloads, within this project.
        Node type is a label in the form of run.ai/type and a value (e.g. run.ai/type = dgx200) that the administrator uses to tag a set of nodes. Adding the node type to the project’s scheduling rules enables the user to submit workloads with any node type label/value pairs in this list, according to the workload type - Workspace or Training. The Scheduler then schedules workloads using a node selector, targeting nodes tagged with the Run:ai node type label/value pair. Node pools and a node type can be used in conjunction with each other. For example, specifying a node pool and a smaller group of nodes from that node pool that includes a fast SSD memory or other unique characteristics.
    2. Click CREATE PROJECT

    Adding an access rule to a project

    To create a new access rule for a project:

    1. Select the project you want to add an access rule for
    2. Click ACCESS RULES
    3. Click +ACCESS RULE
    4. Select a subject
    5. Select or enter the subject identifier:
      • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
      • Group name as recognized by the IDP
      • Application name as created in Run:ai
    6. Select a role
    7. Click SAVE RULE
    8. Click CLOSE

    Deleting an access rule from a project

    To delete an access rule from a project:

    1. Select the project you want to remove an access rule from
    2. Click ACCESS RULES
    3. Find the access rule you want to delete
    4. Click on the trash icon
    5. Click CLOSE

    Editing a project

    To edit a project:

    1. Select the project you want to edit
    2. Click EDIT
    3. Update the Project and click SAVE

    Viewing a project’s policy

    To view the policy of a project:

    1. Select the project for which you want to view its policies. This option is only active for projects with defined policies in place.
    2. Click VIEW POLICY and select the workload type for which you want to view the policies:
      a. Workspace workload type policy with its set of rules
      b. Training workload type policies with its set of rules
    3. In the Policy form, view the workload rules that are enforcing your project for the selected workload type as well as the defaults:
      • Parameter - The workload submission parameter that Rules and Defaults are applied to
      • Type (applicable for data sources only) - The data source type (Git, S3, nfs, pvc etc.)
      • Default - The default value of the Parameter
      • Rule - Set up constraints on workload policy fields
      • Source - The origin of the applied policy (cluster, department or project)

    Note

    The policy affecting the project consists of rules and defaults. Some of these rules and defaults may be derived from policies of a parent cluster and/or department (source). You can see the source of each rule in the policy form.

    Deleting a project

    To delete a project:

    1. Select the project you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Note

    • Clusters < v2.20

      Deleting a project does not delete its associated namespace, any of the running workloads using this namespace, or the policies defined for this project. However, any assets created in the scope of this project such as compute resources, environments, data sources, templates and credentials, are permanently deleted from the system.

    • Clusters >=v2.20

      Deleting a project does not delete its associated namespace, but will attempt to delete its associated workloads and assets. Any assets created in the scope of this project such as compute resources, environments, data sources, templates and credentials, are permanently deleted from the system.

    Using API

    Go to the Projects API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/platform-admin/aiinitiatives/org/scheduling-rules/index.html b/v2.20/platform-admin/aiinitiatives/org/scheduling-rules/index.html index f9d477c5c9..2f946b0a62 100644 --- a/v2.20/platform-admin/aiinitiatives/org/scheduling-rules/index.html +++ b/v2.20/platform-admin/aiinitiatives/org/scheduling-rules/index.html @@ -1,4 +1,4 @@ - Scheduling Rules - Run:ai Documentation Library

    Scheduling Rules

    This article explains the procedure of configuring and managing Scheduling rules. Scheduling rules refer to restrictions applied over workloads. These restrictions apply to either the resources (nodes) on which workloads can run or to the duration of the workload run time. Scheduling rules are set for projects or departments and apply to a specific workload type. Once scheduling rules are set, all matching workloads associated with the project or (subordinate projects in case of department) have the restrictions as defined when the workload was submitted. New scheduling rules added, are not applied over already created workloads associated with that project/department.

    Scheduling Rules

    There are 3 types of scheduling rules:

    Workload duration (time limit)

    This rule limits the duration of a workload run time. Workload run time is calculated as the total time in which the workload was in status Running. You can apply a single rule per workload type - Preemptive Workspaces, Non-preemptive Workspaces, and Training.

    Idle GPU time limit

    This rule limits the total GPU time of a workload. Workload idle time is counted from the first time the workload is in status Running and the GPU was idle.
    We calculate idleness by employing the runai_gpu_idle_seconds_per_workload metric. This metric determines the total duration of zero GPU utilization within each 30-second interval. If the GPU remains idle throughout the 30-second window, 30 seconds are added to the idleness sum; otherwise, the idleness count is reset.
    You can apply a single rule per workload type - Preemptible Workspaces, Non-preemptible Workspaces, and Training.

    Note

    To make Idle GPU timeout effective, it must be set to a shorter duration than that workload duration of the same workload type.

    Node type (Affinity)

    Node type is used to select a group of nodes, typically with specific characteristics such as a hardware feature, storage type, fast networking interconnection, etc. The scheduler uses node type as an indication of which nodes should be used for your workloads, within this project.

    Node type is a label in the form of run.ai/type and a value (e.g. run.ai/type = dgx200) that the administrator uses to tag a set of nodes. Adding the node type to the project’s scheduling rules mandates the user to submit workloads with a node type label/value pairs from this list, according to the workload type - Workspace or Training. The Scheduler then schedules workloads using a node selector, targeting nodes tagged with the Run:ai node type label/value pair. Node pools and a node type can be used in conjunction. For example, specifying a node pool and a smaller group of nodes from that node pool that includes a fast SSD memory or other unique characteristics.

    Labelling nodes for node types grouping

    The administrator should use a node label with the key of run.ai/type and any coupled value

    To assign a label to nodes you want to group, set the ‘node type (affinity)’ on each relevant node:

    1. Obtain the list of nodes and their current labels by coping the following to your terminal:

      kubectl get nodes --show-labels
       

    2. Annotate a specific node with a new label by coping the following to your terminal:

      kubectl label node <node-name> run.ai/type=<value>
      -

    Adding a scheduling rule to a project/department

    To add a scheduling rule:

    1. Select the project/department for which you want to add a scheduling rule
    2. Click EDIT
    3. In the Scheduling rules section click +RULE
    4. Select the rule type
    5. Select the workload type and time limitation period
    6. For Node type, choose one or more labels for the desired nodes.
    7. Click SAVE

    Note

    You can review the defined rules in the Projects table in the relevant column.

    Editing the project/department scheduling rule

    To edit a scheduling rule:

    1. Select the project/department for which you want to edit its scheduling rule
    2. Click EDIT
    3. Find the scheduling rule you would like to edit
    4. Edit the rule
    5. Click SAVE

    Note

    When a editing an inherited rule on a project/department (a rule defined by the department), you can only restrict the rule limitation

    Deleting the project/department scheduling rule

    To delete a scheduling rule:

    1. Select the project/department from which you want to delete a scheduling rule
    2. Click EDIT
    3. Find the scheduling rule you would like to delete
    4. Click on the x icon
    5. Click SAVE

    !!! You cannot delete rules inherited from the department from the project's set of rules

    Using API

    Go to the Projects API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/platform-admin/aiinitiatives/overview/index.html b/v2.20/platform-admin/aiinitiatives/overview/index.html index b90704dfd1..14c9fde707 100644 --- a/v2.20/platform-admin/aiinitiatives/overview/index.html +++ b/v2.20/platform-admin/aiinitiatives/overview/index.html @@ -1,4 +1,4 @@ - Adapting AI initiatives to your organization - Run:ai Documentation Library

    Adapting AI initiatives to your organization

    AI initiatives refer to advancing research, development, and implementation of AI technologies. These initiatives represent your business needs and involve collaboration between individuals, teams, and other stakeholders. AI initiatives require compute resources and a methodology to effectively and efficiently use those compute resources and split them among the different AI initiatives stakeholders. The building blocks of AI compute resources are GPUs, CPUs, and CPU memory, which are built into nodes (servers) and can be further grouped into node pools. Nodes and node pools are part of a Kubernetes Cluster.

    To manage AI initiatives in Run:ai you should:

    • Map your organization and initiatives to projects and optionally departments
    • Map compute resources (node pools and quotas) to projects and optionally departments
    • Assign users (e.g. AI practitioners, ML engineers, Admins) to projects and departments

    Mapping your organization

    The way you map your AI initiatives and organization into Run:ai projects and departments should reflect your organization’s structure and Project management practices. There are multiple options, and we provide you here with 3 examples of typical forms in which to map your organization, initiatives, and users into Run:ai, but of course, other ways that suit your requirements are also acceptable.

    Based on individuals

    A typical use case would be students (individual practitioners) within a faculty (business unit) - an individual practitioner may be involved in one or more initiatives. In this example, the resources are accounted for by the student (project) and aggregated per faculty (department).
    Department = business unit / Project = individual practitioner

    Based on business units

    A typical use case would be an AI service (business unit) split into AI capabilities (initiatives) - an individual practitioner may be involved in several initiatives. In this example, the resources are accounted for by Initiative (project) and aggregated per AI service (department).

    Department = business unit / Project = initiative

    Based on the organizational structure

    A typical use case would be a business unit split into teams - an individual practitioner is involved in a single team (project) but the team may be involved in several AI initiatives. In this example, the resources are accounted for by team (project) and aggregated per business unit (department).

    Department = business unit / Project = team

    Mapping your resources

    AI initiatives require compute resources such as GPUs and CPUs to run. Compute resources in any organization are limited, either due to the number of servers (nodes) owned by the organization is limited, the budget it can spend to lease resources in the cloud or spending for in-house servers is also limited. Every organization strives to optimize the usage of its resources by maximizing their utilization and providing all users with their needs. Therefore, the organization needs to split resources according to the organization's internal priorities and budget constraints. But even after splitting the resources, the orchestration layer should still provide fairness between the resourced consumers, and allow access to unused resources to minimize scenarios of idle resources.

    Another aspect of resource management is how to group your resources effectively, especially in large environments, or environments that are made of heterogeneous types of hardware, where some users need to use specific hardware types, or where other users should avoid occupying critical hardware of some users or initiatives.

    Run:ai assists you with all of these complex issues by allowing you to map your cluster resources to node pools, then map each Project and Department a quota allocation per node pool, and set access rights to unused resources (Over quota) per node pool.

    Grouping your resources

    There are several reasons why you would group resources (nodes) into node pools:

    • Control the GPU type to use in heterogeneous hardware environment - in many cases, AI models can be optimized per hardware type they will use, e.g. a training workload that is optimized for H100 does not necessarily run optimally on an A100, and vice versa. Therefore segmenting into node pools, each with a different hardware type gives the AI researcher and ML engineer better control of where to run.
    • Quota control - splitting to node pools allows the admin to set specific quota per hardware type, e.g. give high priority project guaranteed access to advanced GPU hardware, while keeping lower priority project with a lower quota or even with no quota at all for that high-end GPU, but give it a “best-effort” access only (i.e. if the high priority guaranteed project is not using those resources).
    • Multi-region or multi-availability-zone cloud environments - if some or all of your clusters run on the cloud (or even on-premise) but any of your clusters uses different physical locations or different topologies (e.g. racks), you probably want to segment your resources per region/zone/topology to be able to control where to run your workloads, how much quota to assign to specific environments (per project, per department), even if all those locations are all using the same hardware type. This methodology can help in optimizing the performance of your workloads because of the superior performance of local computing such as the locality of distributed workloads, local storage etc.
    • Explainability and predictability - large environments are complex to understand, this becomes even more complex when an environment is loaded. To maintain users’ satisfaction and their understanding of the resources state, as well as to keep predictability of your workload chances to get scheduled, segmenting your cluster into smaller pools may significantly help.
    • Scale - Run:ai implementation of node pools has many benefits, one of the main of them is scale. Each node pool has its own scheduler instance, therefore allowing the cluster to handle more nodes and schedule workloads faster when segmented into node pools vs. one large cluster. To allow your workloads to use any resource within a cluster that is split to node pools, a second-level Scheduler is in charge of scheduling workloads between node pools according to your preferences and resource availability.
    • Prevent mutual exclusion - Some AI workloads consume CPU-only resources, to prevent those workloads from consuming the CPU resources of GPU nodes and thus block GPU workloads from using those nodes, it is recommended to group CPU-only nodes into a dedicated node pool(s) and assign a quota for CPU projects to CPU node-pools only while keeping GPU node-pools with zero quota and optionally “best-effort” over-quota access for CPU-only projects.

    Grouping Examples

    Set out below are illustrations of different grouping options.

    Example: grouping nodes by topology

    Example: grouping nodes by hardware type

    Assigning your resources

    After the initial grouping of resources, it is time to associate resources to AI initiatives, this is performed by assigning quotas to projects and optionally to departments. Assigning GPU quota to a project, on a node pool basis, means that the workloads submitted by that project are entitled to use those GPUs as guaranteed resources and can use them for all workload types.

    However, what happens if the project requires more resources than its quota? This depends on the type of workloads that the user wants to submit. If the user requires more resources for non-preemptible workloads, then the quota must be increased, because non-preemptible workloads require guaranteed resources. On the other hand, if the type of workload is, for example, a model Training workload that is preemptible - in this case the project can exploit unused resources of other projects, as long as the other projects don’t need them. Over-quota is set per project on a node-pool basis and per department.

    Administrators can use quota allocations to prioritize resources between users, teams, and AI initiatives. The administrator can completely prevent the use of certain node pools by a project or department by setting the node pool quota to 0 and disabling over quota for that node pool, or it can keep the quota to 0 and enable over-quota to that node pool and allow access based on resource availability only (e.g. unused GPUs). However, when a project with a non-zero quota needs to use those resources, the Scheduler reclaims those resources back and preempts the preemptible workloads of over-quota projects. As an administrator, you can also have an impact on the amount of over-quota resources a project or department uses.

    It is essential to make sure that the sum of all projects' quota does NOT surpass that of the Department, and that the sum of all departments does not surpass the number of physical resources, per node pool and for the entire cluster (we call such behavior - ‘over-subscription’). The reason over-subscription is not recommended is that it may produce unexpected scheduling decisions, especially those that might preempt ‘non-preemptive’ workloads or fail to schedule workloads within quota, either non-preemptible or preemptible, thus quota cannot be considered anymore as ‘guaranteed’. Admins can opt-in a system flag that helps to prevent over-subscription scenarios.

    Example: assigning resources to projects

    Assigning users to projects and departments

    Run:ai system is using ‘Role Based Access Control’ (RBAC) to manage users’ access rights to the different objects of the system, its resources, and the set of allowed actions.
    To allow AI researchers, ML engineers, Project Admins, or any other stakeholder of your AI initiatives to access projects and use AI compute resources with their AI initiatives, the administrator needs to assign users to projects. After a user is assigned to a project with the proper role, e.g. ‘L1 Researcher’, the user can submit and monitor its workloads under that project. Assigning users to departments is usually done to assign ‘Department Admin’ to manage a specific department. Other roles, such as ‘L1 Researcher’, can also be assigned to departments, this allows the researcher access to all projects within that department.

    Scopes in the organization

    This is an example of an organization, as represented in the Run:ai platform:

    The organizational tree is structured from top down under a single node headed by the account. The account is comprised of clusters, departments and projects.

    Note

    Different roles and permissions can be granted to specific clusters, departments and projects within an organization.

    The organizational tree is structured from top down under a single node headed by the account. The account is comprised of clusters, departments and projects.

    After mapping and building your hierarchal structured organization as shown above, you can assign or associate various Run:ai components (e.g. workloads, roles, assets, policies, and more) to different parts of the organization - these organizational parts are the Scopes. The following organizational example consists of 5 optional scopes:

    Note

    When a scope is selected, the very same unit, including all of its subordinates (both existing and any future subordinates, if added), are selected as well.

    Next Steps

    Now that resources are grouped into node pools, organizational units or business initiatives are mapped into projects and departments, projects’ quota parameters are set per node pool, and users are assigned to projects, you can finally submit workloads from a project and use compute resources to run your AI initiatives.

    Adapting AI initiatives to your organization

    AI initiatives refer to advancing research, development, and implementation of AI technologies. These initiatives represent your business needs and involve collaboration between individuals, teams, and other stakeholders. AI initiatives require compute resources and a methodology to effectively and efficiently use those compute resources and split them among the different AI initiatives stakeholders. The building blocks of AI compute resources are GPUs, CPUs, and CPU memory, which are built into nodes (servers) and can be further grouped into node pools. Nodes and node pools are part of a Kubernetes Cluster.

    To manage AI initiatives in Run:ai you should:

    • Map your organization and initiatives to projects and optionally departments
    • Map compute resources (node pools and quotas) to projects and optionally departments
    • Assign users (e.g. AI practitioners, ML engineers, Admins) to projects and departments

    Mapping your organization

    The way you map your AI initiatives and organization into Run:ai projects and departments should reflect your organization’s structure and Project management practices. There are multiple options, and we provide you here with 3 examples of typical forms in which to map your organization, initiatives, and users into Run:ai, but of course, other ways that suit your requirements are also acceptable.

    Based on individuals

    A typical use case would be students (individual practitioners) within a faculty (business unit) - an individual practitioner may be involved in one or more initiatives. In this example, the resources are accounted for by the student (project) and aggregated per faculty (department).
    Department = business unit / Project = individual practitioner

    Based on business units

    A typical use case would be an AI service (business unit) split into AI capabilities (initiatives) - an individual practitioner may be involved in several initiatives. In this example, the resources are accounted for by Initiative (project) and aggregated per AI service (department).

    Department = business unit / Project = initiative

    Based on the organizational structure

    A typical use case would be a business unit split into teams - an individual practitioner is involved in a single team (project) but the team may be involved in several AI initiatives. In this example, the resources are accounted for by team (project) and aggregated per business unit (department).

    Department = business unit / Project = team

    Mapping your resources

    AI initiatives require compute resources such as GPUs and CPUs to run. Compute resources in any organization are limited, either due to the number of servers (nodes) owned by the organization is limited, the budget it can spend to lease resources in the cloud or spending for in-house servers is also limited. Every organization strives to optimize the usage of its resources by maximizing their utilization and providing all users with their needs. Therefore, the organization needs to split resources according to the organization's internal priorities and budget constraints. But even after splitting the resources, the orchestration layer should still provide fairness between the resourced consumers, and allow access to unused resources to minimize scenarios of idle resources.

    Another aspect of resource management is how to group your resources effectively, especially in large environments, or environments that are made of heterogeneous types of hardware, where some users need to use specific hardware types, or where other users should avoid occupying critical hardware of some users or initiatives.

    Run:ai assists you with all of these complex issues by allowing you to map your cluster resources to node pools, then map each Project and Department a quota allocation per node pool, and set access rights to unused resources (Over quota) per node pool.

    Grouping your resources

    There are several reasons why you would group resources (nodes) into node pools:

    • Control the GPU type to use in heterogeneous hardware environment - in many cases, AI models can be optimized per hardware type they will use, e.g. a training workload that is optimized for H100 does not necessarily run optimally on an A100, and vice versa. Therefore segmenting into node pools, each with a different hardware type gives the AI researcher and ML engineer better control of where to run.
    • Quota control - splitting to node pools allows the admin to set specific quota per hardware type, e.g. give high priority project guaranteed access to advanced GPU hardware, while keeping lower priority project with a lower quota or even with no quota at all for that high-end GPU, but give it a “best-effort” access only (i.e. if the high priority guaranteed project is not using those resources).
    • Multi-region or multi-availability-zone cloud environments - if some or all of your clusters run on the cloud (or even on-premise) but any of your clusters uses different physical locations or different topologies (e.g. racks), you probably want to segment your resources per region/zone/topology to be able to control where to run your workloads, how much quota to assign to specific environments (per project, per department), even if all those locations are all using the same hardware type. This methodology can help in optimizing the performance of your workloads because of the superior performance of local computing such as the locality of distributed workloads, local storage etc.
    • Explainability and predictability - large environments are complex to understand, this becomes even more complex when an environment is loaded. To maintain users’ satisfaction and their understanding of the resources state, as well as to keep predictability of your workload chances to get scheduled, segmenting your cluster into smaller pools may significantly help.
    • Scale - Run:ai implementation of node pools has many benefits, one of the main of them is scale. Each node pool has its own scheduler instance, therefore allowing the cluster to handle more nodes and schedule workloads faster when segmented into node pools vs. one large cluster. To allow your workloads to use any resource within a cluster that is split to node pools, a second-level Scheduler is in charge of scheduling workloads between node pools according to your preferences and resource availability.
    • Prevent mutual exclusion - Some AI workloads consume CPU-only resources, to prevent those workloads from consuming the CPU resources of GPU nodes and thus block GPU workloads from using those nodes, it is recommended to group CPU-only nodes into a dedicated node pool(s) and assign a quota for CPU projects to CPU node-pools only while keeping GPU node-pools with zero quota and optionally “best-effort” over-quota access for CPU-only projects.

    Grouping Examples

    Set out below are illustrations of different grouping options.

    Example: grouping nodes by topology

    Example: grouping nodes by hardware type

    Assigning your resources

    After the initial grouping of resources, it is time to associate resources to AI initiatives, this is performed by assigning quotas to projects and optionally to departments. Assigning GPU quota to a project, on a node pool basis, means that the workloads submitted by that project are entitled to use those GPUs as guaranteed resources and can use them for all workload types.

    However, what happens if the project requires more resources than its quota? This depends on the type of workloads that the user wants to submit. If the user requires more resources for non-preemptible workloads, then the quota must be increased, because non-preemptible workloads require guaranteed resources. On the other hand, if the type of workload is, for example, a model Training workload that is preemptible - in this case the project can exploit unused resources of other projects, as long as the other projects don’t need them. Over-quota is set per project on a node-pool basis and per department.

    Administrators can use quota allocations to prioritize resources between users, teams, and AI initiatives. The administrator can completely prevent the use of certain node pools by a project or department by setting the node pool quota to 0 and disabling over quota for that node pool, or it can keep the quota to 0 and enable over-quota to that node pool and allow access based on resource availability only (e.g. unused GPUs). However, when a project with a non-zero quota needs to use those resources, the Scheduler reclaims those resources back and preempts the preemptible workloads of over-quota projects. As an administrator, you can also have an impact on the amount of over-quota resources a project or department uses.

    It is essential to make sure that the sum of all projects' quota does NOT surpass that of the Department, and that the sum of all departments does not surpass the number of physical resources, per node pool and for the entire cluster (we call such behavior - ‘over-subscription’). The reason over-subscription is not recommended is that it may produce unexpected scheduling decisions, especially those that might preempt ‘non-preemptive’ workloads or fail to schedule workloads within quota, either non-preemptible or preemptible, thus quota cannot be considered anymore as ‘guaranteed’. Admins can opt-in a system flag that helps to prevent over-subscription scenarios.

    Example: assigning resources to projects

    Assigning users to projects and departments

    Run:ai system is using ‘Role Based Access Control’ (RBAC) to manage users’ access rights to the different objects of the system, its resources, and the set of allowed actions.
    To allow AI researchers, ML engineers, Project Admins, or any other stakeholder of your AI initiatives to access projects and use AI compute resources with their AI initiatives, the administrator needs to assign users to projects. After a user is assigned to a project with the proper role, e.g. ‘L1 Researcher’, the user can submit and monitor its workloads under that project. Assigning users to departments is usually done to assign ‘Department Admin’ to manage a specific department. Other roles, such as ‘L1 Researcher’, can also be assigned to departments, this allows the researcher access to all projects within that department.

    Scopes in the organization

    This is an example of an organization, as represented in the Run:ai platform:

    The organizational tree is structured from top down under a single node headed by the account. The account is comprised of clusters, departments and projects.

    Note

    Different roles and permissions can be granted to specific clusters, departments and projects within an organization.

    The organizational tree is structured from top down under a single node headed by the account. The account is comprised of clusters, departments and projects.

    After mapping and building your hierarchal structured organization as shown above, you can assign or associate various Run:ai components (e.g. workloads, roles, assets, policies, and more) to different parts of the organization - these organizational parts are the Scopes. The following organizational example consists of 5 optional scopes:

    Note

    When a scope is selected, the very same unit, including all of its subordinates (both existing and any future subordinates, if added), are selected as well.

    Next Steps

    Now that resources are grouped into node pools, organizational units or business initiatives are mapped into projects and departments, projects’ quota parameters are set per node pool, and users are assigned to projects, you can finally submit workloads from a project and use compute resources to run your AI initiatives.

    \ No newline at end of file diff --git a/v2.20/platform-admin/aiinitiatives/resources/configuring-mig-profiles/index.html b/v2.20/platform-admin/aiinitiatives/resources/configuring-mig-profiles/index.html index 6067659f2f..dba7dc95a4 100644 --- a/v2.20/platform-admin/aiinitiatives/resources/configuring-mig-profiles/index.html +++ b/v2.20/platform-admin/aiinitiatives/resources/configuring-mig-profiles/index.html @@ -1,4 +1,4 @@ - Configuring NVIDIA MIG Profiles - Run:ai Documentation Library

    Configuring NVIDIA MIG Profiles

    NVIDIA’s Multi-Instance GPU (MIG) enables splitting a GPU into multiple logical GPU devices, each with its own memory and compute portion of the physical GPU.

    NVIDIA provides two MIG strategies that the user can split the GPU into:

    • Single - A GPU can be divided evenly. This means all MIG profiles are the same.
    • Mixed - A GPU can be divided into different profiles.

    The Run:ai platform supports running workloads using NVIDIA MIG. Administrators can set the Kubernetes nodes to their preferred MIG strategy and configure the appropriate MIG profiles for researchers and MLOPS engineers to use.

    This guide explains how to configure MIG in each strategy to submit workloads. It also outlines the individual implications of each strategy and best practices for administrators.

    Note

    • Starting from v2.19, Dynamic MIG feature began a deprecation process and is now no longer supported. With Dynamic MIG, the Run:ai platform automatically configured MIG profiles according to on-demand user requests for different MIG profiles or memory fractions.
    • GPU fractions and memory fractions are not supported with MIG profiles.
    • Single strategy supports both Run:ai and third-party workloads. Using mixed strategy can only be done using third-party workloads. For more details on Run:ai and third-party workloads, see Introduction to workloads.

    Before you start

    To use MIG single and mixed strategy effectively, make sure to familiarize yourself with the following NVIDIA resources:

    Configuring single MIG strategy

    When deploying MIG using single strategy, all GPUs within a node are configured with the same profile. For example, a node might have GPUs configured with 3 MIG slices of profile type 1g.20gb, or 7 MIG slices of profile 1g.10gb. With this strategy, MIG profiles are displayed as whole GPU devices by CUDA.

    The Run:ai platform discovers these MIG profiles as whole GPU devices as well, ensuring MIG devices are transparent to the end-user (practitioner). For example, a node that consists of 8 physical GPUs split into MIG slices, 3×2g20gb slices each, is discovered by the Run:ai platform as a node with 24 GPU devices.

    Users can submit workloads by requesting a specific number of GPU devices (X GPU) and Run:ai will allocate X MIG slices (logical devices). The Run:ai platform deducts X GPUs from the workload’s Project quota, regardless of whether this ‘logical GPU’ represents 1/3 of a physical GPU device or 1/7 of a physical GPU device.

    Configuring mixed MIG strategy

    When deploying MIG using mixed strategy, each GPU in a node can be configured with a different combination of MIG profiles such as 2×2g.20gb and 3×1g.10gb. For details on supported combinations per GPU type, refer to Supported MIG Profiles.

    In mixed strategy, physical GPU devices continue to be displayed as physical GPU devices by CUDA, and each MIG profile is shown individually. The Run:ai platform identifies the physical GPU devices normally, however, MIG profiles are not visible in the UI or node APIs.

    When submitting third-party workloads with this strategy, the user should explicitly specify the exact requested MIG profile (for example, nvidia.com/gpu.product: A100-SXM4-40GB-MIG-3g.20gb). The Run:ai Scheduler finds a node that can provide this specific profile and binds it to the workload.

    A third-party workload submitted with a MIG profile of type Xg.Ygb (e.g. 3g.40gb or 2g.20gb) is considered as consuming X GPUs. These X GPUs will be deducted from the workload’s project quota of GPUs. For example, a 3g.40gb profile deducts 3 GPUs from the associated Project’s quota, while 2g.20gb deducts 2 GPUs from the associated Project’s quota. This is done to maintain a logical ratio according to the characteristics of the MIG profile.

    Best practices for administrators

    Single strategy

    • Configure proper and uniform sizes of MIG slices (profiles) across all GPUs within a node.
    • Set the same MIG profiles on all nodes of a single node pool.
    • Create separate node pools with different MIG profile configurations allowing users to select the pool that best matches their workloads’ needs.
    • Ensure Project quotas are allocated according to the MIG profile sizes.

    Mixed strategy

    • Use mixed strategy with workloads that require diverse resources. Make sure to evaluate the workload requirements and plan accordingly.
    • Configure individual MIG profiles on each node by using a limited set of MIG profile combinations to minimize complexity. Make sure to evaluate your requirements and node configurations.
    • Ensure Project quotas are allocated according to the MIG profile sizes.

    Note

    Since MIG slices are a fixed size, once configured, changing MIG profiles requires administrative intervention.

    Configuring NVIDIA MIG Profiles

    NVIDIA’s Multi-Instance GPU (MIG) enables splitting a GPU into multiple logical GPU devices, each with its own memory and compute portion of the physical GPU.

    NVIDIA provides two MIG strategies that the user can split the GPU into:

    • Single - A GPU can be divided evenly. This means all MIG profiles are the same.
    • Mixed - A GPU can be divided into different profiles.

    The Run:ai platform supports running workloads using NVIDIA MIG. Administrators can set the Kubernetes nodes to their preferred MIG strategy and configure the appropriate MIG profiles for researchers and MLOPS engineers to use.

    This guide explains how to configure MIG in each strategy to submit workloads. It also outlines the individual implications of each strategy and best practices for administrators.

    Note

    • Starting from v2.19, Dynamic MIG feature began a deprecation process and is now no longer supported. With Dynamic MIG, the Run:ai platform automatically configured MIG profiles according to on-demand user requests for different MIG profiles or memory fractions.
    • GPU fractions and memory fractions are not supported with MIG profiles.
    • Single strategy supports both Run:ai and third-party workloads. Using mixed strategy can only be done using third-party workloads. For more details on Run:ai and third-party workloads, see Introduction to workloads.

    Before you start

    To use MIG single and mixed strategy effectively, make sure to familiarize yourself with the following NVIDIA resources:

    Configuring single MIG strategy

    When deploying MIG using single strategy, all GPUs within a node are configured with the same profile. For example, a node might have GPUs configured with 3 MIG slices of profile type 1g.20gb, or 7 MIG slices of profile 1g.10gb. With this strategy, MIG profiles are displayed as whole GPU devices by CUDA.

    The Run:ai platform discovers these MIG profiles as whole GPU devices as well, ensuring MIG devices are transparent to the end-user (practitioner). For example, a node that consists of 8 physical GPUs split into MIG slices, 3×2g20gb slices each, is discovered by the Run:ai platform as a node with 24 GPU devices.

    Users can submit workloads by requesting a specific number of GPU devices (X GPU) and Run:ai will allocate X MIG slices (logical devices). The Run:ai platform deducts X GPUs from the workload’s Project quota, regardless of whether this ‘logical GPU’ represents 1/3 of a physical GPU device or 1/7 of a physical GPU device.

    Configuring mixed MIG strategy

    When deploying MIG using mixed strategy, each GPU in a node can be configured with a different combination of MIG profiles such as 2×2g.20gb and 3×1g.10gb. For details on supported combinations per GPU type, refer to Supported MIG Profiles.

    In mixed strategy, physical GPU devices continue to be displayed as physical GPU devices by CUDA, and each MIG profile is shown individually. The Run:ai platform identifies the physical GPU devices normally, however, MIG profiles are not visible in the UI or node APIs.

    When submitting third-party workloads with this strategy, the user should explicitly specify the exact requested MIG profile (for example, nvidia.com/gpu.product: A100-SXM4-40GB-MIG-3g.20gb). The Run:ai Scheduler finds a node that can provide this specific profile and binds it to the workload.

    A third-party workload submitted with a MIG profile of type Xg.Ygb (e.g. 3g.40gb or 2g.20gb) is considered as consuming X GPUs. These X GPUs will be deducted from the workload’s project quota of GPUs. For example, a 3g.40gb profile deducts 3 GPUs from the associated Project’s quota, while 2g.20gb deducts 2 GPUs from the associated Project’s quota. This is done to maintain a logical ratio according to the characteristics of the MIG profile.

    Best practices for administrators

    Single strategy

    • Configure proper and uniform sizes of MIG slices (profiles) across all GPUs within a node.
    • Set the same MIG profiles on all nodes of a single node pool.
    • Create separate node pools with different MIG profile configurations allowing users to select the pool that best matches their workloads’ needs.
    • Ensure Project quotas are allocated according to the MIG profile sizes.

    Mixed strategy

    • Use mixed strategy with workloads that require diverse resources. Make sure to evaluate the workload requirements and plan accordingly.
    • Configure individual MIG profiles on each node by using a limited set of MIG profile combinations to minimize complexity. Make sure to evaluate your requirements and node configurations.
    • Ensure Project quotas are allocated according to the MIG profile sizes.

    Note

    Since MIG slices are a fixed size, once configured, changing MIG profiles requires administrative intervention.

    \ No newline at end of file diff --git a/v2.20/platform-admin/aiinitiatives/resources/node-pools/index.html b/v2.20/platform-admin/aiinitiatives/resources/node-pools/index.html index 3abff422a9..556765cc6a 100644 --- a/v2.20/platform-admin/aiinitiatives/resources/node-pools/index.html +++ b/v2.20/platform-admin/aiinitiatives/resources/node-pools/index.html @@ -1,4 +1,4 @@ - Node Pools - Run:ai Documentation Library

    Node Pools

    This article explains the procedure for managing Node pools.

    Node pools assist in managing heterogeneous resources effectively. A node pool is a Run:ai construct representing a set of nodes grouped into a bucket of resources using a predefined node label (e.g. NVidia GPU type) or an administrator-defined node label (any key/value pair).

    Typically, the grouped nodes share a common feature or property, such as GPU type or other HW capability (such as Infiniband connectivity), or represent a proximity group (i.e. nodes interconnected via a local ultra-fast switch). Researchers and ML Engineers would typically use node pools to run specific workloads on specific resource types.

    Platform administrators can create, view, edit, and delete node pools. Creating a new node pool creates a new instance of the Run:ai scheduler, workloads submitted to a node pool will be scheduled using the node pool’s designated scheduler instance.

    Once a new node pool is created, it is automatically assigned to all Projects and Departments with a quota of zero GPU resources, unlimited CPU resources, and over-quota enabled (Medium priority if over-quota weight is enabled). This allows any Project and Department to use any node pool when over-quota is enabled, even if the administrator has not assigned a quota for a specific node pool in a Project or Department.

    Workloads can be submitted using a prioritized list of node pools, the node pool selector picks one node pool at a time (according to the prioritized list) and the designated node pool scheduler instance handles the submission request and tries to match the requested resources within that node pool. If the scheduler cannot find resources to satisfy the submitted workload, the node pool selector will move the request to the next node pool in the prioritized list, if no node pool satisfies the request, the node pool selector will start from the first node pool again until one of the node pools satisfies the request.

    Node pools table

    The Node pools table can be found under Resources in the Run:ai platform.

    The Node pools table lists all the node pools defined in the Run:ai platform and allows you to manage them.

    Note

    By default, the Run:ai platform includes a single node pool named ‘default’. When no other node pool is defined, all existing and new nodes are associated with the ‘default’ node pool. When deleting a node pool, if no other node pool matches any of the nodes’ labels, the node will be included in the default node pool.

    The Node pools table consists of the following columns:

    Column Description
    Node pool The node pool name, set by the administrator during its creation (the node pool name cannot be changed after its creation).
    Status Node pool status. A ‘Ready’ status means the scheduler can use this node pool to schedule workloads. ‘Empty’ status means no nodes are currently included in that node pool.
    Label key
    Label value
    The node pool controller will use this node-label key-value pair to match nodes into this node pool.
    Node(s) List of nodes included in this node pool. Click the field to view details (the details are in the Nodes article).
    GPU devices The total number of GPU devices installed into nodes included in this node pool. For example, a node pool that includes 12 nodes each with 8 GPU devices would show a total number of 96 GPU devices.
    GPU memory The total amount of GPU memory included in this node pool. The total amount of GPU memory installed in nodes included in this node pool. For example, a node pool that includes 12 nodes, each with 8 GPU devices, and each device with 80 GB of memory would show a total memory amount of 7.68 TB.
    Allocated GPUs The total allocation of GPU devices in units of GPUs (decimal number). For example, if 3 GPUs are 50% allocated, the field prints out the value 1.50. This value represents the portion of GPU memory consumed by all running pods using this node pool. ‘Allocated GPUs’ can be larger than ‘Projects’ GPU quota’ if over-quota is used by workloads, but not larger than GPU devices.
    GPU resource optimization ratio Shows the Node Level Scheduler mode.
    CPUs (Cores) The number of CPU cores installed on nodes included in this node
    CPU memory The total amount of CPU memory installed on nodes using this node pool
    Allocated CPUs (Cores) The total allocation of CPU compute in units of Cores (decimal number). This value represents the amount of CPU cores consumed by all running pods using this node pool. ‘Allocated CPUs’ can be larger than ‘Projects’ GPU quota’ if over-quota is used by workloads, but not larger than CPUs (Cores).
    Allocated CPU memory The total allocation of CPU memory in units of TB/GB/MB (decimal number). This value represents the amount of CPU memory consumed by all running pods using this node pool. ‘Allocated CPUs’ can be larger than ‘Projects’ CPU memory quota’ if over-quota is used by workloads, but not larger than CPU memory.
    GPU placement strategy Sets the Scheduler strategy for the assignment of pods requesting both GPU and CPU resources to nodes, which can be either Bin-pack or Spread. By default, Bin-Pack is used, but can be changed to Spread by editing the node pool. When set to Bin-pack the scheduler will try to fill nodes as much as possible before using empty or sparse nodes, when set to spread the scheduler will try to keep nodes as sparse as possible by spreading workloads across as many nodes as it succeeds.
    CPU placement strategy Sets the Scheduler strategy for the assignment of pods requesting only CPU resources to nodes, which can be either Bin-pack or Spread. By default, Bin-Pack is used, but can be changed to Spread by editing the node pool. When set to Bin-pack the scheduler will try to fill nodes as much as possible before using empty or sparse nodes, when set to spread the scheduler will try to keep nodes as sparse as possible by spreading workloads across as many nodes as it succeeds.
    Last update The date and time when the node pool was last updated
    Creation time The date and time when the node pool was created
    Workload(s) List of workloads running on nodes included in this node pool, click the field to view details (described below in this article)

    Workloads associated with the node pool

    Click one of the values in the Workload(s) column, to view the list of workloads and their parameters.

    Note

    This column is only viewable if your role in the Run:ai platform gives you read access to workloads, even if you are allowed to view workloads, you can only view the workloads within your allowed scope. This means, there might be more pods running on this node than appear in the list your are viewing.

    Column Description
    Workload The name of the workload. If the workloads’ type is one of the recognized types (for example: Pytorch, MPI, Jupyter, Ray, Spark, Kubeflow, and many more), an appropriate icon is printed.
    Type The Run:ai platform type of the workload - Workspace, Training, or Inference
    Status The state of the workload. The Workloads state is described in the ‘Run:ai Workloads’ article.
    Created by The User or Application created this workload
    Running/requested pods The number of running pods out of the number of requested pods within this workload.
    Creation time The workload’s creation date and time
    Allocated GPU compute The total amount of GPU compute allocated by this workload. A workload with 3 Pods, each allocating 0.5 GPU, will show a value of 1.5 GPUs for the workload.
    Allocated GPU memory The total amount of GPU memory allocated by this workload. A workload with 3 Pods, each allocating 20GB, will show a value of 60 GB for the workload.
    Allocated CPU compute (cores) The total amount of CPU compute allocated by this workload. A workload with 3 Pods, each allocating 0.5 Core, will show a value of 1.5 Cores for the workload.
    Allocated CPU memory The total amount of CPU memory allocated by this workload. A workload with 3 Pods, each allocating 5 GB of CPU memory, will show a value of 15 GB of CPU memory for the workload.

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
    • Show/Hide details - Click to view additional information on the selected row

    Show/Hide details

    Select a row in the Node pools table and then click Show details in the upper-right corner of the action bar. The details window appears, presenting metrics graphs for the whole node pool:

    • Node GPU allocation - This graph shows an overall sum of the Allocated, Unallocated, and Total number of GPUs for this node pool, over time. From observing this graph, you can learn about the occupancy of GPUs in this node pool, over time.

    • GPU Utilization Distribution - This graph shows the distribution of GPU utilization in this node pool over time. Observing this graph, you can learn how many GPUs are utilized up to 25%, 25%-50%, 50%-75%, and 75%-100%. This information helps to understand how many available resources you have in this node pool, and how well those resources are utilized by comparing the allocation graph to the utilization graphs, over time.

    • GPU Utilization - This graph shows the average GPU utilization in this node pool over time. Comparing this graph with the GPU Utilization Distribution helps to understand the actual distribution of GPU occupancy over time.

    • GPU Memory Utilization - This graph shows the average GPU memory utilization in this node pool over time, for example an average of all nodes’ GPU memory utilization over time.

    • CPU Utilization - This graph shows the average CPU utilization in this node pool over time, for example, an average of all nodes’ CPU utilization over time.

    • CPU Memory Utilization - This graph shows the average CPU memory utilization in this node pool over time, for example an average of all nodes’ CPU memory utilization over time.

    Adding a new node pool

    To create a new node pool:

    1. Click +NEW NODE POOL
    2. Enter a name for the node pool.
      Node pools names must start with a letter and can only contain lowercase Latin letters, numbers or a hyphen ('-’)
    3. Enter the node pool label:
      The node pool controller will use this node-label key-value pair to match nodes into this node pool.

      • Key is the unique identifier of a node label.

        • The key must fit the following regular expression: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?/?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$
        • The administrator can put an automatically preset label such as the nvidia.com/gpu.product that labels the GPU type or any other key from a node label.
      • Value is the value of that label identifier (key). The same key may have different values, in this case, they are
        considered as different labels.

        • Value must fit the following regular expression: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$
      • A node pool is defined by a single key-value pair. You must not use different labels that are set on the same node by
        different node pools, this situation may lead to unexpected results.
    4. Set the GPU placement strategy:

      • Bin-pack - Place as many workloads as possible in each GPU and node to use fewer resources and maximize GPU and node vacancy.
      • Spread Spread workloads across as many GPUs and nodes as possible to minimize the load and maximize the available resources per workload.
      • GPU workloads are workloads that request both GPU and CPU resources
    5. Set the CPU placement strategy:

      • Bin-pack - Place as many workloads as possible in each CPU and node to use fewer resources and maximize CPU and node vacancy.
      • Spread - Spread workloads across as many CPUs and nodes as possible to minimize the load and maximize the available resources per workload.
      • CPU workloads are workloads that request purely CPU resources
    6. Click CREATE NODE POOL

    Labeling nodes for node-pool grouping:

    The Infrastructure Administrator can use a preset node label such as the nvidia.com/gpu.product that labels the GPU type, or configure any other node label (e.g. faculty=physics).

    To assign a label to nodes you want to group into a node pool, set a node label on each node:

    1. Get the list of nodes and their current labels using the following command:

      kubectl get nodes --show-labels
       

    2. Annotate a specific node with a new label using the following command:

      kubectl label node <node-name> <key>=<value>
      -

    Editing a node pool

    1. Select the node pool you want to edit
    2. Click EDIT
    3. Update the node pool and click SAVE

    Deleting a node pool

    1. Select the node pool you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Note

    The default node pool cannot be deleted. When deleting a node pool, if no other node pool matches any of the nodes’ labels, the node will be included in the default node pool.

    Using API

    Go to the Node pools API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/platform-admin/aiinitiatives/resources/nodes/index.html b/v2.20/platform-admin/aiinitiatives/resources/nodes/index.html index 5f65a093fa..20515758fe 100644 --- a/v2.20/platform-admin/aiinitiatives/resources/nodes/index.html +++ b/v2.20/platform-admin/aiinitiatives/resources/nodes/index.html @@ -1,4 +1,4 @@ - Nodes - Run:ai Documentation Library

    Nodes

    This article explains the procedure for managing Nodes.

    Nodes are Kubernetes elements automatically discovered by the Run:ai platform. Once a node is discovered by the Run:ai platform, an associated instance is created in the Nodes table, administrators can view the Node’s relevant information, and Run:ai scheduler can use the node for Scheduling.

    Nodes table

    The Nodes table can be found under Resources in the Run:ai platform.

    The Nodes table displays a list of predefined nodes available to users in the Run:ai platform.

    Note

    • It is not possible to create additional nodes, or edit, or delete existing nodes.
    • Only users with relevant permissions can view the table.

    The Nodes table consists of the following columns:

    Column Description
    Node The Kubernetes name of the node
    Status The state of the node. Nodes in the Ready state are eligible for scheduling. If the state is Not ready then the main reason appears in parenthesis on the right side of the state field. Hovering the state lists the reasons why a node is Not ready.
    Node pool The name of the associated node pool. By default, every node in the Run:ai platform is associated with the default node pool, if no other node pool is associated
    GPU type The GPU model, for example, H100, or V100
    GPU devices The number of GPU devices installed on the node. Clicking this field pops up a dialog with details per GPU (described below in this article)
    Free GPU devices The current number of fully vacant GPU devices
    GPU memory The total amount of GPU memory installed on this node. For example, if the number is 640GB and the number of GPU devices is 8, then each GPU is installed with 80GB of memory (assuming the node is assembled of homogenous GPU devices)
    Allocated GPUs The total allocation of GPU devices in units of GPUs (decimal number). For example, if 3 GPUs are 50% allocated, the field prints out the value 1.50. This value represents the portion of GPU memory consumed by all running pods using this node
    Used GPU memory The actual amount of memory (in GB or MB) used by pods running on this node.
    GPU compute utilization The average compute utilization of all GPU devices in this node
    GPU memory utilization The average memory utilization of all GPU devices in this node
    CPU (Cores) The number of CPU cores installed on this node
    CPU memory The total amount of CPU memory installed on this node
    Allocated CPU (Cores) The number of CPU cores allocated by pods running on this node (decimal number, e.g. a pod allocating 350 mili-cores shows an allocation of 0.35 cores).
    Allocated CPU memory The total amount of CPU memory allocated by pods running on this node (in GB or MB)
    Used CPU memory The total amount of actually used CPU memory by pods running on this node. Pods may allocate memory but not use all of it, or go beyond their CPU memory allocation if using Limit > Request for CPU memory (burstable workload)
    CPU compute utilization The utilization of all CPU compute resources on this node (percentage)
    CPU memory utilization The utilization of all CPU memory resources on this node (percentage)
    Used swap CPU memory The amount of CPU memory (in GB or MB) used for GPU swap memory (* future)
    Pod(s) List of pods running on this node, click the field to view details (described below in this article)

    GPU devices for node

    Click one of the values in the GPU devices column, to view the list of GPU devices and their parameters.

    Column Description
    Index The GPU index, read from the GPU hardware. The same index is used when accessing the GPU directly
    Used memory The amount of memory used by pods and drivers using the GPU (in GB or MB)
    Compute utilization The portion of time the GPU is being used by applications (percentage)
    Memory utilization The portion of the GPU memory that is being used by applications (percentage)
    Idle time The elapsed time since the GPU was used (i.e. the GPU is being idle for ‘Idle time’)

    Pods associated with node

    Click one of the values in the Pod(s) column, to view the list of pods and their parameters.

    Note

    This column is only viewable if your role in the Run:ai platform gives you read access to workloads, even if you are allowed to view workloads, you can only view the workloads within your allowed scope. This means, there might be more pods running on this node than appear in the list your are viewing.

    Column Description
    Pod The Kubernetes name of the pod. Usually name of the pod is made of the name of the parent workload if there is one, and an index for unique for that pod instance within the workload
    Status The state of the pod. In steady state this should be Running and the amount of time the pod is running
    Project The Run:ai project name the pod belongs to. Clicking this field takes you to the Projects table filtered by this project name
    Workload The workload name the pod belongs to. Clicking this field takes you to the Workloads table filtered by this workload name
    Image The full path of the image used by the main container of this pod
    Creation time The pod’s creation date and time

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
    • Show/Hide details - Click to view additional information on the selected row

    Show/Hide details

    Click a row in the Nodes table and then click the Show details button at the upper right side of the action bar. The details screen appears, presenting the following metrics graphs:

    • GPU utilization
      Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs compute utilization (percentage of GPU compute) in this node.
    • GPU memory utilization
      Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs memory usage (percentage of the GPU memory) in this node.
    • CPU compute utilization
      The average of all CPUs’ cores compute utilization graph, along an adjustable period allows you to see the trends of CPU compute utilization (percentage of CPU compute) in this node.
    • CPU memory utilization
      The utilization of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory utilization (percentage of CPU memory) in this node.
    • CPU memory usage
      The usage of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory usage (in GB or MB of CPU memory) in this node.

    • For GPUs charts - Click the GPU legend on the right-hand side of the chart, to activate or deactivate any of the GPU lines.

    • You can click the date picker to change the presented period
    • You can use your mouse to mark a sub-period in the graph for zooming in, and use the ‘Reset zoom’ button to go back to the preset period
    • Changes in the period affect all graphs on this screen.

    Using API

    Go to the Nodes API reference to view the available actions

    Nodes

    This article explains the procedure for managing Nodes.

    Nodes are Kubernetes elements automatically discovered by the Run:ai platform. Once a node is discovered by the Run:ai platform, an associated instance is created in the Nodes table, administrators can view the Node’s relevant information, and Run:ai scheduler can use the node for Scheduling.

    Nodes table

    The Nodes table can be found under Resources in the Run:ai platform.

    The Nodes table displays a list of predefined nodes available to users in the Run:ai platform.

    Note

    • It is not possible to create additional nodes, or edit, or delete existing nodes.
    • Only users with relevant permissions can view the table.

    The Nodes table consists of the following columns:

    Column Description
    Node The Kubernetes name of the node
    Status The state of the node. Nodes in the Ready state are eligible for scheduling. If the state is Not ready then the main reason appears in parenthesis on the right side of the state field. Hovering the state lists the reasons why a node is Not ready.
    Node pool The name of the associated node pool. By default, every node in the Run:ai platform is associated with the default node pool, if no other node pool is associated
    GPU type The GPU model, for example, H100, or V100
    GPU devices The number of GPU devices installed on the node. Clicking this field pops up a dialog with details per GPU (described below in this article)
    Free GPU devices The current number of fully vacant GPU devices
    GPU memory The total amount of GPU memory installed on this node. For example, if the number is 640GB and the number of GPU devices is 8, then each GPU is installed with 80GB of memory (assuming the node is assembled of homogenous GPU devices)
    Allocated GPUs The total allocation of GPU devices in units of GPUs (decimal number). For example, if 3 GPUs are 50% allocated, the field prints out the value 1.50. This value represents the portion of GPU memory consumed by all running pods using this node
    Used GPU memory The actual amount of memory (in GB or MB) used by pods running on this node.
    GPU compute utilization The average compute utilization of all GPU devices in this node
    GPU memory utilization The average memory utilization of all GPU devices in this node
    CPU (Cores) The number of CPU cores installed on this node
    CPU memory The total amount of CPU memory installed on this node
    Allocated CPU (Cores) The number of CPU cores allocated by pods running on this node (decimal number, e.g. a pod allocating 350 mili-cores shows an allocation of 0.35 cores).
    Allocated CPU memory The total amount of CPU memory allocated by pods running on this node (in GB or MB)
    Used CPU memory The total amount of actually used CPU memory by pods running on this node. Pods may allocate memory but not use all of it, or go beyond their CPU memory allocation if using Limit > Request for CPU memory (burstable workload)
    CPU compute utilization The utilization of all CPU compute resources on this node (percentage)
    CPU memory utilization The utilization of all CPU memory resources on this node (percentage)
    Used swap CPU memory The amount of CPU memory (in GB or MB) used for GPU swap memory (* future)
    Pod(s) List of pods running on this node, click the field to view details (described below in this article)

    GPU devices for node

    Click one of the values in the GPU devices column, to view the list of GPU devices and their parameters.

    Column Description
    Index The GPU index, read from the GPU hardware. The same index is used when accessing the GPU directly
    Used memory The amount of memory used by pods and drivers using the GPU (in GB or MB)
    Compute utilization The portion of time the GPU is being used by applications (percentage)
    Memory utilization The portion of the GPU memory that is being used by applications (percentage)
    Idle time The elapsed time since the GPU was used (i.e. the GPU is being idle for ‘Idle time’)

    Pods associated with node

    Click one of the values in the Pod(s) column, to view the list of pods and their parameters.

    Note

    This column is only viewable if your role in the Run:ai platform gives you read access to workloads, even if you are allowed to view workloads, you can only view the workloads within your allowed scope. This means, there might be more pods running on this node than appear in the list your are viewing.

    Column Description
    Pod The Kubernetes name of the pod. Usually name of the pod is made of the name of the parent workload if there is one, and an index for unique for that pod instance within the workload
    Status The state of the pod. In steady state this should be Running and the amount of time the pod is running
    Project The Run:ai project name the pod belongs to. Clicking this field takes you to the Projects table filtered by this project name
    Workload The workload name the pod belongs to. Clicking this field takes you to the Workloads table filtered by this workload name
    Image The full path of the image used by the main container of this pod
    Creation time The pod’s creation date and time

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
    • Show/Hide details - Click to view additional information on the selected row

    Show/Hide details

    Click a row in the Nodes table and then click the Show details button at the upper right side of the action bar. The details screen appears, presenting the following metrics graphs:

    • GPU utilization
      Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs compute utilization (percentage of GPU compute) in this node.
    • GPU memory utilization
      Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs memory usage (percentage of the GPU memory) in this node.
    • CPU compute utilization
      The average of all CPUs’ cores compute utilization graph, along an adjustable period allows you to see the trends of CPU compute utilization (percentage of CPU compute) in this node.
    • CPU memory utilization
      The utilization of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory utilization (percentage of CPU memory) in this node.
    • CPU memory usage
      The usage of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory usage (in GB or MB of CPU memory) in this node.

    • For GPUs charts - Click the GPU legend on the right-hand side of the chart, to activate or deactivate any of the GPU lines.

    • You can click the date picker to change the presented period
    • You can use your mouse to mark a sub-period in the graph for zooming in, and use the ‘Reset zoom’ button to go back to the preset period
    • Changes in the period affect all graphs on this screen.

    Using API

    Go to the Nodes API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/platform-admin/authentication/accessrules/index.html b/v2.20/platform-admin/authentication/accessrules/index.html index cb1482eabc..3e78969b3b 100644 --- a/v2.20/platform-admin/authentication/accessrules/index.html +++ b/v2.20/platform-admin/authentication/accessrules/index.html @@ -1,4 +1,4 @@ - Access Rules - Run:ai Documentation Library

    Access Rules

    This article explains the procedure to manage Access rules.

    Access rules provide users, groups, or applications privileges to system entities.

    An access rule is the assignment of a role to a subject in a scope: <Subject> is a <Role> in a <Scope>.

    For example, user user@domain.com is a department admin in department A.

    Access rules table

    The Access rules table can be found under Access in the Run:ai platform.

    The Access rules table provides a list of all the access rules defined in the platform and allows you to manage them.

    Note

    Flexible management

    It is also possible to manage access rules directly for a specific user, application, project, or department.

    The Access rules table consists of the following columns:

    Column Description
    Type The type of subject assigned to the access rule (user, SSO group, or application).
    Subject The user, SSO group, or application assigned with the role
    Role The role assigned to the subject
    Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates
    Authorized by The user who granted the access rule
    Creation time The timestamp for when the rule was created
    Last updated The last time the access rule was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Adding new access rules

    To add a new access rule:

    1. Click +NEW ACCESS RULE
    2. Select a subject User, SSO Group, or Application
    3. Select or enter the subject identifier:
      • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
      • Group name as recognized by the IDP
      • Application name as created in Run:ai
    4. Select a role
    5. Select a scope
    6. Click SAVE RULE

    Note

    An access rule consists of a single subject with a single role in a single scope. To assign multiple roles or multiple scopes to the same subject, multiple access rules must be added.

    Editing an access rule

    Access rules cannot be edited. To change an access rule, you must delete the rule, and then create a new rule to replace it.

    Deleting an access rule

    1. Select the access rule you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Using API

    Go to the Access rules API reference to view the available actions

    Access Rules

    This article explains the procedure to manage Access rules.

    Access rules provide users, groups, or applications privileges to system entities.

    An access rule is the assignment of a role to a subject in a scope: <Subject> is a <Role> in a <Scope>.

    For example, user user@domain.com is a department admin in department A.

    Access rules table

    The Access rules table can be found under Access in the Run:ai platform.

    The Access rules table provides a list of all the access rules defined in the platform and allows you to manage them.

    Note

    Flexible management

    It is also possible to manage access rules directly for a specific user, application, project, or department.

    The Access rules table consists of the following columns:

    Column Description
    Type The type of subject assigned to the access rule (user, SSO group, or application).
    Subject The user, SSO group, or application assigned with the role
    Role The role assigned to the subject
    Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates
    Authorized by The user who granted the access rule
    Creation time The timestamp for when the rule was created
    Last updated The last time the access rule was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Adding new access rules

    To add a new access rule:

    1. Click +NEW ACCESS RULE
    2. Select a subject User, SSO Group, or Application
    3. Select or enter the subject identifier:
      • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
      • Group name as recognized by the IDP
      • Application name as created in Run:ai
    4. Select a role
    5. Select a scope
    6. Click SAVE RULE

    Note

    An access rule consists of a single subject with a single role in a single scope. To assign multiple roles or multiple scopes to the same subject, multiple access rules must be added.

    Editing an access rule

    Access rules cannot be edited. To change an access rule, you must delete the rule, and then create a new rule to replace it.

    Deleting an access rule

    1. Select the access rule you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Using API

    Go to the Access rules API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/platform-admin/authentication/applications/index.html b/v2.20/platform-admin/authentication/applications/index.html index ab3f70197b..e99662e704 100644 --- a/v2.20/platform-admin/authentication/applications/index.html +++ b/v2.20/platform-admin/authentication/applications/index.html @@ -1,4 +1,4 @@ - Applications - Run:ai Documentation Library

    Applications

    This article explains the procedure to manage your organization's applications.

    Applications are used for API integrations with Run:ai. An application contains a client ID and a client secret. With the client credentials, you can obtain a token as detailed in API authentication and use it within subsequent API calls.

    Applications are assigned with Access Rules to manage permissions. For example, application ci-pipeline-prod is assigned with a Researcher role in Cluster: A.

    Applications table

    The Applications table can be found under Access in the Run:ai platform.

    The Applications table provides a list of all the applications defined in the platform, and allows you to manage them.

    The Applications table consists of the following columns:

    Column Description
    Application The name of the application
    Client ID The client ID of the application
    Access rule(s) The access rules assigned to the application
    Last login The timestamp for the last time the user signed in
    Created by The user who created the application
    Creation time The timestamp for when the application was created
    Last updated The last time the application was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Creating an application

    To create an application:

    1. Click +NEW APPLICATION
    2. Enter the application’s name
    3. Click CREATE
    4. Copy the Client ID and Client secret and store them securely
    5. Click DONE

    Note

    The client secret is visible only at the time of creation. It cannot be recovered but can be regenerated.

    Adding an access rule to an application

    To create an access rule:

    1. Select the application you want to add an access rule for
    2. Click ACCESS RULES
    3. Click +ACCESS RULE
    4. Select a role
    5. Select a scope
    6. Click SAVE RULE
    7. Click CLOSE

    Deleting an access rule from an application

    To delete an access rule:

    1. Select the application you want to remove an access rule from
    2. Click ACCESS RULES
    3. Find the access rule assigned to the user you would like to delete
    4. Click on the trash icon
    5. Click CLOSE

    Regenerating client secret

    To regenerate a client secret:

    1. Locate the application you want to regenerate its client secret
    2. Click REGENERATE CLIENT SECRET
    3. Click REGENERATE
    4. Copy the New client secret and store it securely
    5. Click DONE

    Warning

    Regenerating a client secret revokes the previous one.

    Deleting an application

    1. Select the application you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Using API

    Go to the Applications, Access rules API reference to view the available actions

    Applications

    This article explains the procedure to manage your organization's applications.

    Applications are used for API integrations with Run:ai. An application contains a client ID and a client secret. With the client credentials, you can obtain a token as detailed in API authentication and use it within subsequent API calls.

    Applications are assigned with Access Rules to manage permissions. For example, application ci-pipeline-prod is assigned with a Researcher role in Cluster: A.

    Applications table

    The Applications table can be found under Access in the Run:ai platform.

    The Applications table provides a list of all the applications defined in the platform, and allows you to manage them.

    The Applications table consists of the following columns:

    Column Description
    Application The name of the application
    Client ID The client ID of the application
    Access rule(s) The access rules assigned to the application
    Last login The timestamp for the last time the user signed in
    Created by The user who created the application
    Creation time The timestamp for when the application was created
    Last updated The last time the application was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Creating an application

    To create an application:

    1. Click +NEW APPLICATION
    2. Enter the application’s name
    3. Click CREATE
    4. Copy the Client ID and Client secret and store them securely
    5. Click DONE

    Note

    The client secret is visible only at the time of creation. It cannot be recovered but can be regenerated.

    Adding an access rule to an application

    To create an access rule:

    1. Select the application you want to add an access rule for
    2. Click ACCESS RULES
    3. Click +ACCESS RULE
    4. Select a role
    5. Select a scope
    6. Click SAVE RULE
    7. Click CLOSE

    Deleting an access rule from an application

    To delete an access rule:

    1. Select the application you want to remove an access rule from
    2. Click ACCESS RULES
    3. Find the access rule assigned to the user you would like to delete
    4. Click on the trash icon
    5. Click CLOSE

    Regenerating client secret

    To regenerate a client secret:

    1. Locate the application you want to regenerate its client secret
    2. Click REGENERATE CLIENT SECRET
    3. Click REGENERATE
    4. Copy the New client secret and store it securely
    5. Click DONE

    Warning

    Regenerating a client secret revokes the previous one.

    Deleting an application

    1. Select the application you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Using API

    Go to the Applications, Access rules API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/platform-admin/authentication/roles/index.html b/v2.20/platform-admin/authentication/roles/index.html index c020c5746e..1d57333fb0 100644 --- a/v2.20/platform-admin/authentication/roles/index.html +++ b/v2.20/platform-admin/authentication/roles/index.html @@ -1,4 +1,4 @@ - Roles - Run:ai Documentation Library

    Roles

    This article explains the available roles in the Run:ai platform.

    A role is a set of permissions that can be assigned to a subject in a scope.

    A permission is a set of actions (View, Edit, Create and Delete) over a Run:ai entity (e.g. projects, workloads, users).

    Roles table

    The Roles table can be found under Access in the Run:ai platform.

    The Roles table displays a list of predefined roles available to users in the Run:ai platform. It is not possible to create additional rules or edit or delete existing rules.

    The Roles table consists of the following columns:

    Column Description
    Role The name of the role
    Created by The name of the role creator
    Creation time The timestamp when the role was created

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Reviewing a role

    1. To review a role click the role name on the table
    2. In the role form review the following:
      • Role name
        The name of the role
      • Entity
        A system-managed object that can be viewed, edited, created or deleted by a user based on their assigned role and scope
      • Actions
        The actions that the role assignee is authorized to perform for each entity
        • View If checked, an assigned user with this role can view instances of this type of entity within their defined scope
        • Edit If checked, an assigned user with this role can change the settings of an instance of this type of entity within their defined scope
        • Create If checked, an assigned user with this role can create new instances of this type of entity within their defined scope
        • Delete If checked, an assigned user with this role can delete instances of this type of entity within their defined scope

    Roles in Run:ai

    Run:ai supports the following roles and their permissions:
    Under each role is a detailed list of the actions that the role assignee is authorized to perform for each entity.

    Compute resource administrator

    Data source administrator

    Data volume administrator

    Department administrator

    Department viewer

    Editor

    Environment administrator

    L1 researcher

    L2 researcher

    ML engineer

    Research manager

    System administrator

    Template administrator

    Viewer

    Notes

    Keep the following in mind when upgrading from versions 2.13 or earlier:

    • Admin becomes System Admin with full access to all managed objects and scopes
    • Research Manager is not automatically assigned to all projects, but to projects set by the relevant Admin when assigning this role to a user, group or app
    • To preserve backwards compatibility, users with the role of Research Manager are assigned to all current projects, but not to new projects
    • To allow the Department Admin to assign a Researcher role to a user, group or app, the Department Admin must have VECD permissions for jobs and workspaces. This creates a broader span of managed objects
    • To preserve backwards compatibility, users with the role of Editor, are assigned to the same scope they had before the upgrade. However, with new user assignments, the Admin can limit the scope to only part of the organizational scope.

    Permitted workloads

    When assigning a role with either one, all or any combination of the View, Edit, Create and Delete permissions for workloads, the subject has permissions to manage not only Run:ai native workloads (Workspace, Training, Inference), but also a list of 3rd party workloads:

    Using API

    Go to the Roles API reference to view the available actions.

    Roles

    This article explains the available roles in the Run:ai platform.

    A role is a set of permissions that can be assigned to a subject in a scope.

    A permission is a set of actions (View, Edit, Create and Delete) over a Run:ai entity (e.g. projects, workloads, users).

    Roles table

    The Roles table can be found under Access in the Run:ai platform.

    The Roles table displays a list of predefined roles available to users in the Run:ai platform. It is not possible to create additional rules or edit or delete existing rules.

    The Roles table consists of the following columns:

    Column Description
    Role The name of the role
    Created by The name of the role creator
    Creation time The timestamp when the role was created

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Reviewing a role

    1. To review a role click the role name on the table
    2. In the role form review the following:
      • Role name
        The name of the role
      • Entity
        A system-managed object that can be viewed, edited, created or deleted by a user based on their assigned role and scope
      • Actions
        The actions that the role assignee is authorized to perform for each entity
        • View If checked, an assigned user with this role can view instances of this type of entity within their defined scope
        • Edit If checked, an assigned user with this role can change the settings of an instance of this type of entity within their defined scope
        • Create If checked, an assigned user with this role can create new instances of this type of entity within their defined scope
        • Delete If checked, an assigned user with this role can delete instances of this type of entity within their defined scope

    Roles in Run:ai

    Run:ai supports the following roles and their permissions:
    Under each role is a detailed list of the actions that the role assignee is authorized to perform for each entity.

    Compute resource administrator

    Data source administrator

    Data volume administrator

    Department administrator

    Department viewer

    Editor

    Environment administrator

    L1 researcher

    L2 researcher

    ML engineer

    Research manager

    System administrator

    Template administrator

    Viewer

    Notes

    Keep the following in mind when upgrading from versions 2.13 or earlier:

    • Admin becomes System Admin with full access to all managed objects and scopes
    • Research Manager is not automatically assigned to all projects, but to projects set by the relevant Admin when assigning this role to a user, group or app
    • To preserve backwards compatibility, users with the role of Research Manager are assigned to all current projects, but not to new projects
    • To allow the Department Admin to assign a Researcher role to a user, group or app, the Department Admin must have VECD permissions for jobs and workspaces. This creates a broader span of managed objects
    • To preserve backwards compatibility, users with the role of Editor, are assigned to the same scope they had before the upgrade. However, with new user assignments, the Admin can limit the scope to only part of the organizational scope.

    Permitted workloads

    When assigning a role with either one, all or any combination of the View, Edit, Create and Delete permissions for workloads, the subject has permissions to manage not only Run:ai native workloads (Workspace, Training, Inference), but also a list of 3rd party workloads:

    Using API

    Go to the Roles API reference to view the available actions.

    \ No newline at end of file diff --git a/v2.20/platform-admin/authentication/users/index.html b/v2.20/platform-admin/authentication/users/index.html index a76be92ba6..9d7b21674e 100644 --- a/v2.20/platform-admin/authentication/users/index.html +++ b/v2.20/platform-admin/authentication/users/index.html @@ -1,4 +1,4 @@ - Users - Run:ai Documentation Library

    Users

    This article explains the procedure to manage users and their permissions.

    Users can be managed locally, or via the Identity provider, while assigned with Access Rules to manage its permissions.

    For example, user user@domain.com is a department admin in department A.

    Users table

    The Users table can be found under Access in the Run:ai platform.

    The users table provides a list of all the users in the platform.
    You can manage local users and manage user permissions (access rules) for both local and SSO users.

    Note

    Single Sign-On users

    SSO users are managed by the identity provider and appear once they have signed in to Run:ai

    The Users table consists of the following columns:

    Column Description
    User The unique identity of the user (email address)
    Type The type of the user - SSO / local
    Last login The timestamp for the last time the user signed in
    Access rule(s) The access rules assigned to the user
    Created By The user who created the user
    Creation time The timestamp for when the user was created
    Last updated The last time the user was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Creating a local user

    To create a local user:

    1. Click +NEW LOCAL USER
    2. Enter the user’s Email address
    3. Click CREATE
    4. Review and copy the user’s credentials:
      • User Email
      • Temporary password to be used on first sign-in
    5. Click DONE

    Note

    The temporary password is visible only at the time of user’s creation, and must be changed after the first sign-in

    Adding an access rule to a user

    To create an access rule:

    1. Select the user you want to add an access rule for
    2. Click ACCESS RULES
    3. Click +ACCESS RULE
    4. Select a role
    5. Select a scope
    6. Click SAVE RULE
    7. Click CLOSE

    Deleting user’s access rule

    To delete an access rule:

    1. Select the user you want to remove an access rule from
    2. Click ACCESS RULES
    3. Find the access rule assigned to the user you would like to delete
    4. Click on the trash icon
    5. Click CLOSE

    Resetting a user password

    To reset a user’s password:

    1. Select the user you want to reset it’s password
    2. Click RESET PASSWORD
    3. Click RESET
    4. Review and copy the user’s credentials:
      • User Email
      • Temporary password to be used on next sign-in
    5. Click DONE

    Deleting a user

    1. Select the user you want to delete
    2. Click DELETE
    3. In the dialog, click DELETE to confirm the deletion

    Note

    To ensure administrative operations are always available, at least one local user with System Administrator role should exist.

    Using API

    Go to the Users, Access rules API reference to view the available actions

    Users

    This article explains the procedure to manage users and their permissions.

    Users can be managed locally, or via the Identity provider, while assigned with Access Rules to manage its permissions.

    For example, user user@domain.com is a department admin in department A.

    Users table

    The Users table can be found under Access in the Run:ai platform.

    The users table provides a list of all the users in the platform.
    You can manage local users and manage user permissions (access rules) for both local and SSO users.

    Note

    Single Sign-On users

    SSO users are managed by the identity provider and appear once they have signed in to Run:ai

    The Users table consists of the following columns:

    Column Description
    User The unique identity of the user (email address)
    Type The type of the user - SSO / local
    Last login The timestamp for the last time the user signed in
    Access rule(s) The access rules assigned to the user
    Created By The user who created the user
    Creation time The timestamp for when the user was created
    Last updated The last time the user was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Creating a local user

    To create a local user:

    1. Click +NEW LOCAL USER
    2. Enter the user’s Email address
    3. Click CREATE
    4. Review and copy the user’s credentials:
      • User Email
      • Temporary password to be used on first sign-in
    5. Click DONE

    Note

    The temporary password is visible only at the time of user’s creation, and must be changed after the first sign-in

    Adding an access rule to a user

    To create an access rule:

    1. Select the user you want to add an access rule for
    2. Click ACCESS RULES
    3. Click +ACCESS RULE
    4. Select a role
    5. Select a scope
    6. Click SAVE RULE
    7. Click CLOSE

    Deleting user’s access rule

    To delete an access rule:

    1. Select the user you want to remove an access rule from
    2. Click ACCESS RULES
    3. Find the access rule assigned to the user you would like to delete
    4. Click on the trash icon
    5. Click CLOSE

    Resetting a user password

    To reset a user’s password:

    1. Select the user you want to reset it’s password
    2. Click RESET PASSWORD
    3. Click RESET
    4. Review and copy the user’s credentials:
      • User Email
      • Temporary password to be used on next sign-in
    5. Click DONE

    Deleting a user

    1. Select the user you want to delete
    2. Click DELETE
    3. In the dialog, click DELETE to confirm the deletion

    Note

    To ensure administrative operations are always available, at least one local user with System Administrator role should exist.

    Using API

    Go to the Users, Access rules API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/platform-admin/integrations/integration-overview/index.html b/v2.20/platform-admin/integrations/integration-overview/index.html index e667592a84..0da387b298 100644 --- a/v2.20/platform-admin/integrations/integration-overview/index.html +++ b/v2.20/platform-admin/integrations/integration-overview/index.html @@ -1,4 +1,4 @@ - Overview - Run:ai Documentation Library

    Integrations with Run:ai

    The table below summarizes the integration capabilities with various third-party products.

    Integration support

    Support for integrations varies. Where mentioned below, the integration is supported out of the box with Run:ai. With other integrations, our customer success team has previous experience with integrating with the third party software and many times the community portal will contain additional reference documentation provided on an as-is basis.

    The Run:ai community portal is password protected and access is provided to customers and partners.

    Integrations

    Tool Category Run:ai support details Additional Information
    Triton Orchestration Supported Usage via docker base image. Quickstart inference example
    Spark Orchestration Community Support
    It is possible to schedule Spark workflows with the Run:ai scheduler. For details, please contact Run:ai customer support.
    Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-Run-Spark-jobs-with-Run-AI
    Kubeflow Pipelines Orchestration Community Support It is possible to schedule kubeflow pipelines with the Run:ai scheduler. For details please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal
    https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Kubeflow
    Apache Airflow Orchestration Community Support It is possible to schedule Airflow workflows with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Apache-Airflow
    Argo workflows Orchestration Community Support It is possible to schedule Argo workflows with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Argo-Workflows
    SeldonX Orchestration Community Support It is possible to schedule Seldon Core workloads with the Run:ai scheduler. For details, please contact Run:ai customer success. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Seldon-Core
    Jupyter Notebook Development Supported Run:ai provides integrated support with Jupyter Notebooks. Quickstart example: https://docs.run.ai/latest/Researcher/Walkthroughs/quickstart-jupyter/
    Jupyter Hub Development Community Support It is possible to submit Run:ai workloads via JupyterHub. For more information please contact Run:ai customer support
    PyCharm Development Supported Containers created by Run:ai can be accessed via PyCharm. PyCharm example
    VScode Development Supported - Containers created by Run:ai can be accessed via Visual Studio Code. example
    - You can automatically launch Visual Studio code web from the Run:ai console. example.
    Kubeflow notebooks Development Community Support It is possible to launch a kubeflow notebook with the Run:ai scheduler. For details please contact Run:ai customer support Sample code can be found in the Run:ai customer success community portal:https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Kubeflow
    Ray training, inference, data processing. Community Support It is possible to schedule Ray jobs with the Run:ai scheduler. Sample code can be found in the Run:ai customer success community portal https://runai.my.site.com/community/s/article/How-to-Integrate-Run-ai-with-Ray
    TensorBoard Experiment tracking Supported Run:ai comes with a preset Tensorboard Environment asset. TensorBoard example.
    Additional sample
    Weights & Biases Experiment tracking Community Support It is possible to schedule W&B workloads with the Run:ai scheduler. For details, please contact Run:ai customer success.
    ClearML Experiment tracking Community Support It is possible to schedule ClearML workloads with the Run:ai scheduler. For details, please contact Run:ai customer success.
    MLFlow Model Serving Community Support It is possible to use ML Flow together with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-MLflow
    Additional MLFlow sample
    Hugging Face Repositories Supported Run:ai provides an out of the box integration with Hugging Face
    Docker Registry Repositories Supported Run:ai allows using a docker registry as a Credentials asset.
    S3 Storage Supported Run:ai communicates with S3 by defining a data source asset.
    Github Storage Supported Run:ai communicates with GitHub by defining it as a data source asset
    Tensorflow Training Supported Run:ai provides out of the box support for submitting TensorFlow workloads via API or by submitting workloads via user interface.
    Pytorch Training Supported Run:ai provides out of the box support for submitting PyTorch workloads via API or by submitting workloads via user interface.
    Kubeflow MPI Training Supported Run:ai provides out of the box support for submitting MPI workloads via API or by submitting workloads via user interface
    XGBoost Training Supported Run:ai provides out of the box support for submitting XGBoost workloads via API or by submitting workloads via user interface
    Karpenter Cost Optimization Supported Run:ai provides out of the box support for Karpenter to save cloud costs. Integration notes with Karpenter can be found here

    Kubernetes Workloads Integration

    Kubernetes has several built-in resources that encapsulate running Pods. These are called Kubernetes Workloads and should not be confused with Run:ai Workloads.

    Examples of such resources are a Deployment that manages a stateless application, or a Job that runs tasks to completion.

    Run:ai natively runs Run:ai Workloads. A Run:ai workload encapsulates all the resources needed to run, creates them, and deletes them together. However, Run:ai, being an open platform allows the scheduling of any Kubernetes Workflow.

    For more information see Kubernetes Workloads Integration.

    Integrations with Run:ai

    The table below summarizes the integration capabilities with various third-party products.

    Integration support

    Support for integrations varies. Where mentioned below, the integration is supported out of the box with Run:ai. With other integrations, our customer success team has previous experience with integrating with the third party software and many times the community portal will contain additional reference documentation provided on an as-is basis.

    The Run:ai community portal is password protected and access is provided to customers and partners.

    Integrations

    Tool Category Run:ai support details Additional Information
    Triton Orchestration Supported Usage via docker base image. Quickstart inference example
    Spark Orchestration Community Support
    It is possible to schedule Spark workflows with the Run:ai scheduler. For details, please contact Run:ai customer support.
    Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-Run-Spark-jobs-with-Run-AI
    Kubeflow Pipelines Orchestration Community Support It is possible to schedule kubeflow pipelines with the Run:ai scheduler. For details please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal
    https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Kubeflow
    Apache Airflow Orchestration Community Support It is possible to schedule Airflow workflows with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Apache-Airflow
    Argo workflows Orchestration Community Support It is possible to schedule Argo workflows with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Argo-Workflows
    SeldonX Orchestration Community Support It is possible to schedule Seldon Core workloads with the Run:ai scheduler. For details, please contact Run:ai customer success. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Seldon-Core
    Jupyter Notebook Development Supported Run:ai provides integrated support with Jupyter Notebooks. Quickstart example: https://docs.run.ai/latest/Researcher/Walkthroughs/quickstart-jupyter/
    Jupyter Hub Development Community Support It is possible to submit Run:ai workloads via JupyterHub. For more information please contact Run:ai customer support
    PyCharm Development Supported Containers created by Run:ai can be accessed via PyCharm. PyCharm example
    VScode Development Supported - Containers created by Run:ai can be accessed via Visual Studio Code. example
    - You can automatically launch Visual Studio code web from the Run:ai console. example.
    Kubeflow notebooks Development Community Support It is possible to launch a kubeflow notebook with the Run:ai scheduler. For details please contact Run:ai customer support Sample code can be found in the Run:ai customer success community portal:https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Kubeflow
    Ray training, inference, data processing. Community Support It is possible to schedule Ray jobs with the Run:ai scheduler. Sample code can be found in the Run:ai customer success community portal https://runai.my.site.com/community/s/article/How-to-Integrate-Run-ai-with-Ray
    TensorBoard Experiment tracking Supported Run:ai comes with a preset Tensorboard Environment asset. TensorBoard example.
    Additional sample
    Weights & Biases Experiment tracking Community Support It is possible to schedule W&B workloads with the Run:ai scheduler. For details, please contact Run:ai customer success.
    ClearML Experiment tracking Community Support It is possible to schedule ClearML workloads with the Run:ai scheduler. For details, please contact Run:ai customer success.
    MLFlow Model Serving Community Support It is possible to use ML Flow together with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-MLflow
    Additional MLFlow sample
    Hugging Face Repositories Supported Run:ai provides an out of the box integration with Hugging Face
    Docker Registry Repositories Supported Run:ai allows using a docker registry as a Credentials asset.
    S3 Storage Supported Run:ai communicates with S3 by defining a data source asset.
    Github Storage Supported Run:ai communicates with GitHub by defining it as a data source asset
    Tensorflow Training Supported Run:ai provides out of the box support for submitting TensorFlow workloads via API or by submitting workloads via user interface.
    Pytorch Training Supported Run:ai provides out of the box support for submitting PyTorch workloads via API or by submitting workloads via user interface.
    Kubeflow MPI Training Supported Run:ai provides out of the box support for submitting MPI workloads via API or by submitting workloads via user interface
    XGBoost Training Supported Run:ai provides out of the box support for submitting XGBoost workloads via API or by submitting workloads via user interface
    Karpenter Cost Optimization Supported Run:ai provides out of the box support for Karpenter to save cloud costs. Integration notes with Karpenter can be found here

    Kubernetes Workloads Integration

    Kubernetes has several built-in resources that encapsulate running Pods. These are called Kubernetes Workloads and should not be confused with Run:ai Workloads.

    Examples of such resources are a Deployment that manages a stateless application, or a Job that runs tasks to completion.

    Run:ai natively runs Run:ai Workloads. A Run:ai workload encapsulates all the resources needed to run, creates them, and deletes them together. However, Run:ai, being an open platform allows the scheduling of any Kubernetes Workflow.

    For more information see Kubernetes Workloads Integration.

    \ No newline at end of file diff --git a/v2.20/platform-admin/integrations/karpenter/index.html b/v2.20/platform-admin/integrations/karpenter/index.html index 27c3b5f853..8f16ae7cf2 100644 --- a/v2.20/platform-admin/integrations/karpenter/index.html +++ b/v2.20/platform-admin/integrations/karpenter/index.html @@ -1,4 +1,4 @@ - Working with Karpenter - Run:ai Documentation Library

    Working with Karpenter

    Karpenter is an open-source, Kubernetes cluster autoscaler built for cloud deployments. Karpenter optimizes the cloud cost of a customer’s cluster by moving workloads between different node types, consolidating workloads into fewer nodes, using lower-cost nodes where possible, scaling up new nodes when needed, and shutting down unused nodes.

    Karpenter’s main goal is cost optimization. Unlike Karpenter, Run:ai’s scheduler optimizes for fairness and resource utilization. Therefore, there are a few potential friction points when using both on the same cluster.

    Friction points using Karpenter with Run:ai

    1. Karpenter looks for “unschedulable” pending workloads and may try to scale up new nodes to make those workloads schedulable. However, in some scenarios, these workloads may exceed their quota parameters, and the Run:ai scheduler will put them into a pending state.
    2. Karpenter is not aware of the Run:ai fractions mechanism and may try to interfere incorrectly.
    3. Karpenter preempts any type of workload (i.e., high-priority, non-preemptible workloads will potentially be interrupted and moved to save cost).
    4. Karpenter has no pod-group (i.e., workload) notion or gang scheduling awareness, meaning that Karpenter is unaware that a set of “arbitrary” pods is a single workload. This may cause Karpenter to schedule those pods into different node pools (in the case of multi-node-pool workloads) or scale up or down a mix of wrong nodes.

    Mitigating the friction points

    Run:ai scheduler mitigates the friction points using the following techniques (each numbered bullet below corresponds to the related friction point listed above):

    1. Karpenter uses a “nominated node” to recommend a node for the scheduler. The Run:ai scheduler treats this as a “preferred” recommendation, meaning it will try to use this node, but it’s not required and it may choose another node.
    2. Fractions - Karpenter won’t consolidate nodes with one or more pods that cannot be moved. The Run:ai reservation pod is marked as ‘do not evict’ to allow the Run:ai scheduler to control the scheduling of fractions.
    3. Non-preemptible workloads - Run:ai marks non-preemptible workloads as ‘do not evict’ and Karpenter respects this annotation.
    4. Run:ai node pools (single-node-pool workloads) - Karpenter respects the ‘node affinity’ that Run:ai sets on a pod, so Karpenter uses the node affinity for its recommended node. For the gang-scheduling/pod-group (workload) notion, Run:ai scheduler considers Karpenter directives as preferred recommendations rather than mandatory instructions and overrides Karpenter instructions where appropriate.

    Deployment Considerations

    • Using multi-node-pool workloads
      • Workloads may include a list of optional nodepools. Karpenter is not aware that only a single node pool should be selected out of that list for the workload. It may therefore recommend putting pods of the same workload into different node pools and may scaleup nodes from different node pools to serve a “multi-node-pool” workload instead of nodes on the selected single node pool.
      • If this becomes an issue (i.e., if Karpenter scales up the wrong node types), users can set an inter-pod affinity using the node pool label or another common label as a ‘topology’ identifier. This will force Karpenter to choose nodes from a single-node pool per workload, selecting from any of the node pools listed as allowed by the workload.
      • An alternative approach is to use a single-node pool for each workload instead of multi-node pools.
    • Consolidation
      • To make Karpenter more effective when using its consolidation function, users should consider separating preemptible and non-preemptible workloads, either by using node pools, node affinities, taint/tollerations, or inter-pod anti-affinity.
      • If users don’t separate preemptible and non-preemptible workloads (i.e., make them run on different nodes), Karpenter’s ability to consolidate (binpack) and shut down nodes will be reduced, but it is still effective.
    • Conflicts between binpacking and spread policies
      • If Run:ai is used with a scheduling spread policy, it will clash with Karpenter’s default binpacks/consolidation policy, and the outcome may be a deployment that is not optimized for any of these policies.
      • Usually spread is used for Inference, which is non-preemptible and therefore not controlled by Karpenter (Run:ai scheduler will mark those workloads as ‘do not evict’ for Karpenter), so this should not present a real deployment issue for customers.

    Working with Karpenter

    Karpenter is an open-source, Kubernetes cluster autoscaler built for cloud deployments. Karpenter optimizes the cloud cost of a customer’s cluster by moving workloads between different node types, consolidating workloads into fewer nodes, using lower-cost nodes where possible, scaling up new nodes when needed, and shutting down unused nodes.

    Karpenter’s main goal is cost optimization. Unlike Karpenter, Run:ai’s scheduler optimizes for fairness and resource utilization. Therefore, there are a few potential friction points when using both on the same cluster.

    Friction points using Karpenter with Run:ai

    1. Karpenter looks for “unschedulable” pending workloads and may try to scale up new nodes to make those workloads schedulable. However, in some scenarios, these workloads may exceed their quota parameters, and the Run:ai scheduler will put them into a pending state.
    2. Karpenter is not aware of the Run:ai fractions mechanism and may try to interfere incorrectly.
    3. Karpenter preempts any type of workload (i.e., high-priority, non-preemptible workloads will potentially be interrupted and moved to save cost).
    4. Karpenter has no pod-group (i.e., workload) notion or gang scheduling awareness, meaning that Karpenter is unaware that a set of “arbitrary” pods is a single workload. This may cause Karpenter to schedule those pods into different node pools (in the case of multi-node-pool workloads) or scale up or down a mix of wrong nodes.

    Mitigating the friction points

    Run:ai scheduler mitigates the friction points using the following techniques (each numbered bullet below corresponds to the related friction point listed above):

    1. Karpenter uses a “nominated node” to recommend a node for the scheduler. The Run:ai scheduler treats this as a “preferred” recommendation, meaning it will try to use this node, but it’s not required and it may choose another node.
    2. Fractions - Karpenter won’t consolidate nodes with one or more pods that cannot be moved. The Run:ai reservation pod is marked as ‘do not evict’ to allow the Run:ai scheduler to control the scheduling of fractions.
    3. Non-preemptible workloads - Run:ai marks non-preemptible workloads as ‘do not evict’ and Karpenter respects this annotation.
    4. Run:ai node pools (single-node-pool workloads) - Karpenter respects the ‘node affinity’ that Run:ai sets on a pod, so Karpenter uses the node affinity for its recommended node. For the gang-scheduling/pod-group (workload) notion, Run:ai scheduler considers Karpenter directives as preferred recommendations rather than mandatory instructions and overrides Karpenter instructions where appropriate.

    Deployment Considerations

    • Using multi-node-pool workloads
      • Workloads may include a list of optional nodepools. Karpenter is not aware that only a single node pool should be selected out of that list for the workload. It may therefore recommend putting pods of the same workload into different node pools and may scaleup nodes from different node pools to serve a “multi-node-pool” workload instead of nodes on the selected single node pool.
      • If this becomes an issue (i.e., if Karpenter scales up the wrong node types), users can set an inter-pod affinity using the node pool label or another common label as a ‘topology’ identifier. This will force Karpenter to choose nodes from a single-node pool per workload, selecting from any of the node pools listed as allowed by the workload.
      • An alternative approach is to use a single-node pool for each workload instead of multi-node pools.
    • Consolidation
      • To make Karpenter more effective when using its consolidation function, users should consider separating preemptible and non-preemptible workloads, either by using node pools, node affinities, taint/tollerations, or inter-pod anti-affinity.
      • If users don’t separate preemptible and non-preemptible workloads (i.e., make them run on different nodes), Karpenter’s ability to consolidate (binpack) and shut down nodes will be reduced, but it is still effective.
    • Conflicts between binpacking and spread policies
      • If Run:ai is used with a scheduling spread policy, it will clash with Karpenter’s default binpacks/consolidation policy, and the outcome may be a deployment that is not optimized for any of these policies.
      • Usually spread is used for Inference, which is non-preemptible and therefore not controlled by Karpenter (Run:ai scheduler will mark those workloads as ‘do not evict’ for Karpenter), so this should not present a real deployment issue for customers.
    \ No newline at end of file diff --git a/v2.20/platform-admin/overview/index.html b/v2.20/platform-admin/overview/index.html index 4ec4cb02ca..43ad0947d9 100644 --- a/v2.20/platform-admin/overview/index.html +++ b/v2.20/platform-admin/overview/index.html @@ -1,4 +1,4 @@ - Overview - Run:ai Documentation Library

    Overview: Platform Administrator

    The Platform Administrator is responsible for the day-to-day administration of the product.

    As part of the Platform Administrator documentation you will find:

    Overview: Platform Administrator

    The Platform Administrator is responsible for the day-to-day administration of the product.

    As part of the Platform Administrator documentation you will find:

    \ No newline at end of file diff --git a/v2.20/platform-admin/performance/dashboard-analysis/index.html b/v2.20/platform-admin/performance/dashboard-analysis/index.html index 9a3bd123dd..90d0bd539c 100644 --- a/v2.20/platform-admin/performance/dashboard-analysis/index.html +++ b/v2.20/platform-admin/performance/dashboard-analysis/index.html @@ -1,4 +1,4 @@ - Dashboard Analysis - Run:ai Documentation Library

    Introduction

    The Run:ai Administration User Interface provides a set of dashboards that help you monitor Clusters, Cluster Nodes, Projects, and Workloads. This document provides the key metrics to monitor, how to assess them as well as suggested actions.

    Dashboards are used by system administrators to analyze and diagnose issues that relate to:

    • Physical Resources.
    • Organization resource allocation and utilization.
    • Usage characteristics.

    System administrators need to know important information about the physical resources that are currently being used. Important information such as:

    • Resource health.
    • Available resources and their distribution.
    • Is there a lack of resources.
    • Are resources being utilized correctly.

    With this information, system administrators can hone in on:

    • How resources are allocated across the organization.
    • How the different organizational units utilized quotas and resources within those quotas.
    • The actual performance of the organizational units.

    These dashboards give system administrators the ability to drill down to see details of the different types of workloads that each of the organizational units is running. These usage and performance metrics ensure that system administrators can then take actions to correct issues that affect performance.

    There are 5 dashboards:

    • GPU/CPU Overview dashboard—Provides information about what is happening right now in the cluster.
    • Quota Management dashboard—Provides information about quota utilization.
    • Analytics dashboard—Provides long term analysis of cluster behavior.
    • Multi-Cluster Overview dashboard—Provides a more holistic, multi-cluster view of what is happening right now. The dashboard is intended for organizations that have more than one connected cluster.
    • Consumption dashboard—Provides information about resource consumption.

    GPU/CPU Overview Dashboard (New and legacy)

    The Overview dashboard provides information about what is happening right now in the cluster. Administrators can view high-level information on the state of the cluster. The dashboard has two tabs that change the display to provide a focused view for GPU Dashboards (default view) and CPU Dashboards.

    GPU Dashboard

    The GPU dashboard displays specific information for GPU based nodes, node-pools, clusters, or tenants. These dashboards also include additional metrics that specific to GPU based environments. The dashboard contains tiles that show information about specific resource allocation and performance metrics. The tiles are interactive allowing you to link directly to the assets or drill down to specific scopes. Use the time frame selector to choose a time frame for all the tiles in the dashboard.

    The dashboard has the following tiles:

    • Ready nodes—displays GPU nodes that are in the ready state.
    • Ready GPU devices—displays the number of GPUs in nodes that are in the ready state.
    • Allocated GPU compute—displays the total number of GPUs allocated from all the nodes.
    • Idle allocated GPU devices—displays the number of allocated GPU devices that have been idle for more than 5 minutes.
    • Running workloads—displays the number of running workloads.
    • Pending workloads—displays the number of workloads in the pending status.
    • Allocation ration by node pool—displays the percentage of GPUs allocated per node pool. Hover over the bar for detailed information. Use the scope selected at the bottom of the graph to drill down for more details.
    • Free resources by node pool—the graph displays the amount of free resources per node pool. Press a entry in the graph for more details. Hover over the resource bubbles for specific details for the workers in the node. Use the ellipsis to download the graph as a CSV file.
    • Resource allocation by workload type—displays the resource allocation by workload type. Hover over the bar for detailed information. Use the scope selected at the bottom of the graph to drill down for more details. Use the ellipsis to download the graph as a CSV file.
    • Workload by status—displays the number of workloads for each status in the workloads table. Hover over the bar for detailed information. Use the scope selected at the bottom of the graph to drill down for more details. Use the ellipsis to download the graph as a CSV file.
    • Resources utilization—displays the resource utilization over time. The right pane of the graph shows the average utilization of the selected time frame of the dashboard. Hover over the graph to see details of a specific time in the graph. Use the ellipsis to download the graph as a CSV file.
    • Resource allocation—displays the resource allocation over time. The right pane of the graph shows the average allocation of the selected time frame of the dashboard. Hover over the graph to see details of a specific time in the graph. Use the ellipsis to download the graph as a CSV file.

    CPU Dashboard

    The CPU dashboards display specific information for CPU based nodes, node-pools, clusters, or tenants. These dashboards also include additional metrics that specific to CPU based environments.

    To enable CPU Dashboards:

    1. Click General settings
    2. Open the Analytics pane and toggle the Show CPU dashboard switch to enable the feature.

    Toggle the switch to disable to disable CPU Dashboards option.

    The dashboard contains the following tiles:

    • Total CPU Nodes—displays the total amount of CPU nodes.
    • Ready CPU nodes—displays the total amount of CPU nodes in the ready state.
    • Total CPUs—displays the total amount of CPUs.
    • Ready CPUs—displays the total amount of CPUs in the ready state.
    • Allocated CPUs—displays the amount of allocated CPUs.
    • Running workloads—displays the amount of workloads in the running state.
    • Pending workloads—displays the amount of workloads in the pending state.
    • Allocated CPUs per project—displays the amount of CPUs allocated per project.
    • Active projects—displays the active projects with the CPU allocation and amount of running and pending workloads.
    • Utilization per resource type—displays the CPU compute and CPU memory utilization over time.
    • CPU compute utilization—displays the current CPU compute utilization.
    • CPU memory utilization—displays the current CPU memory utilization.
    • Pending workloads—displays the requested resources and wait time for workloads in the pending status.
    • Workloads with error—displays the amount of workloads that are currently not running due to an error.
    • Workload Count per CPU Compute Utilization—
    • 5 longest running workloads—displays up to 5 of workloads that have the longest running time.

    Analysis and Suggested actions:

    Review Analysis & Actions
    Interactive Workloads are too frequently idle Consider setting time limits for interactive Workloads through the Projects tab. 
    Consider also reducing GPU/CPU quotas for specific Projects to encourage users to run more training Workloads as opposed to interactive Workloads (note that interactive Workloads can not use more than the GPU/CPU quota assigned to their Project).
    Training Workloads are too frequently idle Identify and notify the right users and work with them to improve the utilization of their training scripts

    Workloads with an Error

    Search for Workloads with an error status. These Workloads may be holding GPUs/CPUs without actually using them.

    Analysis and Suggested actions:

    Search for workloads with an Error status on the Workloads view and discuss with the Job owner. Consider deleting these Workloads to free up the resources for other users.

    Workloads with a Long Duration

    View list of 5 longest Workloads.

    Analysis and Suggested actions:

    Review Analysis & Actions
    Training Workloads run for too long Ask users to view their Workloads and analyze whether useful work is being done. If needed, stop their Workloads.
    Interactive Workloads run for too long Consider setting time limits for interactive Workloads via the Project editor.

    Job Queue

    Identify queueing bottlenecks.

    Analysis and Suggested actions:

    Review Analysis & Actions
    Cluster is fully loaded Go over the table of active Projects and check that fairness between Projects was enforced, by reviewing the number of allocated GPUs/CPUs for each Project, ensuring each Project was allocated with its fair-share portion of the cluster.
    Cluster is not fully loaded Go to the Workloads view to review the resources requested for that Job (CPU, CPU memory, GPU, GPU memory).
    Go to the Nodes view to verify that there is no Node with enough free resources that can host that Job.

    Also, check the command that the user used to submit the job. The Researcher may have requested a specific Node for that Job.

    Analytics Dashboard

    The Analytics dashboard provides means for viewing historical data on cluster information such as:

    • Utilization across the cluster
    • GPU usage by different Projects, including allocation and utilization, broken down into interactive and training Workloads
    • Breakdown of running Workloads into interactive, training, and GPU versus CPU-only Workloads, including information on queueing (number of pending Workloads and requested GPUs),
    • Status of Nodes in terms of availability and allocated and utilized resources.

    The dashboard has a dropdown filter for node pools and Departments. From the dropdown, select one or more node pools. The default setting is all.

    The information presented in Analytics can be used in different ways for identifying problems and fixing them. Below are a few examples.

    Node Downtime

    View the overall available resources per Node and identify cases where a Node is down and there was a reduction in the number of available resources.

    How to: view the following panel.

    Analysis and Suggested actions:

    Filter according to time range to understand for how long the Node is down.

    GPU Allocation

    Track GPU allocation across time.

    How to: view the following panels.

    The panel on the right-hand side shows the cluster-wide GPU allocation and utilization versus time, whereas the panels on the left-hand side show the cluster-wide GPU allocation and utilization averaged across the filtered time range.

    Analysis and Suggested actions:

    If the allocation is too low for a long period, work with users to run more workloads and to better utilize the Cluster.

    Track GPU utilization

    Track whether Researchers efficiently use the GPU resources they have allocated for themselves.

    How to: view the following panel:

    Analysis and Suggested actions:

    If utilization is too low for a long period, you will want to identify the source of the problem:

    • Go to “Average GPU Allocation & Utilization”
    • Look for Projects with large GPU allocations for interactive Workloads or Projects that poorly utilize their training Workloads. Users tend to poorly utilize their GPUs in interactive sessions because of the dev & debug nature of their work which typically is an iterative process with long idle GPU time. On many occasions users also don’t shut down their interactive Workloads, holding their GPUs idle and preventing others from using them.
    Review Analysis & Actions
    Low GPU utilization is due to interactive Workloads being used too frequently Consider setting time limits for interactive Workloads through the Projects tab or reducing GPU quotas to encourage users to run more training Workloads as opposed to interactive Workloads (note that interactive Workloads can not use more than the GPU quota assigned to their Project).
    Low GPU utilization is due to users poorly utilizing their GPUs in training sessions Identify Projects with bad GPU utilization in training Workloads, notify the users and work with them to improve their code and the way they utilize their GPUs.

    Training vs. Interactive -- Researcher maturity

    Track the number of running Workloads and the breakdown into interactive, training, and CPU-only Workloads.

    How to: view the following panel:

    Analysis and Suggested actions:

    We would want to encourage users to run more training Workloads than interactive Workloads, as it is the key to achieving high GPU utilization across the Cluster:

    • Training Workloads run to completion and free up their resources automatically when training ends
    • Training Workloads can be preempted, queued, and resumed automatically by the Run:ai system according to predefined policies which increases fairness and Cluster utilization.

    Pending Queue Size

    Track how long is the queue for pending Workloads

    How to: view the following panels:

    Analysis and Suggested actions:

    Consider buying more GPUs:

    • When there are too many Workloads are waiting in queue for too long.
    • With a large number of requested GPUs.
    • While the Cluster is fully loaded and well utilized.

    CPU & Memory Utilization

    Track CPU and memory Node utilization and identify times where the load on specific Nodes is high.

    How to: view the following panel:

    Analysis and Suggested actions:

    If the load on specific Nodes is too high, it may cause problems with the proper operation of the Cluster and the way workloads are running.

    Consider adding more CPUs, or adding additional CPU-only nodes for Workloads that do only CPU processing.

    Multi-Cluster overview dashboard

    Provides a holistic, aggregated view across Clusters, including information about Cluster and Node utilization, available resources, and allocated resources. With this dashboard, you can identify Clusters that are down or underutilized and go to the Overview of that Cluster to explore further.

    Consumption dashboard

    This dashboard enables users and admins to view consumption usage using run:AI services. The dashboard provides views based on configurable filters and timelines. The dashboard also provides costing analysis for GPU, CPU, and memory costs for the system.

    !consumption dashboard

    The dashboard has 4 tiles for:

    • Cumulative GPU allocation per Project or Department
    • Cumulative CPU allocation per Project or Department
    • Cumulative memory allocation per Project or Department
    • Consumption types

    Use the drop down menus at the top of the dashboard to apply filters for:

    • Project or department
    • Per project (single, multiple, or all)
    • Per department (single, multiple or all)
    • Per cluster (single, multiple, all)

    To enable the Consumption Dashboard:

    1. Press the General settings icon, then press General.
    2. Open the Analytics pane and toggle the Consumption switch to enable the feature.
    3. Enter the cost of:
    4. GPU compute / Hour
    5. CPU compute / Hour
    6. CPU memory / Hour

    Use the time picker dropdown to select relative time range options and set custom absolute time ranges. You can change the Timezone and fiscal year settings from the time range controls by clicking the Change time settings button.

    Note

    Dashboard data updates once an hour.

    You can change the refresh interval using the refresh interval drop down.

    The dashboard has a 2 consumption tables that display the total consumption of resources. Hover over an entry in the table to filter it in or out of the table.

    The Total consumption table includes consumption details based on the filters selected. Fields include:

    • Project
    • Department
    • GPU hours
    • CPU hours
    • Memory hours
    • GPU Idle allocated hours—the portion of time the GPUs spend idle from the total allocation hours.
    • CPU usage hours—the actual usage time of CPU.
    • Memory usage time—the actual usage time of CPU memory.
    • GPU cost (only when configured)
    • CPU cost (only when configured)
    • CPU memory (only when configured)

    The Total department consumption table includes consumption details for each department, or details for departments selected in the filters. Fields include:

    • Department
    • GPU hours
    • CPU hours
    • Memory hours
    • GPU Idle allocated hours—the portion of time the GPUs spend idle from the total allocation hours.
    • CPU usage hours—the actual usage time of CPU.
    • Memory usage time—the actual usage time of CPU memory.
    • GPU cost (only when configured)
    • CPU cost (only when configured)
    • CPU memory (only when configured)

    The dashboard has a graph of the GPU allocation over time.

    !

    The dashboard has a graph of the Project over-quota GPU consumption.

    !

    Quota management dashboard

    The Quota management dashboard provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard is divided into sections with essential metrics and data visualizations to identify resource usage patterns, potential bottlenecks, and areas for optimization. The sections of the dashboard include:

    • Add Filter
    • Quota / Total
    • Allocated / Quota
    • Pending workloads
    • Quota by node pool
    • Allocation by node pool
    • Pending workloads by node pool
    • Departments with lowest allocation by node pool
    • Projects with lowest allocation ratio by node pool
    • Over time allocation / quota

    Add Filter

    Use the Add Filter dropdown to select filters for the dashboard. The filters will change the data shown on the dashboard. Available filters are:

    • Departments
    • Projects
    • Nodes

    Select a filter from the dropdown, then select a item from the list, and press apply.

    Note

    You can create a filter with multiple categories, but you can use each category and item only once.

    Quota / Total

    This section shows the number of GPUs that are in the quota based on the filter selection. The quota of GPUs is the number of GPUs that are reserved for use.

    Allocated / Quota

    This section shows the number of GPUs that are allocated based on the filter selection. Allocated GPUs are the number of GPUs that are being used.

    Pending workloads

    This section shows the number workloads that are pending based on the filter selection. Pending workloads are workloads that have not started.

    Quota by node pool

    This section shows the quota of GPUs by node pool based on the filter. The quota is the number of GPUs that are reserved for use. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.

    Allocation by node pool

    This section shows the allocation of GPUs by node pool based on the filter. The allocation is the number of GPUs that are being used. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.

    Pending workloads by node pool

    This section shows the number of pending workloads by node pool. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.

    Departments with lowest allocation by node pool

    This section shows the departments with the lowest allocation of GPUs by percentage relative to the total number of GPUs.

    Projects with lowest allocation ratio by node pool

    This section shows the projects with the lowest allocation of GPUS by percentage relative to the total number of GPUs.

    Over time allocation / quota

    This section shows the allocation of GPUs from the quota over a period of time.

    Introduction

    The Run:ai Administration User Interface provides a set of dashboards that help you monitor Clusters, Cluster Nodes, Projects, and Workloads. This document provides the key metrics to monitor, how to assess them as well as suggested actions.

    Dashboards are used by system administrators to analyze and diagnose issues that relate to:

    • Physical Resources.
    • Organization resource allocation and utilization.
    • Usage characteristics.

    System administrators need to know important information about the physical resources that are currently being used. Important information such as:

    • Resource health.
    • Available resources and their distribution.
    • Is there a lack of resources.
    • Are resources being utilized correctly.

    With this information, system administrators can hone in on:

    • How resources are allocated across the organization.
    • How the different organizational units utilized quotas and resources within those quotas.
    • The actual performance of the organizational units.

    These dashboards give system administrators the ability to drill down to see details of the different types of workloads that each of the organizational units is running. These usage and performance metrics ensure that system administrators can then take actions to correct issues that affect performance.

    There are 5 dashboards:

    • GPU/CPU Overview dashboard—Provides information about what is happening right now in the cluster.
    • Quota Management dashboard—Provides information about quota utilization.
    • Analytics dashboard—Provides long term analysis of cluster behavior.
    • Multi-Cluster Overview dashboard—Provides a more holistic, multi-cluster view of what is happening right now. The dashboard is intended for organizations that have more than one connected cluster.
    • Consumption dashboard—Provides information about resource consumption.

    GPU/CPU Overview Dashboard (New and legacy)

    The Overview dashboard provides information about what is happening right now in the cluster. Administrators can view high-level information on the state of the cluster. The dashboard has two tabs that change the display to provide a focused view for GPU Dashboards (default view) and CPU Dashboards.

    GPU Dashboard

    The GPU dashboard displays specific information for GPU based nodes, node-pools, clusters, or tenants. These dashboards also include additional metrics that specific to GPU based environments. The dashboard contains tiles that show information about specific resource allocation and performance metrics. The tiles are interactive allowing you to link directly to the assets or drill down to specific scopes. Use the time frame selector to choose a time frame for all the tiles in the dashboard.

    The dashboard has the following tiles:

    • Ready nodes—displays GPU nodes that are in the ready state.
    • Ready GPU devices—displays the number of GPUs in nodes that are in the ready state.
    • Allocated GPU compute—displays the total number of GPUs allocated from all the nodes.
    • Idle allocated GPU devices—displays the number of allocated GPU devices that have been idle for more than 5 minutes.
    • Running workloads—displays the number of running workloads.
    • Pending workloads—displays the number of workloads in the pending status.
    • Allocation ration by node pool—displays the percentage of GPUs allocated per node pool. Hover over the bar for detailed information. Use the scope selected at the bottom of the graph to drill down for more details.
    • Free resources by node pool—the graph displays the amount of free resources per node pool. Press a entry in the graph for more details. Hover over the resource bubbles for specific details for the workers in the node. Use the ellipsis to download the graph as a CSV file.
    • Resource allocation by workload type—displays the resource allocation by workload type. Hover over the bar for detailed information. Use the scope selected at the bottom of the graph to drill down for more details. Use the ellipsis to download the graph as a CSV file.
    • Workload by status—displays the number of workloads for each status in the workloads table. Hover over the bar for detailed information. Use the scope selected at the bottom of the graph to drill down for more details. Use the ellipsis to download the graph as a CSV file.
    • Resources utilization—displays the resource utilization over time. The right pane of the graph shows the average utilization of the selected time frame of the dashboard. Hover over the graph to see details of a specific time in the graph. Use the ellipsis to download the graph as a CSV file.
    • Resource allocation—displays the resource allocation over time. The right pane of the graph shows the average allocation of the selected time frame of the dashboard. Hover over the graph to see details of a specific time in the graph. Use the ellipsis to download the graph as a CSV file.

    CPU Dashboard

    The CPU dashboards display specific information for CPU based nodes, node-pools, clusters, or tenants. These dashboards also include additional metrics that specific to CPU based environments.

    To enable CPU Dashboards:

    1. Click General settings
    2. Open the Analytics pane and toggle the Show CPU dashboard switch to enable the feature.

    Toggle the switch to disable to disable CPU Dashboards option.

    The dashboard contains the following tiles:

    • Total CPU Nodes—displays the total amount of CPU nodes.
    • Ready CPU nodes—displays the total amount of CPU nodes in the ready state.
    • Total CPUs—displays the total amount of CPUs.
    • Ready CPUs—displays the total amount of CPUs in the ready state.
    • Allocated CPUs—displays the amount of allocated CPUs.
    • Running workloads—displays the amount of workloads in the running state.
    • Pending workloads—displays the amount of workloads in the pending state.
    • Allocated CPUs per project—displays the amount of CPUs allocated per project.
    • Active projects—displays the active projects with the CPU allocation and amount of running and pending workloads.
    • Utilization per resource type—displays the CPU compute and CPU memory utilization over time.
    • CPU compute utilization—displays the current CPU compute utilization.
    • CPU memory utilization—displays the current CPU memory utilization.
    • Pending workloads—displays the requested resources and wait time for workloads in the pending status.
    • Workloads with error—displays the amount of workloads that are currently not running due to an error.
    • Workload Count per CPU Compute Utilization—
    • 5 longest running workloads—displays up to 5 of workloads that have the longest running time.

    Analysis and Suggested actions:

    Review Analysis & Actions
    Interactive Workloads are too frequently idle Consider setting time limits for interactive Workloads through the Projects tab. 
    Consider also reducing GPU/CPU quotas for specific Projects to encourage users to run more training Workloads as opposed to interactive Workloads (note that interactive Workloads can not use more than the GPU/CPU quota assigned to their Project).
    Training Workloads are too frequently idle Identify and notify the right users and work with them to improve the utilization of their training scripts

    Workloads with an Error

    Search for Workloads with an error status. These Workloads may be holding GPUs/CPUs without actually using them.

    Analysis and Suggested actions:

    Search for workloads with an Error status on the Workloads view and discuss with the Job owner. Consider deleting these Workloads to free up the resources for other users.

    Workloads with a Long Duration

    View list of 5 longest Workloads.

    Analysis and Suggested actions:

    Review Analysis & Actions
    Training Workloads run for too long Ask users to view their Workloads and analyze whether useful work is being done. If needed, stop their Workloads.
    Interactive Workloads run for too long Consider setting time limits for interactive Workloads via the Project editor.

    Job Queue

    Identify queueing bottlenecks.

    Analysis and Suggested actions:

    Review Analysis & Actions
    Cluster is fully loaded Go over the table of active Projects and check that fairness between Projects was enforced, by reviewing the number of allocated GPUs/CPUs for each Project, ensuring each Project was allocated with its fair-share portion of the cluster.
    Cluster is not fully loaded Go to the Workloads view to review the resources requested for that Job (CPU, CPU memory, GPU, GPU memory).
    Go to the Nodes view to verify that there is no Node with enough free resources that can host that Job.

    Also, check the command that the user used to submit the job. The Researcher may have requested a specific Node for that Job.

    Analytics Dashboard

    The Analytics dashboard provides means for viewing historical data on cluster information such as:

    • Utilization across the cluster
    • GPU usage by different Projects, including allocation and utilization, broken down into interactive and training Workloads
    • Breakdown of running Workloads into interactive, training, and GPU versus CPU-only Workloads, including information on queueing (number of pending Workloads and requested GPUs),
    • Status of Nodes in terms of availability and allocated and utilized resources.

    The dashboard has a dropdown filter for node pools and Departments. From the dropdown, select one or more node pools. The default setting is all.

    The information presented in Analytics can be used in different ways for identifying problems and fixing them. Below are a few examples.

    Node Downtime

    View the overall available resources per Node and identify cases where a Node is down and there was a reduction in the number of available resources.

    How to: view the following panel.

    Analysis and Suggested actions:

    Filter according to time range to understand for how long the Node is down.

    GPU Allocation

    Track GPU allocation across time.

    How to: view the following panels.

    The panel on the right-hand side shows the cluster-wide GPU allocation and utilization versus time, whereas the panels on the left-hand side show the cluster-wide GPU allocation and utilization averaged across the filtered time range.

    Analysis and Suggested actions:

    If the allocation is too low for a long period, work with users to run more workloads and to better utilize the Cluster.

    Track GPU utilization

    Track whether Researchers efficiently use the GPU resources they have allocated for themselves.

    How to: view the following panel:

    Analysis and Suggested actions:

    If utilization is too low for a long period, you will want to identify the source of the problem:

    • Go to “Average GPU Allocation & Utilization”
    • Look for Projects with large GPU allocations for interactive Workloads or Projects that poorly utilize their training Workloads. Users tend to poorly utilize their GPUs in interactive sessions because of the dev & debug nature of their work which typically is an iterative process with long idle GPU time. On many occasions users also don’t shut down their interactive Workloads, holding their GPUs idle and preventing others from using them.
    Review Analysis & Actions
    Low GPU utilization is due to interactive Workloads being used too frequently Consider setting time limits for interactive Workloads through the Projects tab or reducing GPU quotas to encourage users to run more training Workloads as opposed to interactive Workloads (note that interactive Workloads can not use more than the GPU quota assigned to their Project).
    Low GPU utilization is due to users poorly utilizing their GPUs in training sessions Identify Projects with bad GPU utilization in training Workloads, notify the users and work with them to improve their code and the way they utilize their GPUs.

    Training vs. Interactive -- Researcher maturity

    Track the number of running Workloads and the breakdown into interactive, training, and CPU-only Workloads.

    How to: view the following panel:

    Analysis and Suggested actions:

    We would want to encourage users to run more training Workloads than interactive Workloads, as it is the key to achieving high GPU utilization across the Cluster:

    • Training Workloads run to completion and free up their resources automatically when training ends
    • Training Workloads can be preempted, queued, and resumed automatically by the Run:ai system according to predefined policies which increases fairness and Cluster utilization.

    Pending Queue Size

    Track how long is the queue for pending Workloads

    How to: view the following panels:

    Analysis and Suggested actions:

    Consider buying more GPUs:

    • When there are too many Workloads are waiting in queue for too long.
    • With a large number of requested GPUs.
    • While the Cluster is fully loaded and well utilized.

    CPU & Memory Utilization

    Track CPU and memory Node utilization and identify times where the load on specific Nodes is high.

    How to: view the following panel:

    Analysis and Suggested actions:

    If the load on specific Nodes is too high, it may cause problems with the proper operation of the Cluster and the way workloads are running.

    Consider adding more CPUs, or adding additional CPU-only nodes for Workloads that do only CPU processing.

    Multi-Cluster overview dashboard

    Provides a holistic, aggregated view across Clusters, including information about Cluster and Node utilization, available resources, and allocated resources. With this dashboard, you can identify Clusters that are down or underutilized and go to the Overview of that Cluster to explore further.

    Consumption dashboard

    This dashboard enables users and admins to view consumption usage using run:AI services. The dashboard provides views based on configurable filters and timelines. The dashboard also provides costing analysis for GPU, CPU, and memory costs for the system.

    !consumption dashboard

    The dashboard has 4 tiles for:

    • Cumulative GPU allocation per Project or Department
    • Cumulative CPU allocation per Project or Department
    • Cumulative memory allocation per Project or Department
    • Consumption types

    Use the drop down menus at the top of the dashboard to apply filters for:

    • Project or department
    • Per project (single, multiple, or all)
    • Per department (single, multiple or all)
    • Per cluster (single, multiple, all)

    To enable the Consumption Dashboard:

    1. Press the General settings icon, then press General.
    2. Open the Analytics pane and toggle the Consumption switch to enable the feature.
    3. Enter the cost of:
    4. GPU compute / Hour
    5. CPU compute / Hour
    6. CPU memory / Hour

    Use the time picker dropdown to select relative time range options and set custom absolute time ranges. You can change the Timezone and fiscal year settings from the time range controls by clicking the Change time settings button.

    Note

    Dashboard data updates once an hour.

    You can change the refresh interval using the refresh interval drop down.

    The dashboard has a 2 consumption tables that display the total consumption of resources. Hover over an entry in the table to filter it in or out of the table.

    The Total consumption table includes consumption details based on the filters selected. Fields include:

    • Project
    • Department
    • GPU hours
    • CPU hours
    • Memory hours
    • GPU Idle allocated hours—the portion of time the GPUs spend idle from the total allocation hours.
    • CPU usage hours—the actual usage time of CPU.
    • Memory usage time—the actual usage time of CPU memory.
    • GPU cost (only when configured)
    • CPU cost (only when configured)
    • CPU memory (only when configured)

    The Total department consumption table includes consumption details for each department, or details for departments selected in the filters. Fields include:

    • Department
    • GPU hours
    • CPU hours
    • Memory hours
    • GPU Idle allocated hours—the portion of time the GPUs spend idle from the total allocation hours.
    • CPU usage hours—the actual usage time of CPU.
    • Memory usage time—the actual usage time of CPU memory.
    • GPU cost (only when configured)
    • CPU cost (only when configured)
    • CPU memory (only when configured)

    The dashboard has a graph of the GPU allocation over time.

    !

    The dashboard has a graph of the Project over-quota GPU consumption.

    !

    Quota management dashboard

    The Quota management dashboard provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard is divided into sections with essential metrics and data visualizations to identify resource usage patterns, potential bottlenecks, and areas for optimization. The sections of the dashboard include:

    • Add Filter
    • Quota / Total
    • Allocated / Quota
    • Pending workloads
    • Quota by node pool
    • Allocation by node pool
    • Pending workloads by node pool
    • Departments with lowest allocation by node pool
    • Projects with lowest allocation ratio by node pool
    • Over time allocation / quota

    Add Filter

    Use the Add Filter dropdown to select filters for the dashboard. The filters will change the data shown on the dashboard. Available filters are:

    • Departments
    • Projects
    • Nodes

    Select a filter from the dropdown, then select a item from the list, and press apply.

    Note

    You can create a filter with multiple categories, but you can use each category and item only once.

    Quota / Total

    This section shows the number of GPUs that are in the quota based on the filter selection. The quota of GPUs is the number of GPUs that are reserved for use.

    Allocated / Quota

    This section shows the number of GPUs that are allocated based on the filter selection. Allocated GPUs are the number of GPUs that are being used.

    Pending workloads

    This section shows the number workloads that are pending based on the filter selection. Pending workloads are workloads that have not started.

    Quota by node pool

    This section shows the quota of GPUs by node pool based on the filter. The quota is the number of GPUs that are reserved for use. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.

    Allocation by node pool

    This section shows the allocation of GPUs by node pool based on the filter. The allocation is the number of GPUs that are being used. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.

    Pending workloads by node pool

    This section shows the number of pending workloads by node pool. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.

    Departments with lowest allocation by node pool

    This section shows the departments with the lowest allocation of GPUs by percentage relative to the total number of GPUs.

    Projects with lowest allocation ratio by node pool

    This section shows the projects with the lowest allocation of GPUS by percentage relative to the total number of GPUs.

    Over time allocation / quota

    This section shows the allocation of GPUs from the quota over a period of time.

    \ No newline at end of file diff --git a/v2.20/platform-admin/performance/reports/index.html b/v2.20/platform-admin/performance/reports/index.html index 34f537a705..1b784d1923 100644 --- a/v2.20/platform-admin/performance/reports/index.html +++ b/v2.20/platform-admin/performance/reports/index.html @@ -1,4 +1,4 @@ - Reports - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/assets/compute/index.html b/v2.20/platform-admin/workloads/assets/compute/index.html index 003ae3d0f4..b1cfe2c2b8 100644 --- a/v2.20/platform-admin/workloads/assets/compute/index.html +++ b/v2.20/platform-admin/workloads/assets/compute/index.html @@ -1,4 +1,4 @@ - Compute Resources - Run:ai Documentation Library

    Compute Resources

    This article explains what compute resources are and how to create and use them.

    Compute resources are one type of workload asset. A compute resource is a template that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

    A compute resource asset is a preconfigured building block that encapsulates all the specifications of compute requirements for the workload including:

    • GPU devices and GPU memory
    • CPU memory and CPU compute

    Compute resource table

    The Compute resource table can be found under Workload manager in the Run:ai UI.

    The Compute resource table provides a list of all the compute resources defined in the platform and allows you to manage them.

    The Compute resource table consists of the following columns:

    Column Description
    Compute resource The name of the compute resource
    Description A description of the essence of the compute resource
    GPU devices request per pod The number of requested physical devices per pod of the workload that uses this compute resource
    GPU memory request per device The amount of GPU memory per requested device that is granted to each pod of the workload that uses this compute resource
    CPU memory request The minimum amount of CPU memory per pod of the workload that uses this compute resource
    CPU memory limit The maximum amount of CPU memory per pod of the workload that uses this compute resource
    CPU compute request The minimum number of CPU cores per pod of the workload that uses this compute resource
    CPU compute limit The maximum number of CPU cores per pod of the workload that uses this compute resource
    Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram
    Workload(s) The list of workloads associated with the compute resource
    Template(s) The list of workload templates that use this compute resource
    Created by The name of the user who created the compute resource
    Creation time The timestamp of when the compute resource was created
    Last updated The timestamp of when the compute resource was last updated
    Cluster The cluster that the compute resource is associated with

    Workloads associated with the compute resource

    Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

    Column Description
    Workload The workload that uses the compute resource
    Type Workspace/Training/Inference
    Status Represents the workload lifecycle. See the full list of workload status.

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table

    Adding new compute resource

    To add a new compute resource:

    1. Go to the Compute resource table
    2. Click +NEW COMPUTE RESOURCE
    3. Select under which cluster to create the compute resource
    4. Select a scope
    5. Enter a name for the compute resource. The name must be unique.
    6. Optional: Provide a description of the essence of the compute resource
    7. Set the resource types needed within a single node
      (the Run:ai scheduler tries to match a single node that complies with the compute resource for each of the workload’s pods)

      • GPU

        • GPU devices per pod
          The number of devices (physical GPUs) per pod
          (for example, if you requested 3 devices per pod and the running workload using this compute resource consists of 3 pods, there are 9 physical GPU devices used in total)

        Note

        • When setting it to zero, the workload using this computer resource neither requests or uses GPU resources while running
        • You can set any number of GPU devices and specify the memory requirement to any portion size (1..100), or memory size value using GB or MB units per device
        • GPU memory per device
          • Select the memory request format
            • % (of device) - Fraction of a GPU device’s memory
            • MB (memory size) - An explicit GPU memory unit
            • GB (memory size) - An explicit GPU memory unit
          • Set the memory Request - The minimum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives this amount of GPU memory for each device(s) the pod utilizes
          • Optional: Set the memory Limit - The maximum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives at most this amount of GPU memory for each device(s) the pod utilizes.
            To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.

        Note

        • GPU memory limit is disabled by default. If you cannot see the Limit toggle in the compute resource form, then it must be enabled by your Administrator, under General settings → Resources → GPU resource optimization
        • When a Limit is set and is bigger than the Request, the scheduler allows each pod to reach the maximum amount of GPU memory in an opportunistic manner (only upon availability).
        • If the GPU Memory Limit is bigger that the Request the pod is prone to be killed by the Run:ai toolkit (out of memory signal). The greater the difference between the GPU memory used and the request, the higher the risk of being killed
        • If GPU resource optimization is turned off, the minimum and maximum are in fact equal
      • CPU

        • CPU compute per pod
          • Select the units for the CPU compute (Cores / Millicores)
          • Set the CPU compute Request - the minimum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU compute for each pod.
          • Optional: Set the CPU compute Limit - The maximum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU compute.
            To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.
            By default, the limit is set to “Unlimited” - which means that the pod may consume all the node's free CPU compute resources.
        • CPU memory per pod
          • Select the units for the CPU memory (MB / GB)
          • Set the CPU memory Request - The minimum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU memory for each pod.
          • Optional: Set the CPU memory Limit - The maximum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU memory.
            To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.
            By default, the limit is set to “Unlimited” - Meaning that the pod may consume all the node's free CPU memory resources.

        Note

        If the CPU Memory Limit is bigger that the Request the pod is prone to be killed by the operating system (out of memory signal). The greater the difference between the CPU memory used and the request, the higher the risk of being killed.

    8. Optional: More settings

      • Increase shared memory size
        When enabled, the shared memory size available to the pod is increased from the default 64MB to the node's total available memory or the CPU memory limit, if set above.
      • Set extended resource(s)
        Click +EXTENDED RESOURCES to add resource/quantity pairs. For more information on how to set extended resources, see the Extended resources and Quantity guides
    9. Click CREATE COMPUTE RESOURCE

      Note

      It is also possible to add compute resources directly when creating a specific Workspace, training or inference workload.

    Editing a compute resource

    To edit a compute resource:

    1. Select the compute resource you want to edit
    2. Click Edit
    3. Click SAVE COMPUTE RESOURCE

    Note

    The already bound workload that is using this asset will not be affected.

    Copying a compute resource

    To make a copy of an existing compute resource:

    1. Select the compute resource you want to copy
    2. Click MAKE A COPY
    3. Enter a name for the environment. The name must be unique.
    4. Update the environment
    5. Click CREATE COMPUTE RESOURCE

    Deleting a compute resource

    1. Select the compute resource you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Note

    The already bound workload that is using this asset will not be affected.

    Using API

    Go to the Compute resources API reference to view the available actions

    Compute Resources

    This article explains what compute resources are and how to create and use them.

    Compute resources are one type of workload asset. A compute resource is a template that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

    A compute resource asset is a preconfigured building block that encapsulates all the specifications of compute requirements for the workload including:

    • GPU devices and GPU memory
    • CPU memory and CPU compute

    Compute resource table

    The Compute resource table can be found under Workload manager in the Run:ai UI.

    The Compute resource table provides a list of all the compute resources defined in the platform and allows you to manage them.

    The Compute resource table consists of the following columns:

    Column Description
    Compute resource The name of the compute resource
    Description A description of the essence of the compute resource
    GPU devices request per pod The number of requested physical devices per pod of the workload that uses this compute resource
    GPU memory request per device The amount of GPU memory per requested device that is granted to each pod of the workload that uses this compute resource
    CPU memory request The minimum amount of CPU memory per pod of the workload that uses this compute resource
    CPU memory limit The maximum amount of CPU memory per pod of the workload that uses this compute resource
    CPU compute request The minimum number of CPU cores per pod of the workload that uses this compute resource
    CPU compute limit The maximum number of CPU cores per pod of the workload that uses this compute resource
    Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram
    Workload(s) The list of workloads associated with the compute resource
    Template(s) The list of workload templates that use this compute resource
    Created by The name of the user who created the compute resource
    Creation time The timestamp of when the compute resource was created
    Last updated The timestamp of when the compute resource was last updated
    Cluster The cluster that the compute resource is associated with

    Workloads associated with the compute resource

    Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

    Column Description
    Workload The workload that uses the compute resource
    Type Workspace/Training/Inference
    Status Represents the workload lifecycle. See the full list of workload status.

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table

    Adding new compute resource

    To add a new compute resource:

    1. Go to the Compute resource table
    2. Click +NEW COMPUTE RESOURCE
    3. Select under which cluster to create the compute resource
    4. Select a scope
    5. Enter a name for the compute resource. The name must be unique.
    6. Optional: Provide a description of the essence of the compute resource
    7. Set the resource types needed within a single node
      (the Run:ai scheduler tries to match a single node that complies with the compute resource for each of the workload’s pods)

      • GPU

        • GPU devices per pod
          The number of devices (physical GPUs) per pod
          (for example, if you requested 3 devices per pod and the running workload using this compute resource consists of 3 pods, there are 9 physical GPU devices used in total)

        Note

        • When setting it to zero, the workload using this computer resource neither requests or uses GPU resources while running
        • You can set any number of GPU devices and specify the memory requirement to any portion size (1..100), or memory size value using GB or MB units per device
        • GPU memory per device
          • Select the memory request format
            • % (of device) - Fraction of a GPU device’s memory
            • MB (memory size) - An explicit GPU memory unit
            • GB (memory size) - An explicit GPU memory unit
          • Set the memory Request - The minimum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives this amount of GPU memory for each device(s) the pod utilizes
          • Optional: Set the memory Limit - The maximum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives at most this amount of GPU memory for each device(s) the pod utilizes.
            To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.

        Note

        • GPU memory limit is disabled by default. If you cannot see the Limit toggle in the compute resource form, then it must be enabled by your Administrator, under General settings → Resources → GPU resource optimization
        • When a Limit is set and is bigger than the Request, the scheduler allows each pod to reach the maximum amount of GPU memory in an opportunistic manner (only upon availability).
        • If the GPU Memory Limit is bigger that the Request the pod is prone to be killed by the Run:ai toolkit (out of memory signal). The greater the difference between the GPU memory used and the request, the higher the risk of being killed
        • If GPU resource optimization is turned off, the minimum and maximum are in fact equal
      • CPU

        • CPU compute per pod
          • Select the units for the CPU compute (Cores / Millicores)
          • Set the CPU compute Request - the minimum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU compute for each pod.
          • Optional: Set the CPU compute Limit - The maximum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU compute.
            To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.
            By default, the limit is set to “Unlimited” - which means that the pod may consume all the node's free CPU compute resources.
        • CPU memory per pod
          • Select the units for the CPU memory (MB / GB)
          • Set the CPU memory Request - The minimum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU memory for each pod.
          • Optional: Set the CPU memory Limit - The maximum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU memory.
            To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.
            By default, the limit is set to “Unlimited” - Meaning that the pod may consume all the node's free CPU memory resources.

        Note

        If the CPU Memory Limit is bigger that the Request the pod is prone to be killed by the operating system (out of memory signal). The greater the difference between the CPU memory used and the request, the higher the risk of being killed.

    8. Optional: More settings

      • Increase shared memory size
        When enabled, the shared memory size available to the pod is increased from the default 64MB to the node's total available memory or the CPU memory limit, if set above.
      • Set extended resource(s)
        Click +EXTENDED RESOURCES to add resource/quantity pairs. For more information on how to set extended resources, see the Extended resources and Quantity guides
    9. Click CREATE COMPUTE RESOURCE

      Note

      It is also possible to add compute resources directly when creating a specific Workspace, training or inference workload.

    Editing a compute resource

    To edit a compute resource:

    1. Select the compute resource you want to edit
    2. Click Edit
    3. Click SAVE COMPUTE RESOURCE

    Note

    The already bound workload that is using this asset will not be affected.

    Copying a compute resource

    To make a copy of an existing compute resource:

    1. Select the compute resource you want to copy
    2. Click MAKE A COPY
    3. Enter a name for the environment. The name must be unique.
    4. Update the environment
    5. Click CREATE COMPUTE RESOURCE

    Deleting a compute resource

    1. Select the compute resource you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Note

    The already bound workload that is using this asset will not be affected.

    Using API

    Go to the Compute resources API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/assets/credentials/index.html b/v2.20/platform-admin/workloads/assets/credentials/index.html index d430d8f7ef..da593ab7fa 100644 --- a/v2.20/platform-admin/workloads/assets/credentials/index.html +++ b/v2.20/platform-admin/workloads/assets/credentials/index.html @@ -1,4 +1,4 @@ - Credentials - Run:ai Documentation Library

    Credentials

    This article explains what credentials are and how to create and use them.

    Credentials are a workload asset that simplify the complexities of Kubernetes secrets. They consist of and mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

    Credentials are crucial for the security of AI workloads and the resources they require, as they restrict access to authorized users, verify identities, and ensure secure interactions. By enforcing the protection of sensitive data, credentials help organizations comply with industry regulations, fostering a secure environment overall.

    Essentially, credentials enable AI practitioners to access relevant protected resources, such as private data sources and Docker images, thereby streamlining the workload submission process.

    Credentials table

    The Credentials table can be found under Workload manager in the Run:ai User interface.

    The Credentials table provides a list of all the credentials defined in the platform and allows you to manage them.

    The Credentials table comprises the following columns:

    Column Description
    Credentials The name of the credentials
    Description A description of the credentials
    Type The type of credentials, e.g., Docker registry
    Status The different lifecycle phases and representation of the credentials’ condition
    Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram
    Kubernetes name The unique name of the credentials Kubernetes name as it appears in the cluster
    Environment(s) The environment(s) that are associated with the credentials
    Data source(s) The private data source(s) that are accessed using the credentials
    Created by The user who created the credentials
    Creation time The timestamp of when the credentials were created
    Cluster The cluster with which the credentials are associated

    Credentials status

    The following table describes the credentials’ condition and whether they were created successfully for the selected scope.

    Status Description
    No issues found No issues were found while creating the credentials (this status may change while propagating the credentials to the selected scope)
    Issues found Issues found while propagating the credentials
    Issues found Failed to access the cluster
    Creating… Credentials are being created
    Deleting… Credentials are being deleted
    No status When the credentials’ scope is an account, or the current version of the cluster is not up to date, the status cannot be displayed

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then click ‘Download as CSV’. Export to CSV is limited to 20,000 rows.
    • Refresh - Click REFRESH to update the table with the latest data

    Adding new credentials

    Creating credentials is limited to specific roles.

    To add a new credential:

    1. Go to the Credentials table:
    2. Click +NEW CREDENTIALS
    3. Select the credential type from the list
      Follow the step-by-step guide for each credential type:

    Docker registry

    These credentials allow users to authenticate and pull images from a Docker registry, enabling access to containerized applications and services.

    After creating the credentials, it is used automatically when pulling images.

    1. Select a scope.
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credentials
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Enter the username, password, and Docker registry URL
    5. Click CREATE CREDENTIALS

    After the credentials are created, check their status to monitor their proper creation across the selected scope.

    Access key

    These credentials are unique identifiers used to authenticate and authorize access to cloud services or APIs, ensuring secure communication between applications. They typically consist of two parts:

    • An access key ID
    • A secret access key

    The purpose of this credential type is to allow access to restricted data.

    1. Select a scope.
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credential
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Enter the Access key and Access secret
    5. Click CREATE CREDENTIALS

    After the credentials are created, check their status to monitor their proper creation across the selected scope.

    Username & password

    These credentials require a username and corresponding password to access various resources, ensuring that only authorized users can log in.

    The purpose of this credential type is to allow access to restricted data.

    1. Select a scope
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credentials
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Enter the username and password
    5. Click CREATE CREDENTIALS

    After the credentials are created, check their status to monitor their proper creation across the selected scope.

    Generic secret

    These credentials are a flexible option that consists of multiple keys & values and can store various sensitive information, such as API keys or configuration data, to be used securely within applications.

    The purpose of this credential type is to allow access to restricted data.

    1. Select a scope
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credentials
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Click +KEY & VALUE - to add key/value pairs to store in the new secret
    5. Click CREATE CREDENTIALS

    Editing credentials

    To rename a credential:

    1. Select the credential from the table
    2. Click Rename to edit its name and description

    Deleting credentials

    To delete a credential:

    1. Select the credential you want to delete
    2. Click DELETE
    3. In the dialog, click DELETE to confirm

    Note

    Credentials cannot be deleted if they are being used by a workload and template.

    Using credentials

    You can use credentials (secrets) in various ways within the system

    Access private data sources

    To access private data sources, attach credentials to data sources of the following types: Git, S3 Bucket

    Use directly within the container

    To use the secret directly from within the container, you can choose between the following options

    1. Get the secret mounted to the file system by using the Generic secret data source
    2. Get the secret as an environment variable injected into the container. There are two equivalent ways to inject the environment variable.

      a. By adding it to the Environment asset. b. By adding it ad-hoc as part of the workload.


    Creating secrets in advance

    Add secrets in advance to be used when creating credentials via the Run:ai UI.

    Follow the steps below for each required scope:

    1. Create the secret in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the secret, label it: run.ai/cluster-wide: "true"
    3. Label the secret with the correct credential type:
      1. Docker registry - run.ai/resource: "docker-registry"
      2. Access key - run.ai/resource: "access-key"
      3. Username and password - run.ai/resource: "password"
      4. Generic secret - run.ai/resource: "generic" ֿ
    1. Create the secret in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the secret, label it: run.ai/department: "<department id>"
    3. Label the secret with the correct credential type:
      1. Docker registry - run.ai/resource: "docker-registry"
      2. Access key - run.ai/resource: "access-key"
      3. Username and password - run.ai/resource: "password"
      4. Generic secret - run.ai/resource: "generic"
    1. Create the secret in the project’s namespace
    2. Label the secret with the correct credential type:
      1. Docker registry - run.ai/resource: "docker-registry"
      2. Access key - run.ai/resource: "access-key"
      3. Username and password - run.ai/resource: "password"
      4. Generic secret - run.ai/resource: "generic"

    The secret is now displayed for that scope in the list of existing secrets.

    Using API

    To view the available actions, go to the Credentials API reference

    Credentials

    This article explains what credentials are and how to create and use them.

    Credentials are a workload asset that simplify the complexities of Kubernetes secrets. They consist of and mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

    Credentials are crucial for the security of AI workloads and the resources they require, as they restrict access to authorized users, verify identities, and ensure secure interactions. By enforcing the protection of sensitive data, credentials help organizations comply with industry regulations, fostering a secure environment overall.

    Essentially, credentials enable AI practitioners to access relevant protected resources, such as private data sources and Docker images, thereby streamlining the workload submission process.

    Credentials table

    The Credentials table can be found under Workload manager in the Run:ai User interface.

    The Credentials table provides a list of all the credentials defined in the platform and allows you to manage them.

    The Credentials table comprises the following columns:

    Column Description
    Credentials The name of the credentials
    Description A description of the credentials
    Type The type of credentials, e.g., Docker registry
    Status The different lifecycle phases and representation of the credentials’ condition
    Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram
    Kubernetes name The unique name of the credentials Kubernetes name as it appears in the cluster
    Environment(s) The environment(s) that are associated with the credentials
    Data source(s) The private data source(s) that are accessed using the credentials
    Created by The user who created the credentials
    Creation time The timestamp of when the credentials were created
    Cluster The cluster with which the credentials are associated

    Credentials status

    The following table describes the credentials’ condition and whether they were created successfully for the selected scope.

    Status Description
    No issues found No issues were found while creating the credentials (this status may change while propagating the credentials to the selected scope)
    Issues found Issues found while propagating the credentials
    Issues found Failed to access the cluster
    Creating… Credentials are being created
    Deleting… Credentials are being deleted
    No status When the credentials’ scope is an account, or the current version of the cluster is not up to date, the status cannot be displayed

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then click ‘Download as CSV’. Export to CSV is limited to 20,000 rows.
    • Refresh - Click REFRESH to update the table with the latest data

    Adding new credentials

    Creating credentials is limited to specific roles.

    To add a new credential:

    1. Go to the Credentials table:
    2. Click +NEW CREDENTIALS
    3. Select the credential type from the list
      Follow the step-by-step guide for each credential type:

    Docker registry

    These credentials allow users to authenticate and pull images from a Docker registry, enabling access to containerized applications and services.

    After creating the credentials, it is used automatically when pulling images.

    1. Select a scope.
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credentials
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Enter the username, password, and Docker registry URL
    5. Click CREATE CREDENTIALS

    After the credentials are created, check their status to monitor their proper creation across the selected scope.

    Access key

    These credentials are unique identifiers used to authenticate and authorize access to cloud services or APIs, ensuring secure communication between applications. They typically consist of two parts:

    • An access key ID
    • A secret access key

    The purpose of this credential type is to allow access to restricted data.

    1. Select a scope.
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credential
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Enter the Access key and Access secret
    5. Click CREATE CREDENTIALS

    After the credentials are created, check their status to monitor their proper creation across the selected scope.

    Username & password

    These credentials require a username and corresponding password to access various resources, ensuring that only authorized users can log in.

    The purpose of this credential type is to allow access to restricted data.

    1. Select a scope
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credentials
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Enter the username and password
    5. Click CREATE CREDENTIALS

    After the credentials are created, check their status to monitor their proper creation across the selected scope.

    Generic secret

    These credentials are a flexible option that consists of multiple keys & values and can store various sensitive information, such as API keys or configuration data, to be used securely within applications.

    The purpose of this credential type is to allow access to restricted data.

    1. Select a scope
    2. Enter a name for the credential. The name must be unique.
    3. Optional: Provide a description of the credentials
    4. Set how the credential is created
      • Existing secret (in the cluster)
        This option applies when the purpose is to create credentials based on an existing secret
        • Select a secret from the list (The list is empty if no secrets were created in advance)
      • New secret (recommended)
        A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
        • Click +KEY & VALUE - to add key/value pairs to store in the new secret
    5. Click CREATE CREDENTIALS

    Editing credentials

    To rename a credential:

    1. Select the credential from the table
    2. Click Rename to edit its name and description

    Deleting credentials

    To delete a credential:

    1. Select the credential you want to delete
    2. Click DELETE
    3. In the dialog, click DELETE to confirm

    Note

    Credentials cannot be deleted if they are being used by a workload and template.

    Using credentials

    You can use credentials (secrets) in various ways within the system

    Access private data sources

    To access private data sources, attach credentials to data sources of the following types: Git, S3 Bucket

    Use directly within the container

    To use the secret directly from within the container, you can choose between the following options

    1. Get the secret mounted to the file system by using the Generic secret data source
    2. Get the secret as an environment variable injected into the container. There are two equivalent ways to inject the environment variable.

      a. By adding it to the Environment asset. b. By adding it ad-hoc as part of the workload.


    Creating secrets in advance

    Add secrets in advance to be used when creating credentials via the Run:ai UI.

    Follow the steps below for each required scope:

    1. Create the secret in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the secret, label it: run.ai/cluster-wide: "true"
    3. Label the secret with the correct credential type:
      1. Docker registry - run.ai/resource: "docker-registry"
      2. Access key - run.ai/resource: "access-key"
      3. Username and password - run.ai/resource: "password"
      4. Generic secret - run.ai/resource: "generic" ֿ
    1. Create the secret in the Run:ai namespace (runai)
    2. To authorize Run:ai to use the secret, label it: run.ai/department: "<department id>"
    3. Label the secret with the correct credential type:
      1. Docker registry - run.ai/resource: "docker-registry"
      2. Access key - run.ai/resource: "access-key"
      3. Username and password - run.ai/resource: "password"
      4. Generic secret - run.ai/resource: "generic"
    1. Create the secret in the project’s namespace
    2. Label the secret with the correct credential type:
      1. Docker registry - run.ai/resource: "docker-registry"
      2. Access key - run.ai/resource: "access-key"
      3. Username and password - run.ai/resource: "password"
      4. Generic secret - run.ai/resource: "generic"

    The secret is now displayed for that scope in the list of existing secrets.

    Using API

    To view the available actions, go to the Credentials API reference

    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/assets/data-volumes/index.html b/v2.20/platform-admin/workloads/assets/data-volumes/index.html index e681097619..79a6e9c881 100644 --- a/v2.20/platform-admin/workloads/assets/data-volumes/index.html +++ b/v2.20/platform-admin/workloads/assets/data-volumes/index.html @@ -1,4 +1,4 @@ - Data Volumes - Run:ai Documentation Library

    Data Volumes

    Data volumes offer a powerful solution for storing, managing, and sharing AI training data within the Run:ai platform. They promote collaboration, simplify data access control, and streamline the AI development lifecycle.

    Data volumes are snapshots of datasets stored in Kubernetes Persistent Volume Claims (PVCs). They act as a central repository for training data.

    Why use a data volume?

    1. Sharing with multiple scopes
      Unlike other Run:ai data sources, data volumes can be shared across projects, departments, or clusters, encouraging data reuse and collaboration within the organization.
    2. Storage saving
      A single copy of the data can be used across multiple scopes

    Typical use cases

    1. Sharing large data sets
      In large organizations, the data is often stored in a remote location, which can be a barrier for large model training. Even if the data is transferred into the cluster, sharing it easily with multiple users is still challenging. Data volumes can help share the data seamlessly, with maximum security and control.
    2. Sharing data with colleagues
      When sharing training results, generated data sets, or other artifacts with team members is needed, data volumes can help make the data available easily.

    data-volumes-architecture

    Prerequisites

    To create a data volume, there must be a project with a PVC in its namespace.

    Working with data volumes is currently available using the API. To view the available actions, go to the Data volumes API reference.

    Adding a new data volume

    Data volume creation is limited to specific roles

    Adding scopes for a data volume

    Data volume sharing (adding scopes) is limited to specific roles

    Once created, the data volume is available to its originating project (see the prerequisites above).

    Data volumes can be shared with additional scopes in the organization.

    Who can use a data volume?

    Data volumes are used when submitting workloads. Any user, application or SSO group with a role that has permissions to create workloads can also use data volumes.

    Researchers can list available data volumes within their permitted scopes for easy selection.

    Data Volumes

    Data volumes offer a powerful solution for storing, managing, and sharing AI training data within the Run:ai platform. They promote collaboration, simplify data access control, and streamline the AI development lifecycle.

    Data volumes are snapshots of datasets stored in Kubernetes Persistent Volume Claims (PVCs). They act as a central repository for training data.

    Why use a data volume?

    1. Sharing with multiple scopes
      Unlike other Run:ai data sources, data volumes can be shared across projects, departments, or clusters, encouraging data reuse and collaboration within the organization.
    2. Storage saving
      A single copy of the data can be used across multiple scopes

    Typical use cases

    1. Sharing large data sets
      In large organizations, the data is often stored in a remote location, which can be a barrier for large model training. Even if the data is transferred into the cluster, sharing it easily with multiple users is still challenging. Data volumes can help share the data seamlessly, with maximum security and control.
    2. Sharing data with colleagues
      When sharing training results, generated data sets, or other artifacts with team members is needed, data volumes can help make the data available easily.

    data-volumes-architecture

    Prerequisites

    To create a data volume, there must be a project with a PVC in its namespace.

    Working with data volumes is currently available using the API. To view the available actions, go to the Data volumes API reference.

    Adding a new data volume

    Data volume creation is limited to specific roles

    Adding scopes for a data volume

    Data volume sharing (adding scopes) is limited to specific roles

    Once created, the data volume is available to its originating project (see the prerequisites above).

    Data volumes can be shared with additional scopes in the organization.

    Who can use a data volume?

    Data volumes are used when submitting workloads. Any user, application or SSO group with a role that has permissions to create workloads can also use data volumes.

    Researchers can list available data volumes within their permitted scopes for easy selection.

    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/assets/datasources/index.html b/v2.20/platform-admin/workloads/assets/datasources/index.html index 352d088cfe..60c5d7179c 100644 --- a/v2.20/platform-admin/workloads/assets/datasources/index.html +++ b/v2.20/platform-admin/workloads/assets/datasources/index.html @@ -1,4 +1,4 @@ - Data Sources - Run:ai Documentation Library

    Data Sources

    This article explains what data sources are and how to create and use them.

    Data sources are a type of workload asset and represent a location where data is actually stored. They may represent a remote data location, such as NFS, Git, or S3, or a Kubernetes local resource, such as PVC, ConfigMap, HostPath, or Secret.

    This configuration simplifies the mapping of the data into the workload’s file system and handles the mounting process during workload creation for reading and writing. These data sources are reusable and can be easily integrated and used by AI practitioners while submitting workloads across various scopes.

    Data sources table

    The data sources table can be found under Workload manager in the Run:ai platform.

    The data sources table provides a list of all the data sources defined in the platform and allows you to manage them.

    The data sources table comprises the following columns:

    Column Description
    Data source The name of the data source
    Description A description of the data source
    Type The type of data source connected – e.g., S3 bucket, PVC, or others
    Status The different lifecycle phases and representation of the data source condition
    Scope The scope of the data source within the organizational tree. Click the scope name to view the organizational tree diagram
    Kubernetes name The unique name of the data sources Kubernetes name as it appears in the cluster
    Workload(s) The list of existing workloads that use the data source
    Template(s) The list of workload templates that use the data source
    Created by The user who created the data source
    Creation time The timestamp for when the data source was created
    Cluster The cluster that the data source is associated with

    Data sources status

    The following table describes the data sources' condition and whether they were created successfully for the selected scope.

    Status Description
    No issues found No issues were found while creating the data source
    Issues found Issues were found while propagating the data source credentials
    Issues found The data source couldn’t be created at the cluster
    Creating… The data source is being created
    No status / “-” When the data source’s scope is an account, the current version of the cluster is not up to date, or the asset is not a cluster-syncing entity, the status can’t be displayed

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then click ‘Download as CSV’
    • Refresh - Click REFRESH to update the table with the latest data

    Adding a new data source

    To create a new data source:

    1. Click +NEW DATA SOURCE
    2. Select the data source type from the list. Follow the step-by-step guide for each data source type:

    NFS

    A Network File System (NFS) is a Kubernetes concept used for sharing storage in the cluster among different pods. Like a PVC, the NFS volume’s content remains preserved, even outside the lifecycle of a single pod. However, unlike PVCs, which abstract storage management, NFS provides a method for network-based file sharing. The NFS volume can be pre-populated with data and can be mounted by multiple pod writers simultaneously. At Run:ai, an NFS-type data source is an abstraction that is mapped directly to a Kubernetes NFS volume. This integration allows multiple workloads under various scopes to mount and present the NFS data source.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Enter the NFS server (host name or host IP)
      • Enter the NFS path
    6. Set the data target location
      • Container path
    7. Optional: Restrictions
      • Prevent data modification - When enabled, the data will be mounted with read-only permissions
    8. Click CREATE DATA SOURCE

    PVC

    A Persistent Volume Claim (PVC) is a Kubernetes concept used for managing storage in the cluster, which can be provisioned by an administrator or dynamically by Kubernetes using a StorageClass. PVCs allow users to request specific sizes and access modes (read/write once, read-only many). Run:ai ensures that data remains consistent and accessible across various scopes and workloads, beyond the lifecycle of individual pods, which is efficient while working with large datasets typically associated with AI projects.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Select PVC:
      • Existing PVC
        This option is relevant when the purpose is to create a PVC-type data source based on an existing PVC in the cluster
        • Select a PVC from the list - (The list is empty if no existing PVCs were created in advance)
      • New PVC - creates a new PVC in the cluster. New PVCs are not added to the Existing PVCs list.
        When creating a PVC-type data source and selecting the ‘New PVC’ option, the PVC is immediately created in the cluster (even if no workload has requested this PVC).
    6. Select the storage class
      • None - Proceed without defining a storage class
      • Custom storage class - This option applies when selecting a storage class based on existing storage classes.
        To add new storage classes to the storage class list, and for additional information, check Kubernetes storage classes
    7. Select the access mode(s) (multiple modes can be selected)
      • Read-write by one node - The volume can be mounted as read-write by a single node.
      • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
      • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
    8. Set the claim size and its units
    9. Select the volume mode
      • File system (default) - allows the volume to be mounted as a filesystem, enabling the usage of directories and files.
      • Block - exposes the volume as a block storage, which can be formatted or used by applications directly without a filesystem.
    10. Set the data target location
      • container path
    11. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permission.
    12. Click CREATE DATA SOURCE

    After the data source is created, check its status to monitor its proper creation across the selected scope.

    S3 Bucket

    The S3 bucket data source enables the mapping of a remote S3 bucket into the workload’s file system. Similar to a PVC, this mapping remains accessible across different workload executions, extending beyond the lifecycle of individual pods. However, unlike PVCs, data stored in an S3 bucket resides remotely, which may lead to decreased performance during the execution of heavy machine learning workloads. As part of the Run:ai connection to the S3 bucket, you can create credentials in order to access and map private buckets.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Set the S3 service URL
      • Select the credentials
        • None - for public buckets
        • Credential names - This option is relevant for private buckets based on existing credentials that were created for the scope.
          To add new credentials to the credentials list, and for additional information, check the Credentials article.
      • Enter the bucket name
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    After a private data source is created, check its status to monitor its proper creation across the selected scope.

    Git

    A Git-type data source is a Run:ai integration, that enables code to be copied from a Git branch into a dedicated folder in the container. It is mainly used to provide the workload with the latest code repository. As part of the integration with Git, in order to access private repositories, you can add predefined credentials to the data source mapping.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Set the Repository URL
      • Set the Revision (branch, tag, or hash)- If left empty, it will use the 'HEAD' (latest)
      • Select the credentials
        • None - for public repositories
        • Credential names - This option applies to private repositories based on existing credentials that were created for the scope.
          To add new credentials to the credentials list, and for additional information, check the Credentials article.
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    After a private data source is created, check its status to monitor its proper creation across the selected scope.

    Host path

    A Host path volume is a Kubernetes concept that enables mounting a host path file or a directory on the workload’s file system. Like a PVC, the host path volume’s data persists across workloads under various scopes. It also enables data serving from the hosting node.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • host path
    6. Set the data target location
      • container path
    7. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permissions.
    8. Click CREATE DATA SOURCE

    ConfigMap

    A ConfigMap data source is a Run:ai abstraction for the Kubernetes ConfigMap concept. The ConfigMap is used mainly for storage that can be mounted on the workload container for non-confidential data. It is usually represented in key-value pairs (e.g., environment variables, command-line arguments etc.). It allows you to decouple environment-specific system configurations from your container images, so that your applications are easily portable. ConfigMaps must be created on the cluster prior to being used within the Run:ai system.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Select the ConfigMap name (The list is empty if no existing ConfigMaps were created in advance).
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    Secret

    A secret-type data source enables the mapping of a credential into the workload’s file system. Credentials are a workload asset that simplify the complexities of Kubernetes Secrets. The credentials mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Select the credentials
        To add new credentials, and for additional information, check the Credentials article.
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    After the data source is created, check its status to monitor its proper creation across the selected scope.

    Note

    It is also possible to add data sources directly when creating a specific workspace, training or inference workload

    Editing a data source

    To edit a data source:

    1. Select the data source from the table
    2. Click Rename to provide it with a new name
    3. Click Copy & Edit to make any changes to the data source

    Deleting a data source

    To delete a data source:

    1. Select the data source you want to delete
    2. Click DELETE
    3. Confirm you want to delete the data source

    Note

    It is not possible to delete an environment being used by an existing workload or template.

    Using API

    To view the available actions, go to the Data sources API reference.

    Data Sources

    This article explains what data sources are and how to create and use them.

    Data sources are a type of workload asset and represent a location where data is actually stored. They may represent a remote data location, such as NFS, Git, or S3, or a Kubernetes local resource, such as PVC, ConfigMap, HostPath, or Secret.

    This configuration simplifies the mapping of the data into the workload’s file system and handles the mounting process during workload creation for reading and writing. These data sources are reusable and can be easily integrated and used by AI practitioners while submitting workloads across various scopes.

    Data sources table

    The data sources table can be found under Workload manager in the Run:ai platform.

    The data sources table provides a list of all the data sources defined in the platform and allows you to manage them.

    The data sources table comprises the following columns:

    Column Description
    Data source The name of the data source
    Description A description of the data source
    Type The type of data source connected – e.g., S3 bucket, PVC, or others
    Status The different lifecycle phases and representation of the data source condition
    Scope The scope of the data source within the organizational tree. Click the scope name to view the organizational tree diagram
    Kubernetes name The unique name of the data sources Kubernetes name as it appears in the cluster
    Workload(s) The list of existing workloads that use the data source
    Template(s) The list of workload templates that use the data source
    Created by The user who created the data source
    Creation time The timestamp for when the data source was created
    Cluster The cluster that the data source is associated with

    Data sources status

    The following table describes the data sources' condition and whether they were created successfully for the selected scope.

    Status Description
    No issues found No issues were found while creating the data source
    Issues found Issues were found while propagating the data source credentials
    Issues found The data source couldn’t be created at the cluster
    Creating… The data source is being created
    No status / “-” When the data source’s scope is an account, the current version of the cluster is not up to date, or the asset is not a cluster-syncing entity, the status can’t be displayed

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then click ‘Download as CSV’
    • Refresh - Click REFRESH to update the table with the latest data

    Adding a new data source

    To create a new data source:

    1. Click +NEW DATA SOURCE
    2. Select the data source type from the list. Follow the step-by-step guide for each data source type:

    NFS

    A Network File System (NFS) is a Kubernetes concept used for sharing storage in the cluster among different pods. Like a PVC, the NFS volume’s content remains preserved, even outside the lifecycle of a single pod. However, unlike PVCs, which abstract storage management, NFS provides a method for network-based file sharing. The NFS volume can be pre-populated with data and can be mounted by multiple pod writers simultaneously. At Run:ai, an NFS-type data source is an abstraction that is mapped directly to a Kubernetes NFS volume. This integration allows multiple workloads under various scopes to mount and present the NFS data source.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Enter the NFS server (host name or host IP)
      • Enter the NFS path
    6. Set the data target location
      • Container path
    7. Optional: Restrictions
      • Prevent data modification - When enabled, the data will be mounted with read-only permissions
    8. Click CREATE DATA SOURCE

    PVC

    A Persistent Volume Claim (PVC) is a Kubernetes concept used for managing storage in the cluster, which can be provisioned by an administrator or dynamically by Kubernetes using a StorageClass. PVCs allow users to request specific sizes and access modes (read/write once, read-only many). Run:ai ensures that data remains consistent and accessible across various scopes and workloads, beyond the lifecycle of individual pods, which is efficient while working with large datasets typically associated with AI projects.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Select PVC:
      • Existing PVC
        This option is relevant when the purpose is to create a PVC-type data source based on an existing PVC in the cluster
        • Select a PVC from the list - (The list is empty if no existing PVCs were created in advance)
      • New PVC - creates a new PVC in the cluster. New PVCs are not added to the Existing PVCs list.
        When creating a PVC-type data source and selecting the ‘New PVC’ option, the PVC is immediately created in the cluster (even if no workload has requested this PVC).
    6. Select the storage class
      • None - Proceed without defining a storage class
      • Custom storage class - This option applies when selecting a storage class based on existing storage classes.
        To add new storage classes to the storage class list, and for additional information, check Kubernetes storage classes
    7. Select the access mode(s) (multiple modes can be selected)
      • Read-write by one node - The volume can be mounted as read-write by a single node.
      • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
      • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
    8. Set the claim size and its units
    9. Select the volume mode
      • File system (default) - allows the volume to be mounted as a filesystem, enabling the usage of directories and files.
      • Block - exposes the volume as a block storage, which can be formatted or used by applications directly without a filesystem.
    10. Set the data target location
      • container path
    11. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permission.
    12. Click CREATE DATA SOURCE

    After the data source is created, check its status to monitor its proper creation across the selected scope.

    S3 Bucket

    The S3 bucket data source enables the mapping of a remote S3 bucket into the workload’s file system. Similar to a PVC, this mapping remains accessible across different workload executions, extending beyond the lifecycle of individual pods. However, unlike PVCs, data stored in an S3 bucket resides remotely, which may lead to decreased performance during the execution of heavy machine learning workloads. As part of the Run:ai connection to the S3 bucket, you can create credentials in order to access and map private buckets.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Set the S3 service URL
      • Select the credentials
        • None - for public buckets
        • Credential names - This option is relevant for private buckets based on existing credentials that were created for the scope.
          To add new credentials to the credentials list, and for additional information, check the Credentials article.
      • Enter the bucket name
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    After a private data source is created, check its status to monitor its proper creation across the selected scope.

    Git

    A Git-type data source is a Run:ai integration, that enables code to be copied from a Git branch into a dedicated folder in the container. It is mainly used to provide the workload with the latest code repository. As part of the integration with Git, in order to access private repositories, you can add predefined credentials to the data source mapping.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Set the Repository URL
      • Set the Revision (branch, tag, or hash)- If left empty, it will use the 'HEAD' (latest)
      • Select the credentials
        • None - for public repositories
        • Credential names - This option applies to private repositories based on existing credentials that were created for the scope.
          To add new credentials to the credentials list, and for additional information, check the Credentials article.
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    After a private data source is created, check its status to monitor its proper creation across the selected scope.

    Host path

    A Host path volume is a Kubernetes concept that enables mounting a host path file or a directory on the workload’s file system. Like a PVC, the host path volume’s data persists across workloads under various scopes. It also enables data serving from the hosting node.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • host path
    6. Set the data target location
      • container path
    7. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permissions.
    8. Click CREATE DATA SOURCE

    ConfigMap

    A ConfigMap data source is a Run:ai abstraction for the Kubernetes ConfigMap concept. The ConfigMap is used mainly for storage that can be mounted on the workload container for non-confidential data. It is usually represented in key-value pairs (e.g., environment variables, command-line arguments etc.). It allows you to decouple environment-specific system configurations from your container images, so that your applications are easily portable. ConfigMaps must be created on the cluster prior to being used within the Run:ai system.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Select the ConfigMap name (The list is empty if no existing ConfigMaps were created in advance).
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    Secret

    A secret-type data source enables the mapping of a credential into the workload’s file system. Credentials are a workload asset that simplify the complexities of Kubernetes Secrets. The credentials mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

    1. Select the cluster under which to create this data source
    2. Select a scope
    3. Enter a name for the data source. The name must be unique.
    4. Optional: Provide a description of the data source
    5. Set the data origin
      • Select the credentials
        To add new credentials, and for additional information, check the Credentials article.
    6. Set the data target location
      • container path
    7. Click CREATE DATA SOURCE

    After the data source is created, check its status to monitor its proper creation across the selected scope.

    Note

    It is also possible to add data sources directly when creating a specific workspace, training or inference workload

    Editing a data source

    To edit a data source:

    1. Select the data source from the table
    2. Click Rename to provide it with a new name
    3. Click Copy & Edit to make any changes to the data source

    Deleting a data source

    To delete a data source:

    1. Select the data source you want to delete
    2. Click DELETE
    3. Confirm you want to delete the data source

    Note

    It is not possible to delete an environment being used by an existing workload or template.

    Using API

    To view the available actions, go to the Data sources API reference.

    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/assets/environments/index.html b/v2.20/platform-admin/workloads/assets/environments/index.html index 5b6a20a5c8..ed5dd52343 100644 --- a/v2.20/platform-admin/workloads/assets/environments/index.html +++ b/v2.20/platform-admin/workloads/assets/environments/index.html @@ -1,4 +1,4 @@ - Environments - Run:ai Documentation Library

    Environments

    This article explains what environments are and how to create and use them.

    Environments are one type of workload asset. An environment consists of a configuration that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

    An environment asset is a preconfigured building block that encapsulates aspects for the workload such as:

    • Container image and container configuration
    • Tools and connections
    • The type of workload it serves

    Environments table

    The Environments table can be found under Workload manager in the Run:ai platform.

    The Environment table provides a list of all the environment defined in the platform and allows you to manage them.

    The Environments table consists of the following columns:

    Column Description
    Environment The name of the environment
    Description A description of the environment
    Scope The scope of this environment within the organizational tree. Click the name of the scope to view the organizational tree diagram
    Image The application or service to be run by the workload
    Workload Architecture This can be either standard for running workloads on a single node or distributed for running distributed workloads on multiple nodes
    Tool(s) The tools and connection types the environment exposes
    Workload(s) The list of existing workloads that use the environment
    Workload types The workload types that can use the environment (Workspace/ Training / Inference)
    Template(s) The list of workload templates that use this environment
    Created by The user who created the environment. By default Run:ai UI comes with preinstalled environments created by Run:ai created by Run:ai
    Creation time The timestamp of when the environment was created
    Last updated The timestamp of when the environment was last updated
    Cluster The cluster with which the environment is associated

    Tools associated with the environment

    Click one of the values in the tools column to view the list of tools and their connection type.

    Column Description
    Tool name The name of the tool or application AI practitioner can set up within the environment.
    Connection type The method by which you can access and interact with the running workload. It's essentially the "doorway" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc)

    Workloads associated with the environment

    Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

    Column Description
    Workload The workload that uses the environment
    Type The workload type (Workspace/Training/Inference)
    Status Represents the workload lifecycle. See the full list of workload status

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Environments created by Run:ai

    When installing Run:ai, you automatically get the environments created by Run:ai to ease up the onboarding process and support different use cases out of the box.
    These environments are created at the scope of the account.

    Environment Image
    Jupiter-lab jupyter/scipy-notebook
    jupyter-tensorboard gcr.io/run-ai-demo/jupyter-tensorboard
    tensorboard tensorflow/tensorflow:latest
    llm-server runai.jfrog.io/core-llm/runai-vllm:v0.6.4-0.10.0
    chatbot-ui runai.jfrog.io/core-llm/llm-app
    gpt2 runai.jfrog.io/core-llm/quickstart-inference:gpt2-cpu

    Adding a new environment

    Environment creation is limited to specific roles

    To add a new environment:

    1. Go to the Environments table
    2. Click +NEW ENVIRONMENT
    3. Select under which cluster to create the environment
    4. Select a scope
    5. Enter a name for the environment. The name must be unique.
    6. Optional: Provide a description of the essence of the environment
    7. Enter the Image URL
      If a token or secret is required to pull the image, it is possible to create it via credentials of type docker registry. These credentials are automatically used once the image is pulled (which happens when the workload is submitted)
    8. Set the image pull policy - the condition for when to pull the image from the registry
    9. Set the workload architecture:
      • Standard
        Only standard workloads can use the environment. A standard workload consists of a single process.
      • Distributed
        Only distributed workloads can use the environment. A distributed workload consists of multiple processes working together. These processes can run on different nodes.
      • Select a framework from the list.
    10. Set the workload type:
      • Workspace
      • Training
      • Inference
      • When inference is selected, define the endpoint of the model by providing both the protocol and the container’s serving port
    11. Optional: Set the connection for your tool(s). The tools must be configured in the image. When submitting a workload using the environment, it is possible to connect to these tools
      • Select the tool from the list (the available tools varies from IDE, experiment tracking, and more, including a custom tool for your choice)
      • Select the connection type
        • External URL
          • Auto generate
            A unique URL is automatically created for each workload using the environment
          • Custom URL
            The URL is set manually
        • Node port
          • Auto generate
            A unique port is automatically exposed for each workload using the environment
          • Custom URL
            Set the port manually
        • Set the container port
    12. Optional: Set a command and arguments for the container running the pod
      • When no command is added, the default command of the image is used (the image entrypoint)
      • The command can be modified while submitting a workload using the environment
      • The argument(s) can be modified while submitting a workload using the environment
    13. Optional: Set the environment variable(s)
      • Click +ENVIRONMENT VARIABLE
      • Enter a name
      • Select the source for the environment variable
      • Custom
        • Enter a value
        • Leave empty
        • Add instructions for the expected value if any
      • Credentials - Select existing credentials as the environment variable
        • Select a credential name
          To add new credentials to the credentials list, and for additional information, see Credentials.
        • Select a secret key
      • The environment variables can be modified and new variables can be added while submitting a workload using the environment
    14. Optional: Set the container’s working directory to define where the container’s process starts running. When left empty, the default directory is used.
    15. Optional: Set where the UID, GID and supplementary groups are taken from, this can be:
      • From the image
      • From the IdP token (only available in an SSO installations)
      • Custom (manually set) - decide whether the submitter can modify these value upon submission.
      • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
        • Enter UID
        • Enter GID
        • Add Supplementary groups (multiple groups can be added, separated by commas)
        • Disable Allow the values above to be modified within the workload if you want the above values to be used as the default
    16. Optional: Select Linux capabilities - Grant certain privileges to a container without granting all the privileges of the root user.
    17. Click CREATE ENVIRONMENT

    Note

    It is also possible to add environments directly when creating a specific workspace, training or inference workload.

    Editing an environment

    To edit an environment:

    1. Select the environment you want to edit
    2. Click Edit
    3. Click SAVE ENVIRONMENT

    Note

    • The already bound workload that is using this asset will not be affected.
    • llm-server and chatbot-ui environments cannot be edited.

    Copying an environment

    To make a copy of an existing environment:

    1. Select the environment you want to copy
    2. Click MAKE A COPY
    3. Enter a name for the environment. The name must be unique.
    4. Update the environment
    5. Click CREATE ENVIRONMENT

    Deleting an environment

    To delete an environment:

    1. Select the environment you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Note

    The already bound workload that is using this asset will not be affected.

    Using API

    Go to the Environment API reference to view the available actions

    Environments

    This article explains what environments are and how to create and use them.

    Environments are one type of workload asset. An environment consists of a configuration that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

    An environment asset is a preconfigured building block that encapsulates aspects for the workload such as:

    • Container image and container configuration
    • Tools and connections
    • The type of workload it serves

    Environments table

    The Environments table can be found under Workload manager in the Run:ai platform.

    The Environment table provides a list of all the environment defined in the platform and allows you to manage them.

    The Environments table consists of the following columns:

    Column Description
    Environment The name of the environment
    Description A description of the environment
    Scope The scope of this environment within the organizational tree. Click the name of the scope to view the organizational tree diagram
    Image The application or service to be run by the workload
    Workload Architecture This can be either standard for running workloads on a single node or distributed for running distributed workloads on multiple nodes
    Tool(s) The tools and connection types the environment exposes
    Workload(s) The list of existing workloads that use the environment
    Workload types The workload types that can use the environment (Workspace/ Training / Inference)
    Template(s) The list of workload templates that use this environment
    Created by The user who created the environment. By default Run:ai UI comes with preinstalled environments created by Run:ai created by Run:ai
    Creation time The timestamp of when the environment was created
    Last updated The timestamp of when the environment was last updated
    Cluster The cluster with which the environment is associated

    Tools associated with the environment

    Click one of the values in the tools column to view the list of tools and their connection type.

    Column Description
    Tool name The name of the tool or application AI practitioner can set up within the environment.
    Connection type The method by which you can access and interact with the running workload. It's essentially the "doorway" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc)

    Workloads associated with the environment

    Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

    Column Description
    Workload The workload that uses the environment
    Type The workload type (Workspace/Training/Inference)
    Status Represents the workload lifecycle. See the full list of workload status

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.

    Environments created by Run:ai

    When installing Run:ai, you automatically get the environments created by Run:ai to ease up the onboarding process and support different use cases out of the box.
    These environments are created at the scope of the account.

    Environment Image
    Jupiter-lab jupyter/scipy-notebook
    jupyter-tensorboard gcr.io/run-ai-demo/jupyter-tensorboard
    tensorboard tensorflow/tensorflow:latest
    llm-server runai.jfrog.io/core-llm/runai-vllm:v0.6.4-0.10.0
    chatbot-ui runai.jfrog.io/core-llm/llm-app
    gpt2 runai.jfrog.io/core-llm/quickstart-inference:gpt2-cpu

    Adding a new environment

    Environment creation is limited to specific roles

    To add a new environment:

    1. Go to the Environments table
    2. Click +NEW ENVIRONMENT
    3. Select under which cluster to create the environment
    4. Select a scope
    5. Enter a name for the environment. The name must be unique.
    6. Optional: Provide a description of the essence of the environment
    7. Enter the Image URL
      If a token or secret is required to pull the image, it is possible to create it via credentials of type docker registry. These credentials are automatically used once the image is pulled (which happens when the workload is submitted)
    8. Set the image pull policy - the condition for when to pull the image from the registry
    9. Set the workload architecture:
      • Standard
        Only standard workloads can use the environment. A standard workload consists of a single process.
      • Distributed
        Only distributed workloads can use the environment. A distributed workload consists of multiple processes working together. These processes can run on different nodes.
      • Select a framework from the list.
    10. Set the workload type:
      • Workspace
      • Training
      • Inference
      • When inference is selected, define the endpoint of the model by providing both the protocol and the container’s serving port
    11. Optional: Set the connection for your tool(s). The tools must be configured in the image. When submitting a workload using the environment, it is possible to connect to these tools
      • Select the tool from the list (the available tools varies from IDE, experiment tracking, and more, including a custom tool for your choice)
      • Select the connection type
        • External URL
          • Auto generate
            A unique URL is automatically created for each workload using the environment
          • Custom URL
            The URL is set manually
        • Node port
          • Auto generate
            A unique port is automatically exposed for each workload using the environment
          • Custom URL
            Set the port manually
        • Set the container port
    12. Optional: Set a command and arguments for the container running the pod
      • When no command is added, the default command of the image is used (the image entrypoint)
      • The command can be modified while submitting a workload using the environment
      • The argument(s) can be modified while submitting a workload using the environment
    13. Optional: Set the environment variable(s)
      • Click +ENVIRONMENT VARIABLE
      • Enter a name
      • Select the source for the environment variable
      • Custom
        • Enter a value
        • Leave empty
        • Add instructions for the expected value if any
      • Credentials - Select existing credentials as the environment variable
        • Select a credential name
          To add new credentials to the credentials list, and for additional information, see Credentials.
        • Select a secret key
      • The environment variables can be modified and new variables can be added while submitting a workload using the environment
    14. Optional: Set the container’s working directory to define where the container’s process starts running. When left empty, the default directory is used.
    15. Optional: Set where the UID, GID and supplementary groups are taken from, this can be:
      • From the image
      • From the IdP token (only available in an SSO installations)
      • Custom (manually set) - decide whether the submitter can modify these value upon submission.
      • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
        • Enter UID
        • Enter GID
        • Add Supplementary groups (multiple groups can be added, separated by commas)
        • Disable Allow the values above to be modified within the workload if you want the above values to be used as the default
    16. Optional: Select Linux capabilities - Grant certain privileges to a container without granting all the privileges of the root user.
    17. Click CREATE ENVIRONMENT

    Note

    It is also possible to add environments directly when creating a specific workspace, training or inference workload.

    Editing an environment

    To edit an environment:

    1. Select the environment you want to edit
    2. Click Edit
    3. Click SAVE ENVIRONMENT

    Note

    • The already bound workload that is using this asset will not be affected.
    • llm-server and chatbot-ui environments cannot be edited.

    Copying an environment

    To make a copy of an existing environment:

    1. Select the environment you want to copy
    2. Click MAKE A COPY
    3. Enter a name for the environment. The name must be unique.
    4. Update the environment
    5. Click CREATE ENVIRONMENT

    Deleting an environment

    To delete an environment:

    1. Select the environment you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm

    Note

    The already bound workload that is using this asset will not be affected.

    Using API

    Go to the Environment API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/assets/overview/index.html b/v2.20/platform-admin/workloads/assets/overview/index.html index 8a26837a43..245a856bc0 100644 --- a/v2.20/platform-admin/workloads/assets/overview/index.html +++ b/v2.20/platform-admin/workloads/assets/overview/index.html @@ -1,4 +1,4 @@ - Overview - Run:ai Documentation Library

    Overview

    Workload assets enable organizations to:

    • Create and reuse preconfigured setup for code, data, storage and resources to be used by AI practitioners to simplify the process of submitting workloads
    • Share the preconfigured setup with a wide audience of AI practitioners with similar needs

    Note

    • The creation of assets is possible only via API and the Run:ai UI
    • The submission of workloads using assets, is possible only via the Run:ai UI

    Workload asset types

    There are four workload asset types used by the workload:

    • Environments
      The container image, tools and connections for the workload
    • Data sources
      The type of data, its origin and the target storage location such as PVCs or cloud storage buckets where datasets are stored
    • Compute resources
      The compute specification, including GPU and CPU compute and memory
    • Credentials
      The secrets to be used to access sensitive data, services, and applications such as docker registry or S3 buckets

    Asset scope

    When a workload asset is created, a scope is required. The scope defines who in the organization can view and/or use the asset.

    Note

    When an asset is created via API, the scope can be the entire account, this is currently an experimental feature.

    Who can create an asset?

    Any subject (user, application, or SSO group) with a role that has permissions to Create an asset, can do so within their scope.

    Who can use an asset?

    Assets are used when submitting workloads. Any subject (user, application or SSO group) with a role that has permissions to Create workloads, can also use assets.

    Who can view an asset?

    Any subject (user, application, or SSO group) with a role that has permission to View an asset, can do so within their scope.

    Overview

    Workload assets enable organizations to:

    • Create and reuse preconfigured setup for code, data, storage and resources to be used by AI practitioners to simplify the process of submitting workloads
    • Share the preconfigured setup with a wide audience of AI practitioners with similar needs

    Note

    • The creation of assets is possible only via API and the Run:ai UI
    • The submission of workloads using assets, is possible only via the Run:ai UI

    Workload asset types

    There are four workload asset types used by the workload:

    • Environments
      The container image, tools and connections for the workload
    • Data sources
      The type of data, its origin and the target storage location such as PVCs or cloud storage buckets where datasets are stored
    • Compute resources
      The compute specification, including GPU and CPU compute and memory
    • Credentials
      The secrets to be used to access sensitive data, services, and applications such as docker registry or S3 buckets

    Asset scope

    When a workload asset is created, a scope is required. The scope defines who in the organization can view and/or use the asset.

    Note

    When an asset is created via API, the scope can be the entire account, this is currently an experimental feature.

    Who can create an asset?

    Any subject (user, application, or SSO group) with a role that has permissions to Create an asset, can do so within their scope.

    Who can use an asset?

    Assets are used when submitting workloads. Any subject (user, application or SSO group) with a role that has permissions to Create workloads, can also use assets.

    Who can view an asset?

    Any subject (user, application, or SSO group) with a role that has permission to View an asset, can do so within their scope.

    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/assets/templates/index.html b/v2.20/platform-admin/workloads/assets/templates/index.html index 0b6f6787d2..70d1176f6d 100644 --- a/v2.20/platform-admin/workloads/assets/templates/index.html +++ b/v2.20/platform-admin/workloads/assets/templates/index.html @@ -1,4 +1,4 @@ - Workspace Templates - Run:ai Documentation Library

    Workspace Templates

    This article explains the procedure to manage templates.

    A template is a pre-set configuration that is used to quickly configure and submit workloads using existing assets. A template consists of all the assets a workload needs, allowing researchers to submit a workload in a single click, or make subtle adjustments to differentiate them from each other.

    Workspace templates table

    The Templates table can be found under Workload manager in the Run:ai User interface.

    The Templates table provides a list of all the templates defined in the platform, and allows you to manage them.

    Flexible Management

    It is also possible to manage templates directly for a specific user, application, project, or department.

    The Templates table consists of the following columns:

    Column Description
    Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates
    Environment The name of the environment related to the workspace template
    Compute resource The name of the compute resource connected to the workspace template
    Data source(s) The name of the data source(s) connected to the workspace template
    Created by The subject that created the template
    Creation time The timestamp for when the template was created
    Cluster The cluster name containing the template

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then click Download as CSV. Export to CSV is limited to 20,000 rows.
    • Refresh (optional) - Click REFRESH to update the table with the latest data
    • Show/Hide details (optional) - Click to view additional information on the selected row

    Adding a new workspace template

    To add a new template:

    1. Click +NEW TEMPLATE
    2. Set the scope for the template
    3. Enter a name for the template
    4. Select the environment for your workload
    5. Select the node resources needed to run your workload
      - or -
      Click +NEW COMPUTE RESOURCE

    6. Set the volume needed for your workload

    7. Create a new data source
    8. Set auto-deletion, annotations and labels, as required
    9. Click CREATE TEMPLATE

    Editing a template

    To edit a template:

    1. Select the template from the table
    2. Click Rename to provide it with a new name
    3. Click Copy & Edit to make any changes to the template

    Deleting a template

    To delete a template:

    1. Select the template you want to delete
    2. Click DELETE
    3. Confirm you want to delete the template

    Using API**

    Go to the Workload template API reference to view the available actions

    Workspace Templates

    This article explains the procedure to manage templates.

    A template is a pre-set configuration that is used to quickly configure and submit workloads using existing assets. A template consists of all the assets a workload needs, allowing researchers to submit a workload in a single click, or make subtle adjustments to differentiate them from each other.

    Workspace templates table

    The Templates table can be found under Workload manager in the Run:ai User interface.

    The Templates table provides a list of all the templates defined in the platform, and allows you to manage them.

    Flexible Management

    It is also possible to manage templates directly for a specific user, application, project, or department.

    The Templates table consists of the following columns:

    Column Description
    Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates
    Environment The name of the environment related to the workspace template
    Compute resource The name of the compute resource connected to the workspace template
    Data source(s) The name of the data source(s) connected to the workspace template
    Created by The subject that created the template
    Creation time The timestamp for when the template was created
    Cluster The cluster name containing the template

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then click Download as CSV. Export to CSV is limited to 20,000 rows.
    • Refresh (optional) - Click REFRESH to update the table with the latest data
    • Show/Hide details (optional) - Click to view additional information on the selected row

    Adding a new workspace template

    To add a new template:

    1. Click +NEW TEMPLATE
    2. Set the scope for the template
    3. Enter a name for the template
    4. Select the environment for your workload
    5. Select the node resources needed to run your workload
      - or -
      Click +NEW COMPUTE RESOURCE

    6. Set the volume needed for your workload

    7. Create a new data source
    8. Set auto-deletion, annotations and labels, as required
    9. Click CREATE TEMPLATE

    Editing a template

    To edit a template:

    1. Select the template from the table
    2. Click Rename to provide it with a new name
    3. Click Copy & Edit to make any changes to the template

    Deleting a template

    To delete a template:

    1. Select the template you want to delete
    2. Click DELETE
    3. Confirm you want to delete the template

    Using API**

    Go to the Workload template API reference to view the available actions

    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/overviews/introduction-to-workloads/index.html b/v2.20/platform-admin/workloads/overviews/introduction-to-workloads/index.html index d6f5b8d1aa..3aca02828f 100644 --- a/v2.20/platform-admin/workloads/overviews/introduction-to-workloads/index.html +++ b/v2.20/platform-admin/workloads/overviews/introduction-to-workloads/index.html @@ -1,4 +1,4 @@ - Introduction to Workloads - Run:ai Documentation Library

    Introduction to Workloads

    Run:ai enhances visibility and simplifies management, by monitoring, presenting and orchestrating all AI workloads in the clusters it is installed on. Workloads are the fundamental building blocks for consuming resources, enabling AI practitioners such as researchers, data scientists and engineers to efficiently support the entire life cycle of an AI initiative.

    Workloads across the AI lifecycle

    A typical AI initiative progresses through several key stages, each with distinct workloads and objectives. With Run:ai, research and engineering teams can host and manage all these workloads to achieve the following:

    • Data preparation: Aggregating, cleaning, normalizing, and labeling data to prepare for training.
    • Training: Conducting resource-intensive model development and iterative performance optimization.
    • Fine-tuning: Adapting pre-trained models to domain-specific data sets while balancing efficiency and performance.
    • Inference: Deploying models for real-time or batch predictions with a focus on low latency and high throughput.
    • Monitoring and optimization: Ensuring ongoing performance by addressing data drift, usage patterns, and retraining as needed.

    What is a workload?

    A workload runs in the cluster, is associated with a namespace, and operates to fulfill its targets, whether that is running to completion for a batch job, allocating resources for experimentation in an integrated development environment (IDE)/notebook, or serving inference requests in production.

    The workload, defined by the AI practitioner, consists of:

    • Container images: This includes the application, its dependencies, and the runtime environment.
    • Compute resources: CPU, GPU, and RAM to execute efficiently and address the workload’s needs.
    • Data sets: The data needed for processing, such as training data sets or input from external databases.
    • Credentials: The access to certain data sources or external services, ensuring proper authentication and authorization.

    Workload scheduling and orchestration

    Run:ai’s core mission is to optimize AI resource usage at scale. This is achieved through efficient scheduling and orchestrating of all cluster workloads using the Run:ai Scheduler. The Scheduler allows the prioritization of workloads across different departments and projects within the organization at large scales, based on the resource distribution set by the system administrator.

    Run:ai and third-party workloads

    • Run:ai workloads: These workloads are submitted via the Run:ai platform. They are represented by Kubernetes Custom Resource Definitions (CRDs) and APIs. When using Run:ai workloads, a complete Workload and Scheduling Policy solution is offered for administrators to ensure optimizations, governance and security standards are applied.
    • Third-party workloads: These workloads are submitted via third-party applications that use the Run:ai Scheduler. The Run:ai platform manages and monitors these workloads. They enable seamless integrations with external tools, allowing teams and individuals flexibility.

    Levels of support

    Different types of workloads have different levels of support. Understanding what capabilities are needed before selecting the workload type to work with is important. The table below details the level of support for each workload type in Run:ai. Run:ai workloads are fully supported with all of Run:ai advanced features and capabilities. While third-party workloads are partially supported. The list of capabilities can change between different Run:ai versions.

    Functionality Workload Type
    Run:ai workloads Third-party workloads
    Training - Standard Workspace Inference Training - distributed
    Fairness v v v v v
    Priority and preemption v v v v v
    Over quota v v v v v
    Node pools v v v v v
    Bin packing / Spread v v v v v
    Multi-GPU fractions v v v v v
    Multi-GPU dynamic fractions v v v v v
    Node level scheduler v v v v v
    Multi-GPU memory swap v v v v v
    Elastic scaling NA NA v v v
    Gang scheduling v v v v v
    Monitoring v v v v v
    RBAC v v v v
    Workload awareness v v v v
    Workload submission v v v v
    Workload actions (stop/run) v v v v
    Workload Policies v v v v
    Scheduling rules v v v v

    Note

    Workload awareness

    Specific workload-aware visibility, so that different pods are identified and treated as a single workload (for example GPU utilization, workload view, dashboards).

    Introduction to Workloads

    Run:ai enhances visibility and simplifies management, by monitoring, presenting and orchestrating all AI workloads in the clusters it is installed on. Workloads are the fundamental building blocks for consuming resources, enabling AI practitioners such as researchers, data scientists and engineers to efficiently support the entire life cycle of an AI initiative.

    Workloads across the AI lifecycle

    A typical AI initiative progresses through several key stages, each with distinct workloads and objectives. With Run:ai, research and engineering teams can host and manage all these workloads to achieve the following:

    • Data preparation: Aggregating, cleaning, normalizing, and labeling data to prepare for training.
    • Training: Conducting resource-intensive model development and iterative performance optimization.
    • Fine-tuning: Adapting pre-trained models to domain-specific data sets while balancing efficiency and performance.
    • Inference: Deploying models for real-time or batch predictions with a focus on low latency and high throughput.
    • Monitoring and optimization: Ensuring ongoing performance by addressing data drift, usage patterns, and retraining as needed.

    What is a workload?

    A workload runs in the cluster, is associated with a namespace, and operates to fulfill its targets, whether that is running to completion for a batch job, allocating resources for experimentation in an integrated development environment (IDE)/notebook, or serving inference requests in production.

    The workload, defined by the AI practitioner, consists of:

    • Container images: This includes the application, its dependencies, and the runtime environment.
    • Compute resources: CPU, GPU, and RAM to execute efficiently and address the workload’s needs.
    • Data sets: The data needed for processing, such as training data sets or input from external databases.
    • Credentials: The access to certain data sources or external services, ensuring proper authentication and authorization.

    Workload scheduling and orchestration

    Run:ai’s core mission is to optimize AI resource usage at scale. This is achieved through efficient scheduling and orchestrating of all cluster workloads using the Run:ai Scheduler. The Scheduler allows the prioritization of workloads across different departments and projects within the organization at large scales, based on the resource distribution set by the system administrator.

    Run:ai and third-party workloads

    • Run:ai workloads: These workloads are submitted via the Run:ai platform. They are represented by Kubernetes Custom Resource Definitions (CRDs) and APIs. When using Run:ai workloads, a complete Workload and Scheduling Policy solution is offered for administrators to ensure optimizations, governance and security standards are applied.
    • Third-party workloads: These workloads are submitted via third-party applications that use the Run:ai Scheduler. The Run:ai platform manages and monitors these workloads. They enable seamless integrations with external tools, allowing teams and individuals flexibility.

    Levels of support

    Different types of workloads have different levels of support. Understanding what capabilities are needed before selecting the workload type to work with is important. The table below details the level of support for each workload type in Run:ai. Run:ai workloads are fully supported with all of Run:ai advanced features and capabilities. While third-party workloads are partially supported. The list of capabilities can change between different Run:ai versions.

    Functionality Workload Type
    Run:ai workloads Third-party workloads
    Training - Standard Workspace Inference Training - distributed
    Fairness v v v v v
    Priority and preemption v v v v v
    Over quota v v v v v
    Node pools v v v v v
    Bin packing / Spread v v v v v
    Multi-GPU fractions v v v v v
    Multi-GPU dynamic fractions v v v v v
    Node level scheduler v v v v v
    Multi-GPU memory swap v v v v v
    Elastic scaling NA NA v v v
    Gang scheduling v v v v v
    Monitoring v v v v v
    RBAC v v v v
    Workload awareness v v v v
    Workload submission v v v v
    Workload actions (stop/run) v v v v
    Workload Policies v v v v
    Scheduling rules v v v v

    Note

    Workload awareness

    Specific workload-aware visibility, so that different pods are identified and treated as a single workload (for example GPU utilization, workload view, dashboards).

    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/overviews/managing-workloads/index.html b/v2.20/platform-admin/workloads/overviews/managing-workloads/index.html index 80980be559..daa333a2b4 100644 --- a/v2.20/platform-admin/workloads/overviews/managing-workloads/index.html +++ b/v2.20/platform-admin/workloads/overviews/managing-workloads/index.html @@ -1,4 +1,4 @@ - Workloads - Run:ai Documentation Library

    Workloads

    This article explains the procedure for managing workloads.

    Workloads table

    The Workloads table can be found under Workload manager in the Run:ai platform.

    The workloads table provides a list of all the workloads scheduled on the run:ai Scheduler, and allows you to manage them.

    The Workloads table consists of the following columns:

    Column Description
    Workload The name of the workload
    Type The workload type
    Preemptible Is the workload preemptible
    Status The different phases in a workload life cycle.
    Project The project in which the workload runs.
    Department The department that the workload is associated with. this column is visible only if the department toggle is enabled by your administrator.
    Created by The user who created the workload
    Running/requested pods The number of running pods out of the requested
    Creation time The timestamp for when the workload was created
    Completion time The timestamp the workload reached a terminal state (failed/completed)
    Connection(s) The method by which you can access and interact with the running workload. It's essentially the "doorway" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc). Click one of the values in the column to view the list of connections and their parameters
    Data source(s) Data resources used by the workload
    Environment The environment used by the workload
    Workload architecture Standard or distributed. A standard workload consists of a single process. A distributed workload consists of multiple processes working together. These processes can run on different nodes.
    GPU compute request Amount of GPU devices requested
    GPU compute allocation Amount of GPU devices allocated
    GPU memory request Amount of GPU memory Requested
    GPU memory allocation Amount of GPU memory allocated
    Idle GPU devices The number of allocated GPU devices that have been idle for more than 5 minutes
    CPU compute request Amount of CPU cores requested
    CPU compute allocation Amount of CPU cores allocated
    CPU memory request Amount of CPU memory requested
    CPU memory allocation Amount of CPU memory allocated
    Cluster The cluster that the workload is associated with

    Workload status

    The following table describes the different phases in a workload life cycle. The UI provides additional details for some of the below workload statuses which can be viewed by clicking the icon next to the status.

    Status Description Entry Condition Exit Condition
    Creating Workload setup is initiated in the cluster. Resources and pods are now provisioning. A workload is submitted. A multi-pod group is created.
    Pending Workload is queued and awaiting resource allocation. A pod group exists. All pods are scheduled.
    Initializing Workload is retrieving images, starting containers, and preparing pods. All pods are scheduled. All pods are initialized or a failure to initialize is detected.
    Running Workload is currently in progress with all pods operational. All pods initialized (all containers in pods are ready). Workload completion or failure.
    Degraded Pods may not align with specifications, network services might be incomplete, or persistent volumes may be detached. Check your logs for specific details. Pending - All pods are running but have issues. Running - All pods are running with no issues. Running - All resources are OK. Completed - Workload finished with fewer resources. Failed - Workload failure or user-defined rules.
    Deleting Workload and its associated resources are being decommissioned from the cluster. Deleting the workload. Resources are fully deleted.
    Stopped Workload is on hold and resources are intact but inactive. Stopping the workload without deleting resources. Transitioning back to the initializing phase or proceeding to deleting the workload.
    Failed Image retrieval failed or containers experienced a crash. Check your logs for specific details. An error occurs preventing the successful completion of the workload. Terminal state.
    Completed Workload has successfully finished its execution. The workload has finished processing without errors. Terminal state.

    Pods Associated with Workload

    Click one of the values in the Running/requested pods column, to view the list of pods and their parameters.

    Column Description
    Pod Pod name
    Status Pod lifecycle stages
    Node The node on which the pod resides
    Node pool The node pool in which the pod resides (applicable if node pools are enabled)
    Image The pod’s main image
    GPU compute allocation Amount of GPU devices allocated for the pod
    GPU memory allocation Amount of GPU memory allocated for the pod

    Connections Associated with Workload

    A connection refers to the method by which you can access and interact with the running workloads. It is essentially the "doorway" through which you can reach and use the applications (tools) these workloads provide.

    Click one of the values in the Connection(s) column, to view the list of connections and their parameters. Connections are network interfaces that communicate with the application running in the workload. Connections are either the URL the application exposes or the IP and the port of the node that the workload is running on.

    Column Description
    Name The name of the application running on the workload
    Connection type The network connection type selected for the workload
    Access Who is authorized to use this connection (everyone, specific groups/users)
    Address The connection URL
    Copy button Copy URL to clipboard
    Connect button Enabled only for supported tools

    Data Sources Associated with Workload

    Click one of the values in the Data source(s) column, to view the list of data sources and their parameters.

    Column Description
    Data source The name of the data source mounted to the workload
    Type The data source type

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
    • Refresh - Click REFRESH to update the table with the latest data
    • Show/Hide details - Click to view additional information on the selected row

    Show/Hide details

    Click a row in the Workloads table and then click the SHOW DETAILS button at the upper-right side of the action bar. The details pane appears, presenting the following tabs:

    Event History

    Displays the workload status over time. It displays events describing the workload lifecycle and alerts on notable events. Use the filter to search through the history for specific events.

    Metrics

    • GPU utilization
      Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs compute utilization (percentage of GPU compute) in this node.
    • GPU memory utilization
      Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs memory usage (percentage of the GPU memory) in this node.
    • CPU compute utilization
      The average of all CPUs’ cores compute utilization graph, along an adjustable period allows you to see the trends of CPU compute utilization (percentage of CPU compute) in this node.
    • CPU memory utilization
      The utilization of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory utilization (percentage of CPU memory) in this node.
    • CPU memory usage
      The usage of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory usage (in GB or MB of CPU memory) in this node.

    • For GPUs charts - Click the GPU legend on the right-hand side of the chart, to activate or deactivate any of the GPU lines.

    • You can click the date picker to change the presented period
    • You can use your mouse to mark a sub-period in the graph for zooming in, and use Reset zoom to go back to the preset period
    • Changes in the period affect all graphs on this screen.

    Logs

    Workload events are ordered in chronological order. The logs contain events from the workload’s lifecycle to help monitor and debug issues.

    Adding new workload

    Before starting, make sure you have created a project or have one created for you to work with workloads.

    To create a new workload:

    1. Click +NEW WORKLOAD
    2. Select a workload type - Follow the links below to view the step-by-step guide for each workload type:
      • Workspace. Used for data preparation and model-building tasks.
      • Training. Used for standard training tasks of all sorts
      • Distributed Training. Used for distributed tasks of all sorts
      • Inference. Used for inference and serving tasks
      • Job (legacy). This type is displayed only if enabled by your Administrator, under General settings → Workloads → Workload policies
    3. Click CREATE WORKLOAD

    Stopping a workload

    Stopping a workload kills the workload pods and releases the workload resources.

    1. Select the workload you want to stop
    2. Click STOP

    Running a workload

    Running a workload spins up new pods and resumes the workload work after it was stopped.

    1. Select the workload you want to run again
    2. Click RUN

    Connecting to a workload

    To connect to an application running in the workload (for example, Jupyter Notebook)

    1. Select the workload you want to connect
    2. Click CONNECT
    3. Select the tool from the drop-down list
    4. The selected tool is opened in a new tab on your browser

    Deleting a workload

    1. Select the workload you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Note

    Once a workload is deleted you can view it in the Deleted tab in the workloads view.
    This tab is displayed only if enabled by your Administrator, under General settings → Workloads → Deleted workloads

    Copy & Edit a workload

    1. Select the workload you want to copy and edit
    2. Click COPY & EDIT
    3. Update the workload and click CREATE WORKLOAD

    Using API

    Go to the Workloads API reference to view the available actions

    Troubleshooting

    To understand the condition of the workload, review the workload status in the Workload table. For more information, see check the workload’s event history.

    Listed below are a number of known issues when working with workloads and how to fix them:

    Issue Mediation
    Cluster connectivity issues (there are issues with your connection to the cluster error message) Verify that you are on a network that has been granted access to the cluster.
    Reach out to your cluster admin for instructions on verifying this.
    If you are an admin, see the troubleshooting section in the cluster documentation
    Workload in “Initializing” status for some time Check that you have access to the Container image registry.
    Check the statuses of the pods in the pods’ modal.
    Check the event history for more details
    Workload has been pending for some time Check that you have the required quota.
    Check the project’s available quota in the project dialog.
    Check that all services needed to run are bound to the workload.
    Check the event history for more details.
    PVCs created using the K8s API or kubectl are not visible or mountable in Run:ai. This is by design.
    - Create a new data source of type PVC in the Run:ai UI
    - In the Data mount section, select Existing PVC
    - Select the PVC you created via the K8S API
    You are now able to select and mount this PVC in your Run:ai submitted workloads.
    Workload is not visible in the UI. Check that the workload hasn’t been deleted.
    See the “Deleted” tab in the workloads view

    Workloads

    This article explains the procedure for managing workloads.

    Workloads table

    The Workloads table can be found under Workload manager in the Run:ai platform.

    The workloads table provides a list of all the workloads scheduled on the run:ai Scheduler, and allows you to manage them.

    The Workloads table consists of the following columns:

    Column Description
    Workload The name of the workload
    Type The workload type
    Preemptible Is the workload preemptible
    Status The different phases in a workload life cycle.
    Project The project in which the workload runs.
    Department The department that the workload is associated with. this column is visible only if the department toggle is enabled by your administrator.
    Created by The user who created the workload
    Running/requested pods The number of running pods out of the requested
    Creation time The timestamp for when the workload was created
    Completion time The timestamp the workload reached a terminal state (failed/completed)
    Connection(s) The method by which you can access and interact with the running workload. It's essentially the "doorway" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc). Click one of the values in the column to view the list of connections and their parameters
    Data source(s) Data resources used by the workload
    Environment The environment used by the workload
    Workload architecture Standard or distributed. A standard workload consists of a single process. A distributed workload consists of multiple processes working together. These processes can run on different nodes.
    GPU compute request Amount of GPU devices requested
    GPU compute allocation Amount of GPU devices allocated
    GPU memory request Amount of GPU memory Requested
    GPU memory allocation Amount of GPU memory allocated
    Idle GPU devices The number of allocated GPU devices that have been idle for more than 5 minutes
    CPU compute request Amount of CPU cores requested
    CPU compute allocation Amount of CPU cores allocated
    CPU memory request Amount of CPU memory requested
    CPU memory allocation Amount of CPU memory allocated
    Cluster The cluster that the workload is associated with

    Workload status

    The following table describes the different phases in a workload life cycle. The UI provides additional details for some of the below workload statuses which can be viewed by clicking the icon next to the status.

    Status Description Entry Condition Exit Condition
    Creating Workload setup is initiated in the cluster. Resources and pods are now provisioning. A workload is submitted. A multi-pod group is created.
    Pending Workload is queued and awaiting resource allocation. A pod group exists. All pods are scheduled.
    Initializing Workload is retrieving images, starting containers, and preparing pods. All pods are scheduled. All pods are initialized or a failure to initialize is detected.
    Running Workload is currently in progress with all pods operational. All pods initialized (all containers in pods are ready). Workload completion or failure.
    Degraded Pods may not align with specifications, network services might be incomplete, or persistent volumes may be detached. Check your logs for specific details. Pending - All pods are running but have issues. Running - All pods are running with no issues. Running - All resources are OK. Completed - Workload finished with fewer resources. Failed - Workload failure or user-defined rules.
    Deleting Workload and its associated resources are being decommissioned from the cluster. Deleting the workload. Resources are fully deleted.
    Stopped Workload is on hold and resources are intact but inactive. Stopping the workload without deleting resources. Transitioning back to the initializing phase or proceeding to deleting the workload.
    Failed Image retrieval failed or containers experienced a crash. Check your logs for specific details. An error occurs preventing the successful completion of the workload. Terminal state.
    Completed Workload has successfully finished its execution. The workload has finished processing without errors. Terminal state.

    Pods Associated with Workload

    Click one of the values in the Running/requested pods column, to view the list of pods and their parameters.

    Column Description
    Pod Pod name
    Status Pod lifecycle stages
    Node The node on which the pod resides
    Node pool The node pool in which the pod resides (applicable if node pools are enabled)
    Image The pod’s main image
    GPU compute allocation Amount of GPU devices allocated for the pod
    GPU memory allocation Amount of GPU memory allocated for the pod

    Connections Associated with Workload

    A connection refers to the method by which you can access and interact with the running workloads. It is essentially the "doorway" through which you can reach and use the applications (tools) these workloads provide.

    Click one of the values in the Connection(s) column, to view the list of connections and their parameters. Connections are network interfaces that communicate with the application running in the workload. Connections are either the URL the application exposes or the IP and the port of the node that the workload is running on.

    Column Description
    Name The name of the application running on the workload
    Connection type The network connection type selected for the workload
    Access Who is authorized to use this connection (everyone, specific groups/users)
    Address The connection URL
    Copy button Copy URL to clipboard
    Connect button Enabled only for supported tools

    Data Sources Associated with Workload

    Click one of the values in the Data source(s) column, to view the list of data sources and their parameters.

    Column Description
    Data source The name of the data source mounted to the workload
    Type The data source type

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
    • Refresh - Click REFRESH to update the table with the latest data
    • Show/Hide details - Click to view additional information on the selected row

    Show/Hide details

    Click a row in the Workloads table and then click the SHOW DETAILS button at the upper-right side of the action bar. The details pane appears, presenting the following tabs:

    Event History

    Displays the workload status over time. It displays events describing the workload lifecycle and alerts on notable events. Use the filter to search through the history for specific events.

    Metrics

    • GPU utilization
      Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs compute utilization (percentage of GPU compute) in this node.
    • GPU memory utilization
      Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs memory usage (percentage of the GPU memory) in this node.
    • CPU compute utilization
      The average of all CPUs’ cores compute utilization graph, along an adjustable period allows you to see the trends of CPU compute utilization (percentage of CPU compute) in this node.
    • CPU memory utilization
      The utilization of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory utilization (percentage of CPU memory) in this node.
    • CPU memory usage
      The usage of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory usage (in GB or MB of CPU memory) in this node.

    • For GPUs charts - Click the GPU legend on the right-hand side of the chart, to activate or deactivate any of the GPU lines.

    • You can click the date picker to change the presented period
    • You can use your mouse to mark a sub-period in the graph for zooming in, and use Reset zoom to go back to the preset period
    • Changes in the period affect all graphs on this screen.

    Logs

    Workload events are ordered in chronological order. The logs contain events from the workload’s lifecycle to help monitor and debug issues.

    Adding new workload

    Before starting, make sure you have created a project or have one created for you to work with workloads.

    To create a new workload:

    1. Click +NEW WORKLOAD
    2. Select a workload type - Follow the links below to view the step-by-step guide for each workload type:
      • Workspace. Used for data preparation and model-building tasks.
      • Training. Used for standard training tasks of all sorts
      • Distributed Training. Used for distributed tasks of all sorts
      • Inference. Used for inference and serving tasks
      • Job (legacy). This type is displayed only if enabled by your Administrator, under General settings → Workloads → Workload policies
    3. Click CREATE WORKLOAD

    Stopping a workload

    Stopping a workload kills the workload pods and releases the workload resources.

    1. Select the workload you want to stop
    2. Click STOP

    Running a workload

    Running a workload spins up new pods and resumes the workload work after it was stopped.

    1. Select the workload you want to run again
    2. Click RUN

    Connecting to a workload

    To connect to an application running in the workload (for example, Jupyter Notebook)

    1. Select the workload you want to connect
    2. Click CONNECT
    3. Select the tool from the drop-down list
    4. The selected tool is opened in a new tab on your browser

    Deleting a workload

    1. Select the workload you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Note

    Once a workload is deleted you can view it in the Deleted tab in the workloads view.
    This tab is displayed only if enabled by your Administrator, under General settings → Workloads → Deleted workloads

    Copy & Edit a workload

    1. Select the workload you want to copy and edit
    2. Click COPY & EDIT
    3. Update the workload and click CREATE WORKLOAD

    Using API

    Go to the Workloads API reference to view the available actions

    Troubleshooting

    To understand the condition of the workload, review the workload status in the Workload table. For more information, see check the workload’s event history.

    Listed below are a number of known issues when working with workloads and how to fix them:

    Issue Mediation
    Cluster connectivity issues (there are issues with your connection to the cluster error message) Verify that you are on a network that has been granted access to the cluster.
    Reach out to your cluster admin for instructions on verifying this.
    If you are an admin, see the troubleshooting section in the cluster documentation
    Workload in “Initializing” status for some time Check that you have access to the Container image registry.
    Check the statuses of the pods in the pods’ modal.
    Check the event history for more details
    Workload has been pending for some time Check that you have the required quota.
    Check the project’s available quota in the project dialog.
    Check that all services needed to run are bound to the workload.
    Check the event history for more details.
    PVCs created using the K8s API or kubectl are not visible or mountable in Run:ai. This is by design.
    - Create a new data source of type PVC in the Run:ai UI
    - In the Data mount section, select Existing PVC
    - Select the PVC you created via the K8S API
    You are now able to select and mount this PVC in your Run:ai submitted workloads.
    Workload is not visible in the UI. Check that the workload hasn’t been deleted.
    See the “Deleted” tab in the workloads view
    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/overviews/workload-types/index.html b/v2.20/platform-admin/workloads/overviews/workload-types/index.html index cd6df8d3ad..979d72c6ab 100644 --- a/v2.20/platform-admin/workloads/overviews/workload-types/index.html +++ b/v2.20/platform-admin/workloads/overviews/workload-types/index.html @@ -1,4 +1,4 @@ - Workload Types - Run:ai Documentation Library

    Run:ai Workload Types

    In the world of machine learning (ML), the journey from raw data to actionable insights is a complex process that spans multiple stages. Each stage of the AI lifecycle requires different tools, resources, and frameworks to ensure optimal performance. Run:ai simplifies this process by offering specialized workload types tailored to each phase, facilitating a smooth transition across various stages of the ML workflows.

    The ML lifecycle usually begins with the experimental work on data and exploration of different modeling techniques to identify the best approach for accurate predictions. At this stage, resource consumption is usually moderate as experimentation is done on a smaller scale. As confidence grows in the model's potential and its accuracy, the demand for compute resources increases. This is especially true during the training phase, where vast amounts of data need to be processed, particularly with complex models such as large language models (LLMs), with their huge parameter sizes, that often require distributed training across multiple GPUs to handle the intensive computational load.

    Finally, once the model is ready, it moves to the inference stage, where it is deployed to make predictions on new, unseen data. Run:ai's workload types are designed to correspond with the natural stages of this lifecycle. They are structured to align with the specific resource and framework requirements of each phase, ensuring that AI researchers and data scientists can focus on advancing their models without worrying about infrastructure management.

    Run:ai offers three workload types that correspond to a specific phase of the researcher’s work:

    • Workspaces – For experimentation with data and models.
    • Training – For resource-intensive tasks such as model training and data preparation.
    • Inference – For deploying and serving the trained model.

    Workspaces: the experimentation phase

    The Workspace is where data scientists conduct initial research, experiment with different data sets, and test various algorithms. This is the most flexible stage in the ML lifecycle, where models and data are explored, tuned, and refined. The value of workspaces lies in the flexibility they offer, allowing the researcher to iterate quickly without being constrained by rigid infrastructure.

    • Framework flexibility

      Workspaces support a variety of machine learning frameworks, as researchers need to experiment with different tools and methods.

    • Resource requirements

      Workspaces are often lighter on resources compared to the training phase, but they still require significant computational power for data processing, analysis, and model iteration.

      Hence, the default for the Run:ai workspaces considerations is to allow scheduling those workloads without the ability to preempt them once the resources were allocated. However, this non-preemptable state doesn’t allow to utilize more resources outside of the project’s deserved quota.

    See Running workspaces to learn more about how to submit a workspace via the Run:ai platform. For quick starts, see Running Jupyter Notebook using workspaces.

    Training: scaling resources for model development

    As models mature and the need for more robust data processing and model training increases, Run:ai facilitates this shift through the Training workload. This phase is resource-intensive, often requiring distributed computing and high-performance clusters to process vast data sets and train models.

    • Training architecture

      For training workloads Run:ai allows you to specify the architecture - standard or distributed. The distributed architecture is relevant for larger data sets and more complex models that require utilizing multiple nodes. For the distributed architecture, Run:ai allows you to specify different configurations for the master and workers and select which framework to use - PyTorch, XGBoost, MPI, and TensorFlow. In addition, as part of the distributed configuration, Run:ai enable the researchers to schedule their distributed workloads on nodes within the same region, zone, placement group, or any other topology.

    • Resource requirements

      Training tasks demand high memory, compute power, and storage. Run:ai ensures that the allocated resources match the scale of the task and allows those workloads to utilize more compute resources than the project’s deserved quota. Make sure that if you wish your training workload not to be preempted, specify the number of GPU’s that are in your quota.

    See Standard training and Distributed training to learn more about how to submit a training workload via the Run:ai UI. For quick starts, see Run your first standard training and Run your first distributed training.

    Inference: deploying and serving models

    Once a model is trained and validated, it moves to the Inference stage, where it is deployed to make predictions (usually in a production environment). This phase is all about efficiency and responsiveness, as the model needs to serve real-time or batch predictions to end-users or other systems.

    • Inference-specific use cases

      Naturally, inference workloads are required to change and adapt to the ever-changing demands to meet SLA. For example, additional replicas may be deployed, manually or automatically, to increase compute resources as part of a horizontal scaling approach or a new version of the deployment may need to be rolled out without affecting the running services.

    • Resource requirements

      Inference models differ in size and purpose, leading to varying computational requirements. For example, small OCR models can run efficiently on CPUs, whereas LLMs typically require significant GPU memory for deployment and serving. Inference workloads are considered production-critical and are given the highest priority to ensure compliance with SLAs. Additionally, Run:ai ensures that inference workloads cannot be preempted, maintaining consistent performance and reliability.

    See Deploy a custom inference workload to learn more about how to submit an inference workload via the Run:ai UI.

    Run:ai Workload Types

    In the world of machine learning (ML), the journey from raw data to actionable insights is a complex process that spans multiple stages. Each stage of the AI lifecycle requires different tools, resources, and frameworks to ensure optimal performance. Run:ai simplifies this process by offering specialized workload types tailored to each phase, facilitating a smooth transition across various stages of the ML workflows.

    The ML lifecycle usually begins with the experimental work on data and exploration of different modeling techniques to identify the best approach for accurate predictions. At this stage, resource consumption is usually moderate as experimentation is done on a smaller scale. As confidence grows in the model's potential and its accuracy, the demand for compute resources increases. This is especially true during the training phase, where vast amounts of data need to be processed, particularly with complex models such as large language models (LLMs), with their huge parameter sizes, that often require distributed training across multiple GPUs to handle the intensive computational load.

    Finally, once the model is ready, it moves to the inference stage, where it is deployed to make predictions on new, unseen data. Run:ai's workload types are designed to correspond with the natural stages of this lifecycle. They are structured to align with the specific resource and framework requirements of each phase, ensuring that AI researchers and data scientists can focus on advancing their models without worrying about infrastructure management.

    Run:ai offers three workload types that correspond to a specific phase of the researcher’s work:

    • Workspaces – For experimentation with data and models.
    • Training – For resource-intensive tasks such as model training and data preparation.
    • Inference – For deploying and serving the trained model.

    Workspaces: the experimentation phase

    The Workspace is where data scientists conduct initial research, experiment with different data sets, and test various algorithms. This is the most flexible stage in the ML lifecycle, where models and data are explored, tuned, and refined. The value of workspaces lies in the flexibility they offer, allowing the researcher to iterate quickly without being constrained by rigid infrastructure.

    • Framework flexibility

      Workspaces support a variety of machine learning frameworks, as researchers need to experiment with different tools and methods.

    • Resource requirements

      Workspaces are often lighter on resources compared to the training phase, but they still require significant computational power for data processing, analysis, and model iteration.

      Hence, the default for the Run:ai workspaces considerations is to allow scheduling those workloads without the ability to preempt them once the resources were allocated. However, this non-preemptable state doesn’t allow to utilize more resources outside of the project’s deserved quota.

    See Running workspaces to learn more about how to submit a workspace via the Run:ai platform. For quick starts, see Running Jupyter Notebook using workspaces.

    Training: scaling resources for model development

    As models mature and the need for more robust data processing and model training increases, Run:ai facilitates this shift through the Training workload. This phase is resource-intensive, often requiring distributed computing and high-performance clusters to process vast data sets and train models.

    • Training architecture

      For training workloads Run:ai allows you to specify the architecture - standard or distributed. The distributed architecture is relevant for larger data sets and more complex models that require utilizing multiple nodes. For the distributed architecture, Run:ai allows you to specify different configurations for the master and workers and select which framework to use - PyTorch, XGBoost, MPI, and TensorFlow. In addition, as part of the distributed configuration, Run:ai enable the researchers to schedule their distributed workloads on nodes within the same region, zone, placement group, or any other topology.

    • Resource requirements

      Training tasks demand high memory, compute power, and storage. Run:ai ensures that the allocated resources match the scale of the task and allows those workloads to utilize more compute resources than the project’s deserved quota. Make sure that if you wish your training workload not to be preempted, specify the number of GPU’s that are in your quota.

    See Standard training and Distributed training to learn more about how to submit a training workload via the Run:ai UI. For quick starts, see Run your first standard training and Run your first distributed training.

    Inference: deploying and serving models

    Once a model is trained and validated, it moves to the Inference stage, where it is deployed to make predictions (usually in a production environment). This phase is all about efficiency and responsiveness, as the model needs to serve real-time or batch predictions to end-users or other systems.

    • Inference-specific use cases

      Naturally, inference workloads are required to change and adapt to the ever-changing demands to meet SLA. For example, additional replicas may be deployed, manually or automatically, to increase compute resources as part of a horizontal scaling approach or a new version of the deployment may need to be rolled out without affecting the running services.

    • Resource requirements

      Inference models differ in size and purpose, leading to varying computational requirements. For example, small OCR models can run efficiently on CPUs, whereas LLMs typically require significant GPU memory for deployment and serving. Inference workloads are considered production-critical and are given the highest priority to ensure compliance with SLAs. Additionally, Run:ai ensures that inference workloads cannot be preempted, maintaining consistent performance and reliability.

    See Deploy a custom inference workload to learn more about how to submit an inference workload via the Run:ai UI.

    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/policies/old-policies/index.html b/v2.20/platform-admin/workloads/policies/old-policies/index.html index 883ad4649f..26cd1101b6 100644 --- a/v2.20/platform-admin/workloads/policies/old-policies/index.html +++ b/v2.20/platform-admin/workloads/policies/old-policies/index.html @@ -1,4 +1,4 @@ - Policies (YAML-based) - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/policies/overview/index.html b/v2.20/platform-admin/workloads/policies/overview/index.html index 7c23a68a5f..46b6ebb751 100644 --- a/v2.20/platform-admin/workloads/policies/overview/index.html +++ b/v2.20/platform-admin/workloads/policies/overview/index.html @@ -1,4 +1,4 @@ - Overview - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/policies/policy-examples/index.html b/v2.20/platform-admin/workloads/policies/policy-examples/index.html index 01cda63233..66a52c96d7 100644 --- a/v2.20/platform-admin/workloads/policies/policy-examples/index.html +++ b/v2.20/platform-admin/workloads/policies/policy-examples/index.html @@ -1,4 +1,4 @@ - Policies Examples - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/policies/policy-reference/index.html b/v2.20/platform-admin/workloads/policies/policy-reference/index.html index 97dc6c6a4a..8c9d7477f8 100644 --- a/v2.20/platform-admin/workloads/policies/policy-reference/index.html +++ b/v2.20/platform-admin/workloads/policies/policy-reference/index.html @@ -1,4 +1,4 @@ - Policies Reference - Run:ai Documentation Library
    \ No newline at end of file diff --git a/v2.20/platform-admin/workloads/policies/workspaces-policy/index.html b/v2.20/platform-admin/workloads/policies/workspaces-policy/index.html index 44df8b5039..1e231c824b 100644 --- a/v2.20/platform-admin/workloads/policies/workspaces-policy/index.html +++ b/v2.20/platform-admin/workloads/policies/workspaces-policy/index.html @@ -1,4 +1,4 @@ - Policies - Run:ai Documentation Library

    Policies

    This article explains the procedure to manage workload policies.

    Workload policies table

    The Workload policies table can be found under Policies in the Run:ai platform.

    Note

    Workload policies are disabled by default. If you cannot see Workload policies in the menu, then it must be enabled by your administrator, under General settings → Workloads → Policies

    The Workload policies table provides a list of all the policies defined in the platform, and allows you to manage them.

    The Workload policies table consists of the following columns:

    Column Description
    Policy The policy name which is a combination of the policy scope and the policy type
    Type The policy type is per Run:ai workload type. This allows administrators to set different policies for each workload type.
    Status Representation of the policy lifecycle (one of the following - “Creating…”, “Updating…”, “Deleting…”, Ready or Failed)
    Scope The scope the policy affects. Click the name of the scope to view the organizational tree diagram. You can only view the parts of the organizational tree for which you have permission to view.
    Created by The user who created the policy
    Creation time The timestamp for when the policy was created
    Last updated The last time the policy was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Refresh - Click REFRESH to update the table with the latest data

    Adding a policy

    To create a new policy:

    1. Click +NEW POLICY
    2. Select a scope
    3. Select the workload type
    4. Click +POLICY YAML
    5. In the YAML editor type or paste a YAML policy with defaults and rules.
      You can utilize the following references and examples:
    6. Policy YAML reference
    7. Policy YAML examples
    8. Click SAVE POLICY

    Editing a policy

    1. Select the policy you want to edit
    2. Click EDIT
    3. Update the policy and click APPLY
    4. Click SAVE POLICY

    Troubleshooting

    Listed below are issues that might occur when creating or editing a policy via the YAML Editor:

    Issue Message Mitigation
    Cluster connectivity issues There's no communication from cluster “cluster_name“. Actions may be affected, and the data may be stale. Verify that you are on a network that has been allowed access to the cluster. Reach out to your cluster administrator for instructions on verifying the issue.
    Policy can’t be applied due to a rule that is occupied by a different policy Field “field_name” already has rules in cluster: “cluster_id” Remove the rule from the new policy or adjust the old policy for the specific rule.
    Policy is not visible in the UI - Check that the policy hasn’t been deleted.
    Policy syntax is no valid Add a valid policy YAML;json: unknown field "field_name" For correct syntax check the Policy YAML reference or the Policy YAML examples.
    Policy can’t be saved for some reason The policy couldn't be saved due to a network or other unknown issue. Download your draft and try pasting and saving it again later. Possible cluster connectivity issues. Try updating the policy once again at a different time.
    Policies were submitted before version 2.18, you upgraded to version 2.18 or above and wish to submit new policies If you have policies and want to create a new one, first contact Run:ai support to prevent potential conflicts Contact Run:ai support. R&D can migrate your old policies to the new version.

    Viewing a policy

    To view a policy:

    1. Select the policy for which you want to view its policies.
    2. Click VIEW POLICY
    3. In the Policy form per workload section, view the workload rules and defaults:
      • Parameter
        The workload submission parameter that Rules and Defaults are applied to
      • Type (applicable for data sources only)
        The data source type (Git, S3, nfs, pvc etc.)
      • Default
        The default value of the Parameter
      • Rule
        Set up constraint on workload policy field
      • Source
        The origin of the applied policy (cluster, department or project)

    Note

    Some of the rules and defaults may be derived from policies of a parent cluster and/or department. You can see the source of each rule in the policy form. For more information, check the Scope of effectiveness documentation

    Deleting a policy

    1. Select the policy you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Using API

    Go to the Policies API reference to view the available actions.

    Policies

    This article explains the procedure to manage workload policies.

    Workload policies table

    The Workload policies table can be found under Policies in the Run:ai platform.

    Note

    Workload policies are disabled by default. If you cannot see Workload policies in the menu, then it must be enabled by your administrator, under General settings → Workloads → Policies

    The Workload policies table provides a list of all the policies defined in the platform, and allows you to manage them.

    The Workload policies table consists of the following columns:

    Column Description
    Policy The policy name which is a combination of the policy scope and the policy type
    Type The policy type is per Run:ai workload type. This allows administrators to set different policies for each workload type.
    Status Representation of the policy lifecycle (one of the following - “Creating…”, “Updating…”, “Deleting…”, Ready or Failed)
    Scope The scope the policy affects. Click the name of the scope to view the organizational tree diagram. You can only view the parts of the organizational tree for which you have permission to view.
    Created by The user who created the policy
    Creation time The timestamp for when the policy was created
    Last updated The last time the policy was updated

    Customizing the table view

    • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
    • Search - Click SEARCH and type the value to search by
    • Sort - Click each column header to sort by
    • Column selection - Click COLUMNS and select the columns to display in the table
    • Refresh - Click REFRESH to update the table with the latest data

    Adding a policy

    To create a new policy:

    1. Click +NEW POLICY
    2. Select a scope
    3. Select the workload type
    4. Click +POLICY YAML
    5. In the YAML editor type or paste a YAML policy with defaults and rules.
      You can utilize the following references and examples:
    6. Policy YAML reference
    7. Policy YAML examples
    8. Click SAVE POLICY

    Editing a policy

    1. Select the policy you want to edit
    2. Click EDIT
    3. Update the policy and click APPLY
    4. Click SAVE POLICY

    Troubleshooting

    Listed below are issues that might occur when creating or editing a policy via the YAML Editor:

    Issue Message Mitigation
    Cluster connectivity issues There's no communication from cluster “cluster_name“. Actions may be affected, and the data may be stale. Verify that you are on a network that has been allowed access to the cluster. Reach out to your cluster administrator for instructions on verifying the issue.
    Policy can’t be applied due to a rule that is occupied by a different policy Field “field_name” already has rules in cluster: “cluster_id” Remove the rule from the new policy or adjust the old policy for the specific rule.
    Policy is not visible in the UI - Check that the policy hasn’t been deleted.
    Policy syntax is no valid Add a valid policy YAML;json: unknown field "field_name" For correct syntax check the Policy YAML reference or the Policy YAML examples.
    Policy can’t be saved for some reason The policy couldn't be saved due to a network or other unknown issue. Download your draft and try pasting and saving it again later. Possible cluster connectivity issues. Try updating the policy once again at a different time.
    Policies were submitted before version 2.18, you upgraded to version 2.18 or above and wish to submit new policies If you have policies and want to create a new one, first contact Run:ai support to prevent potential conflicts Contact Run:ai support. R&D can migrate your old policies to the new version.

    Viewing a policy

    To view a policy:

    1. Select the policy for which you want to view its policies.
    2. Click VIEW POLICY
    3. In the Policy form per workload section, view the workload rules and defaults:
      • Parameter
        The workload submission parameter that Rules and Defaults are applied to
      • Type (applicable for data sources only)
        The data source type (Git, S3, nfs, pvc etc.)
      • Default
        The default value of the Parameter
      • Rule
        Set up constraint on workload policy field
      • Source
        The origin of the applied policy (cluster, department or project)

    Note

    Some of the rules and defaults may be derived from policies of a parent cluster and/or department. You can see the source of each rule in the policy form. For more information, check the Scope of effectiveness documentation

    Deleting a policy

    1. Select the policy you want to delete
    2. Click DELETE
    3. On the dialog, click DELETE to confirm the deletion

    Using API

    Go to the Policies API reference to view the available actions.

    \ No newline at end of file diff --git a/v2.20/search/search_index.json b/v2.20/search/search_index.json index b12f2ac314..6b8fb2b9f3 100644 --- a/v2.20/search/search_index.json +++ b/v2.20/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"Researcher/overview-researcher/","title":"Overview: Researcher Documentation","text":"

    Researchers, or AI practitioners, use Run:ai to submit Workloads.

    As part of the Researcher documentation you will find:

    • Quickstart Guides which provide step-by-step guides to Run:ai technology.
    • Command line interface reference documentation.
    • Best Practices for Deep Learning with Run:ai.
    • Information about the Run:ai Scheduler.
    • Using Run:ai with various developer tools.
    "},{"location":"Researcher/use-cases/","title":"Use Cases","text":"

    This is a collection of various client-requested use cases. Each use case is accompanied by a short live-demo video, along with all the files used.

    Note

    For the most up-to-date information, check out the official Run:ai use-cases GitHub page.

    • MLflow with Run:ai: experiment management is important for Data Scientists. This is a demo of how to set up and use MLflow with Run:ai.
    • Introduction to Docker: Run:ai runs using Docker images. This is a brief introduction to Docker, image creation, and how to use them in the context of Run:ai. Please also check out the Persistent Environments use case if you wish to keep the creation of Docker images to a minimum.
    • Tensorboard with Jupyter (ResNet demo): Many Data Scientists like to use Tensorboard to keep an eye on the their current training experiments. They also like to have it side-by-side with Jupyter. In this demo, we will show how to integrate Tensorboard and Jupyter Lab within the context of Run:ai.
    • Persistent Environments (with Conda/Mamba & Jupyter): Some Data Scientists find creating Docker images for every single one of their environments a bit of a hindrance. They would often prefer the ability to create and alter environments on the fly and to have those environments remain, even after an image has finished running in a job. This demo shows users how they can create and persist Conda/Mamba environments using an NFS.
    • Weights & Biases with Run:ai: W&B (Weights & Biases) is one of the best tools for experiment tracking and management. W&B is an official Run:ai partner. In this tutorial, we will demo how to use W&B alongside Run:ai
    "},{"location":"Researcher/Walkthroughs/quickstart-inference/","title":"Quickstart: Launch an Inference Workload","text":""},{"location":"Researcher/Walkthroughs/quickstart-inference/#introduction","title":"Introduction","text":"

    Machine learning (ML) inference refers to the process of using a trained machine learning model to make predictions or generate outputs based on new, unseen data. After a model has been trained on a dataset, inference involves applying this model to new examples to produce results such as classifications, predictions, or other types of insights.

    The quickstart below shows an inference server running the model and an inference client.

    There are various ways to submit a Workload:

    • Run:ai command-line interface (CLI)
    • Run:ai user interface
    • Run:ai API

    At this time, Inference services cannot be created via the CLI. The CLI can be used for creating a client to query the inference service.

    "},{"location":"Researcher/Walkthroughs/quickstart-inference/#prerequisites","title":"Prerequisites","text":"

    To complete this Quickstart, the Infrastructure Administrator will need to install some optional inference prerequisites as described here.

    To complete this Quickstart, the Platform Administrator will need to provide you with:

    • ML Engineer access to Project in Run:ai named \"team-a\"
    • The project should be assigned a quota of at least 1 GPU.
    • The URL of the Run:ai Console. E.g. https://acme.run.ai.

    As described, the inference client can be created via CLI. To perform this, you will need to have the Run:ai CLI installed on your machine. There are two available CLI variants:

    • The older V1 CLI. See installation here
    • A newer V2 CLI, supported with clusters of version 2.18 and up. See installation here
    "},{"location":"Researcher/Walkthroughs/quickstart-inference/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/quickstart-inference/#login","title":"Login","text":"CLI V1CLI V2User InterfaceAPI

    Run runai login and enter your credentials.

    Run runai login and enter your credentials.

    Browse to the provided Run:ai user interface and log in with your credentials.

    To use the API, you will need to obtain a token. Please follow the api authentication article.

    "},{"location":"Researcher/Walkthroughs/quickstart-inference/#create-an-inference-server-environment","title":"Create an Inference Server Environment","text":"

    To complete this Quickstart via the UI, you will need to create a new Inference Server Environment asset.

    This is a one-time step for all Inference workloads using the same image.

    Under Environments Select NEW ENVIRONMENT. Then select:

    • A default (cluster) scope.
    • Use the environment name inference-server.
    • The image runai.jfrog.io/demo/example-triton-server.
    • Under type of workload select inference.
    • Under endpoint set the container port as 8000 which is the port that the triton server is using.
    "},{"location":"Researcher/Walkthroughs/quickstart-inference/#run-an-inference-workload","title":"Run an Inference Workload","text":"CLI V1CLI V2User InterfaceAPI

    Not available right now.

    Not available right now.

    • In the Run:ai UI select Workloads
    • Select New Workload and then Inference
    • You should already have Cluster and Project selected. Enter inference-server-1 as the name and press CONTINUE.
    • Under Environment, select inference-server.
    • Under Compute Resource, select half-gpu.
    • Under `Replica autoscaling, select a minimum of 1 and a maximum of 2.
    • Under conditions for a new replica select Concurrency and set the value as 3.
    • Set the scale to zero option to 5 minutes
    • Select CREATE INFERENCE.

    Note

    For more information on submitting Workloads and creating Assets via the user interface, see Workload documentation.

    curl -L 'https://<COMPANY-URL>/api/v1/workloads/inferences' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"inference-server-1\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {\n        \"image\": \"runai.jfrog.io/demo/example-triton-server\",\n        \"servingPort\": {\n            \"protocol\": \"http\",\n            \"container\": 8000\n        },\n        \"autoscaling\": {\n            \"minReplicas\": 1,\n            \"maxReplicas\": 2,\n            \"metric\": \"concurrency\",\n            \"metricThreshold\": 3,\n            \"scaleToZeroRetentionSeconds\": 300\n        },\n        \"compute\": {\n            \"cpuCoreRequest\": 0.1,\n            \"gpuRequestType\": \"portion\",\n            \"cpuMemoryRequest\": \"100M\",\n            \"gpuDevicesRequest\": 1,\n            \"gpuPortionRequest\": 0.5\n        }\n    }\n}'\n
    1. <COMPANY-URL> is the link to the Run:ai user interface. For example acme.run.ai
    2. <TOKEN> is an API access token. see above on how to obtain a valid token.
    3. <PROJECT-ID> is the the ID of the team-a Project. You can get the Project ID via the Get Projects API
    4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.

    Note

    • The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.
    • For more information on the Inference Submit API see API Documentation

    This would start a triton inference server with a maximum of 2 instances, each instance consumes half a GPU.

    "},{"location":"Researcher/Walkthroughs/quickstart-inference/#query-the-inference-server","title":"Query the Inference Server","text":"

    You can use the Run:ai Triton demo client to send requests to the server

    "},{"location":"Researcher/Walkthroughs/quickstart-inference/#find-the-inference-server-endpoint","title":"Find the Inference Server Endpoint","text":"
    • Under Workloads, select Columns on the top right. Add the column Connections.
    • See the connections of the inference-server-1 workload:
    • Copy the inference endpoint URL.
    CLI V1CLI V2User Interface

    Open a terminal and run:

    runai config project team-a   \nrunai submit inference-client-1  -i runai.jfrog.io/demo/example-triton-client \\\n-- perf_analyzer -m inception_graphdef  -p 3600000 -u  <INFERENCE-ENDPOINT>    \n

    Open a terminal and run:

    runai project set team-a\nrunai training submit inference-client-1  -i runai.jfrog.io/demo/example-triton-client \\\n-- perf_analyzer -m inception_graphdef  -p 3600000 -u  <INFERENCE-ENDPOINT>    \n
    • In the Run:ai UI select Workloads
    • Select New Workload and then Training
    • You should already have Cluster, Project and a start from scratch Template selected. Enter inference-client-1 as the name and press CONTINUE.
    • Select NEW ENVIRONMENT. Enter inference-client as the name and runai.jfrog.io/demo/example-triton-client as the image. Select CREATE ENVIRONMENT.
    • When the previous screen comes up, select cpu-only under the Compute resource.
    • Under runtime settings enter the command as perf_analyzer and arguments -m inception_graphdef -p 3600000 -u <INFERENCE-ENDPOINT> (replace inference endpoint with the above URL).
    • Select CREATE TRAINING.

    In the user interface, under inference-server-1, go to the Metrics tab and watch as the various GPU and inference metrics graphs rise.

    "},{"location":"Researcher/Walkthroughs/quickstart-inference/#stop-workload","title":"Stop Workload","text":"

    Run the following:

    CLI V1CLI V2User Interface

    Not available right now

    Not available right now

    Select the two workloads and press DELETE.

    "},{"location":"Researcher/Walkthroughs/quickstart-overview/","title":"Run:ai Quickstart Guides","text":"

    Below is a set of Quickstart documents. The purpose of these documents is to get you acquainted with an aspect of Run:ai in the simplest possible form.

    Note

    The Quickstart documents are based solely on the command-line interface. The same functionality can be achieved by using the Workloads User interface which allows for Workload submission and log viewing.

    Follow the Quickstart documents below to learn more:

    • Training Quickstart documents:
      • Standard training sessions
      • Distributed Training
    • Build Quickstart documents:
      • Basic Interactive build sessions
      • Interfactive build session with connected ports
      • Jupyter Notebook
      • Visual Studio Web
    • Inference
    • GPU Allocation documents:
      • Using GPU Fractions
    • Scheduling documents:
      • Over-Quota, Basic Fairness & Bin Packing
      • Fairness

    Most quickstarts rely on an image called runai.jfrog.io/demo/quickstart. The image is based on TensorFlow Release 20-08. This TensorFlow image has minimal requirements for CUDA and NVIDIA Compute Capability.

    If your GPUs do not meet these requirements, use runai.jfrog.io/demo/quickstart:legacy instead.

    "},{"location":"Researcher/Walkthroughs/quickstart-vscode/","title":"Quickstart: Launch Workspace with a Visual Studio Code for Web","text":""},{"location":"Researcher/Walkthroughs/quickstart-vscode/#introduction","title":"Introduction","text":"

    The purpose of this article is to provide a quick ramp-up to running a Workspace running Visual Studio Code (Web edition). Workspaces are containers that live forever until deleted by the user.

    There are various ways to submit a Workspace:

    • Run:ai command-line interface (CLI)
    • Run:ai user interface
    • Run:ai API
    "},{"location":"Researcher/Walkthroughs/quickstart-vscode/#prerequisites","title":"Prerequisites","text":"

    To complete this Quickstart, the Infrastructure Administrator will need to configure a wildcard certificate to Run:ai as described here.

    To complete this Quickstart, the Platform Administrator will need to provide you with:

    • Researcher access to Project in Run:ai named \"team-a\"
    • The project should be assigned a quota of at least 1 GPU.
    • A URL of the Run:ai Console. E.g. https://acme.run.ai.

    To complete this Quickstart via the CLI, you will need to have the Run:ai CLI installed on your machine. There are two available CLI variants:

    • The older V1 CLI. See installation here
    • A newer V2 CLI, supported with clusters of version 2.18 and up. See installation here
    "},{"location":"Researcher/Walkthroughs/quickstart-vscode/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/quickstart-vscode/#login","title":"Login","text":"CLI V1CLI V2User InterfaceAPI

    Run runai login and enter your credentials.

    Run runai login and enter your credentials.

    Browse to the provided Run:ai user interface and log in with your credentials.

    To use the API, you will need to obtain a token. Please follow the api authentication article.

    "},{"location":"Researcher/Walkthroughs/quickstart-vscode/#create-a-visual-studio-environment","title":"Create a Visual Studio Environment","text":"

    To complete this Quickstart via the UI, you will need to create a new Visual Studio Environment asset.

    This is a one-time step for all VSCode Workloads.

    Under Environments Select NEW ENVIRONMENT. Then select:

    • A scope (where you want your environment to live).
    • Use the environment name vscode.
    • The image quay.io/opendatahub-contrib/workbench-images:vscode-datascience-c9s-py311_2023c_latest.
    • Under Tools, add Visual Studio Code and change the port to 8787.
    "},{"location":"Researcher/Walkthroughs/quickstart-vscode/#run-workload","title":"Run Workload","text":"CLI V1CLI V2User InterfaceAPI

    Open a terminal and run:

    runai config project team-a   \nrunai submit vs1 --jupyter -g 1\n

    Note

    For more information on the workload submit command, see cli documentation.

    Open a terminal and run:

    runai project set team-a\nrunai workspace submit vs1  --image quay.io/opendatahub-contrib/workbench-images:vscode-datascience-c9s-py311_2023c_latest \\\n    --gpu-devices-request 1  --external-url container=8787  \n

    Note

    For more information on the workspace submit command, see cli documentation.

    • In the Run:ai UI select Workloads
    • Select New Workload and then Workspace
    • You should already have Cluster, Project and a start from scratch Template selected. Enter vs1 as the name and press CONTINUE.
    • Under Environment, select select the previously created vscode environment.
    • Under Compute Resource, select one-gpu.
    • Select CREATE WORKSPACE.

    Note

    For more information on submitting Workloads and creating Assets via the user interface, see Workload documentation.

    curl -L 'https://<COMPANY-URL>/api/v1/workloads/workspaces' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"vs1\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {\n        \"image\": \"quay.io/opendatahub-contrib/workbench-images:vscode-datascience-c9s-py311_2023c_latest\",\n        \"compute\": {\n            \"gpuDevicesRequest\": 1\n        },\n        \"exposedUrls\" : [\n            { \n                \"container\" : 8787,\n                \"toolType\": \"visual-studio-code\", \\ # (5)\n                \"toolName\": \"Visual Studio\" \\ # (6)\n            }\n        ]\n    }\n}'\n
    1. <COMPANY-URL> is the link to the Run:ai user interface. For example acme.run.ai
    2. <TOKEN> is an API access token. see above on how to obtain a valid token.
    3. <PROJECT-ID> is the the ID of the team-a Project. You can get the Project ID via the Get Projects API
    4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.
    5. toolType will show the Visual Studio icon when connecting to the Visual Studio tool via the user interface.
    6. toolName text will show when connecting to the Visual Studio tool via the user interface.

    Note

    • The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.
    • For more information on the Training Submit API see API Documentation

    This would start a Workspace with a pre-configured Visual Studio Code image with an allocation of a single GPU.

    "},{"location":"Researcher/Walkthroughs/quickstart-vscode/#accessing-visual-studio-web","title":"Accessing Visual Studio Web","text":"

    Via the Run:ai user interface, go to Workloads, select the vs1 Workspace and press Connect.

    "},{"location":"Researcher/Walkthroughs/quickstart-vscode/#stop-workload","title":"Stop Workload","text":"

    Run the following:

    CLI V1CLI V2User Interface
    runai delete job vs1\n
    runai workspace delete vs1\n

    Select the Workspace and press DELETE.

    "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/","title":"Quickstart: Launch Interactive Build Workloads with Connected Ports","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#introduction","title":"Introduction","text":"

    This Quickstart is an extension of the Quickstart document: Start and Use Interactive Build Workloads

    When starting a container with the Run:ai Command-Line Interface (CLI), it is sometimes needed to expose internal ports to the user. Examples are: accessing a Jupyter notebook, using the container from a development environment such as PyCharm.

    "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#exposing-a-container-port","title":"Exposing a Container Port","text":"

    There are three ways to expose ports in Kubernetes: Port Forwarding, NodePort, and LoadBalancer. The first two will always work. The other requires a special setup by your administrator. The four methods are explained here.

    The document below provides an example based on Port Forwarding.

    "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#port-forwarding-step-by-step-walkthrough","title":"Port Forwarding, Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#setup","title":"Setup","text":"
    • Login to the Projects area of the Run:ai user interface.
    • Add a Project named team-a.
    "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#run-workload","title":"Run Workload","text":"
    • At the command-line run:
    runai config project team-a\nrunai submit nginx-test -i zembutsu/docker-sample-nginx --interactive\nrunai port-forward nginx-test --port 8080:80\n
    • The Job is based on a sample NGINX webserver docker image zembutsu/docker-sample-nginx. Once accessed via a browser, the page shows the container name.
    • Note the interactive flag which means the Job will not have a start or end. It is the Researcher's responsibility to close the Job.
    • In this example, we have chosen the simplest scheme to expose ports which is port forwarding. We temporarily expose port 8080 to localhost as long as the runai port-forward command is not stopped
    • It is possible to forward traffic from multiple IP addresses by using the \"--address\" parameter. Check the CLI reference for further details.

    The result will be:

    The job 'nginx-test-0' has been submitted successfully\nYou can run `runai describe job nginx-test-0 -p team-a` to check the job status\n\nForwarding from 127.0.0.1:8080 -> 80\nForwarding from [::1]:8080 -> 80\n
    "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#access-the-webserver","title":"Access the Webserver","text":"

    Open the following in the browser at http://localhost:8080.

    You should see a web page with the name of the container.

    "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#stop-workload","title":"Stop Workload","text":"

    Press Ctrl-C in the shell to stop port forwarding. Then delete the Job by running runai delete job nginx-test

    "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#see-also","title":"See Also","text":"
    • Develop on Run:ai using Visual Studio Code
    • Develop on Run:ai using PyCharm
    • Use a Jupyter notbook with Run:ai.
    "},{"location":"Researcher/Walkthroughs/walkthrough-build/","title":"Quickstart: Launch Interactive Build Workloads","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build/#introduction","title":"Introduction","text":"

    The purpose of this article is to provide a quick ramp-up to running an interactive Workspace to allow building data science programs. data scientists typically use various tools such as Jupyter Notebook, PyCharm, or Visual Studio code. However, in this quickstart, we will start by launching a bare-bones Workspace without such tools.

    With this Quickstart you will learn how to:

    • Start a workspace.
    • Open an ssh session to the workspace.
    • Stop the workspace.
    "},{"location":"Researcher/Walkthroughs/walkthrough-build/#prerequisites","title":"Prerequisites","text":"

    To complete this Quickstart, the Platform Administrator will need to provide you with:

    • Researcher access to Project in Run:ai named \"team-a\"
    • The project should be assigned a quota of at least 1 GPU.
    • A URL of the Run:ai Console. E.g. https://acme.run.ai.

    To complete this Quickstart via the CLI, you will need to have the Run:ai CLI installed on your machine. There are two available CLI variants:

    • The older V1 CLI. See installation here
    • A newer V2 CLI, supported with clusters of version 2.18 and up. See installation here
    "},{"location":"Researcher/Walkthroughs/walkthrough-build/#step-by-step-quickstart","title":"Step by Step Quickstart","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build/#login","title":"Login","text":"CLI V1CLI V2User InterfaceAPI

    Run runai login and enter your credentials.

    Run runai login and enter your credentials.

    Browse to the provided Run:ai user interface and log in with your credentials.

    To use the API, you will need to obtain a token. Please follow the api authentication article.

    "},{"location":"Researcher/Walkthroughs/walkthrough-build/#create-a-workspace","title":"Create a Workspace","text":"CLI V1CLI V2User InterfaceAPI

    Open a terminal and run:

    runai config project team-a   \nrunai submit build1 -i ubuntu -g 1 --interactive -- sleep infinity\n

    Note

    For more information on the workload submit command, see cli documentation.

    Open a terminal and run:

    runai project set team-a\nrunai workspace submit build1 -i ubuntu -g 1 --command -- sleep infinity\n

    Note

    For more information on the workspace submit command, see cli documentation.

    • In the Run:ai UI select Workloads
    • Select New Workload and then Workspace
    • You should already have Cluster, Project and a start from scratch Template selected. Enter build1 as the name and press CONTINUE.
    • Select NEW ENVIRONMENT. Enter ubuntu as the name and ubuntu as the image. Then select CREATE ENVIRONMENT.
    • When the previous screen comes up, select one-gpu under the Compute resource.
    • Select CREATE WORKSPACE.

    Note

    For more information on submitting Workloads and creating Assets via the user interface, see Workload documentation.

    curl -L 'https://<COMPANY-URL>/api/v1/workloads/workspaces' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"build1\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {\n        \"command\" : \"sleep\",\n        \"args\" : \"infinity\"\n        \"image\": \"ubuntu\",\n        \"compute\": {\n        \"gpuDevicesRequest\": 1\n        }\n    }\n}'\n
    1. <COMPANY-URL> is the link to the Run:ai user interface. For example acme.run.ai
    2. <TOKEN> is an API access token. see above on how to obtain a valid token.
    3. <PROJECT-ID> is the the ID of the team-a Project. You can get the Project ID via the Get Projects API
    4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.

    Note

    • The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.
    • For more information on the Workspace Submit API see API Documentation
    • This would start a workload of type Workspace for team-a with an allocation of a single GPU.
    • We named the Workload build1.
    • Note that, unlike a Training workload, a Workspace workload will not end automatically. It is the Researcher's responsibility to stop the Workload.
    • The command provided is sleep infinity. You must provide a command or the container will start and then exit immediately. Alternatively, when using the command line, replace these flags with --attach to attach immediately to a session.
    "},{"location":"Researcher/Walkthroughs/walkthrough-build/#list-workloads","title":"List Workloads","text":"

    Follow up on the Workload's progress by running:

    CLI V1CLI V2User Interface

    runai list jobs\n
    The result:

    runai workspace list\n

    The result:

    Workload     Type        Status      Project     Preemptible      Running/Requested Pods     GPU Allocation\n\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\nvs1          Workspace   Running     team-a      No               1/1                        1.00\n
    • Open the Run:ai user interface.
    • Under \"Workloads\" you can view the new Workspace:

    Select the Workloads and press Show Details to see the Workload details

    Typical statuses you may see:

    • ContainerCreating - The docker container is being downloaded from the cloud repository
    • Pending - the job is waiting to be scheduled
    • Running - the job is running

    A full list of Job statuses can be found here

    "},{"location":"Researcher/Walkthroughs/walkthrough-build/#describe-workload","title":"Describe Workload","text":"

    To get additional status on your Workload run:

    CLI V1CLI V2User Interface
    runai describe job build1\n
    runai workspace describe build1\n

    Workload parameters can be viewed by adding more columns to the Workload list and by reviewing the Event History tab for the specific Workload.

    "},{"location":"Researcher/Walkthroughs/walkthrough-build/#get-a-shell-to-the-container","title":"Get a Shell to the container","text":"CLI V1CLI V2

    Run:

    runai bash build1\n

    runai workspace bash build1\n

    This should provide a direct shell into the computer

    "},{"location":"Researcher/Walkthroughs/walkthrough-build/#stop-workload","title":"Stop Workload","text":"

    Run the following:

    CLI V1CLI V2User Interface
    runai delete job build1\n
    runai workspace delete build1\n

    Select the Workspace and press DELETE.

    This would stop the workspace. You can verify this by running the list command again.

    "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/","title":"Quickstart: Launch Workloads with GPU Fractions","text":""},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#introduction","title":"Introduction","text":"

    Run:ai provides a Fractional GPU sharing system for containerized workloads on Kubernetes. The system supports workloads running CUDA programs and is especially suited for lightweight AI tasks such as inference and model building. The fractional GPU system transparently gives data science and AI engineering teams the ability to run multiple workloads simultaneously on a single GPU, enabling companies to run more workloads such as computer vision, voice recognition and natural language processing on the same hardware, lowering costs.

    Run:ai\u2019s fractional GPU system effectively creates logical GPUs, with their own memory and computing space that containers can use and access as if they were self-contained processors. This enables several workloads to run in containers side-by-side on the same GPU without interfering with each other. The solution is transparent, simple, and portable; it requires no changes to the containers themselves.

    A typical use-case could see a couple of Workloads running on the same GPU, meaning you could multiply the work with the same hardware.

    The purpose of this article is to provide a quick ramp-up to running a training Workload with fractions of a GPU.

    There are various ways to submit a Workload:

    • Run:ai command-line interface (CLI)
    • Run:ai user interface
    • Run:ai API
    "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#prerequisites","title":"Prerequisites","text":"

    To complete this Quickstart, the Platform Administrator will need to provide you with:

    • Researcher access to Run:ai
    • To a Project named \"team-a\"
    • With at least 1 GPU assigned to the project.
    • A link to the Run:ai Console. E.g. https://acme.run.ai.
    • To complete this Quickstart via the CLI, you will need to have the Run:ai CLI installed on your machine. There are two available CLI variants:
      • The older V1 CLI. See installation here
      • A newer V2 CLI, supported with clusters of version 2.18 and up. See installation here
    "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#login","title":"Login","text":"CLI V1CLI V2User InterfaceAPI

    Run runai login and enter your credentials.

    Run runai login and enter your credentials.

    Browse to the provided Run:ai user interface and log in with your credentials.

    To use the API, you will need to obtain a token. Please follow the api authentication article.

    "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#run-workload","title":"Run Workload","text":"

    Open a terminal and run:

    CLI V1CLI V2User InterfaceAPI
    runai config project team-a   \nrunai submit frac05 -i runai.jfrog.io/demo/quickstart -g 0.5\nrunai submit frac05-2 -i runai.jfrog.io/demo/quickstart -g 0.5 \n
    runai project set team-a\nrunai training submit frac05 -i runai.jfrog.io/demo/quickstart --gpu-portion-request 0.5\nrunai training submit frac05-2 -i runai.jfrog.io/demo/quickstart --gpu-portion-request 0.5\n
    • In the Run:ai UI select Workloads
    • Select New Workload and then Training
    • You should already have Cluster, Project and a start from scratch Template selected. Enter frac05 as the name and press CONTINUE.
    • Select NEW ENVIRONMENT. Enter quickstart as the name and runai.jfrog.io/demo/quickstart as the image. Then select CREATE ENVIRONMENT.
    • When the previous screen comes up, select half-gpu under the Compute resource.
    • Select CREATE TRAINING.
    • Follow the process again to submit a second workload called frac05-2.

    Note

    For more information on submitting Workloads and creating Assets via the user interface, see Workload documentation.

    curl -L 'https://<COMPANY-URL>/api/v1/workloads/trainings' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"frac05\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {\n        \"image\": \"runai.jfrog.io/demo/quickstart\",\n        \"compute\": {\n        \"gpuRequestType\": \"portion\",\n        \"gpuPortionRequest\" : 0.5\n        }\n    }\n}'\n
    1. <COMPANY-URL> is the link to the Run:ai user interface. For example acme.run.ai
    2. <TOKEN> is an API access token. see above on how to obtain a valid token.
    3. <PROJECT-ID> is the the ID of the team-a Project. You can get the Project ID via the Get Projects API
    4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.

    Note

    • The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.
    • For more information on the Training Submit API see API Documentation
    • The Workloads are based on a sample docker image runai.jfrog.io/demo/quickstart the image contains a startup script that runs a deep learning TensorFlow-based workload.
    • We named the Workloads frac05 and frac05-2 respectively.
    • The Workloads are assigned to team-a with an allocation of half a GPU.
    "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#list-workloads","title":"List Workloads","text":"

    Follow up on the Workload's progress by running:

    CLI V1CLI V2User Interface

    runai list jobs\n
    The result:

    Showing jobs for project team-a\nNAME      STATUS   AGE  NODE                  IMAGE                          TYPE   PROJECT  USER   GPUs Allocated (Requested)  PODs Running (Pending)  SERVICE URL(S)\nfrac05    Running  9s   runai-cluster-worker  runai.jfrog.io/demo/quickstart  Train  team-a   yaron  0.50 (0.50)                 1 (0)\nfrac05-2  Running  8s   runai-cluster-worker  runai.jfrog.io/demo/quickstart  Train  team-a   yaron  0.50 (0.50)                 1 (0)\n
    runai training list\n

    The result:

    Workload               Type        Status      Project     Preemptible      Running/Requested Pods     GPU Allocation\n\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\nfrac05      Training    Running  team-a      Yes              0/1                        0.00\nfrac05-2    Training    Running  team-a      Yes              0/1                        0.00    \n
    • Open the Run:ai user interface.
    • Under Workloads you can view the two new Training Workloads
    "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#view-partial-gpu-memory","title":"View Partial GPU memory","text":"

    To verify that the Workload sees only parts of the GPU memory run:

    CLI V1CLI V2
    runai exec frac05 nvidia-smi\n
    runai training exec frac05 nvidia-smi\n

    The result:

    Notes:

    • The total memory is circled in red. It should be 50% of the GPUs memory size. In the picture above we see 8GB which is half of the 16GB of Tesla V100 GPUs.
    • The script running on the container is limited by 8GB. In this case, TensorFlow, which tends to allocate almost all of the GPU memory has allocated 7.7GB RAM (and not close to 16 GB). Overallocation beyond 8GB will lead to an out-of-memory exception
    "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#use-exact-gpu-memory","title":"Use Exact GPU Memory","text":"

    Instead of requesting a fraction of the GPU, you can ask for specific GPU memory requirements. For example:

    CLI V1CLI V2User Interface
    runai submit  -i runai.jfrog.io/demo/quickstart --gpu-memory 5G\n
    runai training submit -i runai.jfrog.io/demo/quickstart --gpu-memory-request 5G\n

    As part of the Workload submission, Create a new Compute Resource, with 1 GPU Device and 5GB of GPU memory per device. See picture below:

    Which will provide 5GB of GPU memory.

    "},{"location":"Researcher/Walkthroughs/walkthrough-overquota/","title":"Quickstart: Over-Quota and Bin Packing","text":""},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#goals","title":"Goals","text":"

    The goal of this Quickstart is to explain the concepts of over-quota and bin-packing (consolidation) and how they help in maximizing cluster utilization:

    • Show the simplicity of resource provisioning, and how resources are abstracted from users.
    • Show how the system eliminates compute bottlenecks by allowing teams/users to go over their resource quota if there are free GPUs in the cluster.
    "},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#setup-and-configuration","title":"Setup and configuration:","text":"

    To complete this Quickstart, the Platform Administrator will need to provide you with:

    • Your cluster should have 4 GPUs on 2 machines with 2 GPUs each.
    • Researcher access to two Projects named \"team-a\" and \"team-b\"
    • Each project should be assigned an exact quota of 2 GPUs.
    • A URL of the Run:ai Console. E.g. https://acme.run.ai.
    • Run:ai CLI installed on your machine. There are two available CLI variants:

      • The older V1 CLI. See installation here
      • A newer V2 CLI, supported with clusters of version 2.18 and up. See installation here
    "},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#login","title":"Login","text":"

    Run runai login and enter your credentials.

    "},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#part-i-over-quota","title":"Part I: Over-quota","text":"

    Open a terminal and run the following command:

    CLI V1CLI V2
    runai submit a2 -i runai.jfrog.io/demo/quickstart -g 2 -p team-a\nrunai submit a1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai submit b1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\n
    runai training submit a2 -i runai.jfrog.io/demo/quickstart -g 2 -p team-a\nrunai training submit a1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai training submit b1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\n

    System status after run:

    Discussion

    • team-a has 3 GPUs allocated. Which is over its quota by 1 GPU.
    • The system allows this over-quota as long as there are available resources
    • The system is at full capacity with all GPUs utilized.
    "},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#part-2-basic-fairness-via-preemption","title":"Part 2: Basic Fairness via Preemption","text":"

    Run the following command:

    CLI V1CLI V2
    runai submit b2 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\n
    runai training submit b2 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\n

    System status after run:

    Discussion

    • team-a can no longer remain in over-quota. Thus, one Job, must be preempted: moved out to allow team-b to grow.
    • Run:ai scheduler chooses to preempt Job a1.
    • It is important that unattended Jobs will save checkpoints. This will ensure that whenever Job a1 resume, it will do so from where it left off.
    "},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#part-3-bin-packing","title":"Part 3: Bin Packing","text":"

    Run the following command:

    CLI V1CLI V2

    runai delete job a2 -p team-a

    runai training delete a2\n

    a1 is now going to start running again.

    Run:

    CLI V1CLI V2
    runai list jobs -A\n
    runai training list -A\n

    You have two Jobs that are running on the first node and one Job that is running alone the second node.

    Choose one of the two Jobs from the full node and delete it:

    CLI V1CLI V2
    runai delete job <job-name> -p <project>\n
    runai training delete <job-name> -p <project>\n

    The status now is:

    Now, run a 2 GPU Job:

    CLI V1CLI V2
    runai submit a2 -i runai.jfrog.io/demo/quickstart -g 2 -p team-a\n
    runai training submit a2 -i runai.jfrog.io/demo/quickstart -g 2 -p team-a\n

    _ The status now is:

    Discussion

    Note that Job a1 has been preempted and then restarted on the second node, to clear space for the new a2 Job. This is bin-packing or consolidation

    "},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/","title":"Quickstart: Queue Fairness","text":""},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#goal","title":"Goal","text":"

    The goal of this Quickstart is to explain fairness. The over-quota Quickstart shows basic fairness where allocated GPUs per Project are adhered to such that if a Project is in over-quota, its Job will be preempted once another Project requires its resources.

    This Quickstart is about queue fairness. It shows that Jobs will be scheduled fairly regardless of the time they have been submitted. As such, if a person in Project A has submitted 50 Jobs and soon after that, a person in Project B has submitted 25 Jobs, the Jobs in the queue will be processed fairly.

    "},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#setup-and-configuration","title":"Setup and configuration:","text":"

    To complete this Quickstart, the Platform Administrator will need to provide you with:

    • Your cluster should have 4 GPUs on 2 machines with 2 GPUs each.
    • Researcher access to two Projects named \"team-a\" and \"team-b\"
    • Each project should be assigned an exact quota of 1 GPU.
    • A URL of the Run:ai Console. E.g. https://acme.run.ai.
    • Run:ai CLI installed on your machine. There are two available CLI variants:

      • The older V1 CLI. See installation here
      • A newer V2 CLI, supported with clusters of version 2.18 and up. See installation here
    "},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#login","title":"Login","text":"

    Run runai login and enter your credentials.

    "},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#part-i-immediate-displacement-of-over-quota","title":"Part I: Immediate Displacement of Over-Quota","text":"

    Run the following commands:

    CLI V1CLI V2
    runai submit a1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai submit a2 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai submit a3 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai submit a4 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\n
    runai training submit a1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai training submit a2 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai training submit a3 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai training submit a4 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\n

    System status after run:

    Discussion

    team-a, even though it has a single GPU as quota, is now using all 4 GPUs.

    Run the following commands:

    CLI V1CLI V2
    runai submit b1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\nrunai submit b2 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\nrunai submit b3 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\nrunai submit b4 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\n
    runai training submit b1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\nrunai training submit b2 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\nrunai training submit b3 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\nrunai training submit b4 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\n

    System status after run:

    Discussion

    • Two team-b Jobs have immediately displaced team-a.
    • team-a and team-b each have a quota of 1 GPU, thus the remaining over-quota (2 GPUs) is distributed equally between the Projects.
    "},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#part-2-queue-fairness","title":"Part 2: Queue Fairness","text":"

    Now lets start deleting Jobs. Alternatively, you can wait for Jobs to complete.

    CLI V1CLI V2
    runai delete job b2 -p team-b\n
    runai training delete b2 -p team-b\n

    Discussion

    As the quotas are equal (1 for each Project, the remaining pending Jobs will get scheduled one by one alternating between Projects, regardless of the time in which they were submitted.

    "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/","title":"Best Practice: From Bare Metal to Docker Images","text":""},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#introduction","title":"Introduction","text":"

    Some Researchers do data science on bare metal. The term bare-metal relates to connecting to a server and working directly on its operating system and disks.

    This is the fastest way to start working, but it introduces problems when the data science organization scales:

    • More Researchers mean that the machine resources need to be efficiently shared
    • Researchers need to collaborate and share data, code, and results

    To overcome that, people working on bare-metal typically write scripts to gather data, code as well as code dependencies. This soon becomes an overwhelming task.

    "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#why-use-docker-images","title":"Why Use Docker Images?","text":"

    Docker images and containerization in general provide a level of abstraction which, by large, frees developers and Researchers from the mundane tasks of setting up an environment. The image is an operating system by itself and thus the 'environment' is by large, a part of the image.

    When a docker image is instantiated, it creates a container. A container is the running manifestation of a docker image.

    "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#moving-a-data-science-environment-to-docker","title":"Moving a Data Science Environment to Docker","text":"

    A data science environment typically includes:

  • Training data
  • Machine Learning (ML) code and inputs
  • Libraries: Code dependencies that must be installed before the ML code can be run
  • "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#training-data","title":"Training data","text":"

    Training data is usually significantly large (from several Gigabytes to Petabytes) and is read-only in nature. Thus, training data is typically left outside of the docker image. Instead, the data is mounted onto the image when it is instantiated. Mounting a volume allows the code within the container to access the data as though it was within a directory on the local file system.

    The best practice is to store the training data on a shared file system. This allows the data to be accessed uniformly on whichever machine the Researcher is currently using, allowing the Researcher to easily migrate between machines.

    Organizations without a shared file system typically write scripts to copy data from machine to machine.

    "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#machine-learning-code-and-inputs","title":"Machine Learning Code and Inputs","text":"

    As a rule, code needs to be saved and versioned in a code repository.

    There are two alternative practices:

    • The code resides in the image and is being periodically pulled from the repository. This practice requires building a new container image each time a change is introduced to the code.
    • When a shared file system exists, the code can reside outside the image on a shared disk and mounted via a volume onto the container.

    Both practices are valid.

    Inputs to machine learning models and artifacts of training sessions, like model checkpoints, are also better stored in and loaded from a shared file system.

    "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#code-dependencies","title":"Code Dependencies","text":"

    Any code has code dependencies. These libraries must be installed for the code to run. As the code is changing, so do the dependencies.

    ML Code is typically python and python dependencies are typically declared together in a single requirements.txt file which is saved together with the code.

    The best practice is to have your docker startup script (see below) run this file using pip install -r requirements.txt. This allows the flexibility of adding and removing code dependencies dynamically.

    "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#ml-lifecycle-build-and-train","title":"ML Lifecycle: Build and Train","text":"

    Deep learning workloads can be divided into two generic types:

  • Interactive \"build\" sessions. With these types of workloads, the data scientist opens an interactive session, via bash, Jupyter Notebook, remote PyCharm, or similar and accesses GPU resources directly. Build workloads are typically meant for debugging and development sessions.
  • Unattended \"training\" sessions. Training is characterized by a machine learning run that has a start and a finish. With these types of workloads, the data scientist prepares a self-running workload and sends it for execution. During the execution, the data scientist can examine the results. A Training session can take from a few minutes to a couple of days. It can be interrupted in the middle and later restored (though the data scientist should save checkpoints for that purpose). Training workloads typically utilize large percentages of the GPU and at the end of the run automatically frees the resources.
  • Getting your docker ready is also a matter of which type of workload you are currently running.

    "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#build-workloads","title":"Build Workloads","text":"

    With \"build\" you are actually coding and debugging small experiments. You are interactive. In that mode, you can typically take a well known standard image (e.g. https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) and use it directly.

    Start a docker container by running:

    docker run -it .... \"the well known image\" -v /where/my/code/resides bash 

    You get a shell prompt to a container with a mounted volume of where your code is. You can then install your prerequisites and run your code via ssh.

    You can also access the container remotely from tools such as PyCharm, Jupyter Notebook, and more. In this case, the docker image needs to be customized to install the \"server software\" (e.g. a Jupyter Notebook service).

    "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#training-workloads","title":"Training Workloads","text":"

    For training workloads, you can use a well-known image (e.g. the TensorFlow image from the link above) but more often than not, you want to create your own docker image. The best practice is to use the well-known image (e.g. TensorFlow from above) as a base image and add your own customizations on top of it. To achieve that, you create a Dockerfile. A Dockerfile is a declarative way to build a docker image and is built in layers. e.g.:

    1. Base image is nvidia-tensorflow
    2. Install popular software
    3. (Optional) Run a script

    The script can be part of the image or can be provided as part of the command line to run the docker. It will typically include additional dependencies to install as well as a reference to the ML code to be run.

    The best practice for running training workloads is to test the container image in a \"build\" session and then send it for execution as a training Job. For further information on how to set up and parameterize a training workload via docker or Run:ai see Converting your Workload to use Unattended Training Execution.

    "},{"location":"Researcher/best-practices/convert-to-unattended/","title":"Best Practice: Convert your Workload to Run Unattended","text":""},{"location":"Researcher/best-practices/convert-to-unattended/#motivation","title":"Motivation","text":"

    Run:ai allows non-interactive training workloads to extend beyond guaranteed quotas and into over-quota as long as computing resources are available. To achieve this kind of flexibility, the system needs to be able to safely stop a workload and restart it again later. This requires Researchers to switch workloads from running interactively, to running unattended, thus allowing Run:ai to pause/resume the run.

    Unattended workloads are a good fit for long-duration runs, or sets of smaller hyperparameter optimization runs.

    "},{"location":"Researcher/best-practices/convert-to-unattended/#best-practices","title":"Best Practices","text":""},{"location":"Researcher/best-practices/convert-to-unattended/#docker-image","title":"Docker Image","text":"

    A docker container is based on a docker image. Some Researchers use generic images such as ones provided by Nvidia, for example: NVIDIA NGC TensorFlow. Others, use generic images as the base image to a more customized image using Dockerfiles.

    Realizing that Researchers are not always proficient with building docker files, as a best practice, you will want to:

    • Use the same docker image both for interactive and unattended jobs. In this way, you can keep the difference between both methods of invocation to a minimum. This can be a stock image from Nvidia or a custom image.
    • Leave some degree of flexibility, which allows the Researcher to add/remove python dependencies without re-creating images.
    "},{"location":"Researcher/best-practices/convert-to-unattended/#code-location","title":"Code Location","text":"

    You will want to minimize the cycle of code change-and-run. There are a couple of best practices which you can choose from:

    1. Code resides on the network file storage. This way you can change the code and immediately run the Job. The Job picks up the new files from the network.
    2. Use the runai submit flag --git-sync. The flag allows the Researcher to provide details of a Git repository. The repository will be automatically cloned into a specified directory when the container starts.
    3. The code can be embedded within the image. In this case, you will want to create an automatic CI/CD process, which packages the code into a modified image.

    The document below assumes option #1.

    "},{"location":"Researcher/best-practices/convert-to-unattended/#create-a-startup-script","title":"Create a Startup Script","text":"

    Gather the commands you ran inside the interactive Job into a single script. The script will be provided with the command-line at the start of the unattended execution (see the section running the job below). This script should be kept next to your code, on a shared network drive (e.g. /nfs/john).

    An example of a common startup script start.sh:

    pip install -r requirements.txt\n...\npython training.py\n

    The first line of this script is there to make sure that all required python libraries are installed before the training script executes, it also allows the Researcher to add/remove libraries without needing changes to the image itself.

    "},{"location":"Researcher/best-practices/convert-to-unattended/#support-variance-between-different-runs","title":"Support Variance Between Different Runs","text":"

    Your training script must be flexible enough to support variance in execution without changing the code. For example, you will want to change the number of epochs to run, apply a different set of hyperparameters, etc. There are two ways to handle this in your script. You can use one or both methods:

    1. Your script can read arguments passed to the script:

      python training.py --number-of-epochs=30

    In which case, change your start.sh script to:

    pip install -r requirements.txt\n...\npython training.py $@
    1. Your script can read from environment variables during script execution. In case you use environment variables, the variables will be passed to the training script automatically. No special action is required in this case.
    "},{"location":"Researcher/best-practices/convert-to-unattended/#checkpoints","title":"Checkpoints","text":"

    Run:ai can pause unattended executions, giving your GPU resources to another workload. When the time comes, Run:ai will give you back the resources and restore your workload. Thus, it is a good practice to save your weights at various checkpoints and start a workload from the latest checkpoint (typically between epochs).

    TensorFlow, PyTorch, and others have mechanisms to help save checkpoints (e.g. https://www.tensorflow.org/guide/checkpoint for TensorFlow and https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html for PyTorch).

    It is important to save the checkpoints to network storage and not the machine itself. When your workload resumes, it can, in all probability, be allocated to a different node (machine) than the original node

    For more information on best practices for saving checkpoints, see Saving Deep Learning Checkpoints.

    "},{"location":"Researcher/best-practices/convert-to-unattended/#running-the-job","title":"Running the Job","text":"

    Using runai submit, drop the flag --interactive. For submitting a Job using the script created above, please use -- [COMMAND] flag to specify a command, use the -- syntax to pass arguments, and pass environment variables using the flag --environment.

    Example with Environment variables:

    runai submit train1 -i tensorflow/tensorflow:1.14.0-gpu-py3  \n    -v /nfs/john:/mydir -g 1  --working-dir /mydir/  \n    -e 'EPOCHS=30'  -e 'LEARNING_RATE=0.02'  \n    -- ./startup.sh  \n

    Example with Command-line arguments:

    runai submit train1 -i tensorflow/tensorflow:1.14.0-gpu-py3  \n    -v /nfs/john:/mydir -g 1  --working-dir /mydir/  \n    -- ./startup.sh batch-size=64 number-of-epochs=3\n

    Please refer to Command-Line Interface, runai submit for a list of all arguments accepted by the Run:ai CLI.

    "},{"location":"Researcher/best-practices/convert-to-unattended/#use-cli-policies","title":"Use CLI Policies","text":"

    Different run configurations may vary significantly and can be tedious to be written each time on the command-line. To make life easier, our CLI offers a way to set administrator policies for these configurations and use pre-configured configuration when submitting a Workload. Please refer to Configure Policies.

    "},{"location":"Researcher/best-practices/convert-to-unattended/#attached-files","title":"Attached Files","text":"

    The 3 relevant files mentioned in this document can be downloaded from Github

    "},{"location":"Researcher/best-practices/convert-to-unattended/#see-also","title":"See Also","text":"

    See the unattended training Quickstart: Launch Unattended Training Workloads

    "},{"location":"Researcher/best-practices/env-variables/","title":"Environment Variables inside a Run:ai Workload","text":""},{"location":"Researcher/best-practices/env-variables/#identifying-a-job","title":"Identifying a Job","text":"

    There may be use cases where your container may need to uniquely identify the Job it is currently running in. A typical use case is for saving Job artifacts under a unique name. Run:ai provides pre-defined environment variables you can use. These variables are guaranteed to be unique even if the Job is preempted or evicted and then runs again.

    Run:ai provides the following environment variables:

    • JOB_NAME - the name of the Job.
    • JOB_UUID - a unique identifier for the Job.

    Note that the Job can be deleted and then recreated with the same name. A Job UUID will be different even if the Job names are the same.

    "},{"location":"Researcher/best-practices/env-variables/#gpu-allocation","title":"GPU Allocation","text":"

    Run:ai provides an environment variable, visible inside the container, to help identify the number of GPUs allocated for the container. Use RUNAI_NUM_OF_GPUS

    "},{"location":"Researcher/best-practices/env-variables/#node-name","title":"Node Name","text":"

    There may be use cases where your container may need to identify the node it is currently running on. Run:ai provides an environment variable, visible inside the container, to help identify the name of the node on which the pod was scheduled. Use NODE_NAME

    "},{"location":"Researcher/best-practices/env-variables/#usage-example-in-python","title":"Usage Example in Python","text":"
    import os\n\njobName = os.environ['JOB_NAME']\njobUUID = os.environ['JOB_UUID']\n
    "},{"location":"Researcher/best-practices/researcher-notifications/","title":"Researcher Email Notifications","text":""},{"location":"Researcher/best-practices/researcher-notifications/#importance-of-email-notifications-for-data-scientists","title":"Importance of Email Notifications for Data Scientists","text":"

    Managing numerous data science workloads requires monitoring various stages, including submission, scheduling, initialization, execution, and completion. Additionally, handling suspensions and failures is crucial for ensuring timely workload completion. Email Notifications address this need by sending alerts for critical workload life cycle changes. This empowers data scientists to take necessary actions and prevent delays.

    Once the system administrator configures the email notifications, users will receive notifications about their jobs that transition from one status to another. In addition, the user will get warning notifications before workload termination due to project-defined timeouts. Details included in the email are:

    • Workload type
    • Project and cluster information
    • Event timestamp

    To configure the types of email notifications you can receive:

    1. The user must log in to their account.
    2. Press the user icon, then select settings.
    3. In the Email notifications, and in the Send me an email about my workloads when section, select the relevant workload statuses.
    4. When complete, press Save.
    "},{"location":"Researcher/best-practices/save-dl-checkpoints/","title":"Best Practice: Save Deep-Learning Checkpoints","text":""},{"location":"Researcher/best-practices/save-dl-checkpoints/#introduction","title":"Introduction","text":"

    Run:ai can pause unattended executions, giving your GPU resources to another workload. When the time comes, Run:ai will give you back the resources and restore your workload. Thus, it is a good practice to save the state of your run at various checkpoints and start a workload from the latest checkpoint (typically between epochs).

    "},{"location":"Researcher/best-practices/save-dl-checkpoints/#how-to-save-checkpoints","title":"How to Save Checkpoints","text":"

    TensorFlow, PyTorch, and others have mechanisms to help save checkpoints (e.g. https://www.tensorflow.org/guide/checkpoint for TensorFlow and https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html for PyTorch).

    This document uses Keras as an example. The code itself can be found here

    "},{"location":"Researcher/best-practices/save-dl-checkpoints/#where-to-save-checkpoints","title":"Where to Save Checkpoints","text":"

    It is important to save the checkpoints to network storage and not the machine itself. When your workload resumes, it can, in all probability, be allocated to a different node (machine) than the original node. Example:

    runai submit train-with-checkpoints -i tensorflow/tensorflow:1.14.0-gpu-py3 \\\n  -v /mnt/nfs_share/john:/mydir -g 1  --working-dir /mydir --command -- ./startup.sh\n

    The command saves the checkpoints in an NFS checkpoints folder /mnt/nfs_share/john

    "},{"location":"Researcher/best-practices/save-dl-checkpoints/#when-to-save-checkpoints","title":"When to Save Checkpoints","text":""},{"location":"Researcher/best-practices/save-dl-checkpoints/#save-periodically","title":"Save Periodically","text":"

    It is a best practice to save checkpoints at intervals. For example, every epoch as the Keras code below shows:

    checkpoints_file = \"weights.best.hdf5\"\ncheckpoint = ModelCheckpoint(checkpoints_file, monitor='val_acc', verbose=1, \n    save_best_only=True, mode='max')\n
    "},{"location":"Researcher/best-practices/save-dl-checkpoints/#save-on-exit-signal","title":"Save on Exit Signal","text":"

    If periodic checkpoints are not enough, you can use a signal-hook provided by Run:ai (via Kubernetes). The hook is python code that is called before your Job is suspended and allows you to save your checkpoints as well as other state data you may wish to store.

    import signal\nimport time\n\ndef graceful_exit_handler(signum, frame):\n    # save your checkpoints to shared storage\n\n    # exit with status \"1\" is important for the Job to return later.  \n    exit(1)\n\nsignal.signal(signal.SIGTERM, graceful_exit_handler)\n

    By default, you will have 30 seconds to save your checkpoints.

    Important

    For the signal to be captured, it must be propagated from the startup script to the python child process. See code here

    "},{"location":"Researcher/best-practices/save-dl-checkpoints/#resuming-using-saved-checkpoints","title":"Resuming using Saved Checkpoints","text":"

    A Run:ai unattended workload that is resumed, will run the same startup script as on the first run. It is the responsibility of the script developer to add code that:

    • Checks if saved checkpoints exist (see above)
    • If saved checkpoints exist, load them and start the run using these checkpoints
    import os\n\ncheckpoints_file = \"weights.best.hdf5\"\nif os.path.isfile(checkpoints_file):\n    print(\"loading checkpoint file: \" + checkpoints_file)\n    model.load_weights(checkpoints_file)\n
    "},{"location":"Researcher/best-practices/secrets-as-env-var-in-cli/","title":"Propogating secrets as environment variables to workloads via the CLI","text":"

    The following is a specific knowledge articles for Run:ai command-line interface users who wish to propagate a Kubernetes secret an an environment variable.

    "},{"location":"Researcher/best-practices/secrets-as-env-var-in-cli/#kubernetes-secrets","title":"Kubernetes Secrets","text":"

    Sometimes you want to use sensitive information within your code. For example passwords, OAuth tokens, or ssh keys. The best practice for saving such information in Kubernetes is via Kubernetes Secrets. Kubernetes Secrets let you store and manage sensitive information. Access to secrets is limited via configuration.

    A Kubernetes secret may hold multiple key - value pairs.

    "},{"location":"Researcher/best-practices/secrets-as-env-var-in-cli/#using-secrets-in-runai-workloads","title":"Using Secrets in Run:ai Workloads","text":"

    Our goal is to provide Run:ai Workloads with secrets as input in a secure way. Using the Run:ai command line, you will be able to pass a reference to a secret that already exists in Kubernetes.

    "},{"location":"Researcher/best-practices/secrets-as-env-var-in-cli/#creating-a-secret","title":"Creating a secret","text":"

    For details on how to create a Kubernetes secret see: https://kubernetes.io/docs/concepts/configuration/secret/. Here is an example:

    apiVersion: v1\nkind: Secret\nmetadata:\n  name: my-secret\n  namespace: runai-<project-name>\ndata:\n  username: am9obgo=\n  password: bXktcGFzc3dvcmQK\n

    Then run:

    kubectl apply -f <file-name>\n

    Notes

    • Secrets are base64 encoded
    • Secrets are stored in the scope of a namespace and will not be accessible from other namespaces. Hence the reference to the Run:ai Project name above. Run:ai provides the ability to propagate secrets throughout all Run:ai Projects. See below.
    "},{"location":"Researcher/best-practices/secrets-as-env-var-in-cli/#attaching-a-secret-to-a-workload-on-submit-via-cli","title":"Attaching a secret to a Workload on Submit via CLI","text":"

    When you submit a new Workload, you will want to connect the secret to the new Workload. To do that, run:

    runai submit -e <ENV-VARIABLE>=SECRET:<secret-name>,<secret-key> ....\n

    For example:

    runai submit -i ubuntu -e MYUSERNAME=SECRET:my-secret,username\n
    "},{"location":"Researcher/cli-reference/Introduction/","title":"Introduction","text":"

    The Run:ai Command-line Interface (CLI) is one of the ways for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, etc.

    To install and configure the Run:ai CLI see Researcher Setup - Start Here

    "},{"location":"Researcher/cli-reference/runai-attach/","title":"runai attach","text":""},{"location":"Researcher/cli-reference/runai-attach/#description","title":"Description","text":"

    Attach to a running Job.

    The command attaches to the standard input, output, and error streams of a running Job. If the Job has multiple pods the job will attach to the first pod unless otherwise set.

    "},{"location":"Researcher/cli-reference/runai-attach/#synopsis","title":"Synopsis","text":"
    runai attach <job-name>\n    [--no-stdin ]\n    [--no-tty]   \n    [--pod string]\n    .\n    [--loglevel value] \n    [--help | -h]\n
    "},{"location":"Researcher/cli-reference/runai-attach/#options","title":"Options","text":"

    <job-name> - The name of the Job to run the command with. Mandatory.

    "},{"location":"Researcher/cli-reference/runai-attach/#-no-stdin","title":"--no-stdin","text":"

    Do not attach STDIN.

    "},{"location":"Researcher/cli-reference/runai-attach/#-no-tty","title":"--no-tty","text":"

    Do not allocate a pseudo-TTY

    "},{"location":"Researcher/cli-reference/runai-attach/#-pod-string","title":"--pod string","text":"

    Attach to a specific pod within the Job. To find the list of pods run runai describe job <job-name> and then use the pod name with the --pod flag.

    "},{"location":"Researcher/cli-reference/runai-attach/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-attach/#-loglevel-string","title":"--loglevel (string)","text":"

    Set the logging level. One of: debug | info | warn | error (default \"info\").

    "},{"location":"Researcher/cli-reference/runai-attach/#-help-h","title":"--help | -h","text":"

    Show help text.

    "},{"location":"Researcher/cli-reference/runai-attach/#output","title":"Output","text":"

    None

    "},{"location":"Researcher/cli-reference/runai-bash/","title":"runai bash","text":""},{"location":"Researcher/cli-reference/runai-bash/#description","title":"Description","text":"

    Get a bash session inside a running Job

    This command is a shortcut to runai exec (runai exec -it job-name bash). See runai exec for full documentation of the exec command.

    "},{"location":"Researcher/cli-reference/runai-bash/#synopsis","title":"Synopsis","text":"
    runai bash <job-name> \n    [--pod string]\n\n    [--loglevel value] \n    [--project string | -p string] \n    [--help | -h]\n
    "},{"location":"Researcher/cli-reference/runai-bash/#options","title":"Options","text":"

    <job-name> - The name of the Job to run the command with. Mandatory.

    "},{"location":"Researcher/cli-reference/runai-bash/#-pod-string","title":"--pod string","text":"

    Specify a pod of a running Job. To get a list of the pods of a specific Job, run runai describe job <job-name> command

    "},{"location":"Researcher/cli-reference/runai-bash/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-bash/#-loglevel-string","title":"--loglevel (string)","text":"

    Set the logging level. One of: debug | info | warn | error (default \"info\")

    "},{"location":"Researcher/cli-reference/runai-bash/#-project-p-string","title":"--project | -p (string)","text":"

    Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

    "},{"location":"Researcher/cli-reference/runai-bash/#-help-h","title":"--help | -h","text":"

    Show help text

    "},{"location":"Researcher/cli-reference/runai-bash/#output","title":"Output","text":"

    The command will access the container that should be currently running in the current cluster and attempt to create a command-line shell based on bash.

    The command will return an error if the container does not exist or has not been in a running state yet.

    "},{"location":"Researcher/cli-reference/runai-bash/#see-also","title":"See also","text":"

    Build Workloads. See Quickstart document: Launch Interactive Build Workloads.

    "},{"location":"Researcher/cli-reference/runai-config/","title":"runai config","text":""},{"location":"Researcher/cli-reference/runai-config/#description","title":"Description","text":"

    Set a default Project or Cluster

    "},{"location":"Researcher/cli-reference/runai-config/#synopsis","title":"Synopsis","text":"
    runai  config project <project-name>\n    [--loglevel value] \n    [--help | -h]\n\nrunai  config cluster <cluster-name>\n    [--loglevel value] \n    [--help | -h]\n
    "},{"location":"Researcher/cli-reference/runai-config/#options","title":"Options","text":"

    <project-name> - The name of the Project you want to set as default. Mandatory.

    <cluster-name> - The name of the cluster you want to set as the current cluster. Mandatory.

    "},{"location":"Researcher/cli-reference/runai-config/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-config/#-loglevel-string","title":"--loglevel (string)","text":"

    Set the logging level. One of: debug | info | warn | error (default \"info\").

    "},{"location":"Researcher/cli-reference/runai-config/#-help-h","title":"--help | -h","text":"

    Show help text.

    "},{"location":"Researcher/cli-reference/runai-config/#output","title":"Output","text":"

    None

    "},{"location":"Researcher/cli-reference/runai-delete/","title":"runai delete","text":""},{"location":"Researcher/cli-reference/runai-delete/#description","title":"Description","text":"

    Delete a Workload and its associated Pods.

    Note that once you delete a Workload, its entire data will be gone:

    • You will no longer be able to enter it via bash.
    • You will no longer be able to access logs.
    • Any data saved on the container and not stored in a shared location will be lost.
    "},{"location":"Researcher/cli-reference/runai-delete/#synopsis","title":"Synopsis","text":"
    runai delete job <job-name> \n    [--all | -A]\n\n    [--loglevel value] \n    [--project string | -p string] \n    [--help | -h]\n
    "},{"location":"Researcher/cli-reference/runai-delete/#options","title":"Options","text":"

    <job-name> - The name of the Workload to run the command with. Mandatory.

    "},{"location":"Researcher/cli-reference/runai-delete/#-all-a","title":"--all | -A","text":"

    Delete all Workloads.

    "},{"location":"Researcher/cli-reference/runai-delete/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-delete/#-loglevel-string","title":"--loglevel (string)","text":"

    Set the logging level. One of: debug | info | warn | error (default \"info\").

    "},{"location":"Researcher/cli-reference/runai-delete/#-project-p-string","title":"--project | -p (string)","text":"

    Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

    "},{"location":"Researcher/cli-reference/runai-delete/#-help-h","title":"--help | -h","text":"

    Show help text.

    "},{"location":"Researcher/cli-reference/runai-delete/#output","title":"Output","text":"
    • The Workload will be deleted and not available via the command runai list jobs.

    • The Workloads will show as deleted from the Run:ai user interface Job list.

    "},{"location":"Researcher/cli-reference/runai-delete/#see-also","title":"See Also","text":"
    • Build Workloads. See Quickstart document: Launch Interactive Build Workloads.

    • Training Workloads. See Quickstart document: Launch Unattended Training Workloads.

    "},{"location":"Researcher/cli-reference/runai-describe/","title":"runai describe","text":""},{"location":"Researcher/cli-reference/runai-describe/#description","title":"Description","text":"

    Display details of a Workload or Node.

    "},{"location":"Researcher/cli-reference/runai-describe/#synopsis","title":"Synopsis","text":"
    runai describe job <job-name> \n    [--output value | -o value]  \n\n    [--loglevel value] \n    [--project string | -p string] \n    [--help | -h]\n    [--output string | -o string]  \n\n\nrunai describe node [node-name] \n\n    [--loglevel value] \n    [--help | -h]\n
    "},{"location":"Researcher/cli-reference/runai-describe/#options","title":"Options","text":"
    • <job-name> - The name of the Workload to run the command with. Mandatory.
    • <node-name> - The name of the Node to run the command with. If a Node name is not specified, a description of all Nodes is shown.

    -o | --output

    Output format. One of: json|yaml|wide. Default is 'wide'

    "},{"location":"Researcher/cli-reference/runai-describe/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-describe/#-loglevel-string","title":"--loglevel (string)","text":"

    Set the logging level. One of: debug | info | warn | error (default \"info\").

    "},{"location":"Researcher/cli-reference/runai-describe/#-project-p-string","title":"--project | -p (string)","text":"

    Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project, use: runai config project <project-name>.

    "},{"location":"Researcher/cli-reference/runai-describe/#-help-h","title":"--help | -h","text":"

    Show help text

    "},{"location":"Researcher/cli-reference/runai-describe/#output","title":"Output","text":"
    • The runai describe job command will show Workload properties and status as well as lifecycle events and the list of related resources and pods.
    • The runai describe node command will show Node properties.
    "},{"location":"Researcher/cli-reference/runai-exec/","title":"runai exec","text":""},{"location":"Researcher/cli-reference/runai-exec/#description","title":"Description","text":"

    Execute a command inside a running Job

    Note: to execute a bash command, you can also use the shorthand runai bash

    "},{"location":"Researcher/cli-reference/runai-exec/#synopsis","title":"Synopsis","text":"
    runai exec <job-name> <command> \n    [--stdin | -i] \n    [--tty | -t]\n\n    [--loglevel value] \n    [--project string | -p string] \n    [--help | -h]\n
    "},{"location":"Researcher/cli-reference/runai-exec/#options","title":"Options","text":"

    <job-name> - The name of the Job to run the command with. Mandatory.

    <command> the command itself (e.g. bash).

    "},{"location":"Researcher/cli-reference/runai-exec/#-stdin-i","title":"--stdin | -i","text":"

    Keep STDIN open even if not attached.

    "},{"location":"Researcher/cli-reference/runai-exec/#-tty-t","title":"--tty | -t","text":"

    Allocate a pseudo-TTY.

    "},{"location":"Researcher/cli-reference/runai-exec/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-exec/#-loglevel-string","title":"--loglevel (string)","text":"

    Set the logging level. One of: debug | info | warn | error (default \"info\").

    "},{"location":"Researcher/cli-reference/runai-exec/#-project-p-string","title":"--project | -p (string)","text":"

    Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

    "},{"location":"Researcher/cli-reference/runai-exec/#-help-h","title":"--help | -h","text":"

    Show help text.

    "},{"location":"Researcher/cli-reference/runai-exec/#output","title":"Output","text":"

    The command will run in the context of the container.

    "},{"location":"Researcher/cli-reference/runai-exec/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-list/","title":"runai list","text":""},{"location":"Researcher/cli-reference/runai-list/#description","title":"Description","text":"

    Show lists of Workloads, Projects, Clusters or Nodes.

    "},{"location":"Researcher/cli-reference/runai-list/#synopsis","title":"Synopsis","text":"
    runai list jobs \n    [--all-projects | -A]  \n\n    [--loglevel value] \n    [--project string | -p string] \n    [--help | -h]\n\nrunai list projects \n    [--loglevel value] \n    [--help | -h]\n\nrunai list clusters  \n    [--loglevel value] \n    [--help | -h]\n\nrunai list nodes [node-name]\n    [--loglevel value] \n    [--help | -h]\n
    "},{"location":"Researcher/cli-reference/runai-list/#options","title":"Options","text":"

    node-name - Name of a specific node to list (optional).

    "},{"location":"Researcher/cli-reference/runai-list/#-all-projects-a","title":"--all-projects | -A","text":"

    Show Workloads from all Projects.

    "},{"location":"Researcher/cli-reference/runai-list/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-list/#-loglevel-string","title":"--loglevel (string)","text":"

    Set the logging level. One of: debug | info | warn | error (default \"info\").

    "},{"location":"Researcher/cli-reference/runai-list/#-project-p-string","title":"--project | -p (string)","text":"

    Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

    "},{"location":"Researcher/cli-reference/runai-list/#-help-h","title":"--help | -h","text":"

    Show help text.

    "},{"location":"Researcher/cli-reference/runai-list/#output","title":"Output","text":"
    • A list of Workloads, Nodes, Projects, or Clusters.
    • To filter 'runai list nodes' for a specific Node, add the Node name.
    "},{"location":"Researcher/cli-reference/runai-list/#see-also","title":"See Also","text":"

    To show details for a specific Workload or Node see runai describe.

    "},{"location":"Researcher/cli-reference/runai-login/","title":"runai login","text":""},{"location":"Researcher/cli-reference/runai-login/#description","title":"Description","text":"

    Login to Run:ai

    When Researcher Authentication is enabled, you will need to login to Run:ai using your username and password before accessing resources

    "},{"location":"Researcher/cli-reference/runai-login/#synopsis","title":"Synopsis","text":"
    runai login \n    [--loglevel value]\n    [--help | -h]\n
    "},{"location":"Researcher/cli-reference/runai-login/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-login/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-login/#-loglevel-string","title":"--loglevel (string)","text":"

    Set the logging level. One of: debug | info | warn | error (default \"info\").

    "},{"location":"Researcher/cli-reference/runai-login/#-help-h","title":"--help | -h","text":"

    Show help text.

    "},{"location":"Researcher/cli-reference/runai-login/#output","title":"Output","text":"

    You will be prompted for a user name and password

    "},{"location":"Researcher/cli-reference/runai-login/#see-also","title":"See Also","text":"
    • runai logout.
    "},{"location":"Researcher/cli-reference/runai-logout/","title":"runai logout","text":""},{"location":"Researcher/cli-reference/runai-logout/#description","title":"Description","text":"

    Log out from Run:ai

    "},{"location":"Researcher/cli-reference/runai-logout/#synopsis","title":"Synopsis","text":"
    runai logout \n    [--loglevel value]\n    [--help | -h]\n
    "},{"location":"Researcher/cli-reference/runai-logout/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-logout/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-logout/#-loglevel-string","title":"--loglevel (string)","text":"

    Set the logging level. One of: debug | info | warn | error (default \"info\").

    "},{"location":"Researcher/cli-reference/runai-logout/#-help-h","title":"--help | -h","text":"

    Show help text.

    "},{"location":"Researcher/cli-reference/runai-logout/#output","title":"Output","text":"

    You will be logged out from Run:ai

    "},{"location":"Researcher/cli-reference/runai-logout/#see-also","title":"See Also","text":"
    • runai login.
    "},{"location":"Researcher/cli-reference/runai-logs/","title":"runai logs","text":""},{"location":"Researcher/cli-reference/runai-logs/#description","title":"Description","text":"

    Show the logs of a Job.

    "},{"location":"Researcher/cli-reference/runai-logs/#synopsis","title":"Synopsis","text":"
    runai logs <job-name> \n    [--follow | -f] \n    [--pod string | -p string] \n    [--since duration] \n    [--since-time date-time] \n    [--tail int | -t int] \n    [--timestamps]  \n\n    [--loglevel value] \n    [--project string | -p string] \n    [--help | -h]\n
    "},{"location":"Researcher/cli-reference/runai-logs/#options","title":"Options","text":"

    <job-name> - The name of the Job to run the command with. Mandatory.

    "},{"location":"Researcher/cli-reference/runai-logs/#-follow-f","title":"--follow | -f","text":"

    Stream the logs.

    "},{"location":"Researcher/cli-reference/runai-logs/#-pod-p","title":"--pod | -p","text":"

    Specify a specific pod name. When a Job fails, it may start a couple of times in an attempt to succeed. The flag allows you to see the logs of a specific instance (called 'pod'). Get the name of the pod by running runai describe job <job-name>.

    "},{"location":"Researcher/cli-reference/runai-logs/#-instance-string-i-string","title":"--instance (string) | -i (string)","text":"

    Show logs for a specific instance in cases where a Job contains multiple pods.

    "},{"location":"Researcher/cli-reference/runai-logs/#-since-duration","title":"--since (duration)","text":"

    Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs. The flags since and since-time cannot be used together.

    "},{"location":"Researcher/cli-reference/runai-logs/#-since-time-date-time","title":"--since-time (date-time)","text":"

    Return logs after specified date. Date format should be RFC3339, example: 2020-01-26T15:00:00Z.

    "},{"location":"Researcher/cli-reference/runai-logs/#-tail-int-t-int","title":"--tail (int) | -t (int)","text":"

    # of lines of recent log file to display.

    "},{"location":"Researcher/cli-reference/runai-logs/#-timestamps","title":"--timestamps","text":"

    Include timestamps on each line in the log output.

    "},{"location":"Researcher/cli-reference/runai-logs/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-logs/#-loglevel-string","title":"--loglevel (string)","text":"

    Set the logging level. One of: debug | info | warn | error (default \"info\").

    "},{"location":"Researcher/cli-reference/runai-logs/#-project-p-string","title":"--project | -p (string)","text":"

    Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use runai config project <project-name>.

    "},{"location":"Researcher/cli-reference/runai-logs/#-help-h","title":"--help | -h","text":"

    Show help text.

    "},{"location":"Researcher/cli-reference/runai-logs/#output","title":"Output","text":"

    The command will show the logs of the first process in the container. For training Jobs, this would be the command run at startup. For interactive Jobs, the command may not show anything.

    "},{"location":"Researcher/cli-reference/runai-logs/#see-also","title":"See Also","text":"
    • Training Workloads. See Quickstart document: Launch Unattended Training Workloads.
    "},{"location":"Researcher/cli-reference/runai-port-forwarding/","title":"runai port-forward","text":""},{"location":"Researcher/cli-reference/runai-port-forwarding/#description","title":"Description","text":"

    Forward one or more local ports to the selected job or a pod within the job. The forwarding session ends when the selected job terminates or the terminal is interrupted.

    "},{"location":"Researcher/cli-reference/runai-port-forwarding/#examples","title":"Examples","text":"
    1. Port forward connections from localhost:8080 (localhost is the default) to on port 8090.

      runai port-forward <job-name> --port 8080:8090

    2. Port forward connections from 192.168.1.23:8080 to on port 8080.

      runai port-forward <job-name> --port 8080 --address 192.168.1.23

    3. Port forward multiple connections from localhost:8080 to on port 8090 and localhost:6443 to on port 443.

      runai port-forward <job-name> --port 8080:8090 --port 6443:443

    4. Port forward into a specific pod in a multi-pod job.

      runai port-forward <job-name> --port 8080:8090 --pod <pod-name>

    5. "},{"location":"Researcher/cli-reference/runai-port-forwarding/#global-flags","title":"Global flags","text":"

      --loglevel <string>\u2014Set the logging level. Choose: (default \"info\").

      -p | --project <string>\u2014Specify the project name. To change the default project use runai config project <project name>.

      "},{"location":"Researcher/cli-reference/runai-port-forwarding/#flags","title":"Flags","text":"

      --address <string> | [local-interface-ip\\host] |localhost | 0.0.0.0 [privileged]\u2014The listening address of your local machine. (default \"localhost\").

      -h | --help\u2014Help for the command.

      --port\u2014forward ports based on one of the following arguments:

      • <stringArray>\u2014a list of port forwarding combinations.

      • [local-port]:[remote-port]\u2014different local and remote ports.

      • [local-port=remote-port]\u2014the same port is used for both local and remote.

      --pod\u2014Specify a pod of a running job. To get a list of the pods of a specific job, run the command runai describe <job-name>.

      --pod-running-timeout\u2014The length of time (like 5s, 2m, or 3h, higher than zero) to wait until the pod is running. Default is 10 minutes.

      Filter based flags

      --mpi\u2014search only for mpi jobs.

      --interactive\u2014search only for interactive jobs.

      --pytorch\u2014search only for pytorch jobs.

      --tf\u2014search only for tensorflow jobs.

      --train\u2014search only for training jobs.

      "},{"location":"Researcher/cli-reference/runai-resume/","title":"runai resume","text":""},{"location":"Researcher/cli-reference/runai-resume/#description","title":"Description","text":"

      Resume a suspended Job

      Resuming a previously suspended Job will return it to the queue for scheduling. The Job may or may not start immediately, depending on available resources.

      Suspend and resume do not work with mpi Jobs.

      "},{"location":"Researcher/cli-reference/runai-resume/#synopsis","title":"Synopsis","text":"
      runai resume <job-name>\n    [--all | -A]\n\n    [--loglevel value]\n    [--project string | -p string]\n    [--help | -h]\n
      "},{"location":"Researcher/cli-reference/runai-resume/#options","title":"Options","text":"

      <job-name> - The name of the Job to run the command with. Mandatory.

      "},{"location":"Researcher/cli-reference/runai-resume/#-all-a","title":"--all | -A","text":"

      Resume all suspended Jobs in the current Project.

      "},{"location":"Researcher/cli-reference/runai-resume/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-resume/#-loglevel-string","title":"--loglevel (string)","text":"

      Set the logging level. One of: debug | info | warn | error (default \"info\").

      "},{"location":"Researcher/cli-reference/runai-resume/#-project-p-string","title":"--project | -p (string)","text":"

      Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

      "},{"location":"Researcher/cli-reference/runai-resume/#-help-h","title":"--help | -h","text":"

      Show help text.

      "},{"location":"Researcher/cli-reference/runai-resume/#output","title":"Output","text":"
      • The Job will be resumed. When running runai list jobs the Job status will no longer by Suspended.
      "},{"location":"Researcher/cli-reference/runai-resume/#see-also","title":"See Also","text":"
      • Suspending Jobs: Suspend.
      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/","title":"runai submit-dist tf","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#description","title":"Description","text":"

      Submit a distributed TensorFlow training run:ai job to run.

      Note

      To use distributed training you need to have installed the TensorFlow operator as specified in Distributed training.

      Syntax notes:

      • Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.
      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#examples","title":"Examples","text":"
      runai submit-dist tf --name distributed-job --workers=2 -g 1 \\\n    -i <image_name\n>\n
      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-clean-pod-policy-string","title":"--clean-pod-policy < string >","text":"

      The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:

      • Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)
      • All\u2014all (including completed) pods will be deleted immediately when the job finishes.
      • None\u2014no pods will be deleted when the job completes.
      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-workers-int","title":"--workers < int>","text":"

      Number of replicas for Inference jobs

      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-job-name-prefix-string","title":"--job-name-prefix <string>","text":"

      The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional --job-name-prefix flag creates Job names with the provided prefix.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-name-string","title":"--name <string>","text":"

      The name of the Job.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-template-string","title":"--template <string>","text":"

      Load default values from a workload.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-add-capability-stringarray","title":"--add-capability <stringArray>","text":"

      Add linux capabilities to the container.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-a-annotation-stringarray","title":"-a | --annotation <stringArray>","text":"

      Set annotations variables in the container.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-attach","title":"--attach","text":"

      Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.

      The --attach flag also sets --tty and --stdin to true.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-command","title":"--command","text":"

      Overrides the image's entry point with the command supplied after '--'. When not using the --command flag, the entry point will not be overrided and the string after -- will be appended as arguments to the entry point command.

      Example:

      --command -- run.sh 1 54 will start the docker and run run.sh 1 54

      -- script.py 10000 will augment script.py 10000 to the entry point command (e.g. python)

      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-create-home-dir","title":"--create-home-dir","text":"

      Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-e-stringarray-environment","title":"-e <stringArray> | --environment`

      Define environment variables to be set in the container. To set multiple values add the flag multiple times (-e BATCH_SIZE=50 -e LEARNING_RATE=0.2).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-image-string-i-string","title":"--image <string> | -i <string>

      Image to use when creating the container for this Job

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-image-pull-policy-string","title":"--image-pull-policy <string>

      Pulling policy of the image when starting a container. Options are:

      • Always (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.
      • IfNotPresent: the image is pulled only if it is not already present locally.
      • Never: the image is assumed to exist locally. No attempt is made to pull the image.

      For more information see Kubernetes documentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-l-label-stringarray","title":"-l | --label <stringArray>

      Set labels variables in the container.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-master-args-string-string","title":"--master-args string <string>

      Arguments to pass to the master pod container command. If used together with --command, overrides the image's entrypoint of the master pod container with the given command.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-master-environment-stringarray","title":"--master-environment <stringArray>

      Set environment variables in the master pod container. To prevent from a worker environment variable from being set in the master, use the format: name=-.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-master-extended-resource-stringarray","title":"--master-extended-resource <stringArray>

      Request access to an extended resource in the master pod. Use the format: resource_name=quantity.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-master-gpu-float","title":"--master-gpu <float>

      GPU units to allocate for the master pod.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-master-no-pvcs","title":"--master-no-pvcs

      Do not mount any persistent volumes in the master pod.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-no-master","title":"--no-master

      Do not create a separate pod for the master.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <string>

      If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-required-pod-topology-key-string","title":"--required-pod-topology-key <string>

      Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-stdin","title":"--stdin

      Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-t-tty","title":"-t | --tty

      Allocate a pseudo-TTY.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-working-dir-string","title":"--working-dir <string>

      Starts the container with the specified directory as the current directory.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-cpu-double","title":"--cpu <double>

      CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-cpu-limit-double","title":"--cpu-limit <double>

      Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-extended-resource","title":"--extended-resource `

      Request access to extended resource, syntax <resource-name> = < resource_quantity >

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-g-gpu-float","title":"-g | --gpu <float>

      GPU units to allocate for the Job (0.5, 1).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-gpu-memory","title":"--gpu-memory

      GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-memory-string","title":"--memory <string>

      CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-memory-limit","title":"--memory-limit `

      CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-mig-profile-string-deprecated","title":"--mig-profile <string> (Deprecated)

      MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-backoff-limit-int","title":"--backoff-limit <int>

      The number of times the Job will be retried before failing. The default is 6.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-ttl-after-finish-duration","title":"--ttl-after-finish < duration >

      The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-git-sync-stringarray","title":"--git-sync <stringArray>

      Clone a git repository into the container running the Job. The parameter should follow the syntax: source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-large-shm","title":"--large-shm

      Mount a large /dev/shm device.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-mount-propagation","title":"--mount-propagation

      Enable HostToContainer mount propagation for all container volumes

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-nfs-server-string","title":"--nfs-server <string>

      Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro]","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc Pvc_Name:Container_Mount_Path:[ro]

      Mount a persistent volume claim into a container.

      Note

      This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters --pvc-exists and --pvc-new.

      The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.

      Storage_Class_Name is a storage class name that can be obtained by running kubectl get storageclasses.storage.k8s.io. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class.

      Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes

      Container_Mount_Path. A path internal to the container where the storage will be mounted

      Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container

      Examples:

      --pvc :3Gi:/tmp/john:ro - Allocate 3GB from the default Storage class. Mount it to /tmp/john as read-only

      --pvc my-storage:3Gi:/tmp/john:ro - Allocate 3GB from the my-storage storage class. Mount it to /tmp/john as read-only

      --pvc :3Gi:/tmp/john - Allocate 3GB from the default storage class. Mount it to /tmp/john as read-write

      --pvc my-pvc:/tmp/john - Use a Persistent Volume Claim named my-pvc. Mount it to /tmp/john as read-write

      --pvc my-pvc-2:/tmp/john:ro - Use a Persistent Volume Claim named my-pvc-2. Mount it to /tmp/john as read-only

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-exists-string","title":"--pvc-exists <string>

      Mount a persistent volume. You must include a claimname and path.

      • claim name\u2014The name of the persistent colume claim. Can be obtained by running

      kubectl get storageclasses.storage.k8s.io

      • path\u2014the path internal to the container where the storage will be mounted

      Use the format:

      claimname=<CLAIM_NAME>,path=<PATH>

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-new-string","title":"--pvc-new <string>

      Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.

      • claim name\u2014The name of the persistent colume claim.
      • storage class\u2014A storage class name that can be obtained by running

      kubectl get storageclasses.storage.k8s.io.

      storageclass may be omitted if there is a single storage class in the system, or you are using the default storage class.

      • size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.
      • accessmode\u2014The description ofthedesired volume capabilities for the PVC.
      • ro\u2014Mount the PVC with read-only access.
      • ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.

      Use the format:

      storageclass= <storageclass>,size= <size>, path= <path>, ro, accessmode-rwm

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-s3-string","title":"--s3 <string>

      Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:

      bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH

      All the fields, except url=URL, are mandatory. Default for url is

      url=https://s3.amazon.com

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'

      Volumes to mount into the container.

      Examples:

      -v /raid/public/john/data:/root/data:ro

      Mount /root/data to local path /raid/public/john/data for read-only access.

      -v /public/data:/root/data::nfs.example.com

      Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-configmap-volume-namepath","title":"--configmap-volume name=,path= ...'

      Mount a ConfigMap object for use as a data volume.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-address-string","title":"--address <string>

      Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-host-ipc","title":"--host-ipc

      Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.

      For further information see docker run reference documentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-host-network","title":"--host-network

      Use the host's network stack inside the container. For further information see docker run referencedocumentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-port-stringarray","title":"--port <stringArray>

      Expose ports from the Job container.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-s-service-type-string","title":"-s | --service-type <string>

      External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-allow-privilege-escalation","title":"--allow-privilege-escalation

      Allow the job to gain additional privileges after start.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-run-as-user","title":"--run-as-user

      Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-node-pools-string","title":"--node-pools <string>

      Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-node-type-string","title":"--node-type <string>

      Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-toleration-string","title":"--toleration <string>

      Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.

      The format of the string:

      operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n
      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-loglevel-string","title":"--loglevel (string)

      Set the logging level. One of: debug | info | warn | error (default \"info\")

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-project-p-string","title":"--project | -p (string)

      Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-help-h","title":"--help | -h

      Show help text.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#output","title":"Output","text":"

      The command will attempt to submit an mpi Job. You can follow up on the Job by running runai list jobs or runai describe job <job-name>.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#see-also","title":"See Also","text":"
      • See Quickstart document Running Distributed Training.
      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/","title":"runai submit-dist mpi","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#description","title":"Description","text":"

      Submit a Distributed Training (MPI) Run:ai Job to run.

      Note

      To use distributed training you need to have installed the Kubeflow MPI Operator as specified in Distributed training.

      Syntax notes:

      • Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.
      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#examples","title":"Examples","text":"

      You can start an unattended mpi training Job of name dist1, based on Project team-a using a quickstart-distributed image:

      runai submit-dist mpi --name dist1 --workers=2 -g 1 \\\n    -i runai.jfrog.io/demo/quickstart-distributed:v0.3.0 -e RUNAI_SLEEP_SECS=60\n

      (see: distributed training Quickstart).

      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-clean-pod-policy-string","title":"--clean-pod-policy < string >","text":"

      The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:

      • Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)
      • All\u2014all (including completed) pods will be deleted immediately when the job finishes.
      • None\u2014no pods will be deleted when the job completes.
      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-workers-int","title":"--workers < int >","text":"

      Number of replicas for Inference jobs.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-slots-per-worker-int","title":"--slots-per-worker < int >","text":"

      Number of slots to allocate for each worker.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-job-name-prefix-string","title":"--job-name-prefix <string>","text":"

      The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional --job-name-prefix flag creates Job names with the provided prefix.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-name-string","title":"--name <string>","text":"

      The name of the Job.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-template-string","title":"--template <string>","text":"

      Load default values from a workload.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-add-capability-stringarray","title":"--add-capability <stringArray>","text":"

      Add linux capabilities to the container.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-a-annotation-stringarray","title":"-a | --annotation <stringArray>","text":"

      Set annotations variables in the container.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-attach","title":"--attach","text":"

      Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.

      The --attach flag also sets --tty and --stdin to true.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-command","title":"--command","text":"

      Overrides the image's entry point with the command supplied after '--'. When not using the --command flag, the entry point will not be overrided and the string after -- will be appended as arguments to the entry point command.

      Example:

      --command -- run.sh 1 54 will start the docker and run run.sh 1 54

      -- script.py 10000 will augment script.py 10000 to the entry point command (e.g. python)

      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-create-home-dir","title":"--create-home-dir","text":"

      Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-e-stringarray-environment","title":"-e <stringArray> | --environment`

      Define environment variables to be set in the container. To set multiple values add the flag multiple times (-e BATCH_SIZE=50 -e LEARNING_RATE=0.2).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-image-string-i-string","title":"--image <string> | -i <string>

      Image to use when creating the container for this Job

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-image-pull-policy-string","title":"--image-pull-policy <string>

      Pulling policy of the image when starting a container. Options are:

      • Always (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.
      • IfNotPresent: the image is pulled only if it is not already present locally.
      • Never: the image is assumed to exist locally. No attempt is made to pull the image.

      For more information see Kubernetes documentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-l-label-stringarray","title":"-l | --label <stringArray>

      Set labels variables in the container.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-master-args-string-string","title":"--master-args string <string>

      Arguments to pass to the master pod container command. If used together with --command, overrides the image's entrypoint of the master pod container with the given command.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-master-environment-stringarray","title":"--master-environment <stringArray>

      Set environment variables in the master pod container. To prevent from a worker environment variable from being set in the master, use the format: name=-.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-master-extended-resource-stringarray","title":"--master-extended-resource <stringArray>

      Request access to an extended resource in the master pod. Use the format: resource_name=quantity.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-master-gpu-float","title":"--master-gpu <float>

      GPU units to allocate for the master pod.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-master-no-pvcs","title":"--master-no-pvcs

      Do not mount any persistent volumes in the master pod.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <string>

      If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-required-pod-topology-key-string","title":"--required-pod-topology-key <string>

      Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-stdin","title":"--stdin

      Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-t-tty","title":"-t | --tty

      Allocate a pseudo-TTY.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-working-dir-string","title":"--working-dir <string>

      Starts the container with the specified directory as the current directory.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-cpu-double","title":"--cpu <double>

      CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-cpu-limit-double","title":"--cpu-limit <double>

      Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-extended-resource","title":"--extended-resource `

      Request access to extended resource, syntax <resource-name> = < resource_quantity >

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-g-gpu-float","title":"-g | --gpu <float>

      GPU units to allocate for the Job (0.5, 1).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-gpu-memory","title":"--gpu-memory

      GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-memory-string","title":"--memory <string>

      CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-memory-limit","title":"--memory-limit `

      CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-mig-profile-string-deprecated","title":"--mig-profile <string> (Deprecated)

      MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-backoff-limit-int","title":"--backoff-limit <int>

      The number of times the Job will be retried before failing. The default is 6.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-ttl-after-finish-duration","title":"--ttl-after-finish < duration >

      The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-git-sync-stringarray","title":"--git-sync <stringArray>

      Clone a git repository into the container running the Job. The parameter should follow the syntax: source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-large-shm","title":"--large-shm

      Mount a large /dev/shm device.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-mount-propagation","title":"--mount-propagation

      Enable HostToContainer mount propagation for all container volumes

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-nfs-server-string","title":"--nfs-server <string>

      Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro]","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc Pvc_Name:Container_Mount_Path:[ro]

      Mount a persistent volume claim into a container.

      Note

      This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters --pvc-exists and --pvc-new.

      The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.

      Storage_Class_Name is a storage class name that can be obtained by running kubectl get storageclasses.storage.k8s.io. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class.

      Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes

      Container_Mount_Path. A path internal to the container where the storage will be mounted

      Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container

      Examples:

      --pvc :3Gi:/tmp/john:ro - Allocate 3GB from the default Storage class. Mount it to /tmp/john as read-only

      --pvc my-storage:3Gi:/tmp/john:ro - Allocate 3GB from the my-storage storage class. Mount it to /tmp/john as read-only

      --pvc :3Gi:/tmp/john - Allocate 3GB from the default storage class. Mount it to /tmp/john as read-write

      --pvc my-pvc:/tmp/john - Use a Persistent Volume Claim named my-pvc. Mount it to /tmp/john as read-write

      --pvc my-pvc-2:/tmp/john:ro - Use a Persistent Volume Claim named my-pvc-2. Mount it to /tmp/john as read-only

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-exists-string","title":"--pvc-exists <string>

      Mount a persistent volume. You must include a claimname and path.

      • claim name\u2014The name of the persistent colume claim. Can be obtained by running

      kubectl get storageclasses.storage.k8s.io

      • path\u2014the path internal to the container where the storage will be mounted

      Use the format:

      claimname=<CLAIM_NAME>,path=<PATH>

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-new-string","title":"--pvc-new <string>

      Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.

      • claim name\u2014The name of the persistent colume claim.
      • storage class\u2014A storage class name that can be obtained by running

      kubectl get storageclasses.storage.k8s.io.

      storageclass may be omitted if there is a single storage class in the system, or you are using the default storage class.

      • size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.
      • accessmode\u2014The description ofthedesired volume capabilities for the PVC.
      • ro\u2014Mount the PVC with read-only access.
      • ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.

      Use the format:

      storageclass= <storageclass>,size= <size>, path= <path>, ro, accessmode-rwm

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-s3-string","title":"--s3 <string>

      Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:

      bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH

      All the fields, except url=URL, are mandatory. Default for url is

      url=https://s3.amazon.com

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'

      Volumes to mount into the container.

      Examples:

      -v /raid/public/john/data:/root/data:ro

      Mount /root/data to local path /raid/public/john/data for read-only access.

      -v /public/data:/root/data::nfs.example.com

      Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-configmap-volume-namepath","title":"--configmap-volume name=,path= ...'

      Mount a ConfigMap object for use as a data volume.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-address-string","title":"--address <string>

      Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-host-ipc","title":"--host-ipc

      Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.

      For further information see docker run reference documentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-host-network","title":"--host-network

      Use the host's network stack inside the container. For further information see docker run referencedocumentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-port-stringarray","title":"--port <stringArray>

      Expose ports from the Job container.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-s-service-type-string","title":"-s | --service-type <string>

      External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-allow-privilege-escalation","title":"--allow-privilege-escalation

      Allow the job to gain additional privileges after start.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-run-as-user","title":"--run-as-user

      Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-node-pools-string","title":"--node-pools <string>

      Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-node-type-string","title":"--node-type <string>

      Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-toleration-string","title":"--toleration <string>

      Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.

      The format of the string:

      operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n
      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-loglevel-string","title":"--loglevel (string)

      Set the logging level. One of: debug | info | warn | error (default \"info\")

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-project-p-string","title":"--project | -p (string)

      Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-help-h","title":"--help | -h

      Show help text.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#output","title":"Output","text":"

      The command will attempt to submit an mpi Job. You can follow up on the Job by running runai list jobs or runai describe job <job-name>.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#see-also","title":"See Also","text":"
      • See Quickstart document Running Distributed Training.
      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/","title":"runai submit-dist pytorch","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#description","title":"Description","text":"

      Submit a distributed PyTorch training run:ai job to run.

      Note

      To use distributed training you need to have installed the Pytorch operator as specified in Distributed training.

      Syntax notes:

      • Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.
      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#examples","title":"Examples","text":"
      runai submit-dist pytorch --name distributed-job --workers=2 -g 1 \\\n    -i <image_name>\n
      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-clean-pod-policy-string","title":"--clean-pod-policy < string >","text":"

      The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:

      • Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)
      • All\u2014all (including completed) pods will be deleted immediately when the job finishes.
      • None\u2014no pods will be deleted when the job completes.
      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-max-replicas-int","title":"--max-replicas < int >","text":"

      Maximum number of replicas for elastic PyTorch job.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-min-replicas-int","title":"--min-replicas < int >","text":"

      Minimum number of replicas for elastic PyTorch job.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-workers-int","title":"--workers < int>","text":"

      Number of replicas for Inference jobs

      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-job-name-prefix-string","title":"--job-name-prefix <string>","text":"

      The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional --job-name-prefix flag creates Job names with the provided prefix.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-name-string","title":"--name <string>","text":"

      The name of the Job.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-template-string","title":"--template <string>","text":"

      Load default values from a workload.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-add-capability-stringarray","title":"--add-capability <stringArray>","text":"

      Add linux capabilities to the container.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-a-annotation-stringarray","title":"-a | --annotation <stringArray>","text":"

      Set annotations variables in the container.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-attach","title":"--attach","text":"

      Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.

      The --attach flag also sets --tty and --stdin to true.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-command","title":"--command","text":"

      Overrides the image's entry point with the command supplied after '--'. When not using the --command flag, the entry point will not be overrided and the string after -- will be appended as arguments to the entry point command.

      Example:

      --command -- run.sh 1 54 will start the docker and run run.sh 1 54

      -- script.py 10000 will augment script.py 10000 to the entry point command (e.g. python)

      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-create-home-dir","title":"--create-home-dir","text":"

      Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-e-stringarray-environment","title":"-e <stringArray> | --environment`

      Define environment variables to be set in the container. To set multiple values add the flag multiple times (-e BATCH_SIZE=50 -e LEARNING_RATE=0.2).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-image-string-i-string","title":"--image <string> | -i <string>

      Image to use when creating the container for this Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-image-pull-policy-string","title":"--image-pull-policy <string>

      Pulling policy of the image when starting a container. Options are:

      • Always (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.
      • IfNotPresent: the image is pulled only if it is not already present locally.
      • Never: the image is assumed to exist locally. No attempt is made to pull the image.

      For more information see Kubernetes documentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-l-label-stringarray","title":"-l | --label <stringArray>

      Set labels variables in the container.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-master-args-string-string","title":"--master-args string <string>

      Arguments to pass to the master pod container command. If used together with --command, overrides the image's entrypoint of the master pod container with the given command.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-master-environment-stringarray","title":"--master-environment <stringArray>

      Set environment variables in the master pod container. To prevent from a worker environment variable from being set in the master, use the format: name=-.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-master-extended-resource-stringarray","title":"--master-extended-resource <stringArray>

      Request access to an extended resource in the master pod. Use the format: resource_name=quantity.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-master-gpu-float","title":"--master-gpu <float>

      GPU units to allocate for the master pod.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-master-no-pvcs","title":"--master-no-pvcs

      Do not mount any persistent volumes in the master pod.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-no-master","title":"--no-master

      Do not create a separate pod for the master.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <string>

      If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-required-pod-topology-key-string","title":"--required-pod-topology-key <string>

      Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-stdin","title":"--stdin

      Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-t-tty","title":"-t | --tty

      Allocate a pseudo-TTY.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-working-dir-string","title":"--working-dir <string>

      Starts the container with the specified directory as the current directory.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-cpu-double","title":"--cpu <double>

      CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-cpu-limit-double","title":"--cpu-limit <double>

      Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-extended-resource","title":"--extended-resource `

      Request access to extended resource, syntax <resource-name> = < resource_quantity >

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-g-gpu-float","title":"-g | --gpu <float>

      GPU units to allocate for the Job (0.5, 1).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-gpu-memory","title":"--gpu-memory

      GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-memory-string","title":"--memory <string>

      CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-memory-limit","title":"--memory-limit `

      CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-mig-profile-string-deprecated","title":"--mig-profile <string> (Deprecated)

      MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-backoff-limit-int","title":"--backoff-limit <int>

      The number of times the Job will be retried before failing. The default is 6.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-ttl-after-finish-duration","title":"--ttl-after-finish < duration >

      The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-git-sync-stringarray","title":"--git-sync <stringArray>

      Clone a git repository into the container running the Job. The parameter should follow the syntax: source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-large-shm","title":"--large-shm

      Mount a large /dev/shm device.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-mount-propagation","title":"--mount-propagation

      Enable HostToContainer mount propagation for all container volumes

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-nfs-server-string","title":"--nfs-server <string>

      Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro]","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc Pvc_Name:Container_Mount_Path:[ro]

      Mount a persistent volume claim into a container.

      Note

      This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters --pvc-exists and --pvc-new.

      The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.

      Storage_Class_Name is a storage class name that can be obtained by running kubectl get storageclasses.storage.k8s.io. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class.

      Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes

      Container_Mount_Path. A path internal to the container where the storage will be mounted

      Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container

      Examples:

      --pvc :3Gi:/tmp/john:ro - Allocate 3GB from the default Storage class. Mount it to /tmp/john as read-only

      --pvc my-storage:3Gi:/tmp/john:ro - Allocate 3GB from the my-storage storage class. Mount it to /tmp/john as read-only

      --pvc :3Gi:/tmp/john - Allocate 3GB from the default storage class. Mount it to /tmp/john as read-write

      --pvc my-pvc:/tmp/john - Use a Persistent Volume Claim named my-pvc. Mount it to /tmp/john as read-write

      --pvc my-pvc-2:/tmp/john:ro - Use a Persistent Volume Claim named my-pvc-2. Mount it to /tmp/john as read-only

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-exists-string","title":"--pvc-exists <string>

      Mount a persistent volume. You must include a claimname and path.

      • claim name\u2014The name of the persistent colume claim. Can be obtained by running

      kubectl get storageclasses.storage.k8s.io

      • path\u2014the path internal to the container where the storage will be mounted

      Use the format:

      claimname=<CLAIM_NAME>,path=<PATH>

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-new-string","title":"--pvc-new <string>

      Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.

      • claim name\u2014The name of the persistent colume claim.
      • storage class\u2014A storage class name that can be obtained by running

      kubectl get storageclasses.storage.k8s.io.

      storageclass may be omitted if there is a single storage class in the system, or you are using the default storage class.

      • size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.
      • accessmode\u2014The description ofthedesired volume capabilities for the PVC.
      • ro\u2014Mount the PVC with read-only access.
      • ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.

      Use the format:

      storageclass= <storageclass>,size= <size>, path= <path>, ro, accessmode-rwm

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-s3-string","title":"--s3 <string>

      Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:

      bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH

      All the fields, except url=URL, are mandatory. Default for url is

      url=https://s3.amazon.com

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'

      Volumes to mount into the container.

      Examples:

      -v /raid/public/john/data:/root/data:ro

      Mount /root/data to local path /raid/public/john/data for read-only access.

      -v /public/data:/root/data::nfs.example.com

      Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-configmap-volume-namepath","title":"--configmap-volume name=,path= ...'

      Mount a ConfigMap object for use as a data volume.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-address-string","title":"--address <string>

      Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-host-ipc","title":"--host-ipc

      Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.

      For further information see docker run reference documentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-host-network","title":"--host-network

      Use the host's network stack inside the container. For further information see docker run referencedocumentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-port-stringarray","title":"--port <stringArray>

      Expose ports from the Job container.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-s-service-type-string","title":"-s | --service-type <string>

      External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-allow-privilege-escalation","title":"--allow-privilege-escalation

      Allow the job to gain additional privileges after start.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-run-as-user","title":"--run-as-user

      Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-node-pools-string","title":"--node-pools <string>

      Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-node-type-string","title":"--node-type <string>

      Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-toleration-string","title":"--toleration <string>

      Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.

      The format of the string:

      operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n
      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-loglevel-string","title":"--loglevel (string)

      Set the logging level. One of: debug | info | warn | error (default \"info\")

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-project-p-string","title":"--project | -p (string)

      Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-help-h","title":"--help | -h

      Show help text.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#output","title":"Output","text":"

      The command will attempt to submit a distributed pytorch workload. You can follow up on the workload by running runai list jobs or runai describe job <job-name>.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/","title":"runai submit-dist xgboost","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#description","title":"Description","text":"

      Submit a distributed XGBoost training run:ai job to run.

      Syntax notes:

      • Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.
      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#examples","title":"Examples","text":"
      runai submit-dist xgboost --name distributed-job --workers=2 -g 1 \\\n    -i <image_name\n>\n
      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-clean-pod-policy-string","title":"--clean-pod-policy < string >","text":"

      The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:

      • Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)
      • All\u2014all (including completed) pods will be deleted immediately when the job finishes.
      • None\u2014no pods will be deleted when the job completes.
      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-workers-int","title":"--workers < int>","text":"

      Number of replicas for Inference jobs

      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-job-name-prefix-string","title":"--job-name-prefix <string>","text":"

      The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional --job-name-prefix flag creates Job names with the provided prefix.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-name-string","title":"--name <string>","text":"

      The name of the Job.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-template-string","title":"--template <string>","text":"

      Load default values from a workload.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-add-capability-stringarray","title":"--add-capability <stringArray>","text":"

      Add linux capabilities to the container.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-a-annotation-stringarray","title":"-a | --annotation <stringArray>","text":"

      Set annotations variables in the container.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-attach","title":"--attach","text":"

      Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.

      The --attach flag also sets --tty and --stdin to true.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-command","title":"--command","text":"

      Overrides the image's entry point with the command supplied after '--'. When not using the --command flag, the entry point will not be overrided and the string after -- will be appended as arguments to the entry point command.

      Example:

      --command -- run.sh 1 54 will start the docker and run run.sh 1 54

      -- script.py 10000 will augment script.py 10000 to the entry point command (e.g. python)

      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-create-home-dir","title":"--create-home-dir","text":"

      Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-e-stringarray-environment","title":"-e <stringArray> | --environment`

      Define environment variables to be set in the container. To set multiple values add the flag multiple times (-e BATCH_SIZE=50 -e LEARNING_RATE=0.2).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-image-string-i-string","title":"--image <string> | -i <string>

      Image to use when creating the container for this Job

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-image-pull-policy-string","title":"--image-pull-policy <string>

      Pulling policy of the image when starting a container. Options are:

      • Always (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.
      • IfNotPresent: the image is pulled only if it is not already present locally.
      • Never: the image is assumed to exist locally. No attempt is made to pull the image.

      For more information see Kubernetes documentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-l-label-stringarray","title":"-l | --label <stringArray>

      Set labels variables in the container.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-master-args-string-string","title":"--master-args string <string>

      Arguments to pass to the master pod container command. If used together with --command, overrides the image's entrypoint of the master pod container with the given command.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-master-environment-stringarray","title":"--master-environment <stringArray>

      Set environment variables in the master pod container. To prevent from a worker environment variable from being set in the master, use the format: name=-.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-master-extended-resource-stringarray","title":"--master-extended-resource <stringArray>

      Request access to an extended resource in the master pod. Use the format: resource_name=quantity.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-master-gpu-float","title":"--master-gpu <float>

      GPU units to allocate for the master pod.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-master-no-pvcs","title":"--master-no-pvcs

      Do not mount any persistent volumes in the master pod.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <string>

      If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-required-pod-topology-key-string","title":"--required-pod-topology-key <string>

      Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-stdin","title":"--stdin

      Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-t-tty","title":"-t | --tty

      Allocate a pseudo-TTY.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-working-dir-string","title":"--working-dir <string>

      Starts the container with the specified directory as the current directory.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-cpu-double","title":"--cpu <double>

      CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-cpu-limit-double","title":"--cpu-limit <double>

      Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-extended-resource","title":"--extended-resource `

      Request access to extended resource, syntax <resource-name> = < resource_quantity >

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-g-gpu-float","title":"-g | --gpu <float>

      GPU units to allocate for the Job (0.5, 1).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-gpu-memory","title":"--gpu-memory

      GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-memory-string","title":"--memory <string>

      CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-memory-limit","title":"--memory-limit `

      CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-mig-profile-string-deprecated","title":"--mig-profile <string> (Deprecated)

      MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-backoff-limit-int","title":"--backoff-limit <int>

      The number of times the Job will be retried before failing. The default is 6.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-ttl-after-finish-duration","title":"--ttl-after-finish < duration >

      The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-git-sync-stringarray","title":"--git-sync <stringArray>

      Clone a git repository into the container running the Job. The parameter should follow the syntax: source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-large-shm","title":"--large-shm

      Mount a large /dev/shm device.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-mount-propagation","title":"--mount-propagation

      Enable HostToContainer mount propagation for all container volumes

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-nfs-server-string","title":"--nfs-server <string>

      Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro]","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc Pvc_Name:Container_Mount_Path:[ro]

      Mount a persistent volume claim into a container.

      Note

      This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters --pvc-exists and --pvc-new.

      The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.

      Storage_Class_Name is a storage class name that can be obtained by running kubectl get storageclasses.storage.k8s.io. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class.

      Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes

      Container_Mount_Path. A path internal to the container where the storage will be mounted

      Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container

      Examples:

      --pvc :3Gi:/tmp/john:ro - Allocate 3GB from the default Storage class. Mount it to /tmp/john as read-only

      --pvc my-storage:3Gi:/tmp/john:ro - Allocate 3GB from the my-storage storage class. Mount it to /tmp/john as read-only

      --pvc :3Gi:/tmp/john - Allocate 3GB from the default storage class. Mount it to /tmp/john as read-write

      --pvc my-pvc:/tmp/john - Use a Persistent Volume Claim named my-pvc. Mount it to /tmp/john as read-write

      --pvc my-pvc-2:/tmp/john:ro - Use a Persistent Volume Claim named my-pvc-2. Mount it to /tmp/john as read-only

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-exists-string","title":"--pvc-exists <string>

      Mount a persistent volume. You must include a claimname and path.

      • claim name\u2014The name of the persistent colume claim. Can be obtained by running

      kubectl get storageclasses.storage.k8s.io

      • path\u2014the path internal to the container where the storage will be mounted

      Use the format:

      claimname=<CLAIM_NAME>,path=<PATH>

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-new-string","title":"--pvc-new <string>

      Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.

      • claim name\u2014The name of the persistent colume claim.
      • storage class\u2014A storage class name that can be obtained by running

      kubectl get storageclasses.storage.k8s.io.

      storageclass may be omitted if there is a single storage class in the system, or you are using the default storage class.

      • size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.
      • accessmode\u2014The description ofthedesired volume capabilities for the PVC.
      • ro\u2014Mount the PVC with read-only access.
      • ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.

      Use the format:

      storageclass= <storageclass>,size= <size>, path= <path>, ro, accessmode-rwm

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-s3-string","title":"--s3 <string>

      Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:

      bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH

      All the fields, except url=URL, are mandatory. Default for url is

      url=https://s3.amazon.com

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'

      Volumes to mount into the container.

      Examples:

      -v /raid/public/john/data:/root/data:ro

      Mount /root/data to local path /raid/public/john/data for read-only access.

      -v /public/data:/root/data::nfs.example.com

      Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-configmap-volume-namepath","title":"--configmap-volume name=,path= ...'

      Mount a ConfigMap object for use as a data volume.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-address-string","title":"--address <string>

      Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-host-ipc","title":"--host-ipc

      Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.

      For further information see docker run reference documentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-host-network","title":"--host-network

      Use the host's network stack inside the container. For further information see docker run referencedocumentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-port-stringarray","title":"--port <stringArray>

      Expose ports from the Job container.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-s-service-type-string","title":"-s | --service-type <string>

      External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-allow-privilege-escalation","title":"--allow-privilege-escalation

      Allow the job to gain additional privileges after start.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-run-as-user","title":"--run-as-user

      Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-node-pools-string","title":"--node-pools <string>

      Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-node-type-string","title":"--node-type <string>

      Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-toleration-string","title":"--toleration <string>

      Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.

      The format of the string:

      operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n
      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-loglevel-string","title":"--loglevel (string)

      Set the logging level. One of: debug | info | warn | error (default \"info\")

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-project-p-string","title":"--project | -p (string)

      Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-help-h","title":"--help | -h

      Show help text.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#output","title":"Output","text":"

      The command will attempt to submit an mpi Job. You can follow up on the Job by running runai list jobs or runai describe job <job-name>.

      "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#see-also","title":"See Also","text":"
      • See Quickstart document Running Distributed Training.
      "},{"location":"Researcher/cli-reference/runai-submit/","title":"Description","text":"

      Submit a Run:ai Job for execution.

      Syntax notes:

      • Flags of type stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.
      "},{"location":"Researcher/cli-reference/runai-submit/#examples","title":"Examples","text":"

      All examples assume a Run:ai Project has been setup using runai config project <project-name>.

      Start an interactive Job:

      runai submit -i ubuntu --interactive --attach -g 1\n

      Or

      runai submit --name build1 -i ubuntu -g 1 --interactive -- sleep infinity \n

      (see: build Quickstart).

      Externalize ports:

      runai submit --name build-remote -i rastasheep/ubuntu-sshd:14.04 --interactive \\\n   --service-type=nodeport --port 30022:22\n   -- /usr/sbin/sshd -D\n

      (see: build with ports Quickstart).

      Start a Training Job

      runai submit --name train1 -i runai.jfrog.io/demo/quickstart -g 1 \n

      (see: training Quickstart).

      Use GPU Fractions

      runai submit --name frac05 -i runai.jfrog.io/demo/quickstart -g 0.5\n

      (see: GPU fractions Quickstart).

      Submit a Job without a name (automatically generates a name)

      runai submit -i runai.jfrog.io/demo/quickstart -g 1 \n

      Submit a job using the system autogenerated name to an external URL:

      runai submit -i ubuntu --interactive --attach -g 1 service-type=external-url --port 3745 --custom-url=<destination_url>\n

      Submit a job without a name to a system generated a URL :

      runai submit -i ubuntu --interactive --attach -g 1 service-type=external-url --port 3745\n

      Submit a Job without a name with a pre-defined prefix and an incremental index suffix

      runai submit --job-name-prefix -i runai.jfrog.io/demo/quickstart -g 1 \n
      "},{"location":"Researcher/cli-reference/runai-submit/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit/#job-type","title":"Job Type","text":""},{"location":"Researcher/cli-reference/runai-submit/#-interactive","title":"--interactive","text":"

      Mark this Job as interactive.

      "},{"location":"Researcher/cli-reference/runai-submit/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit/#-completions-int","title":"--completions < int >","text":"

      Number of successful pods required for this job to be completed. Used with HPO.

      "},{"location":"Researcher/cli-reference/runai-submit/#-parallelism-int","title":"--parallelism < int >","text":"

      Number of pods to run in parallel at any given time. Used with HPO.

      "},{"location":"Researcher/cli-reference/runai-submit/#-preemptible","title":"--preemptible","text":"

      Interactive preemptible jobs can be scheduled above guaranteed quota but may be reclaimed at any time.

      "},{"location":"Researcher/cli-reference/runai-submit/#-auto-deletion-time-after-completion","title":"--auto-deletion-time-after-completion","text":"

      The timeframe after which a completed or failed job is automatically deleted. Configured in seconds, minutes, or hours (for example 5s, 2m, or 3h). If set to 0, the job will be deleted immediately after completing or failing.

      "},{"location":"Researcher/cli-reference/runai-submit/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit/#-job-name-prefix-string","title":"--job-name-prefix <string>","text":"

      The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional --job-name-prefix flag creates Job names with the provided prefix.

      "},{"location":"Researcher/cli-reference/runai-submit/#-name-string","title":"--name <string>","text":"

      The name of the Job.

      "},{"location":"Researcher/cli-reference/runai-submit/#-template-string","title":"--template <string>","text":"

      Load default values from a workload.

      "},{"location":"Researcher/cli-reference/runai-submit/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit/#-add-capability-stringarray","title":"--add-capability <stringArray>","text":"

      Add linux capabilities to the container.

      "},{"location":"Researcher/cli-reference/runai-submit/#-a-annotation-stringarray","title":"-a | --annotation <stringArray>","text":"

      Set annotations variables in the container.

      "},{"location":"Researcher/cli-reference/runai-submit/#-attach","title":"--attach","text":"

      Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.

      The --attach flag also sets --tty and --stdin to true.

      "},{"location":"Researcher/cli-reference/runai-submit/#-command","title":"--command","text":"

      Overrides the image's entry point with the command supplied after '--'. When not using the --command flag, the entry point will not be overrided and the string after -- will be appended as arguments to the entry point command.

      Example:

      --command -- run.sh 1 54 will start the docker and run run.sh 1 54

      -- script.py 10000 will augment script.py 10000 to the entry point command (e.g. python)

      "},{"location":"Researcher/cli-reference/runai-submit/#-create-home-dir","title":"--create-home-dir","text":"

      Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.

      "},{"location":"Researcher/cli-reference/runai-submit/#-e-stringarray-environment-stringarray","title":"-e <stringArray> | --environment <stringArray>","text":"

      Define environment variables to be set in the container. To set multiple values add the flag multiple times (-e BATCH_SIZE=50 -e LEARNING_RATE=0.2).

      "},{"location":"Researcher/cli-reference/runai-submit/#-image-string-i-string","title":"--image <string> | -i <string>","text":"

      Image to use when creating the container for this Job

      "},{"location":"Researcher/cli-reference/runai-submit/#-image-pull-policy-string","title":"--image-pull-policy <string>","text":"

      Pulling policy of the image when starting a container. Options are:

      • Always (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.
      • IfNotPresent: the image is pulled only if it is not already present locally.
      • Never: the image is assumed to exist locally. No attempt is made to pull the image.

      For more information see Kubernetes documentation.

      "},{"location":"Researcher/cli-reference/runai-submit/#-l-label-stringarray","title":"-l | --label <stringArray>","text":"

      Set labels variables in the container.

      "},{"location":"Researcher/cli-reference/runai-submit/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <string>","text":"

      If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.

      "},{"location":"Researcher/cli-reference/runai-submit/#-required-pod-topology-key-string","title":"--required-pod-topology-key <string>","text":"

      Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.

      "},{"location":"Researcher/cli-reference/runai-submit/#-stdin","title":"--stdin","text":"

      Keep stdin open for the container(s) in the pod, even if nothing is attached.

      "},{"location":"Researcher/cli-reference/runai-submit/#-t-tty","title":"-t | --tty","text":"

      Allocate a pseudo-TTY.

      "},{"location":"Researcher/cli-reference/runai-submit/#-working-dir-string","title":"--working-dir <string>","text":"

      Starts the container with the specified directory as the current directory.

      "},{"location":"Researcher/cli-reference/runai-submit/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit/#-cpu-double","title":"--cpu <double>","text":"

      CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.

      "},{"location":"Researcher/cli-reference/runai-submit/#-cpu-limit-double","title":"--cpu-limit <double>","text":"

      Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.

      "},{"location":"Researcher/cli-reference/runai-submit/#-extended-resource-stringarray","title":"--extended-resource <stringArray>","text":"

      Request access to extended resource, syntax <resource-name> = < resource_quantity >

      "},{"location":"Researcher/cli-reference/runai-submit/#-g-gpu-float","title":"-g | --gpu <float>","text":"

      GPU units to allocate for the Job (0.5, 1).

      "},{"location":"Researcher/cli-reference/runai-submit/#-gpu-memory","title":"--gpu-memory","text":"

      GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.

      "},{"location":"Researcher/cli-reference/runai-submit/#-memory-string","title":"--memory <string>","text":"

      CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.

      "},{"location":"Researcher/cli-reference/runai-submit/#-memory-limit-string","title":"--memory-limit <string>","text":"

      CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.

      "},{"location":"Researcher/cli-reference/runai-submit/#-mig-profile-string-deprecated","title":"--mig-profile <string> (Deprecated)","text":"

      MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)

      "},{"location":"Researcher/cli-reference/runai-submit/#job-lifecycle_1","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit/#-backoff-limit-int","title":"--backoff-limit <int>","text":"

      The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the --interactive flag is not specified).

      "},{"location":"Researcher/cli-reference/runai-submit/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit/#-git-sync-stringarray","title":"--git-sync <stringArray>","text":"

      Clone a git repository into the container running the Job. The parameter should follow the syntax: source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE.

      "},{"location":"Researcher/cli-reference/runai-submit/#-large-shm","title":"--large-shm","text":"

      Mount a large /dev/shm device.

      "},{"location":"Researcher/cli-reference/runai-submit/#-mount-propagation","title":"--mount-propagation","text":"

      Enable HostToContainer mount propagation for all container volumes

      "},{"location":"Researcher/cli-reference/runai-submit/#-nfs-server-string","title":"--nfs-server <string>","text":"

      Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).

      "},{"location":"Researcher/cli-reference/runai-submit/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro]","text":""},{"location":"Researcher/cli-reference/runai-submit/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc Pvc_Name:Container_Mount_Path:[ro]","text":"

      Mount a persistent volume claim into a container.

      Note

      This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters --pvc-exists and --new-pvc.

      The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.

      Storage_Class_Name is a storage class name that can be obtained by running kubectl get storageclasses.storage.k8s.io. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class.

      Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes

      Container_Mount_Path. A path internal to the container where the storage will be mounted

      Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container

      Examples:

      --pvc :3Gi:/tmp/john:ro - Allocate 3GB from the default Storage class. Mount it to /tmp/john as read-only

      --pvc my-storage:3Gi:/tmp/john:ro - Allocate 3GB from the my-storage storage class. Mount it to /tmp/john as read-only

      --pvc :3Gi:/tmp/john - Allocate 3GB from the default storage class. Mount it to /tmp/john as read-write

      --pvc my-pvc:/tmp/john - Use a Persistent Volume Claim named my-pvc. Mount it to /tmp/john as read-write

      --pvc my-pvc-2:/tmp/john:ro - Use a Persistent Volume Claim named my-pvc-2. Mount it to /tmp/john as read-only

      "},{"location":"Researcher/cli-reference/runai-submit/#-pvc-exists-string","title":"--pvc-exists <string>","text":"

      Mount a persistent volume. You must include a claimname and path.

      • claim name\u2014The name of the persistent colume claim. Can be obtained by running

      kubectl get storageclasses.storage.k8s.io

      • path\u2014the path internal to the container where the storage will be mounted

      Use the format:

      claimname=<CLAIM_NAME>,path=<PATH>

      "},{"location":"Researcher/cli-reference/runai-submit/#-new-pvc-stringarray","title":"--new-pvc <stringArray>","text":"

      Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.

      • claim name\u2014The name of the persistent colume claim.
      • storage class\u2014A storage class name that can be obtained by running

      kubectl get storageclasses.storage.k8s.io.

      storageclass may be omitted if there is a single storage class in the system, or you are using the default storage class.

      • size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.
      • accessmode\u2014The description ofthedesired volume capabilities for the PVC.
      • ro\u2014Mount the PVC with read-only access.
      • ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.

      Use the format:

      storageclass= <storageclass>,size= <size>, path= <path>, ro, accessmode-rwm

      "},{"location":"Researcher/cli-reference/runai-submit/#-s3-string","title":"--s3 <string>","text":"

      Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:

      bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH

      All the fields, except url=URL, are mandatory. Default for url is

      url=https://s3.amazon.com

      "},{"location":"Researcher/cli-reference/runai-submit/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'","text":"

      Volumes to mount into the container.

      Examples:

      -v /raid/public/john/data:/root/data:ro

      Mount /root/data to local path /raid/public/john/data for read-only access.

      -v /public/data:/root/data::nfs.example.com

      Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.

      "},{"location":"Researcher/cli-reference/runai-submit/#-configmap-volume-namepath","title":"--configmap-volume name=,path= ...'

      Mount a ConfigMap object for use as a data volume.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit/#-host-ipc","title":"--host-ipc

      Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.

      For further information see docker run reference documentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-host-network","title":"--host-network

      Use the host's network stack inside the container. For further information see docker run referencedocumentation.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-s-service-type-string","title":"-s | --service-type <string>

      External access type to jobs. Options are:

      • nodeport - add one or more ports using --port.
      • external-url - add one port and an optional custom URL using --custom-url.

      For example:

      runai submit test-jup -p team-a -i runai.jfrog.io/demo/jupyter-tensorboard --service-type external-url --port 8888

      runai submit test-np -p team-a -i ubuntu --service-type nodeport --port 30000:7070

      This flag supports more than one service-type. Multiple service types are supported in CSV style using multiple instances of the same option and commas to separate the values for them.

      For example:

      runai submit test-np -p team-a -i ubuntu --service-type nodeport,port=30000:7070 --service-type external-url,port=30001

      runai submit test-np -p team-a -i ubuntu --service-type nodeport,port=30000:7070,port=9090 --service-type external-url,port=8080,custom-url=https://my.domain.com/url

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-port-stringarray","title":"--port <stringArray>

      Expose ports from the Job container. You can use a port number (for example 9090) or use the numbers of hostport:containerport (for example, 30000:7070).

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-custom-url-string","title":"--custom-url <string>

      An optional argument that specifies a custom URL when using the external-url service type. If not provided, the system will generate a URL automatically.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit/#-allow-privilege-escalation","title":"--allow-privilege-escalation

      Allow the job to gain additional privileges after start.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-run-as-user","title":"--run-as-user

      Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit/#-node-pools-string","title":"--node-pools <string>

      Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-node-type-string","title":"--node-type <string>

      Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-toleration-string","title":"--toleration <string>

      Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.

      The format of the string:

      operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n
      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit/#-loglevel-string","title":"--loglevel (string)

      Set the logging level. One of: debug | info | warn | error (default \"info\")

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-project-p-string","title":"--project | -p (string)

      Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-help-h","title":"--help | -h

      Show help text.

      ","text":""},{"location":"Researcher/cli-reference/runai-submit/#output","title":"Output","text":"

      The command will attempt to submit a Job. You can follow up on the Job by running runai list jobs or runai describe job <job-name>.

      Note that the submit call may use a policy to provide defaults to any of the above flags.

      "},{"location":"Researcher/cli-reference/runai-submit/#see-also","title":"See Also","text":"
      • See any of the Quickstart documents here:.
      • See policy configuration for a description on how policies work.
      "},{"location":"Researcher/cli-reference/runai-suspend/","title":"runai suspend","text":""},{"location":"Researcher/cli-reference/runai-suspend/#description","title":"Description","text":"

      Suspend a Job

      Suspending a Running Job will stop the Job and will not allow it to be scheduled until it is resumed using runai resume. This means that,

      • You will no longer be able to enter it via runai bash.
      • The Job logs will be deleted.
      • Any data saved on the container and not stored in a shared location will be lost.

      Technically, the command deletes the Kubernetes pods associated with the Job and marks the Job as suspended until it is manually released.

      Suspend and resume do not work with MPI and Inference

      "},{"location":"Researcher/cli-reference/runai-suspend/#synopsis","title":"Synopsis","text":"
      runai suspend <job-name>\n    [--all | -A]\n\n    [--loglevel value]\n    [--project string | -p string]\n    [--help | -h]\n
      "},{"location":"Researcher/cli-reference/runai-suspend/#options","title":"Options","text":"

      <job-name> - The name of the Job to run the command with. Mandatory.

      "},{"location":"Researcher/cli-reference/runai-suspend/#-all-a","title":"--all | -A","text":"

      Suspend all Jobs in the current Project.

      "},{"location":"Researcher/cli-reference/runai-suspend/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-suspend/#-loglevel-string","title":"--loglevel (string)","text":"

      Set the logging level. One of: debug | info | warn | error (default \"info\").

      "},{"location":"Researcher/cli-reference/runai-suspend/#-project-p-string","title":"--project | -p (string)","text":"

      Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

      "},{"location":"Researcher/cli-reference/runai-suspend/#-help-h","title":"--help | -h","text":"

      Show help text.

      "},{"location":"Researcher/cli-reference/runai-suspend/#output","title":"Output","text":"
      • The Job will be suspended. When running runai list jobs the Job will be marked as Suspended.
      "},{"location":"Researcher/cli-reference/runai-suspend/#see-also","title":"See Also","text":"
      • Resuming Jobs: Resume.
      "},{"location":"Researcher/cli-reference/runai-top-node/","title":"runai top node","text":""},{"location":"Researcher/cli-reference/runai-top-node/#description","title":"Description","text":"

      Show list of Nodes (machines), their capacity and utilization.

      "},{"location":"Researcher/cli-reference/runai-top-node/#synopsis","title":"Synopsis","text":"
      runai top node \n    [--help | -h]\n    [--details | -d]\n
      "},{"location":"Researcher/cli-reference/runai-top-node/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-top-node/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-top-node/#-loglevel-string","title":"--loglevel (string)","text":"

      Set the logging level. One of: debug | info | warn | error (default \"info\").

      "},{"location":"Researcher/cli-reference/runai-top-node/#-help-h","title":"--help | -h","text":"

      Show help text.

      "},{"location":"Researcher/cli-reference/runai-top-node/#-details-d","title":"--details | -d","text":"

      Show additional details.

      "},{"location":"Researcher/cli-reference/runai-top-node/#output","title":"Output","text":"

      Shows a list of Nodes their capacity and utilization.

      "},{"location":"Researcher/cli-reference/runai-top-node/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-update/","title":"runai update","text":""},{"location":"Researcher/cli-reference/runai-update/#description","title":"Description","text":"

      Find and install the latest version of the runai command-line utility. The command must be run with sudo permissions.

      sudo runai update\n
      "},{"location":"Researcher/cli-reference/runai-update/#synopsis","title":"Synopsis","text":"
      runai update \n    [--loglevel value] \n    [--help | -h]\n
      "},{"location":"Researcher/cli-reference/runai-update/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-update/#-loglevel-string","title":"--loglevel (string)","text":"

      Set the logging level. One of: debug | info | warn | error (default \"info\").

      "},{"location":"Researcher/cli-reference/runai-update/#-help-h","title":"--help | -h","text":"

      Show help text.

      "},{"location":"Researcher/cli-reference/runai-update/#output","title":"Output","text":"

      Update of the Run:ai command-line interface.

      "},{"location":"Researcher/cli-reference/runai-update/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-version/","title":"runai version","text":""},{"location":"Researcher/cli-reference/runai-version/#description","title":"Description","text":"

      Show the version of this utility.

      "},{"location":"Researcher/cli-reference/runai-version/#synopsis","title":"Synopsis","text":"
      runai version \n    [--loglevel value] \n    [--help | -h]\n
      "},{"location":"Researcher/cli-reference/runai-version/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-version/#-loglevel-string","title":"--loglevel (string)","text":"

      Set the logging level. One of: debug | info | warn | error (default \"info\").

      "},{"location":"Researcher/cli-reference/runai-version/#-help-h","title":"--help | -h","text":"

      Show help text.

      "},{"location":"Researcher/cli-reference/runai-version/#output","title":"Output","text":"

      The version of the Run:ai command-line interface.

      "},{"location":"Researcher/cli-reference/runai-version/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-whoami/","title":"runai whoami","text":""},{"location":"Researcher/cli-reference/runai-whoami/#description","title":"Description","text":"

      Show the user name currently logged in

      "},{"location":"Researcher/cli-reference/runai-whoami/#synopsis","title":"Synopsis","text":"
      runai whoami \n    [--loglevel value] \n    [--help | -h]\n
      "},{"location":"Researcher/cli-reference/runai-whoami/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-whoami/#-loglevel-string","title":"--loglevel (string)","text":"

      Set the logging level. One of: debug | info | warn | error (default \"info\").

      "},{"location":"Researcher/cli-reference/runai-whoami/#-help-h","title":"--help | -h","text":"

      Show help text.

      "},{"location":"Researcher/cli-reference/runai-whoami/#output","title":"Output","text":"

      The name of the User currently logged in with the Run:ai command-line interface.

      "},{"location":"Researcher/cli-reference/runai-whoami/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/","title":"CLI Examples","text":"

      This article provides examples of popular use cases illustrating how to use the Command Line Interface (CLI)

      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#logging-in","title":"Logging in","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#logging-in-via-runai-sign-in-page-web","title":"Logging in via run:ai sign in page (web)","text":"

      You can log in from the UI, if you are using SSO or credentials

      runai login\n

      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#logging-in-via-terminal-credentials","title":"Logging in via terminal (credentials)","text":"
      runai login user -u john@acme.com -p \"password\"\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#configuration","title":"Configuration","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#setting-a-default-project","title":"Setting a default project","text":"
      runai project set \"project-name\"\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-a-workload","title":"Submitting a workload","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#naming-a-workload","title":"Naming a workload","text":"

      Use the commands below to provide a name for a workload.

      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#setting-a-the-workload-name-my_workload_name","title":"Setting a the workload name ( my_workload_name)","text":"
      runai workspace submit my-workload-name -p test -i ubuntu \n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#setting-a-random-name-with-prefix-prefixworkload-type","title":"Setting a random name with prefix (prefix=workload type)","text":"
          runai workspace submit -p test -i ubuntu \n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#setting-a-random-name-with-specific-prefix-prefix-determined-by-flag","title":"Setting a random name with specific prefix (prefix determined by flag)","text":"
      runai workspace submit --prefix-name my-prefix-workload-name -p test -i ubuntu \n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#labels-and-annotations","title":"Labels and annotations","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#labels","title":"Labels","text":"
      runai workspace submit -p test -i ubuntu --label name=value --label name2=value2\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#annotations","title":"Annotations","text":"
      runai workspace submit -p test -i ubuntu --annotation name=value --annotation name2=value2\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#containers-environment-variables","title":"Container's environment variables","text":"
      runai workspace submit -p test -i ubuntu -e name=value -e name2=value2\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#requests-and-limits","title":"Requests and limits","text":"
      runai workspace submit  -p \"project-name\" -i runai.jfrog.io/demo/quickstart-demo   --cpu-core-request 0.3 --cpu-core-limit 1 --cpu-memory-request 50M --cpu-memory-limit 1G  --gpu-devices-request 1 --gpu-memory-request 1G\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-and-attaching-to-process","title":"Submitting and attaching to process","text":"
      runai workspace submit  -p \"project-name\" -i python  --attach -- python3\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-a-jupyter-notebook","title":"Submitting a jupyter notebook","text":"
      runai workspace submit --image jupyter/scipy-notebook -p \"project-name\" --gpu-devices-request 1 --external-url container=8888 --name-prefix jupyter --command -- start-notebook.sh --NotebookApp.base_url='/${RUNAI_PROJECT}/${RUNAI_JOB_NAME}' --NotebookApp.token=''\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-distributed-training-workload-with-tensorflow","title":"Submitting distributed training workload with TensorFlow","text":"
      runai distributed submit -f TF --workers=5 --no-master -g 1 -i kubeflow/tf-mnist-with-summaries:latest -p \"project-name\" --command -- python /var/tf_mnist/mnist_with_summaries.py --max_steps 1000000\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-a-multi-pod-workload","title":"Submitting a multi-pod workload","text":"
      runai training submit  -i alpine -p test --parallelism 2 --completions 2  -- sleep 100000\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submit-and-bash","title":"Submit and bash","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-a-workload-with-bash-command","title":"Submitting a workload with bash command","text":"
      runai training pytorch submit  -p \"project-name\" -i nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 -g 1 --workers 3 --command -- bash -c 'trap : TERM INT; sleep infinity & wait'\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#bashing-into-the-workload","title":"Bashing into the workload","text":"
      runai training pytorch bash pytorch-06027b585626 -p \"project-name\"\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-distributed-training-workload-with-mpi","title":"Submitting distributed training workload with MPI","text":"
      runai  mpi submit dist1 --workers=2 -g 1 \\\n    -i runai.jfrog.io/demo/quickstart-distributed:v0.3.0 -e RUNAI_SLEEP_SECS=60 -p \"project-name\"\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-with-pvc","title":"Submitting with PVC","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#new-pvc-bounded-to-the-workspace","title":"New PVC bounded to the workspace","text":"

      New PVCs will be deleted when the workload is deleted

      runai workspace submit -i ubuntu --new-pvc claimname=yuval-3,size=10M,path=/tmp/test\n

      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#new-ephemeral-pvc","title":"New ephemeral PVC","text":"

      New ephemeral PVCs will be deleted when the workload is deleted or paused

      runai workspace submit -i ubuntu --new-pvc claimname=yuval2,size=10M,path=/tmp/test,ephemeral\n

      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#existing-pvc","title":"Existing PVC","text":"

      Existing PVCs will not be deleted when the workload is deleted

      runai workspace submit -i ubuntu --existing-pvc claimname=test-pvc-2-project-mn2xs,path=/home/test\n

      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#masterworker-configuration","title":"Master/Worker configuration","text":"

      --command flag and -- are set both leader (master) and workers command/arguments

      --master-args flag sets the master arguments

      --master-command flag sets the master commands with arguments

      --master-args and --master-command flags can be set together

      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#overriding-both-the-leader-master-and-worker-images-arguments","title":"Overriding both the leader (master) and worker image's arguments","text":"
      runai pytorch submit -i ubuntu -- -a argument_a -b argument_b -c\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#overriding-both-the-leader-master-and-worker-images-commands-with-arguments","title":"Overriding both the leader (master) and worker image's commands with arguments","text":"
      runai pytorch submit -i ubuntu --command -- python -m pip install\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#overriding-arguments-of-the-leader-master-and-worker-images-arguments-with-different-values","title":"Overriding arguments of the leader (master) and worker image's arguments with different values","text":"
      runai pytorch submit -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#overriding-command-with-arguments-of-the-leader-master-and-worker-images-arguments","title":"Overriding command with arguments of the leader (master) and worker image's arguments","text":"
      runai pytorch submit -i ubuntu --master-command \"python_master -m pip install'\" --command -- 'python_worker -m pip install'\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#listing-objects","title":"Listing objects","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#listing-all-workloads-in-the-users-scope","title":"Listing all workloads in the user's scope","text":"
      runai workload list -A\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#listing-projects-in-a-yaml-format","title":"Listing projects in a YAML format","text":"
      runai project list --yaml\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#listing-nodes-in-a-json-format","title":"Listing nodes in a JSON format","text":"
      runai node list --json\n
      "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#cli-reference","title":"CLI reference","text":"

      For the full guide of the CLI syntax, see the CLI reference

      "},{"location":"Researcher/cli-reference/new-cli/overview/","title":"Run:ai V2 Command-line Interface","text":"

      The Run:ai Command-line Interface (CLI) tool for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, and access other features in the Run:ai platform.

      "},{"location":"Researcher/cli-reference/new-cli/overview/#the-new-v2-command-line-interface","title":"The new V2 Command-line interface","text":"

      This command-line interface is a complete revamp of the command-line interface. Few highlights:

      • The CLI internally uses the Control-plane API. This provides a single point of view on Workloads removing dissimilarities between the user interface, programming interface and the command-line interface.
      • As such, it also removes the need to configure the Kubernetes API server for authentication.
      • The CLI is only available for Run:ai cluster version 2.18 and up.
      • The new V2 CLI is backward compatible with the older V1 CLI.
      "},{"location":"Researcher/cli-reference/new-cli/overview/#installing-the-improved-command-line-interface","title":"Installing the Improved Command Line Interface","text":"

      See installation instructions here.

      "},{"location":"Researcher/cli-reference/new-cli/overview/#reference","title":"Reference","text":"

      List of all commands can be found here

      "},{"location":"Researcher/cli-reference/new-cli/runai/","title":"CLI Reference","text":""},{"location":"Researcher/cli-reference/new-cli/runai/#runai","title":"runai","text":"

      Run:ai Command-line Interface

      "},{"location":"Researcher/cli-reference/new-cli/runai/#synopsis","title":"Synopsis","text":"

      runai - The Run:ai Researcher Command Line Interface

      Description: A tool for managing Run:ai workloads and monitoring available resources. It provides researchers with comprehensive control over their AI development environment.

      runai [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai/#options","title":"Options","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -h, --help                 help for runai\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai/#see-also","title":"SEE ALSO","text":"
      • runai cluster - cluster management
      • runai config - configuration management
      • runai kubeconfig - kubeconfig management
      • runai login - login to the control plane
      • runai logout - logout from control plane
      • runai mpi - alias for mpi management
      • runai node - node management
      • runai nodepool - node pool management
      • runai project - project management
      • runai pytorch - alias for pytorch management
      • runai report - [Experimental] report management
      • runai tensorflow - alias for tensorflow management
      • runai training - training management
      • runai upgrade - upgrades the CLI to the latest version
      • runai version - show the current version of the CLI
      • runai whoami - show the current logged in user
      • runai workload - workload management
      • runai workspace - workspace management
      • runai xgboost - alias for xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_attach/","title":"Runai attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_attach/#runai-attach","title":"runai attach","text":"

      [Deprecated] attach

      runai attach WORKLOAD_NAME [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --type string                    The type of the workload (training, workspace)\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_attach/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      "},{"location":"Researcher/cli-reference/new-cli/runai_cluster/","title":"Runai cluster","text":""},{"location":"Researcher/cli-reference/new-cli/runai_cluster/#runai-cluster","title":"runai cluster","text":"

      cluster management

      runai cluster [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_cluster/#options","title":"Options","text":"
        -h, --help                 help for cluster\n      --interactive enable   set interactive mode (enabled|disabled)\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_cluster/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_cluster/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai cluster list - cluster list command
      • runai cluster set - set cluster context
      "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_list/","title":"Runai cluster list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_cluster_list/#runai-cluster-list","title":"runai cluster list","text":"

      cluster list command

      runai cluster list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_list/#options","title":"Options","text":"
        -h, --help         help for list\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_list/#see-also","title":"SEE ALSO","text":"
      • runai cluster - cluster management
      "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_set/","title":"Runai cluster set","text":""},{"location":"Researcher/cli-reference/new-cli/runai_cluster_set/#runai-cluster-set","title":"runai cluster set","text":"

      set cluster context

      runai cluster set [CLUSTER_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_set/#options","title":"Options","text":"
        -h, --help        help for set\n      --id string   set by cluster ID\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_set/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_set/#see-also","title":"SEE ALSO","text":"
      • runai cluster - cluster management
      "},{"location":"Researcher/cli-reference/new-cli/runai_config/","title":"Runai config","text":""},{"location":"Researcher/cli-reference/new-cli/runai_config/#runai-config","title":"runai config","text":"

      configuration management

      runai config [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config/#options","title":"Options","text":"
        -h, --help                 help for config\n      --interactive enable   set interactive mode (enabled|disabled)\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai config generate - generate config file
      • runai config set - Set configuration values
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_generate/","title":"Runai config generate","text":""},{"location":"Researcher/cli-reference/new-cli/runai_config_generate/#runai-config-generate","title":"runai config generate","text":"

      generate config file

      runai config generate [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_generate/#options","title":"Options","text":"
            --file string   Output structure to file\n  -h, --help          help for generate\n      --json          Output structure JSON\n      --yaml          Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_generate/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_generate/#see-also","title":"SEE ALSO","text":"
      • runai config - configuration management
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_project/","title":"Runai config project","text":""},{"location":"Researcher/cli-reference/new-cli/runai_config_project/#runai-config-project","title":"runai config project","text":"

      [Deprecated] Configure a default project

      runai config project PROJECT_NAME [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_project/#options","title":"Options","text":"
        -h, --help   help for project\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_project/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_project/#see-also","title":"SEE ALSO","text":"
      • runai config - configuration management
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_set/","title":"Runai config set","text":""},{"location":"Researcher/cli-reference/new-cli/runai_config_set/#runai-config-set","title":"runai config set","text":"

      Set configuration values

      runai config set [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_set/#examples","title":"Examples","text":"
      runai config set --status-timeout-duration 5s\nrunai config set --status-timeout-duration 300ms\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_set/#options","title":"Options","text":"
            --auth-url string                  set the authorization URL; most likely the same as the control plane URL\n      --cp-url string                    set the control plane URL\n  -h, --help                             help for set\n      --interactive enable               set interactive mode (enabled|disabled)\n      --output string                    set the default output type\n      --status-timeout-duration string   set cluster status call timeout duration value, the default is 3 second (\"3s\")\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_set/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_config_set/#see-also","title":"SEE ALSO","text":"
      • runai config - configuration management
      "},{"location":"Researcher/cli-reference/new-cli/runai_describe/","title":"Runai describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_describe/#runai-describe","title":"runai describe","text":"

      [Deprecated] Display detailed information about resources

      "},{"location":"Researcher/cli-reference/new-cli/runai_describe/#options","title":"Options","text":"
        -h, --help   help for describe\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_describe/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai describe job - [Deprecated] Display details of a job
      • runai describe node - [Deprecated] Display detailed information about nodes in the cluster
      "},{"location":"Researcher/cli-reference/new-cli/runai_describe_job/","title":"Runai describe job","text":""},{"location":"Researcher/cli-reference/new-cli/runai_describe_job/#runai-describe-job","title":"runai describe job","text":"

      [Deprecated] Display details of a job

      runai describe job JOB_NAME [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_describe_job/#options","title":"Options","text":"
        -h, --help             help for job\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --type string      The type of the workload (training, workspace)\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_describe_job/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_describe_job/#see-also","title":"SEE ALSO","text":"
      • runai describe - [Deprecated] Display detailed information about resources
      "},{"location":"Researcher/cli-reference/new-cli/runai_describe_node/","title":"Runai describe node","text":""},{"location":"Researcher/cli-reference/new-cli/runai_describe_node/#runai-describe-node","title":"runai describe node","text":"

      [Deprecated] Display detailed information about nodes in the cluster

      runai describe node [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_describe_node/#options","title":"Options","text":"
        -h, --help   help for node\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_describe_node/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_describe_node/#see-also","title":"SEE ALSO","text":"
      • runai describe - [Deprecated] Display detailed information about resources
      "},{"location":"Researcher/cli-reference/new-cli/runai_exec/","title":"Runai exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_exec/#runai-exec","title":"runai exec","text":"

      [Deprecated] exec

      runai exec WORKLOAD_NAME [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --type string                    The type of the workload (training, workspace)\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_exec/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig/","title":"Runai kubeconfig","text":""},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig/#runai-kubeconfig","title":"runai kubeconfig","text":"

      kubeconfig management

      "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig/#options","title":"Options","text":"
        -h, --help   help for kubeconfig\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai kubeconfig set - kubeconfig set login token
      "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig_set/","title":"Runai kubeconfig set","text":""},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig_set/#runai-kubeconfig-set","title":"runai kubeconfig set","text":"

      kubeconfig set login token

      runai kubeconfig set [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig_set/#options","title":"Options","text":"
        -h, --help   help for set\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig_set/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig_set/#see-also","title":"SEE ALSO","text":"
      • runai kubeconfig - kubeconfig management
      "},{"location":"Researcher/cli-reference/new-cli/runai_list/","title":"Runai list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_list/#runai-list","title":"runai list","text":"

      [Deprecated] display resource list. By default displays the job list

      runai list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list/#options","title":"Options","text":"
        -A, --all-projects     list workloads from all projects\n  -h, --help             help for list\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai list clusters - [Deprecated] list all available clusters
      • runai list jobs - [Deprecated] list all jobs
      • runai list nodes - [Deprecated] list all nodes
      • runai list projects - [Deprecated] list all available projects
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_clusters/","title":"Runai list clusters","text":""},{"location":"Researcher/cli-reference/new-cli/runai_list_clusters/#runai-list-clusters","title":"runai list clusters","text":"

      [Deprecated] list all available clusters

      runai list clusters [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_clusters/#options","title":"Options","text":"
        -h, --help         help for clusters\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_clusters/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_clusters/#see-also","title":"SEE ALSO","text":"
      • runai list - [Deprecated] display resource list. By default displays the job list
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_jobs/","title":"Runai list jobs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_list_jobs/#runai-list-jobs","title":"runai list jobs","text":"

      [Deprecated] list all jobs

      runai list jobs [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_jobs/#options","title":"Options","text":"
        -A, --all-projects     list workloads from all projects\n  -h, --help             help for jobs\n      --json             Output structure JSON\n      --no-headers       Output structure table without headers\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --table            Output structure table\n      --yaml             Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_jobs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_jobs/#see-also","title":"SEE ALSO","text":"
      • runai list - [Deprecated] display resource list. By default displays the job list
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_nodes/","title":"Runai list nodes","text":""},{"location":"Researcher/cli-reference/new-cli/runai_list_nodes/#runai-list-nodes","title":"runai list nodes","text":"

      [Deprecated] list all nodes

      runai list nodes [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_nodes/#options","title":"Options","text":"
        -h, --help         help for nodes\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_nodes/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_nodes/#see-also","title":"SEE ALSO","text":"
      • runai list - [Deprecated] display resource list. By default displays the job list
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_projects/","title":"Runai list projects","text":""},{"location":"Researcher/cli-reference/new-cli/runai_list_projects/#runai-list-projects","title":"runai list projects","text":"

      [Deprecated] list all available projects

      runai list projects [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_projects/#options","title":"Options","text":"
        -h, --help         help for projects\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_projects/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_list_projects/#see-also","title":"SEE ALSO","text":"
      • runai list - [Deprecated] display resource list. By default displays the job list
      "},{"location":"Researcher/cli-reference/new-cli/runai_login/","title":"Runai login","text":""},{"location":"Researcher/cli-reference/new-cli/runai_login/#runai-login","title":"runai login","text":"

      login to the control plane

      runai login [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login/#examples","title":"Examples","text":"
        # Login using browser\n  runai login\n\n  # Login using SSO with remote browser\n  runai login sso\n  runai login remote-browser\n\n  # Login using username and password without browser\n  runai login user -u <username> \n\n  # Login using browser with specific port and host\n  runai login --listen-port=43121 --listen-host=localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login/#options","title":"Options","text":"
        -h, --help                 help for login\n      --listen-host string   the host to listen on for the authentication callback (for browser mode only) (default \"localhost\")\n      --listen-port int      the port to listen on for the authentication callback (for browser mode only) (default 43121)\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai login application - login as an application
      • runai login sso - login using sso without browser
      • runai login user - login for local user without browser
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_application/","title":"Runai login application","text":""},{"location":"Researcher/cli-reference/new-cli/runai_login_application/#runai-login-application","title":"runai login application","text":"

      login as an application

      runai login application [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_application/#examples","title":"Examples","text":"
        # Login interactive using application credentials\n  runai login app\n\n  # Login using application credentials\n  login app --name=<app_name> --secret=<app_secret> --interactive=disabled\n\n  # Login and Save application credentials\n  login app --name=<app_name> --secret=<app_secret> --interactive=disabled --save\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_application/#options","title":"Options","text":"
        -h, --help                 help for application\n      --interactive enable   set interactive mode (enabled|disabled)\n      --name string          application name\n      --save                 save application credentials in config file\n      --secret string        application secret\n      --secret-file string   use application secret from file\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_application/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_application/#see-also","title":"SEE ALSO","text":"
      • runai login - login to the control plane
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_sso/","title":"Runai login sso","text":""},{"location":"Researcher/cli-reference/new-cli/runai_login_sso/#runai-login-sso","title":"runai login sso","text":"

      login using sso without browser

      runai login sso [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_sso/#options","title":"Options","text":"
        -h, --help   help for sso\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_sso/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_sso/#see-also","title":"SEE ALSO","text":"
      • runai login - login to the control plane
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_user/","title":"Runai login user","text":""},{"location":"Researcher/cli-reference/new-cli/runai_login_user/#runai-login-user","title":"runai login user","text":"

      login for local user without browser

      "},{"location":"Researcher/cli-reference/new-cli/runai_login_user/#synopsis","title":"Synopsis","text":"

      Login to the control plane using a local user without browser

      runai login user [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_user/#examples","title":"Examples","text":"
      # Login with a username. the password will be prompted via stdin afterward (recommended)\nrunai login user -u <username>\n\n# Login with a username and plain password (not recommended for security reasons)\nrunai login user --user=user --password=pass\n\n# Login with a username and password (not recommended for security reasons)\nrunai login user -u=user -p=pass\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_user/#options","title":"Options","text":"
        -h, --help              help for user\n  -p, --password string   plaintext password of the given username. not recommended for security reasons\n  -u, --user string       the username to login with\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_user/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_login_user/#see-also","title":"SEE ALSO","text":"
      • runai login - login to the control plane
      "},{"location":"Researcher/cli-reference/new-cli/runai_logout/","title":"Runai logout","text":""},{"location":"Researcher/cli-reference/new-cli/runai_logout/#runai-logout","title":"runai logout","text":"

      logout from control plane

      runai logout [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_logout/#options","title":"Options","text":"
        -h, --help   help for logout\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_logout/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_logout/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      "},{"location":"Researcher/cli-reference/new-cli/runai_logs/","title":"Runai logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_logs/#runai-logs","title":"runai logs","text":"

      [Deprecated] logs

      runai logs WORKLOAD_NAME [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --type string             The type of the workload (training, workspace)\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_logs/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi/","title":"Runai mpi","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi/#runai-mpi","title":"runai mpi","text":"

      alias for mpi management

      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi/#options","title":"Options","text":"
        -h, --help   help for mpi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai mpi attach - attach to a running container in a mpi training job
      • runai mpi bash - open a bash shell in a mpi training job
      • runai mpi delete - delete mpi training workload
      • runai mpi describe - describe mpi training
      • runai mpi exec - execute a command in a mpi training job
      • runai mpi list - list mpi training
      • runai mpi logs - view logs of a mpi training job
      • runai mpi port-forward - forward one or more local ports to a mpi training job
      • runai mpi resume - resume mpi training
      • runai mpi submit - submit mpi training
      • runai mpi suspend - suspend mpi training
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_attach/","title":"Runai mpi attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_attach/#runai-mpi-attach","title":"runai mpi attach","text":"

      attach to a running container in a mpi training job

      runai mpi attach [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_attach/#examples","title":"Examples","text":"
      # Attaching to the main worker of a mpi training\nrunai training mpi attach mpi-01 --tty --stdin\n\n# Attaching to a specific pod of a mpi training\nrunai training mpi attach mpi-01 --pod mpi-01-worker-1 --tty --stdin\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_attach/#see-also","title":"SEE ALSO","text":"
      • runai mpi - alias for mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_bash/","title":"Runai mpi bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_bash/#runai-mpi-bash","title":"runai mpi bash","text":"

      open a bash shell in a mpi training job

      runai mpi bash [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_bash/#examples","title":"Examples","text":"
      # Open a bash shell in the mpi training's main worker\nrunai training mpi bash mpi-01\n\n# Open a bash shell in a specific mpi training worker\nrunai training mpi bash mpi-01 --pod mpi-01-worker-1\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_bash/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_bash/#see-also","title":"SEE ALSO","text":"
      • runai mpi - alias for mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_delete/","title":"Runai mpi delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_delete/#runai-mpi-delete","title":"runai mpi delete","text":"

      delete mpi training workload

      runai mpi delete [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_delete/#examples","title":"Examples","text":"
      # Delete a mpi training workload with a default project\nrunai training mpi delete <mpi-name>\n\n# Delete a mpi training workload with a specific project\nrunai training mpi delete <mpi-name> -p <project_name>\n\n# Delete a mpi training workload by UUID\nrunai training mpi delete --uuid=<mpi_uuid> -p <project_name>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_delete/#options","title":"Options","text":"
        -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_delete/#see-also","title":"SEE ALSO","text":"
      • runai mpi - alias for mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_describe/","title":"Runai mpi describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_describe/#runai-mpi-describe","title":"runai mpi describe","text":"

      describe mpi training

      runai mpi describe [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_describe/#examples","title":"Examples","text":"
      # Describe a mpi training workload with a default project\nrunai training mpi describe <mpi-name>\n\n# Describe a mpi training workload in a specific project\nrunai training mpi describe <mpi-name> -p <project_name>\n\n# Describe a mpi training workload by UUID\nrunai training mpi describe --uuid=<mpi_uuid>\n\n# Describe a mpi training workload with specific output format\nrunai training mpi describe <mpi-name> -o json\n\n# Describe a mpi training workload with specific sections\nrunai training mpi describe <mpi-name> --general --compute --pods --events --networks\n\n# Describe a mpi training workload with container details and custom limits\nrunai training mpi describe <mpi-name> --containers --pod-limit 20 --event-limit 100\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_describe/#options","title":"Options","text":"
            --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_describe/#see-also","title":"SEE ALSO","text":"
      • runai mpi - alias for mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_exec/","title":"Runai mpi exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_exec/#runai-mpi-exec","title":"runai mpi exec","text":"

      execute a command in a mpi training job

      runai mpi exec [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_exec/#examples","title":"Examples","text":"
      # Execute bash in the mpi training's main worker\nrunai training mpi exec mpi-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the mpi training's main worker\nrunai training mpi exec mpi-01 -- ls\n\n# Execute a command in a specific mpi training worker\nrunai training mpi exec mpi-01 --pod mpi-01-worker-1 -- nvidia-smi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_exec/#see-also","title":"SEE ALSO","text":"
      • runai mpi - alias for mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_list/","title":"Runai mpi list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_list/#runai-mpi-list","title":"runai mpi list","text":"

      list mpi training

      runai mpi list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_list/#examples","title":"Examples","text":"
      # List all mpi training workloads\nrunai training mpi list -A\n\n# List mpi training workloads with default project\nrunai training mpi list\n\n# List mpi training workloads in a specific project\nrunai training mpi list -p <project_name>\n\n# List all mpi training workloads with a specific output format\nrunai training mpi list -o wide\n\n# List mpi training workloads with pagination\nrunai training mpi list --limit 20 --offset 40\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_list/#options","title":"Options","text":"
        -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_list/#see-also","title":"SEE ALSO","text":"
      • runai mpi - alias for mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_logs/","title":"Runai mpi logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_logs/#runai-mpi-logs","title":"runai mpi logs","text":"

      view logs of a mpi training job

      runai mpi logs [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_logs/#examples","title":"Examples","text":"
      # Get logs for a mpi training\nrunai training mpi logs mpi-01\n\n# Get logs for a specific pod in a mpi training\nrunai training mpi logs mpi-01 --pod=mpi-01-worker-0\n\n# Get logs for a specific container in a mpi training\nrunai training mpi logs mpi-01 --container=mpi-worker\n\n# Get the last 100 lines of logs\nrunai training mpi logs mpi-01 --tail=100\n\n# Get logs with timestamps\nrunai training mpi logs mpi-01 --timestamps\n\n# Follow the logs\nrunai training mpi logs mpi-01 --follow\n\n# Get logs for the previous instance of the mpi training\nrunai training mpi logs mpi-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training mpi logs mpi-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training mpi logs mpi-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training mpi logs mpi-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for mpi training to be ready for logs\nrunai training mpi logs mpi-01 --wait-timeout=30s\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_logs/#see-also","title":"SEE ALSO","text":"
      • runai mpi - alias for mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_port-forward/","title":"Runai mpi port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_port-forward/#runai-mpi-port-forward","title":"runai mpi port-forward","text":"

      forward one or more local ports to a mpi training job

      runai mpi port-forward [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_port-forward/#examples","title":"Examples","text":"
      # Forward connections from localhost:8080 to mpi training on port 8090:\nrunai training mpi port-forward mpi-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to mpi training on port 8080:\nrunai training mpi port-forward mpi-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to mpi training on port 8090 and from localhost:6443 to mpi training on port 443:\nrunai training mpi port-forward mpi-01 --port 8080:8090 --port 6443:443 --address localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai mpi - alias for mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_resume/","title":"Runai mpi resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_resume/#runai-mpi-resume","title":"runai mpi resume","text":"

      resume mpi training

      runai mpi resume [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_resume/#examples","title":"Examples","text":"
      # Resume a mpi training workload\nrunai training mpi resume <mpi-name>\n\n# Resume a mpi training workload in a specific project\nrunai training mpi resume <mpi-name> -p <project_name>\n\n# Resume a mpi training workload by UUID\nrunai training mpi resume --uuid=<mpi_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_resume/#options","title":"Options","text":"
        -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_resume/#see-also","title":"SEE ALSO","text":"
      • runai mpi - alias for mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_submit/","title":"Runai mpi submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_submit/#runai-mpi-submit","title":"runai mpi submit","text":"

      submit mpi training

      runai mpi submit [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_submit/#examples","title":"Examples","text":"
      # Submit a mpi training workload\nrunai training mpi submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a mpi training workload with arguments\nrunai training mpi submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a mpi training workload with a custom command\nrunai training mpi submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a mpi training master args with worker args\nrunai training mpi submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a mpi training master command with worker args\nrunai training mpi submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a mpi training master command with worker command\nrunai training mpi submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_submit/#options","title":"Options","text":"
            --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --slots-per-worker int32                         Number of slots to allocate for each worker\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_submit/#see-also","title":"SEE ALSO","text":"
      • runai mpi - alias for mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_suspend/","title":"Runai mpi suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_suspend/#runai-mpi-suspend","title":"runai mpi suspend","text":"

      suspend mpi training

      runai mpi suspend [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_suspend/#examples","title":"Examples","text":"
      # Suspend a mpi training workload\nrunai training mpi suspend <mpi-name>\n\n# Suspend a mpi training workload in a specific project\nrunai training mpi suspend <mpi-name> -p <project_name>\n\n# Suspend a mpi training workload by UUID\nrunai training mpi suspend --uuid=<mpi_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_suspend/#options","title":"Options","text":"
        -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_suspend/#see-also","title":"SEE ALSO","text":"
      • runai mpi - alias for mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_node/","title":"Runai node","text":""},{"location":"Researcher/cli-reference/new-cli/runai_node/#runai-node","title":"runai node","text":"

      node management

      "},{"location":"Researcher/cli-reference/new-cli/runai_node/#options","title":"Options","text":"
        -h, --help   help for node\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_node/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_node/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai node list - List node
      "},{"location":"Researcher/cli-reference/new-cli/runai_node_list/","title":"Runai node list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_node_list/#runai-node-list","title":"runai node list","text":"

      List node

      runai node list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_node_list/#options","title":"Options","text":"
        -h, --help         help for list\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_node_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_node_list/#see-also","title":"SEE ALSO","text":"
      • runai node - node management
      "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool/","title":"Runai nodepool","text":""},{"location":"Researcher/cli-reference/new-cli/runai_nodepool/#runai-nodepool","title":"runai nodepool","text":"

      node pool management

      runai nodepool [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool/#options","title":"Options","text":"
        -h, --help   help for nodepool\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai nodepool list - List node pool
      "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool_list/","title":"Runai nodepool list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_nodepool_list/#runai-nodepool-list","title":"runai nodepool list","text":"

      List node pool

      runai nodepool list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool_list/#options","title":"Options","text":"
        -h, --help         help for list\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool_list/#see-also","title":"SEE ALSO","text":"
      • runai nodepool - node pool management
      "},{"location":"Researcher/cli-reference/new-cli/runai_port-forward/","title":"Runai port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_port-forward/#runai-port-forward","title":"runai port-forward","text":"

      [Deprecated] port forward

      runai port-forward WORKLOAD_NAME [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --type string                    The type of the workload (training, workspace)\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      "},{"location":"Researcher/cli-reference/new-cli/runai_project/","title":"Runai project","text":""},{"location":"Researcher/cli-reference/new-cli/runai_project/#runai-project","title":"runai project","text":"

      project management

      runai project [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_project/#options","title":"Options","text":"
        -h, --help                 help for project\n      --interactive enable   set interactive mode (enabled|disabled)\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_project/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_project/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai project list - list available project
      • runai project set - set default project name
      "},{"location":"Researcher/cli-reference/new-cli/runai_project_list/","title":"Runai project list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_project_list/#runai-project-list","title":"runai project list","text":"

      list available project

      runai project list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_project_list/#options","title":"Options","text":"
        -h, --help         help for list\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_project_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_project_list/#see-also","title":"SEE ALSO","text":"
      • runai project - project management
      "},{"location":"Researcher/cli-reference/new-cli/runai_project_set/","title":"Runai project set","text":""},{"location":"Researcher/cli-reference/new-cli/runai_project_set/#runai-project-set","title":"runai project set","text":"

      set default project name

      runai project set PROJECT_NAME [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_project_set/#options","title":"Options","text":"
        -h, --help   help for set\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_project_set/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_project_set/#see-also","title":"SEE ALSO","text":"
      • runai project - project management
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch/","title":"Runai pytorch","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch/#runai-pytorch","title":"runai pytorch","text":"

      alias for pytorch management

      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch/#options","title":"Options","text":"
        -h, --help   help for pytorch\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai pytorch attach - attach to a running container in a pytorch training job
      • runai pytorch bash - open a bash shell in a pytorch training job
      • runai pytorch delete - delete pytorch training workload
      • runai pytorch describe - describe pytorch training
      • runai pytorch exec - execute a command in a pytorch training job
      • runai pytorch list - list pytorch training
      • runai pytorch logs - view logs of a pytorch training job
      • runai pytorch port-forward - forward one or more local ports to a pytorch training job
      • runai pytorch resume - resume pytorch training
      • runai pytorch submit - submit pytorch training
      • runai pytorch suspend - suspend pytorch training
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_attach/","title":"Runai pytorch attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_attach/#runai-pytorch-attach","title":"runai pytorch attach","text":"

      attach to a running container in a pytorch training job

      runai pytorch attach [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_attach/#examples","title":"Examples","text":"
      # Attaching to the main worker of a pytorch training\nrunai training pytorch attach pytorch-01 --tty --stdin\n\n# Attaching to a specific pod of a pytorch training\nrunai training pytorch attach pytorch-01 --pod pytorch-01-worker-1 --tty --stdin\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_attach/#see-also","title":"SEE ALSO","text":"
      • runai pytorch - alias for pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_bash/","title":"Runai pytorch bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_bash/#runai-pytorch-bash","title":"runai pytorch bash","text":"

      open a bash shell in a pytorch training job

      runai pytorch bash [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_bash/#examples","title":"Examples","text":"
      # Open a bash shell in the pytorch training's main worker\nrunai training pytorch bash pytorch-01\n\n# Open a bash shell in a specific pytorch training worker\nrunai training pytorch bash pytorch-01 --pod pytorch-01-worker-1\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_bash/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_bash/#see-also","title":"SEE ALSO","text":"
      • runai pytorch - alias for pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_delete/","title":"Runai pytorch delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_delete/#runai-pytorch-delete","title":"runai pytorch delete","text":"

      delete pytorch training workload

      runai pytorch delete [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_delete/#examples","title":"Examples","text":"
      # Delete a pytorch training workload with a default project\nrunai training pytorch delete <pytorch-name>\n\n# Delete a pytorch training workload with a specific project\nrunai training pytorch delete <pytorch-name> -p <project_name>\n\n# Delete a pytorch training workload by UUID\nrunai training pytorch delete --uuid=<pytorch_uuid> -p <project_name>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_delete/#options","title":"Options","text":"
        -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_delete/#see-also","title":"SEE ALSO","text":"
      • runai pytorch - alias for pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_describe/","title":"Runai pytorch describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_describe/#runai-pytorch-describe","title":"runai pytorch describe","text":"

      describe pytorch training

      runai pytorch describe [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_describe/#examples","title":"Examples","text":"
      # Describe a pytorch training workload with a default project\nrunai training pytorch describe <pytorch-name>\n\n# Describe a pytorch training workload in a specific project\nrunai training pytorch describe <pytorch-name> -p <project_name>\n\n# Describe a pytorch training workload by UUID\nrunai training pytorch describe --uuid=<pytorch_uuid>\n\n# Describe a pytorch training workload with specific output format\nrunai training pytorch describe <pytorch-name> -o json\n\n# Describe a pytorch training workload with specific sections\nrunai training pytorch describe <pytorch-name> --general --compute --pods --events --networks\n\n# Describe a pytorch training workload with container details and custom limits\nrunai training pytorch describe <pytorch-name> --containers --pod-limit 20 --event-limit 100\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_describe/#options","title":"Options","text":"
            --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_describe/#see-also","title":"SEE ALSO","text":"
      • runai pytorch - alias for pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_exec/","title":"Runai pytorch exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_exec/#runai-pytorch-exec","title":"runai pytorch exec","text":"

      execute a command in a pytorch training job

      runai pytorch exec [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_exec/#examples","title":"Examples","text":"
      # Execute bash in the pytorch training's main worker\nrunai training pytorch exec pytorch-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the pytorch training's main worker\nrunai training pytorch exec pytorch-01 -- ls\n\n# Execute a command in a specific pytorch training worker\nrunai training pytorch exec pytorch-01 --pod pytorch-01-worker-1 -- nvidia-smi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_exec/#see-also","title":"SEE ALSO","text":"
      • runai pytorch - alias for pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_list/","title":"Runai pytorch list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_list/#runai-pytorch-list","title":"runai pytorch list","text":"

      list pytorch training

      runai pytorch list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_list/#examples","title":"Examples","text":"
      # List all pytorch training workloads\nrunai training pytorch list -A\n\n# List pytorch training workloads with default project\nrunai training pytorch list\n\n# List pytorch training workloads in a specific project\nrunai training pytorch list -p <project_name>\n\n# List all pytorch training workloads with a specific output format\nrunai training pytorch list -o wide\n\n# List pytorch training workloads with pagination\nrunai training pytorch list --limit 20 --offset 40\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_list/#options","title":"Options","text":"
        -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_list/#see-also","title":"SEE ALSO","text":"
      • runai pytorch - alias for pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_logs/","title":"Runai pytorch logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_logs/#runai-pytorch-logs","title":"runai pytorch logs","text":"

      view logs of a pytorch training job

      runai pytorch logs [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_logs/#examples","title":"Examples","text":"
      # Get logs for a pytorch training\nrunai training pytorch logs pytorch-01\n\n# Get logs for a specific pod in a pytorch training\nrunai training pytorch logs pytorch-01 --pod=pytorch-01-worker-0\n\n# Get logs for a specific container in a pytorch training\nrunai training pytorch logs pytorch-01 --container=pytorch-worker\n\n# Get the last 100 lines of logs\nrunai training pytorch logs pytorch-01 --tail=100\n\n# Get logs with timestamps\nrunai training pytorch logs pytorch-01 --timestamps\n\n# Follow the logs\nrunai training pytorch logs pytorch-01 --follow\n\n# Get logs for the previous instance of the pytorch training\nrunai training pytorch logs pytorch-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training pytorch logs pytorch-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training pytorch logs pytorch-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training pytorch logs pytorch-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for pytorch training to be ready for logs\nrunai training pytorch logs pytorch-01 --wait-timeout=30s\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_logs/#see-also","title":"SEE ALSO","text":"
      • runai pytorch - alias for pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_port-forward/","title":"Runai pytorch port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_port-forward/#runai-pytorch-port-forward","title":"runai pytorch port-forward","text":"

      forward one or more local ports to a pytorch training job

      runai pytorch port-forward [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_port-forward/#examples","title":"Examples","text":"
      # Forward connections from localhost:8080 to pytorch training on port 8090:\nrunai training pytorch port-forward pytorch-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to pytorch training on port 8080:\nrunai training pytorch port-forward pytorch-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to pytorch training on port 8090 and from localhost:6443 to pytorch training on port 443:\nrunai training pytorch port-forward pytorch-01 --port 8080:8090 --port 6443:443 --address localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai pytorch - alias for pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_resume/","title":"Runai pytorch resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_resume/#runai-pytorch-resume","title":"runai pytorch resume","text":"

      resume pytorch training

      runai pytorch resume [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_resume/#examples","title":"Examples","text":"
      # Resume a pytorch training workload\nrunai training pytorch resume <pytorch-name>\n\n# Resume a pytorch training workload in a specific project\nrunai training pytorch resume <pytorch-name> -p <project_name>\n\n# Resume a pytorch training workload by UUID\nrunai training pytorch resume --uuid=<pytorch_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_resume/#options","title":"Options","text":"
        -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_resume/#see-also","title":"SEE ALSO","text":"
      • runai pytorch - alias for pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_submit/","title":"Runai pytorch submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_submit/#runai-pytorch-submit","title":"runai pytorch submit","text":"

      submit pytorch training

      runai pytorch submit [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_submit/#examples","title":"Examples","text":"
      # Submit a pytorch training workload\nrunai training pytorch submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a pytorch training workload with arguments\nrunai training pytorch submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a pytorch training workload with a custom command\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a pytorch training master args with worker args\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a pytorch training master command with worker args\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a pytorch training master command with worker command\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_submit/#options","title":"Options","text":"
            --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-gpu-devices-request int32               GPU units to allocate for the job (e.g. 1, 2)\n      --master-gpu-portion-limit float                 GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-gpu-portion-request float               GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --max-replicas int32                             Maximum number of replicas for an elastic PyTorch job\n      --min-replicas int32                             Minimum number of replicas for an elastic PyTorch job\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --no-master                                      Do not create a separate pod for the master\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_submit/#see-also","title":"SEE ALSO","text":"
      • runai pytorch - alias for pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_suspend/","title":"Runai pytorch suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_suspend/#runai-pytorch-suspend","title":"runai pytorch suspend","text":"

      suspend pytorch training

      runai pytorch suspend [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_suspend/#examples","title":"Examples","text":"
      # Suspend a pytorch training workload\nrunai training pytorch suspend <pytorch-name>\n\n# Suspend a pytorch training workload in a specific project\nrunai training pytorch suspend <pytorch-name> -p <project_name>\n\n# Suspend a pytorch training workload by UUID\nrunai training pytorch suspend --uuid=<pytorch_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_suspend/#options","title":"Options","text":"
        -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_suspend/#see-also","title":"SEE ALSO","text":"
      • runai pytorch - alias for pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_report/","title":"Runai report","text":""},{"location":"Researcher/cli-reference/new-cli/runai_report/#runai-report","title":"runai report","text":"

      [Experimental] report management

      "},{"location":"Researcher/cli-reference/new-cli/runai_report/#options","title":"Options","text":"
        -h, --help   help for report\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai report metrics - [Experimental] metrics management
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics/","title":"Runai report metrics","text":""},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics/#runai-report-metrics","title":"runai report metrics","text":"

      [Experimental] metrics management

      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics/#options","title":"Options","text":"
        -h, --help   help for metrics\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics/#see-also","title":"SEE ALSO","text":"
      • runai report - [Experimental] report management
      • runai report metrics clear - metrics logs deletion
      • runai report metrics config - metrics configuration
      • runai report metrics output - metrics logs output
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_clear/","title":"Runai report metrics clear","text":""},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_clear/#runai-report-metrics-clear","title":"runai report metrics clear","text":"

      metrics logs deletion

      runai report metrics clear [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_clear/#options","title":"Options","text":"
        -h, --help   help for clear\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_clear/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_clear/#see-also","title":"SEE ALSO","text":"
      • runai report metrics - [Experimental] metrics management
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_config/","title":"Runai report metrics config","text":""},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_config/#runai-report-metrics-config","title":"runai report metrics config","text":"

      metrics configuration

      runai report metrics config [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_config/#options","title":"Options","text":"
            --age int          metrics max file age (default 14)\n      --files int        metrics max file number (default 30)\n  -h, --help             help for config\n      --metrics enable   metrics enable flag (enabled|disabled)\n      --size int         metrics max file size (default 10)\n      --type reporter    report generated type (none|logger|local)\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_config/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_config/#see-also","title":"SEE ALSO","text":"
      • runai report metrics - [Experimental] metrics management
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_output/","title":"Runai report metrics output","text":""},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_output/#runai-report-metrics-output","title":"runai report metrics output","text":"

      metrics logs output

      runai report metrics output [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_output/#options","title":"Options","text":"
        -h, --help       help for output\n      --tail int   number of tail metrics (default 100)\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_output/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_output/#see-also","title":"SEE ALSO","text":"
      • runai report metrics - [Experimental] metrics management
      "},{"location":"Researcher/cli-reference/new-cli/runai_submit/","title":"Runai submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_submit/#runai-submit","title":"runai submit","text":"

      [Deprecated] Submit a new workload

      runai submit [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_submit/#options","title":"Options","text":"
            --add-capability stringArray                     The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --completions int32                              Number of successful pods required for this job to be completed. Used with HPO\n      --configmap-volume stringArray                   Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu float                                      CPU core request (e.g. 0.5, 1)\n      --cpu-limit float                                CPU core limit (e.g. 0.5, 1)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu float                                      GPU units to allocate for the job (e.g. 0.5, 1)\n      --gpu-memory string                              GPU memory to allocate for the job (e.g. 1G, 500M)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --interactive                                    Mark this job as interactive\n      --job-name-prefix string                         Set defined prefix for the workload name and add index as a suffix\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --memory string                                  CPU memory to allocate for the job (e.g. 1G, 500M)\n      --memory-limit string                            CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --mig-profile string                             [Deprecated] MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --parallelism int32                              Number of pods to run in parallel at any given time. Used with HPO\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preemptible                                    Workspace preemptible workloads can be scheduled above guaranteed quota but may be reclaimed at any time\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n  -v, --volume stringArray                             Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --working-dir string                             Set the container's working directory\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_submit/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow/","title":"Runai tensorflow","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow/#runai-tensorflow","title":"runai tensorflow","text":"

      alias for tensorflow management

      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow/#options","title":"Options","text":"
        -h, --help   help for tensorflow\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai tensorflow attach - attach to a running container in a tf training job
      • runai tensorflow bash - open a bash shell in a tf training job
      • runai tensorflow delete - delete tf training workload
      • runai tensorflow describe - describe tf training
      • runai tensorflow exec - execute a command in a tf training job
      • runai tensorflow list - list tf training
      • runai tensorflow logs - view logs of a tf training job
      • runai tensorflow port-forward - forward one or more local ports to a tf training job
      • runai tensorflow resume - resume tf training
      • runai tensorflow submit - submit tf training
      • runai tensorflow suspend - suspend tf training
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_attach/","title":"Runai tensorflow attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_attach/#runai-tensorflow-attach","title":"runai tensorflow attach","text":"

      attach to a running container in a tf training job

      runai tensorflow attach [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_attach/#examples","title":"Examples","text":"
      # Attaching to the main worker of a tf training\nrunai training tf attach tf-01 --tty --stdin\n\n# Attaching to a specific pod of a tf training\nrunai training tf attach tf-01 --pod tf-01-worker-1 --tty --stdin\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_attach/#see-also","title":"SEE ALSO","text":"
      • runai tensorflow - alias for tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_bash/","title":"Runai tensorflow bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_bash/#runai-tensorflow-bash","title":"runai tensorflow bash","text":"

      open a bash shell in a tf training job

      runai tensorflow bash [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_bash/#examples","title":"Examples","text":"
      # Open a bash shell in the tf training's main worker\nrunai training tf bash tf-01\n\n# Open a bash shell in a specific tf training worker\nrunai training tf bash tf-01 --pod tf-01-worker-1\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_bash/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_bash/#see-also","title":"SEE ALSO","text":"
      • runai tensorflow - alias for tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_delete/","title":"Runai tensorflow delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_delete/#runai-tensorflow-delete","title":"runai tensorflow delete","text":"

      delete tf training workload

      runai tensorflow delete [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_delete/#examples","title":"Examples","text":"
      # Delete a tf training workload with a default project\nrunai training tf delete <tf-name>\n\n# Delete a tf training workload with a specific project\nrunai training tf delete <tf-name> -p <project_name>\n\n# Delete a tf training workload by UUID\nrunai training tf delete --uuid=<tf_uuid> -p <project_name>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_delete/#options","title":"Options","text":"
        -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_delete/#see-also","title":"SEE ALSO","text":"
      • runai tensorflow - alias for tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_describe/","title":"Runai tensorflow describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_describe/#runai-tensorflow-describe","title":"runai tensorflow describe","text":"

      describe tf training

      runai tensorflow describe [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_describe/#examples","title":"Examples","text":"
      # Describe a tf training workload with a default project\nrunai training tf describe <tf-name>\n\n# Describe a tf training workload in a specific project\nrunai training tf describe <tf-name> -p <project_name>\n\n# Describe a tf training workload by UUID\nrunai training tf describe --uuid=<tf_uuid>\n\n# Describe a tf training workload with specific output format\nrunai training tf describe <tf-name> -o json\n\n# Describe a tf training workload with specific sections\nrunai training tf describe <tf-name> --general --compute --pods --events --networks\n\n# Describe a tf training workload with container details and custom limits\nrunai training tf describe <tf-name> --containers --pod-limit 20 --event-limit 100\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_describe/#options","title":"Options","text":"
            --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_describe/#see-also","title":"SEE ALSO","text":"
      • runai tensorflow - alias for tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_exec/","title":"Runai tensorflow exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_exec/#runai-tensorflow-exec","title":"runai tensorflow exec","text":"

      execute a command in a tf training job

      runai tensorflow exec [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_exec/#examples","title":"Examples","text":"
      # Execute bash in the tf training's main worker\nrunai training tf exec tf-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the tf training's main worker\nrunai training tf exec tf-01 -- ls\n\n# Execute a command in a specific tf training worker\nrunai training tf exec tf-01 --pod tf-01-worker-1 -- nvidia-smi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_exec/#see-also","title":"SEE ALSO","text":"
      • runai tensorflow - alias for tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_list/","title":"Runai tensorflow list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_list/#runai-tensorflow-list","title":"runai tensorflow list","text":"

      list tf training

      runai tensorflow list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_list/#examples","title":"Examples","text":"
      # List all tf training workloads\nrunai training tf list -A\n\n# List tf training workloads with default project\nrunai training tf list\n\n# List tf training workloads in a specific project\nrunai training tf list -p <project_name>\n\n# List all tf training workloads with a specific output format\nrunai training tf list -o wide\n\n# List tf training workloads with pagination\nrunai training tf list --limit 20 --offset 40\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_list/#options","title":"Options","text":"
        -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_list/#see-also","title":"SEE ALSO","text":"
      • runai tensorflow - alias for tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_logs/","title":"Runai tensorflow logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_logs/#runai-tensorflow-logs","title":"runai tensorflow logs","text":"

      view logs of a tf training job

      runai tensorflow logs [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_logs/#examples","title":"Examples","text":"
      # Get logs for a tf training\nrunai training tf logs tf-01\n\n# Get logs for a specific pod in a tf training\nrunai training tf logs tf-01 --pod=tf-01-worker-0\n\n# Get logs for a specific container in a tf training\nrunai training tf logs tf-01 --container=tf-worker\n\n# Get the last 100 lines of logs\nrunai training tf logs tf-01 --tail=100\n\n# Get logs with timestamps\nrunai training tf logs tf-01 --timestamps\n\n# Follow the logs\nrunai training tf logs tf-01 --follow\n\n# Get logs for the previous instance of the tf training\nrunai training tf logs tf-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training tf logs tf-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training tf logs tf-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training tf logs tf-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for tf training to be ready for logs\nrunai training tf logs tf-01 --wait-timeout=30s\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_logs/#see-also","title":"SEE ALSO","text":"
      • runai tensorflow - alias for tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/","title":"Runai tensorflow port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/#runai-tensorflow-port-forward","title":"runai tensorflow port-forward","text":"

      forward one or more local ports to a tf training job

      runai tensorflow port-forward [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/#examples","title":"Examples","text":"
      # Forward connections from localhost:8080 to tf training on port 8090:\nrunai training tf port-forward tf-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to tf training on port 8080:\nrunai training tf port-forward tf-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to tf training on port 8090 and from localhost:6443 to tf training on port 443:\nrunai training tf port-forward tf-01 --port 8080:8090 --port 6443:443 --address localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai tensorflow - alias for tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_resume/","title":"Runai tensorflow resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_resume/#runai-tensorflow-resume","title":"runai tensorflow resume","text":"

      resume tf training

      runai tensorflow resume [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_resume/#examples","title":"Examples","text":"
      # Resume a tf training workload\nrunai training tf resume <tf-name>\n\n# Resume a tf training workload in a specific project\nrunai training tf resume <tf-name> -p <project_name>\n\n# Resume a tf training workload by UUID\nrunai training tf resume --uuid=<tf_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_resume/#options","title":"Options","text":"
        -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_resume/#see-also","title":"SEE ALSO","text":"
      • runai tensorflow - alias for tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_submit/","title":"Runai tensorflow submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_submit/#runai-tensorflow-submit","title":"runai tensorflow submit","text":"

      submit tf training

      runai tensorflow submit [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_submit/#examples","title":"Examples","text":"
      # Submit a tf training workload\nrunai training tf submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a tf training workload with arguments\nrunai training tf submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a tf training workload with a custom command\nrunai training tf submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a tf training master args with worker args\nrunai training tf submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a tf training master command with worker args\nrunai training tf submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a tf training master command with worker command\nrunai training tf submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_submit/#options","title":"Options","text":"
            --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-gpu-devices-request int32               GPU units to allocate for the job (e.g. 1, 2)\n      --master-gpu-portion-limit float                 GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-gpu-portion-request float               GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --max-replicas int32                             Maximum number of replicas for an elastic PyTorch job\n      --min-replicas int32                             Minimum number of replicas for an elastic PyTorch job\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --no-master                                      Do not create a separate pod for the master\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_submit/#see-also","title":"SEE ALSO","text":"
      • runai tensorflow - alias for tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_suspend/","title":"Runai tensorflow suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_suspend/#runai-tensorflow-suspend","title":"runai tensorflow suspend","text":"

      suspend tf training

      runai tensorflow suspend [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_suspend/#examples","title":"Examples","text":"
      # Suspend a tf training workload\nrunai training tf suspend <tf-name>\n\n# Suspend a tf training workload in a specific project\nrunai training tf suspend <tf-name> -p <project_name>\n\n# Suspend a tf training workload by UUID\nrunai training tf suspend --uuid=<tf_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_suspend/#options","title":"Options","text":"
        -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_suspend/#see-also","title":"SEE ALSO","text":"
      • runai tensorflow - alias for tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training/","title":"Runai training","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training/#runai-training","title":"runai training","text":"

      training management

      "},{"location":"Researcher/cli-reference/new-cli/runai_training/#options","title":"Options","text":"
        -h, --help   help for training\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai training attach - attach to a running container in a standard training job
      • runai training bash - open a bash shell in a standard training job
      • runai training delete - delete standard training workload
      • runai training describe - describe standard training
      • runai training exec - execute a command in a standard training job
      • runai training list - list all training frameworks
      • runai training logs - view logs of a standard training job
      • runai training mpi - mpi management
      • runai training port-forward - forward one or more local ports to a standard training job
      • runai training pytorch - pytorch management
      • runai training resume - resume standard training
      • runai training standard - standard training management
      • runai training submit - submit standard training
      • runai training suspend - suspend standard training
      • runai training tensorflow - tensorflow management
      • runai training xgboost - xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_attach/","title":"Runai training attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_attach/#runai-training-attach","title":"runai training attach","text":"

      attach to a running container in a standard training job

      runai training attach [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_attach/#examples","title":"Examples","text":"
      # Attaching to the main worker of a standard training\nrunai training standard attach standard-01 --tty --stdin\n\n# Attaching to a specific pod of a standard training\nrunai training standard attach standard-01 --pod standard-01-worker-1 --tty --stdin\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_attach/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_bash/","title":"Runai training bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_bash/#runai-training-bash","title":"runai training bash","text":"

      open a bash shell in a standard training job

      runai training bash [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_bash/#examples","title":"Examples","text":"
      # Open a bash shell in the standard training's main worker\nrunai training standard bash standard-01\n\n# Open a bash shell in a specific standard training worker\nrunai training standard bash standard-01 --pod standard-01-worker-1\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_bash/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_bash/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_delete/","title":"Runai training delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_delete/#runai-training-delete","title":"runai training delete","text":"

      delete standard training workload

      runai training delete [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_delete/#examples","title":"Examples","text":"
      # Delete a standard training workload with a default project\nrunai training standard delete <standard-name>\n\n# Delete a standard training workload with a specific project\nrunai training standard delete <standard-name> -p <project_name>\n\n# Delete a standard training workload by UUID\nrunai training standard delete --uuid=<standard_uuid> -p <project_name>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_delete/#options","title":"Options","text":"
        -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_delete/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_describe/","title":"Runai training describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_describe/#runai-training-describe","title":"runai training describe","text":"

      describe standard training

      runai training describe [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_describe/#examples","title":"Examples","text":"
      # Describe a standard training workload with a default project\nrunai training standard describe <standard-name>\n\n# Describe a standard training workload in a specific project\nrunai training standard describe <standard-name> -p <project_name>\n\n# Describe a standard training workload by UUID\nrunai training standard describe --uuid=<standard_uuid>\n\n# Describe a standard training workload with specific output format\nrunai training standard describe <standard-name> -o json\n\n# Describe a standard training workload with specific sections\nrunai training standard describe <standard-name> --general --compute --pods --events --networks\n\n# Describe a standard training workload with container details and custom limits\nrunai training standard describe <standard-name> --containers --pod-limit 20 --event-limit 100\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_describe/#options","title":"Options","text":"
            --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_describe/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_exec/","title":"Runai training exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_exec/#runai-training-exec","title":"runai training exec","text":"

      execute a command in a standard training job

      runai training exec [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_exec/#examples","title":"Examples","text":"
      # Execute bash in the standard training's main worker\nrunai training standard exec standard-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the standard training's main worker\nrunai training standard exec standard-01 -- ls\n\n# Execute a command in a specific standard training worker\nrunai training standard exec standard-01 --pod standard-01-worker-1 -- nvidia-smi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_exec/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_list/","title":"Runai training list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_list/#runai-training-list","title":"runai training list","text":"

      list all training frameworks

      runai training list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_list/#examples","title":"Examples","text":"
      runai training list -A\nrunai training list --state=<training_state> --limit=20\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_list/#options","title":"Options","text":"
        -A, --all                list workloads from all projects\n      --framework string   filter by workload framework\n  -h, --help               help for list\n      --json               Output structure JSON\n      --limit int32        number of workload in list (default 50)\n      --no-headers         Output structure table without headers\n      --offset int32       offset number of limit, default 0 (first offset)\n  -p, --project string     Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string      filter by workload state\n      --table              Output structure table\n      --yaml               Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_list/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_logs/","title":"Runai training logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_logs/#runai-training-logs","title":"runai training logs","text":"

      view logs of a standard training job

      runai training logs [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_logs/#examples","title":"Examples","text":"
      # Get logs for a standard training\nrunai training standard logs standard-01\n\n# Get logs for a specific pod in a standard training\nrunai training standard logs standard-01 --pod=standard-01-worker-0\n\n# Get logs for a specific container in a standard training\nrunai training standard logs standard-01 --container=standard-worker\n\n# Get the last 100 lines of logs\nrunai training standard logs standard-01 --tail=100\n\n# Get logs with timestamps\nrunai training standard logs standard-01 --timestamps\n\n# Follow the logs\nrunai training standard logs standard-01 --follow\n\n# Get logs for the previous instance of the standard training\nrunai training standard logs standard-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training standard logs standard-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training standard logs standard-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training standard logs standard-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for standard training to be ready for logs\nrunai training standard logs standard-01 --wait-timeout=30s\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_logs/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi/","title":"Runai training mpi","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi/#runai-training-mpi","title":"runai training mpi","text":"

      mpi management

      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi/#options","title":"Options","text":"
        -h, --help   help for mpi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      • runai training mpi attach - attach to a running container in a mpi training job
      • runai training mpi bash - open a bash shell in a mpi training job
      • runai training mpi delete - delete mpi training workload
      • runai training mpi describe - describe mpi training
      • runai training mpi exec - execute a command in a mpi training job
      • runai training mpi list - list mpi training
      • runai training mpi logs - view logs of a mpi training job
      • runai training mpi port-forward - forward one or more local ports to a mpi training job
      • runai training mpi resume - resume mpi training
      • runai training mpi submit - submit mpi training
      • runai training mpi suspend - suspend mpi training
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_attach/","title":"Runai training mpi attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_attach/#runai-training-mpi-attach","title":"runai training mpi attach","text":"

      attach to a running container in a mpi training job

      runai training mpi attach [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_attach/#examples","title":"Examples","text":"
      # Attaching to the main worker of a mpi training\nrunai training mpi attach mpi-01 --tty --stdin\n\n# Attaching to a specific pod of a mpi training\nrunai training mpi attach mpi-01 --pod mpi-01-worker-1 --tty --stdin\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_attach/#see-also","title":"SEE ALSO","text":"
      • runai training mpi - mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_bash/","title":"Runai training mpi bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_bash/#runai-training-mpi-bash","title":"runai training mpi bash","text":"

      open a bash shell in a mpi training job

      runai training mpi bash [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_bash/#examples","title":"Examples","text":"
      # Open a bash shell in the mpi training's main worker\nrunai training mpi bash mpi-01\n\n# Open a bash shell in a specific mpi training worker\nrunai training mpi bash mpi-01 --pod mpi-01-worker-1\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_bash/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_bash/#see-also","title":"SEE ALSO","text":"
      • runai training mpi - mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_delete/","title":"Runai training mpi delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_delete/#runai-training-mpi-delete","title":"runai training mpi delete","text":"

      delete mpi training workload

      runai training mpi delete [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_delete/#examples","title":"Examples","text":"
      # Delete a mpi training workload with a default project\nrunai training mpi delete <mpi-name>\n\n# Delete a mpi training workload with a specific project\nrunai training mpi delete <mpi-name> -p <project_name>\n\n# Delete a mpi training workload by UUID\nrunai training mpi delete --uuid=<mpi_uuid> -p <project_name>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_delete/#options","title":"Options","text":"
        -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_delete/#see-also","title":"SEE ALSO","text":"
      • runai training mpi - mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_describe/","title":"Runai training mpi describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_describe/#runai-training-mpi-describe","title":"runai training mpi describe","text":"

      describe mpi training

      runai training mpi describe [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_describe/#examples","title":"Examples","text":"
      # Describe a mpi training workload with a default project\nrunai training mpi describe <mpi-name>\n\n# Describe a mpi training workload in a specific project\nrunai training mpi describe <mpi-name> -p <project_name>\n\n# Describe a mpi training workload by UUID\nrunai training mpi describe --uuid=<mpi_uuid>\n\n# Describe a mpi training workload with specific output format\nrunai training mpi describe <mpi-name> -o json\n\n# Describe a mpi training workload with specific sections\nrunai training mpi describe <mpi-name> --general --compute --pods --events --networks\n\n# Describe a mpi training workload with container details and custom limits\nrunai training mpi describe <mpi-name> --containers --pod-limit 20 --event-limit 100\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_describe/#options","title":"Options","text":"
            --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_describe/#see-also","title":"SEE ALSO","text":"
      • runai training mpi - mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_exec/","title":"Runai training mpi exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_exec/#runai-training-mpi-exec","title":"runai training mpi exec","text":"

      execute a command in a mpi training job

      runai training mpi exec [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_exec/#examples","title":"Examples","text":"
      # Execute bash in the mpi training's main worker\nrunai training mpi exec mpi-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the mpi training's main worker\nrunai training mpi exec mpi-01 -- ls\n\n# Execute a command in a specific mpi training worker\nrunai training mpi exec mpi-01 --pod mpi-01-worker-1 -- nvidia-smi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_exec/#see-also","title":"SEE ALSO","text":"
      • runai training mpi - mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_list/","title":"Runai training mpi list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_list/#runai-training-mpi-list","title":"runai training mpi list","text":"

      list mpi training

      runai training mpi list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_list/#examples","title":"Examples","text":"
      # List all mpi training workloads\nrunai training mpi list -A\n\n# List mpi training workloads with default project\nrunai training mpi list\n\n# List mpi training workloads in a specific project\nrunai training mpi list -p <project_name>\n\n# List all mpi training workloads with a specific output format\nrunai training mpi list -o wide\n\n# List mpi training workloads with pagination\nrunai training mpi list --limit 20 --offset 40\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_list/#options","title":"Options","text":"
        -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_list/#see-also","title":"SEE ALSO","text":"
      • runai training mpi - mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_logs/","title":"Runai training mpi logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_logs/#runai-training-mpi-logs","title":"runai training mpi logs","text":"

      view logs of a mpi training job

      runai training mpi logs [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_logs/#examples","title":"Examples","text":"
      # Get logs for a mpi training\nrunai training mpi logs mpi-01\n\n# Get logs for a specific pod in a mpi training\nrunai training mpi logs mpi-01 --pod=mpi-01-worker-0\n\n# Get logs for a specific container in a mpi training\nrunai training mpi logs mpi-01 --container=mpi-worker\n\n# Get the last 100 lines of logs\nrunai training mpi logs mpi-01 --tail=100\n\n# Get logs with timestamps\nrunai training mpi logs mpi-01 --timestamps\n\n# Follow the logs\nrunai training mpi logs mpi-01 --follow\n\n# Get logs for the previous instance of the mpi training\nrunai training mpi logs mpi-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training mpi logs mpi-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training mpi logs mpi-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training mpi logs mpi-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for mpi training to be ready for logs\nrunai training mpi logs mpi-01 --wait-timeout=30s\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_logs/#see-also","title":"SEE ALSO","text":"
      • runai training mpi - mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/","title":"Runai training mpi port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/#runai-training-mpi-port-forward","title":"runai training mpi port-forward","text":"

      forward one or more local ports to a mpi training job

      runai training mpi port-forward [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/#examples","title":"Examples","text":"
      # Forward connections from localhost:8080 to mpi training on port 8090:\nrunai training mpi port-forward mpi-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to mpi training on port 8080:\nrunai training mpi port-forward mpi-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to mpi training on port 8090 and from localhost:6443 to mpi training on port 443:\nrunai training mpi port-forward mpi-01 --port 8080:8090 --port 6443:443 --address localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai training mpi - mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_resume/","title":"Runai training mpi resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_resume/#runai-training-mpi-resume","title":"runai training mpi resume","text":"

      resume mpi training

      runai training mpi resume [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_resume/#examples","title":"Examples","text":"
      # Resume a mpi training workload\nrunai training mpi resume <mpi-name>\n\n# Resume a mpi training workload in a specific project\nrunai training mpi resume <mpi-name> -p <project_name>\n\n# Resume a mpi training workload by UUID\nrunai training mpi resume --uuid=<mpi_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_resume/#options","title":"Options","text":"
        -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_resume/#see-also","title":"SEE ALSO","text":"
      • runai training mpi - mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_submit/","title":"Runai training mpi submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_submit/#runai-training-mpi-submit","title":"runai training mpi submit","text":"

      submit mpi training

      runai training mpi submit [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_submit/#examples","title":"Examples","text":"
      # Submit a mpi training workload\nrunai training mpi submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a mpi training workload with arguments\nrunai training mpi submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a mpi training workload with a custom command\nrunai training mpi submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a mpi training master args with worker args\nrunai training mpi submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a mpi training master command with worker args\nrunai training mpi submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a mpi training master command with worker command\nrunai training mpi submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_submit/#options","title":"Options","text":"
            --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --slots-per-worker int32                         Number of slots to allocate for each worker\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_submit/#see-also","title":"SEE ALSO","text":"
      • runai training mpi - mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_suspend/","title":"Runai training mpi suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_suspend/#runai-training-mpi-suspend","title":"runai training mpi suspend","text":"

      suspend mpi training

      runai training mpi suspend [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_suspend/#examples","title":"Examples","text":"
      # Suspend a mpi training workload\nrunai training mpi suspend <mpi-name>\n\n# Suspend a mpi training workload in a specific project\nrunai training mpi suspend <mpi-name> -p <project_name>\n\n# Suspend a mpi training workload by UUID\nrunai training mpi suspend --uuid=<mpi_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_suspend/#options","title":"Options","text":"
        -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_suspend/#see-also","title":"SEE ALSO","text":"
      • runai training mpi - mpi management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_port-forward/","title":"Runai training port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_port-forward/#runai-training-port-forward","title":"runai training port-forward","text":"

      forward one or more local ports to a standard training job

      runai training port-forward [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_port-forward/#examples","title":"Examples","text":"
      # Forward connections from localhost:8080 to standard training on port 8090:\nrunai training standard port-forward standard-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to standard training on port 8080:\nrunai training standard port-forward standard-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to standard training on port 8090 and from localhost:6443 to standard training on port 443:\nrunai training standard port-forward standard-01 --port 8080:8090 --port 6443:443 --address localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch/","title":"Runai training pytorch","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch/#runai-training-pytorch","title":"runai training pytorch","text":"

      pytorch management

      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch/#options","title":"Options","text":"
        -h, --help   help for pytorch\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      • runai training pytorch attach - attach to a running container in a pytorch training job
      • runai training pytorch bash - open a bash shell in a pytorch training job
      • runai training pytorch delete - delete pytorch training workload
      • runai training pytorch describe - describe pytorch training
      • runai training pytorch exec - execute a command in a pytorch training job
      • runai training pytorch list - list pytorch training
      • runai training pytorch logs - view logs of a pytorch training job
      • runai training pytorch port-forward - forward one or more local ports to a pytorch training job
      • runai training pytorch resume - resume pytorch training
      • runai training pytorch submit - submit pytorch training
      • runai training pytorch suspend - suspend pytorch training
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_attach/","title":"Runai training pytorch attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_attach/#runai-training-pytorch-attach","title":"runai training pytorch attach","text":"

      attach to a running container in a pytorch training job

      runai training pytorch attach [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_attach/#examples","title":"Examples","text":"
      # Attaching to the main worker of a pytorch training\nrunai training pytorch attach pytorch-01 --tty --stdin\n\n# Attaching to a specific pod of a pytorch training\nrunai training pytorch attach pytorch-01 --pod pytorch-01-worker-1 --tty --stdin\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_attach/#see-also","title":"SEE ALSO","text":"
      • runai training pytorch - pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_bash/","title":"Runai training pytorch bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_bash/#runai-training-pytorch-bash","title":"runai training pytorch bash","text":"

      open a bash shell in a pytorch training job

      runai training pytorch bash [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_bash/#examples","title":"Examples","text":"
      # Open a bash shell in the pytorch training's main worker\nrunai training pytorch bash pytorch-01\n\n# Open a bash shell in a specific pytorch training worker\nrunai training pytorch bash pytorch-01 --pod pytorch-01-worker-1\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_bash/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_bash/#see-also","title":"SEE ALSO","text":"
      • runai training pytorch - pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_delete/","title":"Runai training pytorch delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_delete/#runai-training-pytorch-delete","title":"runai training pytorch delete","text":"

      delete pytorch training workload

      runai training pytorch delete [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_delete/#examples","title":"Examples","text":"
      # Delete a pytorch training workload with a default project\nrunai training pytorch delete <pytorch-name>\n\n# Delete a pytorch training workload with a specific project\nrunai training pytorch delete <pytorch-name> -p <project_name>\n\n# Delete a pytorch training workload by UUID\nrunai training pytorch delete --uuid=<pytorch_uuid> -p <project_name>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_delete/#options","title":"Options","text":"
        -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_delete/#see-also","title":"SEE ALSO","text":"
      • runai training pytorch - pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_describe/","title":"Runai training pytorch describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_describe/#runai-training-pytorch-describe","title":"runai training pytorch describe","text":"

      describe pytorch training

      runai training pytorch describe [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_describe/#examples","title":"Examples","text":"
      # Describe a pytorch training workload with a default project\nrunai training pytorch describe <pytorch-name>\n\n# Describe a pytorch training workload in a specific project\nrunai training pytorch describe <pytorch-name> -p <project_name>\n\n# Describe a pytorch training workload by UUID\nrunai training pytorch describe --uuid=<pytorch_uuid>\n\n# Describe a pytorch training workload with specific output format\nrunai training pytorch describe <pytorch-name> -o json\n\n# Describe a pytorch training workload with specific sections\nrunai training pytorch describe <pytorch-name> --general --compute --pods --events --networks\n\n# Describe a pytorch training workload with container details and custom limits\nrunai training pytorch describe <pytorch-name> --containers --pod-limit 20 --event-limit 100\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_describe/#options","title":"Options","text":"
            --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_describe/#see-also","title":"SEE ALSO","text":"
      • runai training pytorch - pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_exec/","title":"Runai training pytorch exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_exec/#runai-training-pytorch-exec","title":"runai training pytorch exec","text":"

      execute a command in a pytorch training job

      runai training pytorch exec [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_exec/#examples","title":"Examples","text":"
      # Execute bash in the pytorch training's main worker\nrunai training pytorch exec pytorch-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the pytorch training's main worker\nrunai training pytorch exec pytorch-01 -- ls\n\n# Execute a command in a specific pytorch training worker\nrunai training pytorch exec pytorch-01 --pod pytorch-01-worker-1 -- nvidia-smi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_exec/#see-also","title":"SEE ALSO","text":"
      • runai training pytorch - pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_list/","title":"Runai training pytorch list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_list/#runai-training-pytorch-list","title":"runai training pytorch list","text":"

      list pytorch training

      runai training pytorch list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_list/#examples","title":"Examples","text":"
      # List all pytorch training workloads\nrunai training pytorch list -A\n\n# List pytorch training workloads with default project\nrunai training pytorch list\n\n# List pytorch training workloads in a specific project\nrunai training pytorch list -p <project_name>\n\n# List all pytorch training workloads with a specific output format\nrunai training pytorch list -o wide\n\n# List pytorch training workloads with pagination\nrunai training pytorch list --limit 20 --offset 40\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_list/#options","title":"Options","text":"
        -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_list/#see-also","title":"SEE ALSO","text":"
      • runai training pytorch - pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_logs/","title":"Runai training pytorch logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_logs/#runai-training-pytorch-logs","title":"runai training pytorch logs","text":"

      view logs of a pytorch training job

      runai training pytorch logs [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_logs/#examples","title":"Examples","text":"
      # Get logs for a pytorch training\nrunai training pytorch logs pytorch-01\n\n# Get logs for a specific pod in a pytorch training\nrunai training pytorch logs pytorch-01 --pod=pytorch-01-worker-0\n\n# Get logs for a specific container in a pytorch training\nrunai training pytorch logs pytorch-01 --container=pytorch-worker\n\n# Get the last 100 lines of logs\nrunai training pytorch logs pytorch-01 --tail=100\n\n# Get logs with timestamps\nrunai training pytorch logs pytorch-01 --timestamps\n\n# Follow the logs\nrunai training pytorch logs pytorch-01 --follow\n\n# Get logs for the previous instance of the pytorch training\nrunai training pytorch logs pytorch-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training pytorch logs pytorch-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training pytorch logs pytorch-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training pytorch logs pytorch-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for pytorch training to be ready for logs\nrunai training pytorch logs pytorch-01 --wait-timeout=30s\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_logs/#see-also","title":"SEE ALSO","text":"
      • runai training pytorch - pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/","title":"Runai training pytorch port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/#runai-training-pytorch-port-forward","title":"runai training pytorch port-forward","text":"

      forward one or more local ports to a pytorch training job

      runai training pytorch port-forward [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/#examples","title":"Examples","text":"
      # Forward connections from localhost:8080 to pytorch training on port 8090:\nrunai training pytorch port-forward pytorch-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to pytorch training on port 8080:\nrunai training pytorch port-forward pytorch-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to pytorch training on port 8090 and from localhost:6443 to pytorch training on port 443:\nrunai training pytorch port-forward pytorch-01 --port 8080:8090 --port 6443:443 --address localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai training pytorch - pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_resume/","title":"Runai training pytorch resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_resume/#runai-training-pytorch-resume","title":"runai training pytorch resume","text":"

      resume pytorch training

      runai training pytorch resume [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_resume/#examples","title":"Examples","text":"
      # Resume a pytorch training workload\nrunai training pytorch resume <pytorch-name>\n\n# Resume a pytorch training workload in a specific project\nrunai training pytorch resume <pytorch-name> -p <project_name>\n\n# Resume a pytorch training workload by UUID\nrunai training pytorch resume --uuid=<pytorch_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_resume/#options","title":"Options","text":"
        -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_resume/#see-also","title":"SEE ALSO","text":"
      • runai training pytorch - pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_submit/","title":"Runai training pytorch submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_submit/#runai-training-pytorch-submit","title":"runai training pytorch submit","text":"

      submit pytorch training

      runai training pytorch submit [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_submit/#examples","title":"Examples","text":"
      # Submit a pytorch training workload\nrunai training pytorch submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a pytorch training workload with arguments\nrunai training pytorch submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a pytorch training workload with a custom command\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a pytorch training master args with worker args\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a pytorch training master command with worker args\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a pytorch training master command with worker command\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_submit/#options","title":"Options","text":"
            --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-gpu-devices-request int32               GPU units to allocate for the job (e.g. 1, 2)\n      --master-gpu-portion-limit float                 GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-gpu-portion-request float               GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --max-replicas int32                             Maximum number of replicas for an elastic PyTorch job\n      --min-replicas int32                             Minimum number of replicas for an elastic PyTorch job\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --no-master                                      Do not create a separate pod for the master\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_submit/#see-also","title":"SEE ALSO","text":"
      • runai training pytorch - pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/","title":"Runai training pytorch suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/#runai-training-pytorch-suspend","title":"runai training pytorch suspend","text":"

      suspend pytorch training

      runai training pytorch suspend [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/#examples","title":"Examples","text":"
      # Suspend a pytorch training workload\nrunai training pytorch suspend <pytorch-name>\n\n# Suspend a pytorch training workload in a specific project\nrunai training pytorch suspend <pytorch-name> -p <project_name>\n\n# Suspend a pytorch training workload by UUID\nrunai training pytorch suspend --uuid=<pytorch_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/#options","title":"Options","text":"
        -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/#see-also","title":"SEE ALSO","text":"
      • runai training pytorch - pytorch management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_resume/","title":"Runai training resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_resume/#runai-training-resume","title":"runai training resume","text":"

      resume standard training

      runai training resume [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_resume/#examples","title":"Examples","text":"
      # Resume a standard training workload\nrunai training standard resume <standard-name>\n\n# Resume a standard training workload in a specific project\nrunai training standard resume <standard-name> -p <project_name>\n\n# Resume a standard training workload by UUID\nrunai training standard resume --uuid=<standard_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_resume/#options","title":"Options","text":"
        -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_resume/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard/","title":"Runai training standard","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard/#runai-training-standard","title":"runai training standard","text":"

      standard training management

      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard/#options","title":"Options","text":"
        -h, --help   help for standard\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      • runai training standard attach - attach to a running container in a standard training job
      • runai training standard bash - open a bash shell in a standard training job
      • runai training standard delete - delete standard training workload
      • runai training standard describe - describe standard training
      • runai training standard exec - execute a command in a standard training job
      • runai training standard list - list standard training
      • runai training standard logs - view logs of a standard training job
      • runai training standard port-forward - forward one or more local ports to a standard training job
      • runai training standard resume - resume standard training
      • runai training standard submit - submit standard training
      • runai training standard suspend - suspend standard training
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_attach/","title":"Runai training standard attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_attach/#runai-training-standard-attach","title":"runai training standard attach","text":"

      attach to a running container in a standard training job

      runai training standard attach [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_attach/#examples","title":"Examples","text":"
      # Attaching to the main worker of a standard training\nrunai training standard attach standard-01 --tty --stdin\n\n# Attaching to a specific pod of a standard training\nrunai training standard attach standard-01 --pod standard-01-worker-1 --tty --stdin\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_attach/#see-also","title":"SEE ALSO","text":"
      • runai training standard - standard training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_bash/","title":"Runai training standard bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_bash/#runai-training-standard-bash","title":"runai training standard bash","text":"

      open a bash shell in a standard training job

      runai training standard bash [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_bash/#examples","title":"Examples","text":"
      # Open a bash shell in the standard training's main worker\nrunai training standard bash standard-01\n\n# Open a bash shell in a specific standard training worker\nrunai training standard bash standard-01 --pod standard-01-worker-1\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_bash/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_bash/#see-also","title":"SEE ALSO","text":"
      • runai training standard - standard training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_delete/","title":"Runai training standard delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_delete/#runai-training-standard-delete","title":"runai training standard delete","text":"

      delete standard training workload

      runai training standard delete [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_delete/#examples","title":"Examples","text":"
      # Delete a standard training workload with a default project\nrunai training standard delete <standard-name>\n\n# Delete a standard training workload with a specific project\nrunai training standard delete <standard-name> -p <project_name>\n\n# Delete a standard training workload by UUID\nrunai training standard delete --uuid=<standard_uuid> -p <project_name>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_delete/#options","title":"Options","text":"
        -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_delete/#see-also","title":"SEE ALSO","text":"
      • runai training standard - standard training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_describe/","title":"Runai training standard describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_describe/#runai-training-standard-describe","title":"runai training standard describe","text":"

      describe standard training

      runai training standard describe [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_describe/#examples","title":"Examples","text":"
      # Describe a standard training workload with a default project\nrunai training standard describe <standard-name>\n\n# Describe a standard training workload in a specific project\nrunai training standard describe <standard-name> -p <project_name>\n\n# Describe a standard training workload by UUID\nrunai training standard describe --uuid=<standard_uuid>\n\n# Describe a standard training workload with specific output format\nrunai training standard describe <standard-name> -o json\n\n# Describe a standard training workload with specific sections\nrunai training standard describe <standard-name> --general --compute --pods --events --networks\n\n# Describe a standard training workload with container details and custom limits\nrunai training standard describe <standard-name> --containers --pod-limit 20 --event-limit 100\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_describe/#options","title":"Options","text":"
            --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_describe/#see-also","title":"SEE ALSO","text":"
      • runai training standard - standard training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_exec/","title":"Runai training standard exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_exec/#runai-training-standard-exec","title":"runai training standard exec","text":"

      execute a command in a standard training job

      runai training standard exec [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_exec/#examples","title":"Examples","text":"
      # Execute bash in the standard training's main worker\nrunai training standard exec standard-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the standard training's main worker\nrunai training standard exec standard-01 -- ls\n\n# Execute a command in a specific standard training worker\nrunai training standard exec standard-01 --pod standard-01-worker-1 -- nvidia-smi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_exec/#see-also","title":"SEE ALSO","text":"
      • runai training standard - standard training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_list/","title":"Runai training standard list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_list/#runai-training-standard-list","title":"runai training standard list","text":"

      list standard training

      runai training standard list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_list/#examples","title":"Examples","text":"
      # List all standard training workloads\nrunai training standard list -A\n\n# List standard training workloads with default project\nrunai training standard list\n\n# List standard training workloads in a specific project\nrunai training standard list -p <project_name>\n\n# List all standard training workloads with a specific output format\nrunai training standard list -o wide\n\n# List standard training workloads with pagination\nrunai training standard list --limit 20 --offset 40\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_list/#options","title":"Options","text":"
        -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_list/#see-also","title":"SEE ALSO","text":"
      • runai training standard - standard training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_logs/","title":"Runai training standard logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_logs/#runai-training-standard-logs","title":"runai training standard logs","text":"

      view logs of a standard training job

      runai training standard logs [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_logs/#examples","title":"Examples","text":"
      # Get logs for a standard training\nrunai training standard logs standard-01\n\n# Get logs for a specific pod in a standard training\nrunai training standard logs standard-01 --pod=standard-01-worker-0\n\n# Get logs for a specific container in a standard training\nrunai training standard logs standard-01 --container=standard-worker\n\n# Get the last 100 lines of logs\nrunai training standard logs standard-01 --tail=100\n\n# Get logs with timestamps\nrunai training standard logs standard-01 --timestamps\n\n# Follow the logs\nrunai training standard logs standard-01 --follow\n\n# Get logs for the previous instance of the standard training\nrunai training standard logs standard-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training standard logs standard-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training standard logs standard-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training standard logs standard-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for standard training to be ready for logs\nrunai training standard logs standard-01 --wait-timeout=30s\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_logs/#see-also","title":"SEE ALSO","text":"
      • runai training standard - standard training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_port-forward/","title":"Runai training standard port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_port-forward/#runai-training-standard-port-forward","title":"runai training standard port-forward","text":"

      forward one or more local ports to a standard training job

      runai training standard port-forward [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_port-forward/#examples","title":"Examples","text":"
      # Forward connections from localhost:8080 to standard training on port 8090:\nrunai training standard port-forward standard-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to standard training on port 8080:\nrunai training standard port-forward standard-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to standard training on port 8090 and from localhost:6443 to standard training on port 443:\nrunai training standard port-forward standard-01 --port 8080:8090 --port 6443:443 --address localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai training standard - standard training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_resume/","title":"Runai training standard resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_resume/#runai-training-standard-resume","title":"runai training standard resume","text":"

      resume standard training

      runai training standard resume [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_resume/#examples","title":"Examples","text":"
      # Resume a standard training workload\nrunai training standard resume <standard-name>\n\n# Resume a standard training workload in a specific project\nrunai training standard resume <standard-name> -p <project_name>\n\n# Resume a standard training workload by UUID\nrunai training standard resume --uuid=<standard_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_resume/#options","title":"Options","text":"
        -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_resume/#see-also","title":"SEE ALSO","text":"
      • runai training standard - standard training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_submit/","title":"Runai training standard submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_submit/#runai-training-standard-submit","title":"runai training standard submit","text":"

      submit standard training

      runai training standard submit [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_submit/#examples","title":"Examples","text":"
      # Submit a standard training workload\nrunai training standard submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a standard training workload with arguments\nrunai training standard submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a standard training workload with a custom command\nrunai training standard submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a standard training jupiter notebook\nrunai training standard submit <name> -p <project_name> -i jupyter/scipy-notebook --gpu-devices-request 1 --external-url container=8888 --name-prefix jupyter --command -- start-notebook.sh --NotebookApp.base_url='/${RUNAI_PROJECT}/${RUNAI_JOB_NAME}' --NotebookApp.token='\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_submit/#options","title":"Options","text":"
            --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --parallelism int32                              Specifies the maximum number of pods that should run in parallel at any given time\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --runs int32                                     Number of successful runs required for this workload to be considered completed\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --working-dir string                             Set the container's working directory\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_submit/#see-also","title":"SEE ALSO","text":"
      • runai training standard - standard training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_suspend/","title":"Runai training standard suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_suspend/#runai-training-standard-suspend","title":"runai training standard suspend","text":"

      suspend standard training

      runai training standard suspend [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_suspend/#examples","title":"Examples","text":"
      # Suspend a standard training workload\nrunai training standard suspend <standard-name>\n\n# Suspend a standard training workload in a specific project\nrunai training standard suspend <standard-name> -p <project_name>\n\n# Suspend a standard training workload by UUID\nrunai training standard suspend --uuid=<standard_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_suspend/#options","title":"Options","text":"
        -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_suspend/#see-also","title":"SEE ALSO","text":"
      • runai training standard - standard training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_submit/","title":"Runai training submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_submit/#runai-training-submit","title":"runai training submit","text":"

      submit standard training

      runai training submit [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_submit/#examples","title":"Examples","text":"
      # Submit a standard training workload\nrunai training standard submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a standard training workload with arguments\nrunai training standard submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a standard training workload with a custom command\nrunai training standard submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a standard training jupiter notebook\nrunai training standard submit <name> -p <project_name> -i jupyter/scipy-notebook --gpu-devices-request 1 --external-url container=8888 --name-prefix jupyter --command -- start-notebook.sh --NotebookApp.base_url='/${RUNAI_PROJECT}/${RUNAI_JOB_NAME}' --NotebookApp.token='\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_submit/#options","title":"Options","text":"
            --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --parallelism int32                              Specifies the maximum number of pods that should run in parallel at any given time\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --runs int32                                     Number of successful runs required for this workload to be considered completed\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --working-dir string                             Set the container's working directory\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_submit/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_suspend/","title":"Runai training suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_suspend/#runai-training-suspend","title":"runai training suspend","text":"

      suspend standard training

      runai training suspend [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_suspend/#examples","title":"Examples","text":"
      # Suspend a standard training workload\nrunai training standard suspend <standard-name>\n\n# Suspend a standard training workload in a specific project\nrunai training standard suspend <standard-name> -p <project_name>\n\n# Suspend a standard training workload by UUID\nrunai training standard suspend --uuid=<standard_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_suspend/#options","title":"Options","text":"
        -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_suspend/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow/","title":"Runai training tensorflow","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow/#runai-training-tensorflow","title":"runai training tensorflow","text":"

      tensorflow management

      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow/#options","title":"Options","text":"
        -h, --help   help for tensorflow\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      • runai training tensorflow attach - attach to a running container in a tf training job
      • runai training tensorflow bash - open a bash shell in a tf training job
      • runai training tensorflow delete - delete tf training workload
      • runai training tensorflow describe - describe tf training
      • runai training tensorflow exec - execute a command in a tf training job
      • runai training tensorflow list - list tf training
      • runai training tensorflow logs - view logs of a tf training job
      • runai training tensorflow port-forward - forward one or more local ports to a tf training job
      • runai training tensorflow resume - resume tf training
      • runai training tensorflow submit - submit tf training
      • runai training tensorflow suspend - suspend tf training
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/","title":"Runai training tensorflow attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/#runai-training-tensorflow-attach","title":"runai training tensorflow attach","text":"

      attach to a running container in a tf training job

      runai training tensorflow attach [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/#examples","title":"Examples","text":"
      # Attaching to the main worker of a tf training\nrunai training tf attach tf-01 --tty --stdin\n\n# Attaching to a specific pod of a tf training\nrunai training tf attach tf-01 --pod tf-01-worker-1 --tty --stdin\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/#see-also","title":"SEE ALSO","text":"
      • runai training tensorflow - tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/","title":"Runai training tensorflow bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/#runai-training-tensorflow-bash","title":"runai training tensorflow bash","text":"

      open a bash shell in a tf training job

      runai training tensorflow bash [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/#examples","title":"Examples","text":"
      # Open a bash shell in the tf training's main worker\nrunai training tf bash tf-01\n\n# Open a bash shell in a specific tf training worker\nrunai training tf bash tf-01 --pod tf-01-worker-1\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/#see-also","title":"SEE ALSO","text":"
      • runai training tensorflow - tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/","title":"Runai training tensorflow delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/#runai-training-tensorflow-delete","title":"runai training tensorflow delete","text":"

      delete tf training workload

      runai training tensorflow delete [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/#examples","title":"Examples","text":"
      # Delete a tf training workload with a default project\nrunai training tf delete <tf-name>\n\n# Delete a tf training workload with a specific project\nrunai training tf delete <tf-name> -p <project_name>\n\n# Delete a tf training workload by UUID\nrunai training tf delete --uuid=<tf_uuid> -p <project_name>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/#options","title":"Options","text":"
        -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/#see-also","title":"SEE ALSO","text":"
      • runai training tensorflow - tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/","title":"Runai training tensorflow describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/#runai-training-tensorflow-describe","title":"runai training tensorflow describe","text":"

      describe tf training

      runai training tensorflow describe [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/#examples","title":"Examples","text":"
      # Describe a tf training workload with a default project\nrunai training tf describe <tf-name>\n\n# Describe a tf training workload in a specific project\nrunai training tf describe <tf-name> -p <project_name>\n\n# Describe a tf training workload by UUID\nrunai training tf describe --uuid=<tf_uuid>\n\n# Describe a tf training workload with specific output format\nrunai training tf describe <tf-name> -o json\n\n# Describe a tf training workload with specific sections\nrunai training tf describe <tf-name> --general --compute --pods --events --networks\n\n# Describe a tf training workload with container details and custom limits\nrunai training tf describe <tf-name> --containers --pod-limit 20 --event-limit 100\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/#options","title":"Options","text":"
            --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/#see-also","title":"SEE ALSO","text":"
      • runai training tensorflow - tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/","title":"Runai training tensorflow exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/#runai-training-tensorflow-exec","title":"runai training tensorflow exec","text":"

      execute a command in a tf training job

      runai training tensorflow exec [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/#examples","title":"Examples","text":"
      # Execute bash in the tf training's main worker\nrunai training tf exec tf-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the tf training's main worker\nrunai training tf exec tf-01 -- ls\n\n# Execute a command in a specific tf training worker\nrunai training tf exec tf-01 --pod tf-01-worker-1 -- nvidia-smi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/#see-also","title":"SEE ALSO","text":"
      • runai training tensorflow - tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_list/","title":"Runai training tensorflow list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_list/#runai-training-tensorflow-list","title":"runai training tensorflow list","text":"

      list tf training

      runai training tensorflow list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_list/#examples","title":"Examples","text":"
      # List all tf training workloads\nrunai training tf list -A\n\n# List tf training workloads with default project\nrunai training tf list\n\n# List tf training workloads in a specific project\nrunai training tf list -p <project_name>\n\n# List all tf training workloads with a specific output format\nrunai training tf list -o wide\n\n# List tf training workloads with pagination\nrunai training tf list --limit 20 --offset 40\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_list/#options","title":"Options","text":"
        -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_list/#see-also","title":"SEE ALSO","text":"
      • runai training tensorflow - tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/","title":"Runai training tensorflow logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/#runai-training-tensorflow-logs","title":"runai training tensorflow logs","text":"

      view logs of a tf training job

      runai training tensorflow logs [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/#examples","title":"Examples","text":"
      # Get logs for a tf training\nrunai training tf logs tf-01\n\n# Get logs for a specific pod in a tf training\nrunai training tf logs tf-01 --pod=tf-01-worker-0\n\n# Get logs for a specific container in a tf training\nrunai training tf logs tf-01 --container=tf-worker\n\n# Get the last 100 lines of logs\nrunai training tf logs tf-01 --tail=100\n\n# Get logs with timestamps\nrunai training tf logs tf-01 --timestamps\n\n# Follow the logs\nrunai training tf logs tf-01 --follow\n\n# Get logs for the previous instance of the tf training\nrunai training tf logs tf-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training tf logs tf-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training tf logs tf-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training tf logs tf-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for tf training to be ready for logs\nrunai training tf logs tf-01 --wait-timeout=30s\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/#see-also","title":"SEE ALSO","text":"
      • runai training tensorflow - tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/","title":"Runai training tensorflow port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/#runai-training-tensorflow-port-forward","title":"runai training tensorflow port-forward","text":"

      forward one or more local ports to a tf training job

      runai training tensorflow port-forward [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/#examples","title":"Examples","text":"
      # Forward connections from localhost:8080 to tf training on port 8090:\nrunai training tf port-forward tf-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to tf training on port 8080:\nrunai training tf port-forward tf-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to tf training on port 8090 and from localhost:6443 to tf training on port 443:\nrunai training tf port-forward tf-01 --port 8080:8090 --port 6443:443 --address localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai training tensorflow - tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/","title":"Runai training tensorflow resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/#runai-training-tensorflow-resume","title":"runai training tensorflow resume","text":"

      resume tf training

      runai training tensorflow resume [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/#examples","title":"Examples","text":"
      # Resume a tf training workload\nrunai training tf resume <tf-name>\n\n# Resume a tf training workload in a specific project\nrunai training tf resume <tf-name> -p <project_name>\n\n# Resume a tf training workload by UUID\nrunai training tf resume --uuid=<tf_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/#options","title":"Options","text":"
        -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/#see-also","title":"SEE ALSO","text":"
      • runai training tensorflow - tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/","title":"Runai training tensorflow submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/#runai-training-tensorflow-submit","title":"runai training tensorflow submit","text":"

      submit tf training

      runai training tensorflow submit [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/#examples","title":"Examples","text":"
      # Submit a tf training workload\nrunai training tf submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a tf training workload with arguments\nrunai training tf submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a tf training workload with a custom command\nrunai training tf submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a tf training master args with worker args\nrunai training tf submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a tf training master command with worker args\nrunai training tf submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a tf training master command with worker command\nrunai training tf submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/#options","title":"Options","text":"
            --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-gpu-devices-request int32               GPU units to allocate for the job (e.g. 1, 2)\n      --master-gpu-portion-limit float                 GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-gpu-portion-request float               GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --max-replicas int32                             Maximum number of replicas for an elastic PyTorch job\n      --min-replicas int32                             Minimum number of replicas for an elastic PyTorch job\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --no-master                                      Do not create a separate pod for the master\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/#see-also","title":"SEE ALSO","text":"
      • runai training tensorflow - tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/","title":"Runai training tensorflow suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/#runai-training-tensorflow-suspend","title":"runai training tensorflow suspend","text":"

      suspend tf training

      runai training tensorflow suspend [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/#examples","title":"Examples","text":"
      # Suspend a tf training workload\nrunai training tf suspend <tf-name>\n\n# Suspend a tf training workload in a specific project\nrunai training tf suspend <tf-name> -p <project_name>\n\n# Suspend a tf training workload by UUID\nrunai training tf suspend --uuid=<tf_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/#options","title":"Options","text":"
        -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/#see-also","title":"SEE ALSO","text":"
      • runai training tensorflow - tensorflow management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost/","title":"Runai training xgboost","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost/#runai-training-xgboost","title":"runai training xgboost","text":"

      xgboost management

      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost/#options","title":"Options","text":"
        -h, --help   help for xgboost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost/#see-also","title":"SEE ALSO","text":"
      • runai training - training management
      • runai training xgboost attach - attach to a running container in a xgboost training job
      • runai training xgboost bash - open a bash shell in a xgboost training job
      • runai training xgboost delete - delete xgboost training workload
      • runai training xgboost describe - describe xgboost training
      • runai training xgboost exec - execute a command in a xgboost training job
      • runai training xgboost list - list xgboost training
      • runai training xgboost logs - view logs of a xgboost training job
      • runai training xgboost port-forward - forward one or more local ports to a xgboost training job
      • runai training xgboost resume - resume xgboost training
      • runai training xgboost submit - submit xgboost training
      • runai training xgboost suspend - suspend xgboost training
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_attach/","title":"Runai training xgboost attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_attach/#runai-training-xgboost-attach","title":"runai training xgboost attach","text":"

      attach to a running container in a xgboost training job

      runai training xgboost attach [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_attach/#examples","title":"Examples","text":"
      # Attaching to the main worker of a xgboost training\nrunai training xgboost attach xgboost-01 --tty --stdin\n\n# Attaching to a specific pod of a xgboost training\nrunai training xgboost attach xgboost-01 --pod xgboost-01-worker-1 --tty --stdin\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_attach/#see-also","title":"SEE ALSO","text":"
      • runai training xgboost - xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_bash/","title":"Runai training xgboost bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_bash/#runai-training-xgboost-bash","title":"runai training xgboost bash","text":"

      open a bash shell in a xgboost training job

      runai training xgboost bash [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_bash/#examples","title":"Examples","text":"
      # Open a bash shell in the xgboost training's main worker\nrunai training xgboost bash xgboost-01\n\n# Open a bash shell in a specific xgboost training worker\nrunai training xgboost bash xgboost-01 --pod xgboost-01-worker-1\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_bash/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_bash/#see-also","title":"SEE ALSO","text":"
      • runai training xgboost - xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_delete/","title":"Runai training xgboost delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_delete/#runai-training-xgboost-delete","title":"runai training xgboost delete","text":"

      delete xgboost training workload

      runai training xgboost delete [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_delete/#examples","title":"Examples","text":"
      # Delete a xgboost training workload with a default project\nrunai training xgboost delete <xgboost-name>\n\n# Delete a xgboost training workload with a specific project\nrunai training xgboost delete <xgboost-name> -p <project_name>\n\n# Delete a xgboost training workload by UUID\nrunai training xgboost delete --uuid=<xgboost_uuid> -p <project_name>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_delete/#options","title":"Options","text":"
        -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_delete/#see-also","title":"SEE ALSO","text":"
      • runai training xgboost - xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_describe/","title":"Runai training xgboost describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_describe/#runai-training-xgboost-describe","title":"runai training xgboost describe","text":"

      describe xgboost training

      runai training xgboost describe [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_describe/#examples","title":"Examples","text":"
      # Describe a xgboost training workload with a default project\nrunai training xgboost describe <xgboost-name>\n\n# Describe a xgboost training workload in a specific project\nrunai training xgboost describe <xgboost-name> -p <project_name>\n\n# Describe a xgboost training workload by UUID\nrunai training xgboost describe --uuid=<xgboost_uuid>\n\n# Describe a xgboost training workload with specific output format\nrunai training xgboost describe <xgboost-name> -o json\n\n# Describe a xgboost training workload with specific sections\nrunai training xgboost describe <xgboost-name> --general --compute --pods --events --networks\n\n# Describe a xgboost training workload with container details and custom limits\nrunai training xgboost describe <xgboost-name> --containers --pod-limit 20 --event-limit 100\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_describe/#options","title":"Options","text":"
            --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_describe/#see-also","title":"SEE ALSO","text":"
      • runai training xgboost - xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_exec/","title":"Runai training xgboost exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_exec/#runai-training-xgboost-exec","title":"runai training xgboost exec","text":"

      execute a command in a xgboost training job

      runai training xgboost exec [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_exec/#examples","title":"Examples","text":"
      # Execute bash in the xgboost training's main worker\nrunai training xgboost exec xgboost-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the xgboost training's main worker\nrunai training xgboost exec xgboost-01 -- ls\n\n# Execute a command in a specific xgboost training worker\nrunai training xgboost exec xgboost-01 --pod xgboost-01-worker-1 -- nvidia-smi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_exec/#see-also","title":"SEE ALSO","text":"
      • runai training xgboost - xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_list/","title":"Runai training xgboost list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_list/#runai-training-xgboost-list","title":"runai training xgboost list","text":"

      list xgboost training

      runai training xgboost list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_list/#examples","title":"Examples","text":"
      # List all xgboost training workloads\nrunai training xgboost list -A\n\n# List xgboost training workloads with default project\nrunai training xgboost list\n\n# List xgboost training workloads in a specific project\nrunai training xgboost list -p <project_name>\n\n# List all xgboost training workloads with a specific output format\nrunai training xgboost list -o wide\n\n# List xgboost training workloads with pagination\nrunai training xgboost list --limit 20 --offset 40\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_list/#options","title":"Options","text":"
        -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_list/#see-also","title":"SEE ALSO","text":"
      • runai training xgboost - xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_logs/","title":"Runai training xgboost logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_logs/#runai-training-xgboost-logs","title":"runai training xgboost logs","text":"

      view logs of a xgboost training job

      runai training xgboost logs [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_logs/#examples","title":"Examples","text":"
      # Get logs for a xgboost training\nrunai training xgboost logs xgboost-01\n\n# Get logs for a specific pod in a xgboost training\nrunai training xgboost logs xgboost-01 --pod=xgboost-01-worker-0\n\n# Get logs for a specific container in a xgboost training\nrunai training xgboost logs xgboost-01 --container=xgboost-worker\n\n# Get the last 100 lines of logs\nrunai training xgboost logs xgboost-01 --tail=100\n\n# Get logs with timestamps\nrunai training xgboost logs xgboost-01 --timestamps\n\n# Follow the logs\nrunai training xgboost logs xgboost-01 --follow\n\n# Get logs for the previous instance of the xgboost training\nrunai training xgboost logs xgboost-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training xgboost logs xgboost-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training xgboost logs xgboost-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training xgboost logs xgboost-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for xgboost training to be ready for logs\nrunai training xgboost logs xgboost-01 --wait-timeout=30s\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_logs/#see-also","title":"SEE ALSO","text":"
      • runai training xgboost - xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/","title":"Runai training xgboost port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/#runai-training-xgboost-port-forward","title":"runai training xgboost port-forward","text":"

      forward one or more local ports to a xgboost training job

      runai training xgboost port-forward [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/#examples","title":"Examples","text":"
      # Forward connections from localhost:8080 to xgboost training on port 8090:\nrunai training xgboost port-forward xgboost-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to xgboost training on port 8080:\nrunai training xgboost port-forward xgboost-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to xgboost training on port 8090 and from localhost:6443 to xgboost training on port 443:\nrunai training xgboost port-forward xgboost-01 --port 8080:8090 --port 6443:443 --address localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai training xgboost - xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_resume/","title":"Runai training xgboost resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_resume/#runai-training-xgboost-resume","title":"runai training xgboost resume","text":"

      resume xgboost training

      runai training xgboost resume [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_resume/#examples","title":"Examples","text":"
      # Resume a xgboost training workload\nrunai training xgboost resume <xgboost-name>\n\n# Resume a xgboost training workload in a specific project\nrunai training xgboost resume <xgboost-name> -p <project_name>\n\n# Resume a xgboost training workload by UUID\nrunai training xgboost resume --uuid=<xgboost_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_resume/#options","title":"Options","text":"
        -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_resume/#see-also","title":"SEE ALSO","text":"
      • runai training xgboost - xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_submit/","title":"Runai training xgboost submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_submit/#runai-training-xgboost-submit","title":"runai training xgboost submit","text":"

      submit xgboost training

      runai training xgboost submit [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_submit/#examples","title":"Examples","text":"
      # Submit a xgboost training workload\nrunai training xgboost submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a xgboost training workload with arguments\nrunai training xgboost submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a xgboost training workload with a custom command\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a xgboost training master args with worker args\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a xgboost training master command with worker args\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a xgboost training master command with worker command\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_submit/#options","title":"Options","text":"
            --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-gpu-devices-request int32               GPU units to allocate for the job (e.g. 1, 2)\n      --master-gpu-portion-limit float                 GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-gpu-portion-request float               GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_submit/#see-also","title":"SEE ALSO","text":"
      • runai training xgboost - xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/","title":"Runai training xgboost suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/#runai-training-xgboost-suspend","title":"runai training xgboost suspend","text":"

      suspend xgboost training

      runai training xgboost suspend [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/#examples","title":"Examples","text":"
      # Suspend a xgboost training workload\nrunai training xgboost suspend <xgboost-name>\n\n# Suspend a xgboost training workload in a specific project\nrunai training xgboost suspend <xgboost-name> -p <project_name>\n\n# Suspend a xgboost training workload by UUID\nrunai training xgboost suspend --uuid=<xgboost_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/#options","title":"Options","text":"
        -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/#see-also","title":"SEE ALSO","text":"
      • runai training xgboost - xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_upgrade/","title":"Runai upgrade","text":""},{"location":"Researcher/cli-reference/new-cli/runai_upgrade/#runai-upgrade","title":"runai upgrade","text":"

      upgrades the CLI to the latest version

      runai upgrade [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_upgrade/#options","title":"Options","text":"
            --force   upgrade CLI without checking for new version\n  -h, --help    help for upgrade\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_upgrade/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_upgrade/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      "},{"location":"Researcher/cli-reference/new-cli/runai_version/","title":"Runai version","text":""},{"location":"Researcher/cli-reference/new-cli/runai_version/#runai-version","title":"runai version","text":"

      show the current version of the CLI

      runai version [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_version/#options","title":"Options","text":"
        -h, --help   help for version\n      --wide   print full version details\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_version/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_version/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      "},{"location":"Researcher/cli-reference/new-cli/runai_whoami/","title":"Runai whoami","text":""},{"location":"Researcher/cli-reference/new-cli/runai_whoami/#runai-whoami","title":"runai whoami","text":"

      show the current logged in user

      runai whoami [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_whoami/#options","title":"Options","text":"
        -h, --help   help for whoami\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_whoami/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_whoami/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload/","title":"Runai workload","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload/#runai-workload","title":"runai workload","text":"

      workload management

      "},{"location":"Researcher/cli-reference/new-cli/runai_workload/#options","title":"Options","text":"
        -h, --help                 help for workload\n      --interactive enable   set interactive mode (enabled|disabled)\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai workload describe - Describe a workload
      • runai workload list - List workloads
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_attach/","title":"Runai workload attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload_attach/#runai-workload-attach","title":"runai workload attach","text":"

      Attach to a process that is already running inside an existing container.

      runai workload attach WORKLOAD_NAME [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_attach/#examples","title":"Examples","text":"
      # Attaching to ubuntu workspace \nrunai workload attach ubuntu-wl --type workspace --tty --stdin\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --type string                    The type of the workload (training, workspace)\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_attach/#see-also","title":"SEE ALSO","text":"
      • runai workload - workload management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_describe/","title":"Runai workload describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload_describe/#runai-workload-describe","title":"runai workload describe","text":"

      Describe a workload

      runai workload describe WORKLOAD_NAME [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_describe/#options","title":"Options","text":"
            --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --framework string    filter by workload framework\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --type string         The type of the workload (training, workspace)\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_describe/#see-also","title":"SEE ALSO","text":"
      • runai workload - workload management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_exec/","title":"Runai workload exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload_exec/#runai-workload-exec","title":"runai workload exec","text":"

      exec management

      runai workload exec WORKLOAD_NAME [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_exec/#examples","title":"Examples","text":"
      # Execute bash to workspace \nrunai workload exec jup --type workspace --tty --stdin -- /bin/bash \n\n# Execute ls to workload\nrunai workload exec jup --type workspace -- ls\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --type string                    The type of the workload (training, workspace)\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_exec/#see-also","title":"SEE ALSO","text":"
      • runai workload - workload management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_list/","title":"Runai workload list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload_list/#runai-workload-list","title":"runai workload list","text":"

      List workloads

      runai workload list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_list/#options","title":"Options","text":"
        -A, --all                list workloads from all projects\n      --framework string   filter by workload framework\n  -h, --help               help for list\n      --json               Output structure JSON\n      --limit int32        number of workload in list (default 50)\n      --no-headers         Output structure table without headers\n      --offset int32       offset number of limit, default 0 (first offset)\n  -p, --project string     Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string      filter by workload state\n      --table              Output structure table\n      --type string        filter by workload type\n      --yaml               Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_list/#see-also","title":"SEE ALSO","text":"
      • runai workload - workload management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_logs/","title":"Runai workload logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload_logs/#runai-workload-logs","title":"runai workload logs","text":"

      logs management

      runai workload logs WORKLOAD_NAME [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_logs/#examples","title":"Examples","text":"
        # Get logs for a workspace\n  runai workload logs workspace-01 --type=workspace\n\n  # Get logs for a specific pod in a workspace\n  runai workload logs workspace-01 --type=workspace --pod=workspace-01-0\n\n  # Get logs for a specific container in a workspace\n  runai workload logs workspace-01 --type=workspace --container=container-01\n\n  # Get the last 100 lines of logs\n  runai workload logs workspace-01 --type=workspace --tail=100\n\n  # Get logs with timestamps\n  runai workload logs workspace-01 --type=workspace --timestamps\n\n  # Follow the logs\n  runai workload logs workspace-01 --type=workspace --follow\n\n  # Get logs for the previous instance of the workspace\n  runai workload logs workspace-01 --type=workspace --previous\n\n  # GetLimit the logs to 1024 bytes\n  runai workload logs workspace-01 --type=workspace --limit-bytes=1024\n\n  # Get logs since the last 5 minutes\n  runai workload logs workspace-01 --type=workspace --since=5m\n\n  # Get logs since a specific timestamp\n  runai workload logs workspace-01 --type=workspace --since-time=2023-05-30T10:00:00Z\n\n  # Wait up to 30 seconds for workload to be ready for logs\n  runai workload logs workspace-01 --type=workspace --wait-timeout=30s\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --type string             The type of the workload (training, workspace)\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_logs/#see-also","title":"SEE ALSO","text":"
      • runai workload - workload management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_port-forward/","title":"Runai workload port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload_port-forward/#runai-workload-port-forward","title":"runai workload port-forward","text":"

      port forward management

      runai workload port-forward WORKLOAD_NAME [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_port-forward/#examples","title":"Examples","text":"
      # Forward connections from localhost:8080 to <workload-name> on port 8090:\nrunai workload port-forward <workload-name> --type=<workload-type> --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to <job-name> on port 8080:\nrunai workload port-forward <workload-name> --type=<workload-type> --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to <workload-name> on port 8090 and from localhost:6443 to <workload-name> on port 443:\nrunai workload port-forward <workload-name> --type=<workload-type> --port 8080:8090 --port 6443:443 --address localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --type string                    The type of the workload (training, workspace)\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workload_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai workload - workload management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace/","title":"Runai workspace","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace/#runai-workspace","title":"runai workspace","text":"

      workspace management

      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace/#options","title":"Options","text":"
        -h, --help   help for workspace\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai workspace attach - attach to a running container in a workspace job
      • runai workspace bash - open a bash shell in a workspace job
      • runai workspace delete - delete workspace workload
      • runai workspace describe - describe workspace
      • runai workspace exec - execute a command in a workspace job
      • runai workspace list - list workspace
      • runai workspace logs - view logs of a workspace job
      • runai workspace port-forward - forward one or more local ports to a workspace job
      • runai workspace resume - resume workspace
      • runai workspace submit - submit workspace
      • runai workspace suspend - suspend workspace
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_attach/","title":"Runai workspace attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_attach/#runai-workspace-attach","title":"runai workspace attach","text":"

      attach to a running container in a workspace job

      runai workspace attach [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_attach/#examples","title":"Examples","text":"
      # Attaching to the main worker of a workspace\nrunai workspace attach workspace-01 --tty --stdin\n\n# Attaching to a specific pod of a workspace\nrunai workspace attach workspace-01 --pod workspace-01-worker-1 --tty --stdin\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_attach/#see-also","title":"SEE ALSO","text":"
      • runai workspace - workspace management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_bash/","title":"Runai workspace bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_bash/#runai-workspace-bash","title":"runai workspace bash","text":"

      open a bash shell in a workspace job

      runai workspace bash [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_bash/#examples","title":"Examples","text":"
      # Open a bash shell in the workspace's main worker\nrunai workspace bash workspace-01\n\n# Open a bash shell in a specific workspace worker\nrunai workspace bash workspace-01 --pod workspace-01-worker-1\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_bash/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_bash/#see-also","title":"SEE ALSO","text":"
      • runai workspace - workspace management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_delete/","title":"Runai workspace delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_delete/#runai-workspace-delete","title":"runai workspace delete","text":"

      delete workspace workload

      runai workspace delete [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_delete/#examples","title":"Examples","text":"
      # Delete a workspace workload with a default project\nrunai workspace delete <workspace-name>\n\n# Delete a workspace workload with a specific project\nrunai workspace delete <workspace-name> -p <project_name>\n\n# Delete a workspace workload by UUID\nrunai workspace delete --uuid=<workspace_uuid> -p <project_name>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_delete/#options","title":"Options","text":"
        -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_delete/#see-also","title":"SEE ALSO","text":"
      • runai workspace - workspace management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_describe/","title":"Runai workspace describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_describe/#runai-workspace-describe","title":"runai workspace describe","text":"

      describe workspace

      runai workspace describe [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_describe/#examples","title":"Examples","text":"
      # Describe a workspace workload with a default project\nrunai workspace describe <workspace-name>\n\n# Describe a workspace workload in a specific project\nrunai workspace describe <workspace-name> -p <project_name>\n\n# Describe a workspace workload by UUID\nrunai workspace describe --uuid=<workspace_uuid>\n\n# Describe a workspace workload with specific output format\nrunai workspace describe <workspace-name> -o json\n\n# Describe a workspace workload with specific sections\nrunai workspace describe <workspace-name> --general --compute --pods --events --networks\n\n# Describe a workspace workload with container details and custom limits\nrunai workspace describe <workspace-name> --containers --pod-limit 20 --event-limit 100\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_describe/#options","title":"Options","text":"
            --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_describe/#see-also","title":"SEE ALSO","text":"
      • runai workspace - workspace management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_exec/","title":"Runai workspace exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_exec/#runai-workspace-exec","title":"runai workspace exec","text":"

      execute a command in a workspace job

      runai workspace exec [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_exec/#examples","title":"Examples","text":"
      # Execute bash in the workspace's main worker\nrunai workspace exec workspace-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the workspace's main worker\nrunai workspace exec workspace-01 -- ls\n\n# Execute a command in a specific workspace worker\nrunai workspace exec workspace-01 --pod workspace-01-worker-1 -- nvidia-smi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_exec/#see-also","title":"SEE ALSO","text":"
      • runai workspace - workspace management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_list/","title":"Runai workspace list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_list/#runai-workspace-list","title":"runai workspace list","text":"

      list workspace

      runai workspace list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_list/#examples","title":"Examples","text":"
      # List all workspace workloads\nrunai workspace list -A\n\n# List workspace workloads with default project\nrunai workspace list\n\n# List workspace workloads in a specific project\nrunai workspace list -p <project_name>\n\n# List all workspace workloads with a specific output format\nrunai workspace list -o wide\n\n# List workspace workloads with pagination\nrunai workspace list --limit 20 --offset 40\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_list/#options","title":"Options","text":"
        -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_list/#see-also","title":"SEE ALSO","text":"
      • runai workspace - workspace management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_logs/","title":"Runai workspace logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_logs/#runai-workspace-logs","title":"runai workspace logs","text":"

      view logs of a workspace job

      runai workspace logs [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_logs/#examples","title":"Examples","text":"
      # Get logs for a workspace\nrunai workspace logs workspace-01\n\n# Get logs for a specific pod in a workspace\nrunai workspace logs workspace-01 --pod=workspace-01-worker-0\n\n# Get logs for a specific container in a workspace\nrunai workspace logs workspace-01 --container=workspace-worker\n\n# Get the last 100 lines of logs\nrunai workspace logs workspace-01 --tail=100\n\n# Get logs with timestamps\nrunai workspace logs workspace-01 --timestamps\n\n# Follow the logs\nrunai workspace logs workspace-01 --follow\n\n# Get logs for the previous instance of the workspace\nrunai workspace logs workspace-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai workspace logs workspace-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai workspace logs workspace-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai workspace logs workspace-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for workspace to be ready for logs\nrunai workspace logs workspace-01 --wait-timeout=30s\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_logs/#see-also","title":"SEE ALSO","text":"
      • runai workspace - workspace management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_port-forward/","title":"Runai workspace port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_port-forward/#runai-workspace-port-forward","title":"runai workspace port-forward","text":"

      forward one or more local ports to a workspace job

      runai workspace port-forward [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_port-forward/#examples","title":"Examples","text":"
      # Forward connections from localhost:8080 to workspace on port 8090:\nrunai workspace port-forward workspace-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to workspace on port 8080:\nrunai workspace port-forward workspace-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to workspace on port 8090 and from localhost:6443 to workspace on port 443:\nrunai workspace port-forward workspace-01 --port 8080:8090 --port 6443:443 --address localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai workspace - workspace management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_resume/","title":"Runai workspace resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_resume/#runai-workspace-resume","title":"runai workspace resume","text":"

      resume workspace

      runai workspace resume [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_resume/#examples","title":"Examples","text":"
      # Resume a workspace workload\nrunai workspace resume <workspace-name>\n\n# Resume a workspace workload in a specific project\nrunai workspace resume <workspace-name> -p <project_name>\n\n# Resume a workspace workload by UUID\nrunai workspace resume --uuid=<workspace_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_resume/#options","title":"Options","text":"
        -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_resume/#see-also","title":"SEE ALSO","text":"
      • runai workspace - workspace management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_submit/","title":"Runai workspace submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_submit/#runai-workspace-submit","title":"runai workspace submit","text":"

      submit workspace

      runai workspace submit [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_submit/#examples","title":"Examples","text":"
      # Submit a workspace workload\nrunai workspace submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a workspace workload with arguments\nrunai workspace submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a workspace workload with a custom command\nrunai workspace submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a workspace jupiter notebook\nrunai workspace submit <name> -p <project_name> -i jupyter/scipy-notebook --gpu-devices-request 1 --external-url container=8888 --name-prefix jupyter --command -- start-notebook.sh --NotebookApp.base_url='/${RUNAI_PROJECT}/${RUNAI_JOB_NAME}' --NotebookApp.token='\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_submit/#options","title":"Options","text":"
            --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preemptible                                    Workspace preemptible workloads can be scheduled above guaranteed quota but may be reclaimed at any time\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --working-dir string                             Set the container's working directory\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_submit/#see-also","title":"SEE ALSO","text":"
      • runai workspace - workspace management
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_suspend/","title":"Runai workspace suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_suspend/#runai-workspace-suspend","title":"runai workspace suspend","text":"

      suspend workspace

      runai workspace suspend [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_suspend/#examples","title":"Examples","text":"
      # Suspend a workspace workload\nrunai workspace suspend <workspace-name>\n\n# Suspend a workspace workload in a specific project\nrunai workspace suspend <workspace-name> -p <project_name>\n\n# Suspend a workspace workload by UUID\nrunai workspace suspend --uuid=<workspace_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_suspend/#options","title":"Options","text":"
        -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_suspend/#see-also","title":"SEE ALSO","text":"
      • runai workspace - workspace management
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost/","title":"Runai xgboost","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost/#runai-xgboost","title":"runai xgboost","text":"

      alias for xgboost management

      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost/#options","title":"Options","text":"
        -h, --help   help for xgboost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost/#see-also","title":"SEE ALSO","text":"
      • runai - Run:ai Command-line Interface
      • runai xgboost attach - attach to a running container in a xgboost training job
      • runai xgboost bash - open a bash shell in a xgboost training job
      • runai xgboost delete - delete xgboost training workload
      • runai xgboost describe - describe xgboost training
      • runai xgboost exec - execute a command in a xgboost training job
      • runai xgboost list - list xgboost training
      • runai xgboost logs - view logs of a xgboost training job
      • runai xgboost port-forward - forward one or more local ports to a xgboost training job
      • runai xgboost resume - resume xgboost training
      • runai xgboost submit - submit xgboost training
      • runai xgboost suspend - suspend xgboost training
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_attach/","title":"Runai xgboost attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_attach/#runai-xgboost-attach","title":"runai xgboost attach","text":"

      attach to a running container in a xgboost training job

      runai xgboost attach [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_attach/#examples","title":"Examples","text":"
      # Attaching to the main worker of a xgboost training\nrunai training xgboost attach xgboost-01 --tty --stdin\n\n# Attaching to a specific pod of a xgboost training\nrunai training xgboost attach xgboost-01 --pod xgboost-01-worker-1 --tty --stdin\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_attach/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_attach/#see-also","title":"SEE ALSO","text":"
      • runai xgboost - alias for xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_bash/","title":"Runai xgboost bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_bash/#runai-xgboost-bash","title":"runai xgboost bash","text":"

      open a bash shell in a xgboost training job

      runai xgboost bash [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_bash/#examples","title":"Examples","text":"
      # Open a bash shell in the xgboost training's main worker\nrunai training xgboost bash xgboost-01\n\n# Open a bash shell in a specific xgboost training worker\nrunai training xgboost bash xgboost-01 --pod xgboost-01-worker-1\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_bash/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_bash/#see-also","title":"SEE ALSO","text":"
      • runai xgboost - alias for xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_delete/","title":"Runai xgboost delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_delete/#runai-xgboost-delete","title":"runai xgboost delete","text":"

      delete xgboost training workload

      runai xgboost delete [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_delete/#examples","title":"Examples","text":"
      # Delete a xgboost training workload with a default project\nrunai training xgboost delete <xgboost-name>\n\n# Delete a xgboost training workload with a specific project\nrunai training xgboost delete <xgboost-name> -p <project_name>\n\n# Delete a xgboost training workload by UUID\nrunai training xgboost delete --uuid=<xgboost_uuid> -p <project_name>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_delete/#options","title":"Options","text":"
        -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_delete/#see-also","title":"SEE ALSO","text":"
      • runai xgboost - alias for xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_describe/","title":"Runai xgboost describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_describe/#runai-xgboost-describe","title":"runai xgboost describe","text":"

      describe xgboost training

      runai xgboost describe [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_describe/#examples","title":"Examples","text":"
      # Describe a xgboost training workload with a default project\nrunai training xgboost describe <xgboost-name>\n\n# Describe a xgboost training workload in a specific project\nrunai training xgboost describe <xgboost-name> -p <project_name>\n\n# Describe a xgboost training workload by UUID\nrunai training xgboost describe --uuid=<xgboost_uuid>\n\n# Describe a xgboost training workload with specific output format\nrunai training xgboost describe <xgboost-name> -o json\n\n# Describe a xgboost training workload with specific sections\nrunai training xgboost describe <xgboost-name> --general --compute --pods --events --networks\n\n# Describe a xgboost training workload with container details and custom limits\nrunai training xgboost describe <xgboost-name> --containers --pod-limit 20 --event-limit 100\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_describe/#options","title":"Options","text":"
            --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_describe/#see-also","title":"SEE ALSO","text":"
      • runai xgboost - alias for xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_exec/","title":"Runai xgboost exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_exec/#runai-xgboost-exec","title":"runai xgboost exec","text":"

      execute a command in a xgboost training job

      runai xgboost exec [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_exec/#examples","title":"Examples","text":"
      # Execute bash in the xgboost training's main worker\nrunai training xgboost exec xgboost-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the xgboost training's main worker\nrunai training xgboost exec xgboost-01 -- ls\n\n# Execute a command in a specific xgboost training worker\nrunai training xgboost exec xgboost-01 --pod xgboost-01-worker-1 -- nvidia-smi\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_exec/#options","title":"Options","text":"
        -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_exec/#see-also","title":"SEE ALSO","text":"
      • runai xgboost - alias for xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_list/","title":"Runai xgboost list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_list/#runai-xgboost-list","title":"runai xgboost list","text":"

      list xgboost training

      runai xgboost list [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_list/#examples","title":"Examples","text":"
      # List all xgboost training workloads\nrunai training xgboost list -A\n\n# List xgboost training workloads with default project\nrunai training xgboost list\n\n# List xgboost training workloads in a specific project\nrunai training xgboost list -p <project_name>\n\n# List all xgboost training workloads with a specific output format\nrunai training xgboost list -o wide\n\n# List xgboost training workloads with pagination\nrunai training xgboost list --limit 20 --offset 40\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_list/#options","title":"Options","text":"
        -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_list/#see-also","title":"SEE ALSO","text":"
      • runai xgboost - alias for xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_logs/","title":"Runai xgboost logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_logs/#runai-xgboost-logs","title":"runai xgboost logs","text":"

      view logs of a xgboost training job

      runai xgboost logs [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_logs/#examples","title":"Examples","text":"
      # Get logs for a xgboost training\nrunai training xgboost logs xgboost-01\n\n# Get logs for a specific pod in a xgboost training\nrunai training xgboost logs xgboost-01 --pod=xgboost-01-worker-0\n\n# Get logs for a specific container in a xgboost training\nrunai training xgboost logs xgboost-01 --container=xgboost-worker\n\n# Get the last 100 lines of logs\nrunai training xgboost logs xgboost-01 --tail=100\n\n# Get logs with timestamps\nrunai training xgboost logs xgboost-01 --timestamps\n\n# Follow the logs\nrunai training xgboost logs xgboost-01 --follow\n\n# Get logs for the previous instance of the xgboost training\nrunai training xgboost logs xgboost-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training xgboost logs xgboost-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training xgboost logs xgboost-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training xgboost logs xgboost-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for xgboost training to be ready for logs\nrunai training xgboost logs xgboost-01 --wait-timeout=30s\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_logs/#options","title":"Options","text":"
        -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_logs/#see-also","title":"SEE ALSO","text":"
      • runai xgboost - alias for xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_port-forward/","title":"Runai xgboost port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_port-forward/#runai-xgboost-port-forward","title":"runai xgboost port-forward","text":"

      forward one or more local ports to a xgboost training job

      runai xgboost port-forward [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_port-forward/#examples","title":"Examples","text":"
      # Forward connections from localhost:8080 to xgboost training on port 8090:\nrunai training xgboost port-forward xgboost-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to xgboost training on port 8080:\nrunai training xgboost port-forward xgboost-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to xgboost training on port 8090 and from localhost:6443 to xgboost training on port 443:\nrunai training xgboost port-forward xgboost-01 --port 8080:8090 --port 6443:443 --address localhost\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_port-forward/#options","title":"Options","text":"
            --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_port-forward/#see-also","title":"SEE ALSO","text":"
      • runai xgboost - alias for xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_resume/","title":"Runai xgboost resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_resume/#runai-xgboost-resume","title":"runai xgboost resume","text":"

      resume xgboost training

      runai xgboost resume [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_resume/#examples","title":"Examples","text":"
      # Resume a xgboost training workload\nrunai training xgboost resume <xgboost-name>\n\n# Resume a xgboost training workload in a specific project\nrunai training xgboost resume <xgboost-name> -p <project_name>\n\n# Resume a xgboost training workload by UUID\nrunai training xgboost resume --uuid=<xgboost_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_resume/#options","title":"Options","text":"
        -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_resume/#see-also","title":"SEE ALSO","text":"
      • runai xgboost - alias for xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_submit/","title":"Runai xgboost submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_submit/#runai-xgboost-submit","title":"runai xgboost submit","text":"

      submit xgboost training

      runai xgboost submit [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_submit/#examples","title":"Examples","text":"
      # Submit a xgboost training workload\nrunai training xgboost submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a xgboost training workload with arguments\nrunai training xgboost submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a xgboost training workload with a custom command\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a xgboost training master args with worker args\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a xgboost training master command with worker args\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a xgboost training master command with worker command\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_submit/#options","title":"Options","text":"
            --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-gpu-devices-request int32               GPU units to allocate for the job (e.g. 1, 2)\n      --master-gpu-portion-limit float                 GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-gpu-portion-request float               GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_submit/#see-also","title":"SEE ALSO","text":"
      • runai xgboost - alias for xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_suspend/","title":"Runai xgboost suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_suspend/#runai-xgboost-suspend","title":"runai xgboost suspend","text":"

      suspend xgboost training

      runai xgboost suspend [WORKLOAD_NAME] [flags]\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_suspend/#examples","title":"Examples","text":"
      # Suspend a xgboost training workload\nrunai training xgboost suspend <xgboost-name>\n\n# Suspend a xgboost training workload in a specific project\nrunai training xgboost suspend <xgboost-name> -p <project_name>\n\n# Suspend a xgboost training workload by UUID\nrunai training xgboost suspend --uuid=<xgboost_uuid>\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_suspend/#options","title":"Options","text":"
        -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
            --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
      "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_suspend/#see-also","title":"SEE ALSO","text":"
      • runai xgboost - alias for xgboost management
      "},{"location":"Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/","title":"Add Run:ai authorization to kubeconfig","text":"

      The runai kubeconfig set command allows users to configure their kubeconfig file with Run:ai authorization token. This setup enables users to gain access to the Kubernetes (k8s) cluster seamlessly.

      Note

      Setting kubeconfig is not required in order to use the CLI. This command is used to enable third-party workloads under Run:ai authorization.

      "},{"location":"Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/#usage","title":"Usage","text":"

      To set the token (will be fetched automatically) inside the kubeconfig file, run the following command:

      runai kubeconfig set\n
      "},{"location":"Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/#prerequisites","title":"Prerequisites","text":"

      Before executing the command, ensure that

      1. Cluster authentication is configured and enabled.
      2. The user has a kubeconfig file configured.
      3. The user is logged in (use the runai login command).
      "},{"location":"Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/#cluster-configuration","title":"Cluster configuration","text":"

      To enable cluster authentication, add the following flags to the Kubernetes server API of each cluster:

      spec:\n  containers:\n  - command:\n    ...\n    - --oidc-client-id=<OIDC_CLIENT_ID>\n    - --oidc-issuer-url=url=https://<HOST>/auth/realms/<REALM>\n    - --oidc-username-prefix=-\n
      "},{"location":"Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/#user-kubeconfig-configuration","title":"User Kubeconfig configuration","text":"

      Add the following to the Kubernetes client configuration file (./kube/config). For the full command reference, see kubeconfig set.

      • Make sure to replace values with the actual cluster information and user credentials.
      • There can be multiple contexts in the kubeconfig file. The command will configure the current context.
      apiVersion: v1\nkind: Config\npreferences:\n  colors: true\ncurrent-context: <CONTEXT_NAME>\ncontexts:\n- context:\n    cluster: <CLUSTER_NAME>\n    user: <USER_NAME>\n  name: <CONTEXT_NAME>\nclusters:\n- cluster:\n    server: <CLUSTER_URL>\n    certificate-authority-data: <CLUSTER_CERT>\n  name: <CLUSTER_NAME>\nusers:\n- name: <USER_NAME>\n
      "},{"location":"Researcher/scheduling/GPU-time-slicing-scheduler/","title":"GPU Time Slicing Scheduler","text":""},{"location":"Researcher/scheduling/GPU-time-slicing-scheduler/#new-time-slicing-scheduler-by-runai","title":"New Time-slicing scheduler by Run:ai","text":"

      To provide customers with predictable and accurate GPU compute resources scheduling, Run:ai is introducing a new feature called Time-slicing GPU scheduler which adds fractional compute capabilities on top of other existing Run:ai memory fractions capabilities. Unlike the default NVIDIA GPU orchestrator which doesn\u2019t provide the ability to split or limit the runtime of each workload, Run:ai created a new mechanism that gives each workload exclusive access to the full GPU for a limited amount of time (lease time) in each scheduling cycle (plan time). This cycle repeats itself for the lifetime of the workload.

      Using the GPU runtime this way guarantees a workload is granted its requested GPU compute resources proportionally to its requested GPU fraction.

      Run:ai offers two new Time-slicing modes:

      1. Strict\u2014each workload gets its precise GPU compute fraction, which equals to its requested GPU (memory) fraction. In terms of official Kubernetes resource specification, this means:
      gpu-compute-request = gpu-compute-limit = gpu-(memory-)fraction\n
      1. Fair\u2014each workload is guaranteed at least its GPU compute fraction, but at the same time can also use additional GPU runtime compute slices that are not used by other idle workloads. Those excess time slices are divided equally between all workloads running on that GPU (after each got at least its requested GPU compute fraction). In terms of official Kubernetes resource specification, this means:
      gpu-compute-request = gpu-(memory-)fraction\n\ngpu-compute-limit = 1.0\n

      The figure below illustrates how Strict time-slicing mode is using the GPU from Lease (slice) and Plan (cycle) perspective:

      The figure below illustrates how Fair time-slicing mode is using the GPU from Lease (slice) and Plan (cycle) perspective:

      "},{"location":"Researcher/scheduling/GPU-time-slicing-scheduler/#setting-the-time-slicing-scheduler-policy","title":"Setting the Time-slicing scheduler policy","text":"

      Time-slicing is a cluster flag which changes the default behavior of Run:ai GPU fractions feature.

      Enable time-slicing by setting the following cluster flag in the runaiconfig file:

      global: \n    core: \n        timeSlicing: \n            mode: fair/strict\n

      If the timeSlicing flag is not set, the system continues to use the default NVidia GPU orchestrator to maintain backward compatability.

      "},{"location":"Researcher/scheduling/GPU-time-slicing-scheduler/#time-slicing-plan-and-lease-times","title":"Time-slicing Plan and Lease Times","text":"

      Each GPU scheduling cycle is a plan, the plan time is determined by the lease time and granularity (precision). By default, basic lease time is 250ms with 5% granularity (precision), which means the plan (cycle) time is: 250 / 0.05 = 5000ms (5 Sec). Using these values, a workload that asked to get gpu-fraction=0.5 gets 2.5s runtime out of 5s cycle time.

      Different workloads requires different SLA and precision, so it also possible to tune the lease time and precision for customizing the time-slicing capabilities to your cluster.

      Note

      Decreasing the lease time makes time-slicing less accurate. Increasing the lease time make the system more accurate, but each workload is less responsive.

      Once timeSlicing is enabled, all submitted GPU fraction or GPU memory workloads will have their gpu-compute-request\\limit set automatically by the system, depending on the annotation used on the timeSlicing mode:

      "},{"location":"Researcher/scheduling/GPU-time-slicing-scheduler/#strict-compute-resources","title":"Strict Compute Resources","text":"Annotation Value GPU Compute Request GPU Compute Limit gpu-fraction x x x gpu-memory x 0 1.0"},{"location":"Researcher/scheduling/GPU-time-slicing-scheduler/#fair-compute-resources","title":"Fair Compute Resources","text":"Annotation Value GPU Compute Request GPU Compute Limit gpu-fraction x x 1.0 gpu-memory x 0 1.0

      Note

      The above tables show that when submitting a workload using gpu-memory annotation, the system will split the GPU compute time between the different workloads running on that GPU. This means the workload can get anything from very little compute time (>0) to full GPU compute time (1.0).

      "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/","title":"Introduction","text":"

      When we discuss the allocation of deep learning compute resources, the discussion tends to focus on GPUs as the most critical resource. But two additional resources are no less important:

      • CPUs. Mostly needed for preprocessing and postprocessing tasks during a deep learning training run.
      • Memory. Has a direct influence on the quantities of data a training run can process in batches.

      GPU servers tend to come installed with a significant amount of memory and CPUs.

      "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#requesting-cpu-memory","title":"Requesting CPU & Memory","text":"

      When submitting a Job, you can request a guaranteed amount of CPUs and memory by using the --cpu and --memory flags in the runai submit command. For example:

      runai submit job1 -i ubuntu --gpu 2 --cpu 12 --memory 1G\n

      The system guarantees that if the Job is scheduled, you will be able to receive this amount of CPU and memory.

      For further details on these flags see: runai submit

      "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#cpu-over-allocation","title":"CPU over allocation","text":"

      The number of CPUs your Job will receive is guaranteed to be the number defined using the --cpu flag. In practice, however, you may receive more CPUs than you have asked for:

      • If you are currently alone on a node, you will receive all the node CPUs until such time when another workload has joined.
      • However, when a second workload joins, each workload will receive a number of CPUs proportional to the number requested via the --cpu flag. For example, if the first workload asked for 1 CPU and the second for 3 CPUs, then on a node with 40 cpus, the workloads will receive 10 and 30 CPUs respectively. If the flag --cpu is not specified, it will be taken from the cluster default (see the section below)
      "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#memory-over-allocation","title":"Memory over allocation","text":"

      The amount of Memory your Job will receive is guaranteed to be the number defined using the --memory flag. In practice, however, you may receive more memory than you have asked for. This is along the same lines as described with CPU over allocation above.

      It is important to note, however, that if you have used this memory over-allocation, and new workloads have joined, your Job may receive an out-of-memory exception and terminate.

      "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#cpu-and-memory-limits","title":"CPU and Memory limits","text":"

      You can limit your Job's allocation of CPU and memory by using the --cpu-limit and --memory-limit flags in the runai submit command. For example:

      runai submit job1 -i ubuntu --gpu 2 --cpu 12 --cpu-limit 24 \\\n    --memory 1G --memory-limit 4G\n

      The limit behavior is different for CPUs and memory.

      • Your Job will never be allocated with more than the amount stated in the --cpu-limit flag
      • If your Job tries to allocate more than the amount stated in the --memory-limit flag it will receive an out-of-memory exception.

      The limit (for both CPU and memory) overrides the cluster default described in the section below

      For further details on these flags see: runai submit

      "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#flag-defaults","title":"Flag Defaults","text":""},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-cpu-flag","title":"Defaults for --cpu flag","text":"

      If your Job has not specified --cpu, the system will use a default. The default is cluster-wide and is defined as a ratio of GPUs to CPUs.

      If, for example, the default has been defined as 1:6 and your Job has specified --gpu 2 and has not specified --cpu, then the implied --cpu flag value is 12 CPUs.

      The system comes with a cluster-wide default of 1:1. To change the ratio see below.

      If you didn't request any GPUs for your job and has not specified --cpu, the default is defined as a ratio of CPU limit to CPUs.

      If, for example, the default has been defined as 1:0.2 and your Job has specified --cpu-limit 10 and has not specified --cpu, then the implied --cpu flag value is 2 CPUs.

      The system comes with a cluster-wide default of 1:0.1. To change the ratio see below.

      "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-memory-flag","title":"Defaults for --memory flag","text":"

      If your Job has not specified --memory, the system will use a default. The default is cluster-wide and is proportional to the number of requested GPUs.

      The system comes with a cluster-wide default of 100MiB of allocated CPU memory per GPU. To change the ratio see below.

      If you didn't request any GPUs for your job and has not specified --memory, the default is defined as a ratio of CPU Memory limit to CPU Memory Request.

      The system comes with a cluster-wide default of 1:0.1. To change the ratio see below.

      "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-cpu-limit-flag","title":"Defaults for --cpu-limit flag","text":"

      If your Job has not specified --cpu-limit, then by default, the system will not set a limit. You can set a cluster-wide limit as a ratio of GPUs to CPUs. See below on how to change the ratio.

      "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-memory-limit-flag","title":"Defaults for --memory-limit flag","text":"

      If your Job has not specified --memory-limit, then by default, the system will not set a limit. You can set a cluster-wide limit as a ratio of GPUs to Memory. See below on how to change the ratio.

      "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#changing-the-ratios","title":"Changing the ratios","text":"

      To change the cluster wide-ratio use the following process. The example shows:

      • a CPU request with a default ratio of 2:1 CPUs to GPUs.
      • a CPU Memory request with a default ratio of 200MB per GPU.
      • a CPU limit with a default ratio of 4:1 CPU to GPU.
      • a Memory limit with a default ratio of 2GB per GPU.
      • a CPU request with a default ratio of 0.1 CPUs per 1 CPU limit.
      • a CPU Memory request with a default ratio of 0.1:1 request per CPU Memory limit.

      You must edit the cluster installation values file:

      • When installing the Run:ai cluster, edit the values file.
      • On an existing installation, use the upgrade cluster instructions to modify the values file.
      • You must specify at least the first 4 values as follows:
      runai-operator:\n  config:\n    limitRange:\n      cpuDefaultRequestGpuFactor: 2\n      memoryDefaultRequestGpuFactor: 200Mi\n      cpuDefaultLimitGpuFactor: 4\n      memoryDefaultLimitGpuFactor: 2Gi\n      cpuDefaultRequestCpuLimitFactorNoGpu: 0.1\n      memoryDefaultRequestMemoryLimitFactorNoGpu: 0.1\n
      "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#validating-cpu-memory-allocations","title":"Validating CPU & Memory Allocations","text":"

      To review CPU & Memory allocations you need to look into Kubernetes. A Run:ai Job creates a Kubernetes pod. The pod declares its resource requests and limits. To see the memory and CPU consumption in Kubernetes:

      • Get the pod name for the Job by running:

      runai describe job <JOB_NAME>

      the pod will appear under the PODS category.

      • Run:

      kubectl describe pod <POD_NAME>

      The information will appear under Requests and Limits. For example:

      Limits:\n    nvidia.com/gpu:  2\nRequests:\n    cpu:             1\n    memory:          104857600\n    nvidia.com/gpu:  2\n
      "},{"location":"Researcher/scheduling/dynamic-gpu-fractions/","title":"Dynamic GPU Fractions","text":""},{"location":"Researcher/scheduling/dynamic-gpu-fractions/#introduction","title":"Introduction","text":"

      Many AI workloads are using GPU resources intermittently and sometimes these resources are not used at all. These AI workloads need these resources when they are running AI applications, or debugging a model in development. Other workloads such as Inference, might be using GPU resources at a lower utilization rate than requested, and may suddenly ask for higher guaranteed resources at peak utilization times.

      This pattern of resource request vs. actual resource utilization causes lower utilization of GPUs. This mainly happens if there are many workloads requesting resources to match their peak demand, even though the majority of the time they operate far below that peak.

      Run:ai has introduced Dynamic GPU fractions in v2.15 to cope with resource request vs. actual resource utilization which enables users to optimize GPU resource usage.

      Dynamic GPU fractions is part of Run:ai's core capabilities to enable workloads to optimize the use of GPU resources. This works by providing the ability to specify and consume GPU memory and compute resources dynamically by leveraging Kubernetes Request and Limit notations.

      Dynamic GPU fractions allow a workload to request a guaranteed fraction of GPU memory or GPU compute resource (similar to a Kubernetes request), and at the same time also request the ability to grow beyond that guaranteed request up to a specific limit (similar to a Kubernetes limit), if the resources are available.

      For example, with Dynamic GPU Fractions, a user can specify a workload with a GPU fraction Request of 0.25 GPU, and add the parameter gpu-fraction-limit of up to 0.80 GPU. The cluster/node-pool scheduler schedules the workload to a node that can provide the GPU fraction request (0.25), and then assigns the workload to a GPU. The GPU scheduler monitors the workload and allows it to occupy memory between 0 to 0.80 of the GPU memory (based on the parameter gpu-fraction-limit), where only 0.25 of the GPU memory is guaranteed to that workload. The rest of the memory (from 0.25 to 0.8) is \u201cloaned\u201d to the workload, as long as it is not needed by other workloads.

      Run:ai automatically manages the state changes between request and Limit as well as the reverse (when the balance need to be \"returned\"), updating the metrics and workloads' states and graphs.

      "},{"location":"Researcher/scheduling/dynamic-gpu-fractions/#setting-fractional-gpu-memory-limit","title":"Setting Fractional GPU Memory Limit","text":"

      With the fractional GPU memory limit, users can submit workloads using GPU fraction Request and Limit.

      You can either:

      1. Use a GPU Fraction parameter (use the gpu-fraction annotation)

        or

      2. Use an absolute GPU Memory parameter (gpu-memory annotation)

      When setting a GPU memory limit either as GPU fraction, or GPU memory size, the Limit must be equal or greater than the GPU fraction memory request.

      Both GPU fraction and GPU memory are translated into the actual requested memory size of the Request (guaranteed resources) and the Limit (burstable resources).

      To guarantee fair quality of service between different workloads using the same GPU, Run:ai developed an extendable GPU OOMKiller (Out Of Memory Killer) component that guarantees the quality of service using Kubernetes semantics for resources Request and Limit.

      The OOMKiller capability requires adding CAP_KILL capabilities to the Dynamic GPU fraction and to the Run:ai core scheduling module (toolkit daemon). This capability is disabled by default.

      To change the state of Dynamic GPU Fraction in the cluster, edit the runaiconfig file and set:

      spec: \n  global: \n    core: \n      dynamicFraction: \n        enabled: true # Boolean field default is true.\n

      To set the gpu memory limit per workload, add the RUNAI_GPU_MEMORY_LIMIT environment variable to the first container in the pod. This is the GPU consuming container.

      To use RUNAI_GPU_MEMORY_LIMIT environment variable:

      1. Submit a workload yaml directly, and set the RUNAI_GPU_MEMORY_LIMIT environment variable.

      2. Create a policy, per Project or globally. For example, set all Interactive workloads of Project=research_vision1 to always set the environment variable of RUNAI_GPU_MEMORY_LIMIT to 1.

      3. Pass the environment variable through the CLI or the UI.

      The supported values depend on the label used. You can use them in either the UI or the CLI. Use only one of the variables in the following table (they cannot be mixed):

      Variable Input format gpu-fraction A fraction value (for example: 0.25, 0.75). gpu-memory Kubernetes resources quantity which must be larger than gpu-memory. For example, 500000000, 2500M, 4G. NOTE: The gpu-memory label values are always in MB, unlike the env variable."},{"location":"Researcher/scheduling/dynamic-gpu-fractions/#compute-resources-ui-with-dynamic-fractions-support","title":"Compute Resources UI with Dynamic Fractions support","text":"

      To enable the UI elements for Dynamic Fractions, press Settings, General, then open the Resources pane and toggle GPU Resource Optimization. This enables all the UI features related to GPU Resource Optimization for the whole tenant. There are other per cluster or per node-pool configurations that should be configured in order to use the capabilities of \u2018GPU Resource Optimization\u2019 See the documentation for each of these features. Once the \u2018GPU Resource Optimization\u2019 feature is enabled, you will be able to create Compute Resources with the GPU Portion (Fraction) Limit and GPU Memory Limit. In addition, you will be able to view the workloads\u2019 utilization vs. Request and Limit parameters in the Metrics pane for each workload.

      Note

      When setting a workload with Dynamic Fractions, (for example, when using it with GPU Request or GPU memory Limits), you practically make the workload burstable. This means it can use memory that is not guaranteed for that workload and is susceptible to an \u2018OOM Kill\u2019 signal if the actual owner of that memory requires it back. This applies to non-preemptive workloads as well. For that reason, its recommended that you use Dynamic Fractions with Interactive workloads running Notebooks. Notebook pods are not evicted when their GPU process is OOM Kill\u2019ed. This behavior is the same as standard Kubernetes burstable CPU workloads.

      "},{"location":"Researcher/scheduling/dynamic-gpu-fractions/#multi-gpu-dynamic-fractions","title":"Multi-GPU Dynamic Fractions","text":"

      Run:ai also supports workload submission using multi-GPU dynamic fractions. Multi-GPU dynamic fractions work similarly to dynamic fractions on a single GPU workload, however, instead of a single GPU device, the Run:ai Scheduler allocates the same dynamic fraction pair (Request and Limit) on multiple GPU devices within the same node. For example, if practitioners develop a new model that uses 8 GPUs and requires 40GB of memory per GPU, but may want to burst out and consume up to the full GPU memory, they can allocate 8\u00d740GB with multi-GPU fractions and a limit of 80GB (e.g. H100 GPU) instead of reserving the full memory of each GPU (e.g. 80GB). This leaves 40GB of GPU memory available on each of the 8 GPUs for other workloads within that node.This is useful during model development, where memory requirements are usually lower due to experimentation with smaller model or configurations.

      This approach significantly improves GPU utilization and availability, enabling more precise and often smaller quota requirements for the end user. Time sharing where single GPUs can serve multiple workloads with dynamic fractions remains unchanged, only now, it serves multiple workloads using multi-GPU per workload.

      "},{"location":"Researcher/scheduling/dynamic-gpu-fractions/#configuring-multi-gpu-dynamic-fractions","title":"Configuring Multi-GPU Dynamic Fractions","text":"

      You can configure multi-GPU dynamic fractions as follows:

      • Using the compute resources asset, you can define the compute requirement to run multiple GPU devices, by specifying either a fraction (percentage) of the overall memory or specifying the memory request (GB, MB), both with Request and Limit parameters:

      • You can submit a workload with dynamic fractions using the CLI V2:

      "},{"location":"Researcher/scheduling/fractions/","title":"Allocation of GPU Fractions","text":""},{"location":"Researcher/scheduling/fractions/#introduction","title":"Introduction","text":"

      A single GPU has a significant amount of memory. Ranging from a couple of gigabytes in older generations and up to 80GB per GPU in the later models of the latest NVIDIA GPU technology. A single GPU also has a vast amount of computing power.

      This amount of memory and computing power is important for processing large amounts of data, such as in training deep learning models. However, there are quite a few applications that do not need this power. Examples can be inference workloads and the model-creation phase. It would thus be convenient if we could divide up a GPU between various workloads, thus achieving better GPU utilization.

      This article describes a Run:ai technology called Fractions that allow the division of GPUs and how to use them with Run:ai.

      "},{"location":"Researcher/scheduling/fractions/#runai-fractions","title":"Run:ai Fractions","text":"

      Run:ai provides the capability to allocate a container with a specific amount of GPU RAM. As a researcher, if you know that your code needs 4GB of RAM. You can submit a job using the flag --gpu-memory 4G to specify the exact portion of the GPU memory that you need. Run:ai will allocate your container that specific amount of GPU RAM. Attempting to reach beyond your allotted RAM will result in an out-of-memory exception.

      You can also use the flag --gpu 0.2 to get 20% of the GPU memory on the GPU assigned for you.

      For more details on Run:ai fractions see the fractions quickstart.

      Limitation

      With the fraction technology all running workloads, which utilize the GPU, share the compute in parallel and on average get an even share of the compute. For example, assuming two containers, one with 0.25 GPU workload and the other with 0.75 GPU workload - both will get (on average) an equal part of the computation power. If one of the workloads does not utilize the GPU, the other workload will get the entire GPU's compute power.

      Info

      For interoperability with other Kubernetes schedulers, Run:ai creates special reservation pods. Once a workload has been allocated a fraction of a GPU, Run:ai will create a pod in a dedicated runai-reservation namespace with the full GPU as a resource. This would cause other schedulers to understand that the GPU is reserved.

      "},{"location":"Researcher/scheduling/fractions/#multi-gpu-fractions","title":"Multi-GPU Fractions","text":"

      Run:ai also supports workload submission using multi-GPU fractions. Multi-GPU fractions work similarly to fractional single GPU workloads, however, the Run:ai Scheduler allocates the same fraction size on multiple GPU devices within the same node. For example, if practitioners develop a new model that uses 8 GPUs and requires 40GB of memory per GPU, they can allocate 8\u00d740GB with multi-GPU fractions instead of reserving the full memory of each GPU (e.g. 80GB). This leaves 40GB of GPU memory available on each of the 8 GPUs for other workloads within that node. This is useful during model development, where memory requirements are usually lower due to experimentation with smaller model or configurations.

      This approach significantly improves GPU utilization and availability, enabling more precise and often smaller quota requirements for the end user. Time sharing where single GPUs can serve multiple workloads with fractions remains unchanged, only now, it serves multiple workloads using multi-GPU per workload, single GPU per workload, or a mix of both.

      "},{"location":"Researcher/scheduling/fractions/#configuring-multi-gpu-fractions","title":"Configuring Multi-GPU Fractions","text":"

      Using the compute resources asset, you can define the compute requirement to run multiple GPU devices, by specifying either a fraction (percentage) of the overall memory or specifying the memory request (GB, MB):

      "},{"location":"Researcher/scheduling/fractions/#see-also","title":"See Also","text":"
      • Fractions quickstart.
      "},{"location":"Researcher/scheduling/gpu-memory-swap/","title":"GPU Memory SWAP","text":""},{"location":"Researcher/scheduling/gpu-memory-swap/#introduction","title":"Introduction","text":"

      To ensure efficient and effective usage of an organization\u2019s resources, Run:ai provides multiple features on multiple layers to help administrators and practitioners maximize their existing GPUs resource utilization.

      Run:ai\u2019s GPU memory swap feature helps administrators and AI practitioners to further increase the utilization of existing GPU hardware by improving GPU sharing between AI initiatives and stakeholders. This is done by expanding the GPU physical memory to the CPU memory which is typically an order of magnitude larger than that of the GPU.

      Expanding the GPU physical memory, helps the Run:ai system to put more workloads on the same GPU physical hardware, and to provide a smooth workload context switching between GPU memory and CPU memory, eliminating the need to kill workloads when the memory requirement is larger than what the GPU physical memory can provide.

      "},{"location":"Researcher/scheduling/gpu-memory-swap/#benefits-of-gpu-memory-swap","title":"Benefits of GPU memory swap","text":"

      There are several use cases where GPU memory swap can benefit and improve the user experience and the system's overall utilization:

      "},{"location":"Researcher/scheduling/gpu-memory-swap/#sharing-a-gpu-between-multiple-interactive-workloads-notebooks","title":"Sharing a GPU between multiple interactive workloads (notebooks)","text":"

      AI practitioners use notebooks to develop and test new AI models and to improve existing AI models. While developing or testing an AI model, notebooks use GPU resources intermittently, yet, required resources of the GPU\u2019s are pre-allocated by the notebook and cannot be used by other workloads after one notebook has already reserved them. To overcome this inefficiency, Run:ai introduced Dynamic Fractions and Node Level Scheduler.

      When one or more workloads require more than their requested GPU resources, there\u2019s a high probability not all workloads can run on a single GPU because the total memory required is larger than the physical size of the GPU memory.

      With GPU memory swap, several workloads can run on the same GPU, even if the sum of their used memory is larger than the size of the physical GPU memory. GPU memory swap can swap in and out workloads interchangeably, allowing multiple workloads to each use the full amount of GPU memory. The most common scenario is for one workload to run on the GPU (for example, an interactive notebook),while other notebooks are either idle or using the CPU to develop new code (while not using the GPU). From a user experience point of view, the swap in and out is a smooth process since the notebooks do not notice that they are being swapped in and out of the GPU memory. On rare occasions, when multiple notebooks need to access the GPU simultaneously, slower workload execution may be experienced.

      Notebooks typically use the GPU intermittently, therefore with high probability, only one workload (for example, an interactive notebook), will use the GPU at a time. The more notebooks the system puts on a single GPU, the higher the chances are that there will be more than one notebook requiring the GPU resources at the same time. Admins have a significant role here in fine tuning the number of notebooks running on the same GPU, based on specific use patterns and required SLAs. Using \u2018Node Level Scheduler\u2019 reduces GPU access contention between different interactive notebooks running on the same node.

      "},{"location":"Researcher/scheduling/gpu-memory-swap/#sharing-a-gpu-between-inferenceinteractive-workloads-and-training-workloads","title":"Sharing a GPU between inference/interactive workloads and training workloads","text":"

      A single GPU can be shared between an interactive or inference workload (for example, a Jupyter notebook, image recognition services, or an LLM service), and a training workload that is not time-sensitive or delay-sensitive. At times when the inference/interactive workload uses the GPU, both training and inference/interactive workloads share the GPU resources, each running part of the time swapped-in to the GPU memory, and swapped-out into the CPU memory the rest of the time.

      Whenever the inference/interactive workload stops using the GPU, the swap mechanism swaps out the inference/interactive workload GPU data to the CPU memory. Kubernetes wise, the POD is still alive and running using the CPU. This allows the training workload to run faster when the inference/interactive workload is not using the GPU, and slower when it does, thus sharing the same resource between multiple workloads, fully utilizing the GPU at all times, and maintaining uninterrupted service for both workloads.

      "},{"location":"Researcher/scheduling/gpu-memory-swap/#serving-inference-warm-models-with-gpu-memory-swap","title":"Serving inference warm models with GPU memory swap","text":"

      Running multiple inference models is a demanding task and you will need to ensure that your SLA is met. You need to provide high performance and low latency, while maximizing GPU utilization. This becomes even more challenging when the exact model usage patterns are unpredictable. You must plan for the agility of inference services and strive to keep models on standby in a ready state rather than an idle state.

      Run:ai\u2019s GPU memory swap feature enables you to load multiple models to a single GPU, where each can use up to the full amount GPU memory. Using an application load balancer, the administrator can control to which server each inference request is sent. Then the GPU can be loaded with multiple models, where the model in use is loaded into the GPU memory and the rest of the models are swapped-out to the CPU memory. The swapped models are stored as ready models to be loaded when required. GPU memory swap always maintains the context of the workload (model) on the GPU so it can easily and quickly switch between models. This is unlike industry standard model servers that load models from scratch into the GPU whenever required.

      "},{"location":"Researcher/scheduling/gpu-memory-swap/#configuring-memory-swap","title":"Configuring memory swap","text":"

      Perquisites\u2014before configuring the GPU Memory Swap the administrator must configure the Dynamic Fractions feature, and optionally configure the Node Level Scheduler feature.

      The first enables you to make your workloads burstable, and both features will maximize your workloads\u2019 performance and GPU utilization within a single node.

      To enable GPU memory swap in a Run:aAi cluster, the administrator must update the runaiconfig file with the following parameters:

      spec: \n global: \n   core: \n     swap:\n       enabled: true\n       limits:\n         cpuRam: 100Gi\n

      The example above uses 100Gi as the size of the swap memory.

      You can also use the patch command from your terminal:

      kubectl patch -n runai runaiconfigs.run.ai/runai --type='merge' --patch '{\"spec\":{\"global\":{\"core\":{\"swap\":{\"enabled\": true, \"limits\": {\"cpuRam\": \"100Gi\"}}}}}}'\n

      To make a workload swappable, a number of conditions must be met:

      1. The workload MUST use Dynamic Fractions. This means the workload\u2019s memory request is less than a full GPU, but it may add a GPU memory limit to allow the workload to effectively use the full GPU memory.

      2. The administrator must label each node that they want to provide GPU memory swap with a run.ai/swap-enabled=true this enables the feature on that node. Enabling the feature reserves CPU memory to serve the swapped GPU memory from all GPUs on that node. The administrator sets the size of the CPU reserved RAM memory using the runaiconfigs file.

      3. Optionally, configure Node Level Scheduler. Using node level scheduler can help in the following ways:

        • The Node Level Scheduler automatically spreads workloads between the different GPUs on a node, ensuring maximum workload performance and GPU utilization.
        • In scenarios where Interactive notebooks are involved, if the CPU reserved memory for the GPU swap is full, the Node Level Scheduler preempts the GPU process of that workload and potentially routes the workload to another GPU to run.
      "},{"location":"Researcher/scheduling/gpu-memory-swap/#configure-system-reserved-gpu-resources","title":"Configure system reserved GPU Resources","text":"

      Swappable workloads require reserving a small part of the GPU for non-swappable allocations like binaries and GPU context. To avoid getting out-of-memory (OOM) errors due to non-swappable memory regions, the system reserves a 2GiB of GPU RAM memory by default, effectively truncating the total size of the GPU memory. For example, a 16GiB T4 will appear as 14GiB on a swap-enabled node. The exact reserved size is application-dependent, and 2GiB is a safe assumption for 2-3 applications sharing and swapping on a GPU. This value can be changed by editing the runaiconfig specification as follows:

      spec: \n global: \n   core: \n     swap:\n       limits:\n         reservedGpuRam: 2Gi\n

      You can also use the patch command from your terminal:

      kubectl patch -n runai runaiconfigs.run.ai/runai --type='merge' --patch '{\"spec\":{\"global\":{\"core\":{\"swap\":{\"limits\":{\"reservedGpuRam\": <quantity>}}}}}}'\n

      This configuration is in addition to the Dynamic Fractions configuration, and optional Node Level Scheduler configuration.

      "},{"location":"Researcher/scheduling/gpu-memory-swap/#preventing-your-workloads-from-getting-swapped","title":"Preventing your workloads from getting swapped","text":"

      If you prefer your workloads not to be swapped into CPU memory, you can specify on the pod an anti-affinity to run.ai/swap-enabled=true node label when submitting your workloads and the Scheduler will ensure not to use swap-enabled nodes.

      "},{"location":"Researcher/scheduling/gpu-memory-swap/#known-limitations","title":"Known Limitations","text":"
      • A pod created before the GPU memory swap feature was eneabled in that cluster, cannot be scheduled to a swap-enabled node. A proper event is generated in case no matching node is found. Users must re-submit those pods to make them swap-enabled.
      • GPU memory swap cannot be enabled if fairshare time-slicing or strict time-slicing is used, GPU memory swap can only be used with the default time-slicing mechanism.
      • CPU RAM size cannot be decreased once GPU memory swap is enabled.
      "},{"location":"Researcher/scheduling/gpu-memory-swap/#what-happens-when-the-cpu-reserved-memory-for-gpu-swap-is-exhausted","title":"What happens when the CPU reserved memory for GPU swap is exhausted?","text":"

      CPU memory is limited, and since a single CPU serves multiple GPUs on a node, this number is usually between 2 to 8. For example, when using 80GB of GPU memory, each swapped workload consumes up to 80GB (but may use less) assuming each GPU is shared between 2-4 workloads. In this example, you can see how the swap memory can become very large. Therefore, we give administrators a way to limit the size of the CPU reserved memory for swapped GPU memory on each swap enabled node.

      Limiting the CPU reserved memory means that there may be scenarios where the GPU memory cannot be swapped out to the CPU reserved RAM. Whenever the CPU reserved memory for swapped GPU memory is exhausted, the workloads currently running will not be swapped out to the CPU reserved RAM, instead, Node Level Scheduler logic takes over and provides GPU resource optimization. See Node Level Scheduler.

      "},{"location":"Researcher/scheduling/gpu-memory-swap/#multi-gpu-memory-swap","title":"Multi-GPU Memory Swap","text":"

      Run:ai also supports workload submission using multi-GPU memory swap. Multi-GPU memory swap works similarly to single GPU memory swap, but instead of swapping memory for a single GPU workload, it swaps memory for workloads across multiple GPUs simultaneously and synchronously.

      The Run:ai Scheduler allocates the same dynamic fraction pair (Request and Limit) on multiple GPU devices in the same node. For example, if you want to run two LLM models, each consuming 8 GPUs that are not used simultaneously, you can use GPU memory swap to share their GPUs. This approach allows multiple models to be stacked on the same node.

      The following outlines the advantages of stacking multiple models on the same node:

      • Maximizes GPU utilization: Efficiently uses available GPU resources by enabling multiple workloads to share GPUs.
      • Improves cold start times: Loading large LLM models to a node and it\u2019s GPUs can take several minutes during a \u201ccold start\u201d. Using memory swap turns this process into a \u201cwarm start\u201d that takes only a faction of a second to a few seconds (depending on the model size and the GPU model).
      • Increases GPU availability: Frees up and maximizes GPU availability for additional workloads (and users), enabling better resource sharing.
      • Smaller quota requirements: Enables more precise and often smaller quota requirements for the end user.
      "},{"location":"Researcher/scheduling/gpu-memory-swap/#configuring-multi-gpu-memory-swap","title":"Configuring multi-GPU memory swap","text":"

      You can configure multi-GPU memory swapping as follows:

      • Using the compute resources asset, you can define the compute requirement to run multiple GPU devices, by specifying either a fraction (percentage) of the overall memory or specifying the memory request (GB, MB). Both options require defining the Request and Limit parameters, Workloads can then be scheduled to nodes or node pools where memory swap is enabled.

      • You can submit a workload with dynamic fractions using the CLI V2:

      "},{"location":"Researcher/scheduling/node-level-scheduler/","title":"Optimize performance with Node Level Scheduler","text":"

      The Node Level Scheduler optimizes the performance of your pods and maximizes the utilization of GPUs by making optimal local decisions on GPU allocation to your pods. While the Cluster Scheduler chooses the specific node for a POD, but has no visibility to node\u2019s GPUs internal state, the Node Level Scheduler is aware of the local GPUs states and makes optimal local decisions such that it can optimize both the GPU utilization and pods\u2019 performance running on the node\u2019s GPUs.

      Node Level Scheduler applies to all workload types, but will best optimize the performance of burstable workloads, giving those more GPU memory than requested and up to the limit specified. Be aware, burstable workloads are always susceptible to an OOM Kill signal if the owner of the excess memory requires it back. This means that using the Node Level Scheduler with Inference or Training workloads may cause pod preemption. Interactive workloads that are using notebooks behave differently since the OOM Kill signal will cause the Notebooks' GPU process to exit but not the notebook itself. This keeps the Interactive pod running and retrying to attach a GPU again. This makes Interactive workloads with notebooks a great use case for burstable workloads and Node Level Scheduler.

      "},{"location":"Researcher/scheduling/node-level-scheduler/#interactive-notebooks-use-case","title":"Interactive Notebooks Use Case","text":"

      Consider the following example of a node with 2 GPUs and 2 interactive pods that are submitted and want GPU resources.

      The Scheduler instructs the node to put the two pods on a single GPU, bin packing a single GPU and leaving the other free for a workload that might want a full GPU or more than half GPU. However that would mean GPU#2 is idle while the two notebooks can only use up to half a GPU, even if they temporarily need more.

      However, with Node Level Scheduler enabled, the local decision will be to spread those two pods on two GPUs and allow them to maximize bot pods\u2019 performance and GPUs\u2019 utilization by bursting out up to the full GPU memory and GPU compute resources.

      The Cluster Scheduler still sees a node with a full empty GPU. When a 3rd pod is scheduled, and it requires a full GPU (or more than 0.5 GPU), the scheduler will send it to that node, and Node Level Scheduler will move one of the Interactive workloads to run with the other pod in GPU#1, as was the Cluster Scheduler initial plan.

      This is an example of one scenario that shows how Node Level Scheduler locally optimizes and maximizes GPU utilization and pods\u2019 performance.

      "},{"location":"Researcher/scheduling/node-level-scheduler/#how-to-configure-node-level-scheduler","title":"How to configure Node Level Scheduler","text":"

      Node Level Scheduler can be enabled per Node-Pool, giving the Administrator the option to decide which Node-Pools will be used with this new feature.

      To use Node Level Scheduler the Administrator should follow the steps:

      1. Enable Node Level Scheduler at the cluster level (per cluster), edit the runaiconfig file and set:

        spec: \n  global: \n      core: \n        nodeScheduler:\n          enabled: true\n

        The Administrator can also use this patch command to perform the change:

        kubectl patch -n runai runaiconfigs.run.ai/runai --type='merge' --patch '{\"spec\":{\"global\":{\"core\":{\"nodeScheduler\":{\"enabled\": true}}}}}'\n
      2. To enable \u2018GPU resource optimization\u2019 on your tenant\u2019s, go to your tenant\u2019s UI and press General settings, then open the Resources pane and toggle GPU Resource Optimization to on.

      3. To enable \u2018Node Level Scheduler\u2019 on any of the Node Pools you want to use this feature, go to the tenant\u2019s UI \u2018Node Pools\u2019 tab (under \u2018Nodes\u2019), and either create a new Node-Pool or edit an existing Node-Pool. In the Node-Pool\u2019s form, under the \u2018Resource Utilization Optimization\u2019 tab, change the \u2018Number of workloads on each GPU\u2019 to any value other than \u2018Not Enforced\u2019 (i.e. 2, 3, 4, 5).

      The Node Level Scheduler is now ready to be used on that Node-Pool.

      "},{"location":"Researcher/scheduling/schedule-to-aws-groups/","title":"Scheduling workloads to AWS placement groups","text":"

      Run:ai supports AWS placement groups when building and submitting a job. AWS Placement Groups are used to maximize throughput and performance of distributed training workloads.

      To enable and configure this feature:

      1. Press Jobs | New job.
      2. In Scheduling and lifecycle enable the Topology aware scheduling.
      3. In Topology key, enter the label of the topology of the node.
      4. In Scheduling rule choose Required or Preferred from the drop down.

        • Required\u2014when enabled, all PODs must be scheduled to the same placement group.
        • Preferred\u2014when enabled, this is a best-effort, to place as many PODs on the same placement group.
      "},{"location":"Researcher/scheduling/the-runai-scheduler/","title":"The Run:ai Scheduler","text":"

      Each time a user submits a workload via the Run:ai platform, through a 3rd party framework, or directly to Kubernetes APIs, the submitted workload goes to the selected Kubernetes cluster, and is handled by the Run:ai Scheduler.

      The Scheduler\u2019s main role is to find the best-suited node or nodes for each submitted workload. The nodes must match the resources and other characteristics requested by the workload, while adhering to the quota and fairness principles of the Run:ai platform. A workload can be a single pod running on a single node, or a distributed workload using multiple pods, each running on a node (or part of a node). It is not rare to find large training workloads using 128 nodes and even more, or inference workloads using many pods (replicas) and nodes. There are numerous types of workloads, some are Kubernetes native and some are 3rd party extensions on top of Kubernetes native pods. The Run:ai Scheduler schedules any Kubernetes native workloads, Run:ai workloads, or any other type of 3rd party workload.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#scheduler-basics","title":"Scheduler basics","text":"

      Set out below are some basic terms and information regarding the Run:ai Scheduler.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#terminology","title":"Terminology","text":"

      This section describes the terminology and building blocks of the Run:ai scheduler, it also explains some of the scheduling principles used by the Run:ai scheduler.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#workloads-and-pod-groups","title":"Workloads and Pod-Groups","text":"

      The Run:ai scheduler attaches any newly created pod to a pod-group. A pod-group may contain one or more pods representing a workload. For example, if the submitted workload is a PyTorch distributed training with 32 workers, a single pod-group is created for the entire workload, and all pods are then attached to the pod-group with certain rules that may apply to the pod-group itself, for example, gang scheduling.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#scheduling-queue","title":"Scheduling queue","text":"

      A scheduling queue (or simply a queue) represents a scheduler primitive that manages the scheduling of workloads based on different parameters. A queue is created for each project/node pool pair and department/node pool pair. The Run:ai scheduler supports hierarchical queueing, project queues are bound to department queues, per node pool. This allows an organization to manage quota, over-quota, and other characteristics for projects and their associated departments.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#priority-and-preemption","title":"Priority and Preemption","text":"

      Run:ai supports scheduling workloads using different priorities and preemption policies. In the Run:ai scheduling system, higher priority workloads (pods) may preempt lower priority workloads (pods) within the same scheduling queue (project), according to their Preemption policy. Run:ai Scheduler implicitly assumes any PriorityClass >= 100 is non-preemptible and any PriorityClass < 100 is preemptible.

      Cross project and cross department workload preemptions are referred to as Resource reclaim and are based on fairness between queues rather than the priority of the workloads.

      To make it easier for users to submit AI workloads, Run:ai preconfigured several Kubernetes PriorityClass objects, the Run:ai preset PriorityClass objects have their preemptionPolicy always set to PreemptLowerPriority, regardless of their actual Run:ai preemption policy within the Run:ai platform.

      PriorityClass Name PriorityClass Run:ai preemption policy K8S preemption policy Inference 125 Non-preemptible PreemptLowerPriority Build 100 Non-preemptible PreemptLowerPriority Interactive-preemptible 75 Preemptible PreemptLowerPriority Train 50 Preemptible PreemptLowerPriority"},{"location":"Researcher/scheduling/the-runai-scheduler/#quota","title":"Quota","text":"

      Each project and department includes a set of guaranteed resource quotas per node pool per resource type. For example, Project LLM-Train/Node Pool NV-H100 quota parameters specify the number of GPUs, CPUs(cores), and the amount of CPU memory that this project guarantees for that node pool.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#over-quota","title":"Over-quota","text":"

      Projects and departments can have a share in the unused resources of any node pool, beyond their quota of resources. We name these resources as over quota resources. The admin configures the over-quota parameters per node pool for each project and department.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#over-quota-weight","title":"Over quota weight","text":"

      Projects can receive a share of the cluster/node pool unused resources when the over-quota weight setting is enabled, the part each Project receives depends on its over-quota weight value, and the total weights of all other projects\u2019 over-quota priorities. The admin configures the over-quota weight parameters per node pool for each project and department.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#fairshare-and-fairshare-balancing","title":"Fairshare and fairshare balancing","text":"

      Run:ai Scheduler calculates a numerical value per project (or department) for each node-pool, representing the project\u2019s (department\u2019s) sum of guaranteed resources plus the portion of non-guaranteed resources in that node pool. We name this value fairshare.

      The scheduler strives to provide each project (or department) the resources they deserve using two main parameters - deserved quota and deserved fairshare (i.e. quota + over quota resources), this is done per node pool. If one project\u2019s node pool queue is below fairshare and another project\u2019s node pool queue is above fairshare, the scheduler shifts resources between queues to balance fairness; this may result in the preemption of some over-quota preemptible workloads.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#over-subscription","title":"Over-subscription","text":"

      Over-subscription is a scenario where the sum of all guaranteed resource quotas surpasses the physical resources of the cluster or node pool. In this case, there may be scenarios in which the scheduler cannot find matching nodes to all workload requests, even if those requests were within the resource quota of their associated projects.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#gang-scheduling","title":"Gang scheduling","text":"

      Gang scheduling describes a scheduling principle where a workload composed of multiple pods is either fully scheduled (i.e. all pods are scheduled and running) or fully pending (i.e. all pods are not running). Gang scheduling refers to a single pod group.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#fairness-fair-resource-distribution","title":"Fairness (fair resource distribution)","text":"

      Fairness is a major principle within the Run:ai scheduling system. In essence, it means that the Run:ai Scheduler always respects certain resource splitting rules (fairness) between projects and between departments.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#preemption-of-lower-priority-workloads-within-a-project","title":"Preemption of lower priority workloads within a project","text":"

      Workload priority is always respected within a project. This means higher priority workloads are scheduled before lower priority workloads, it also means that higher priority workloads may preempt lower priority workloads within the same project if the lower priority workloads are preemptible.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#reclaim-of-resources-between-projects-and-departments","title":"Reclaim of resources between projects and departments","text":"

      Reclaim is an inter-project (and inter-department) scheduling action that takes back resources from one project (or department) that has used them as over-quota, back to a project (or department) that deserves those resources as part of its guaranteed quota, or to balance fairness between projects, each to its fairshare (i.e. sharing fairly the portion of the unused resources).

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#multi-level-quota-system","title":"Multi-Level quota system","text":"

      Each project has a set of guaranteed resource quotas (GPUs, CPUs, and CPU memory) per node pool. Projects can go over-quota and get a share of the unused resources (over-quota) in a node pool beyond their guaranteed quota in that node pool. The same applies to Departments. The Scheduler balances the amount of over quota between departments, and then between projects.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#placement-strategy-bin-pack-and-spread","title":"Placement strategy - bin-pack and spread","text":"

      The admin can set per node pool placement strategy of the scheduler for GPU based workloads and for CPU-only based workloads.

      Each type\u2019s strategy can be either bin-pack or spread.

      GPU workloads:

      • Bin-pack means the Scheduler places as many workloads as possible in each GPU and each node to use fewer resources and maximize GPU and node vacancy.
      • Spread means the Scheduler spreads workloads across as many GPUs and nodes as possible to minimize the load and maximize the available resources per workload.
      • GPU workloads are workloads that request both GPU and CPU resources.

      CPU workloads:

      • Bin-pack means the scheduler places as many workloads as possible in each CPU and node to use fewer resources and maximize CPU and node vacancy.
      • Spread means the scheduler spreads workloads across as many CPUs and nodes as possible to minimize the load and maximize the available resources per workload.
      • CPU workloads are workloads that request only CPU resources
      "},{"location":"Researcher/scheduling/the-runai-scheduler/#scheduler-deep-dive","title":"Scheduler deep dive","text":""},{"location":"Researcher/scheduling/the-runai-scheduler/#allocation","title":"Allocation","text":"

      When a user submits a workload, the workload controller creates a pod or pods (for distributed training workloads or a deployment based Inference). When the scheduler gets a submit request with the first pod, it creates a pod group and allocates all the relevant building blocks of that workload. The next pods of the same workload are attached to the same pod group.

      A workload, with its associated pod group, is queued in the appropriate queue. In every scheduling cycle, the Scheduler ranks the order of queues by calculating their precedence for scheduling.

      The next step is for the scheduler to find nodes for those pods, assign the pods to their nodes (bind operation), and bind other building blocks of the pods such as storage, ingress etc.

      If the pod-group has a gang scheduling rule attached to it, the scheduler either allocates and binds all pods together, or puts all of them into the pending state. It retries to schedule them all together in the next scheduling cycle.

      The scheduler also updates the status of the pods and their associate pod group, users are able to track the workload submission process both in the CLI or Run:ai UI.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#preemption","title":"Preemption","text":"

      If the scheduler cannot find resources for the submitted workloads (and all of its associated pods), and the workload deserves resources either because it is under its queue quota or under its queue fairshare, the scheduler tries to reclaim resources from other queues; if this doesn\u2019t solve the resources issue, the scheduler tries to preempt lower priority preemptible workloads within the same queue.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#reclaim-preemption-between-projects-and-departments","title":"Reclaim preemption between projects (and departments)","text":"

      Reclaim is an inter-project (and inter-department) resource balancing action that takes back resources from one project (or department) that has used them as an over-quota, back to a project (or department) that deserves those resources as part of its deserved quota, or to balance fairness between projects (or departments), so a project (or department) doesn\u2019t exceed its fairshare (portion of the unused resources).

      This mode of operation means that a lower priority workload submitted in one project (e.g. training) can reclaim resources from a project that runs a higher priority workload (e.g. preemptive workspace) if fairness balancing is required.

      Note

      Only preemptive workloads can go over-quota as they are susceptible to reclaim (cross-projects preemption) of the over-quota resources they are using. The amount of over-quota resources a project can gain depends on the over-quota weight or quota (if over-quota weight is disabled). Departments\u2019 over-quota is always proportional to its quota.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#priority-preemption-within-a-project","title":"Priority preemption within a project","text":"

      Higher priority workloads may preempt lower priority preemptible workloads within the same project/node pool queue. For example, in a project that runs a training workload that exceeds the project quota for a certain node pool, a newly submitted workspace within the same project/node pool may stop (preempt) the training workload if there are not enough over-quota resources for the project within that node pool to run both workloads (e.g. workspace using in-quota resources and training using over-quota resources).

      There is no priority notion between workloads of different projects.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#quota-over-quota-and-fairshare","title":"Quota, over-quota, and fairshare","text":"

      Run:ai scheduler strives to ensure fairness between projects and between departments, this means each department and project always strive to get their deserved quota, and unused resources are split between projects according to known rules (e.g. over-quota weights).

      If a project needs more resources even beyond its fairshare, and the scheduler finds unused resources that no other project needs, this project can consume resources even beyond its fairshare.

      Some scenarios can prevent the scheduler from fully providing the deserved quota and fairness promise, such as fragmentation or other scheduling constraints like affinities, taints etc.

      The example below illustrates a split of quota between different projects and departments, using several node pools:

      Legend:

      • OQW = Over-quota weight
      • OQ = Over-quota

      The example below illustrates how fairshare is calculated per project/node pool and per department/node pool for the above example:

      The Over quota (OQ) portion of each Project (per node pool) is calculated as:

      [(OQ-Weight) / (\u03a3 Projects OQ-Weights)] x (Unused Resource per node pool)\n

      Fairshare(FS) is calculated as: the sum of Quota + Over-Quota

      Let\u2019s see how Project 2 over quota and fairshare are calculated:

      For this example, we assume that out of the 40 available GPUs in node pool A, 20 GPUs are currently unused (unused means either not part of any project\u2019s quota, or part of a project\u2019s quota but not used by any workloads of that project).

      Project 2 over quota share:

      [(Project 2 OQ-Weight) / (\u03a3 all Projects OQ-Weights)] x (Unused Resource within node pool A)\n\n[(3) / (2 + 3 + 1)] x (20) = (3/6) x 20 = 10 GPUs\n

      Fairshare = deserved quota + over quota = 6 +10 = 16 GPUs

      Similarly, fairshare is also calculated for CPU and CPU memory.

      The scheduler can grant a project more resources than its fairshare if the scheduler finds resources not required by other projects that may deserve those resources.

      One can also see in the above illustration that Project 3 has no guaranteed quota, but it still has a share of the excess resources in node pool A. Run:ai Scheduler ensures that Project 3 receives its part of the unused resources for over quota, even if this results in reclaiming resources from other projects and preempting preemptible workloads.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#fairshare-balancing","title":"Fairshare balancing","text":"

      The Scheduler constantly re-calculates the fairshare of each project and department (per node pool, represented in the scheduler as queues), resulting in the re-balancing of resources between projects and between departments. This means that a preemptible workload that was granted resources to run in one scheduling cycle, can find itself preempted and go back to the pending state waiting for resources on the next cycle.

      A queue, representing a scheduler-managed object for each Project or Department per node pool, can be in one of 3 states:

      • __In-quota __ The queue\u2019s allocated resources \u2264 queue deserved quota
      • __Over-quota (but below fairshare) __ The queue\u2019s deserved quota < queue\u2019s allocates resources <= queue\u2019s fairshare
      • Over-Fairshare (and over-quota) The queue\u2019s fairshare < queue\u2019s allocated resources

      The scheduler\u2019s first priority is to ensure each queue (representing a project/node pool or department/node pool scheduler object) receives its deserved quota. Then the scheduler tries to find and allocate more resources to queues that need resources beyond their deserved quota and up to their fairshare, finally, the scheduler tries to allocate resources to queues that need even more resources - beyond their fairshare.

      When re-balancing resources between queues of different projects and departments, the scheduler goes in the opposite direction, i.e. first take resources from over-fairshare queues, then from over-quota queues, and finally, in some scenarios, even from queues that are below their deserved quota.

      "},{"location":"Researcher/scheduling/the-runai-scheduler/#summary","title":"Summary","text":"

      The scheduler\u2019s role is to bind any submitted pod to a node that satisfies the pod\u2019s requirements and constraints while adhering to the Run:ai quota and fairness system. In some scenarios, the scheduler finds a node for a pod (or nodes for a group of pods) immediately. In other scenarios, the scheduler has to preempt an already running workload to \u201cmake room\u201d, while sometimes a workload becomes pending until resources are released by other workloads (e.g. wait for other workloads to terminate), and only then it is scheduled and run.

      Other than scenarios where the requested resources or other constraints cannot be met within the cluster, either because the resources physically don\u2019t exist (e.g. a node with 16 GPUs, or a GPU with 200GB of memory), or a combination of constraints cannot be matched (e.g. a GPU with 80GB of memory together with a node with specific label or storage type), the scheduler eventually finds any workload its matching nodes to use, but this process may take some time.

      The Run:ai scheduler adheres to Kubernetes standard rules, but it also adds a layer of fairness between queues, queue hierarchy, node pools, and many more features, making the scheduling and Quota management more sophisticated, granular, and robust. The combination of these scheduler capabilities results in higher efficiency, scale, and maximization of cluster utilization.

      "},{"location":"Researcher/tools/dev-jupyter/","title":"Use a Jupyter Notebook with a Run:ai Job","text":"

      See the Jupyter Notebook Quickstart here.

      "},{"location":"Researcher/tools/dev-pycharm/","title":"Use PyCharm with a Run:ai Job","text":"

      Once you launch a workload using Run:ai, you will want to connect to it. You can do so via command-line or via other tools such as a Jupyter Notebook

      This document is about accessing the remote container created by Run:ai, from JetBrain's PyCharm.

      "},{"location":"Researcher/tools/dev-pycharm/#submit-a-workload","title":"Submit a Workload","text":"

      You will need your image to run an SSH server (e.g OpenSSH). For the purposes of this document, we have created an image named runai.jfrog.io/demo/pycharm-demo. The image runs both python and ssh. Details on how to create the image are here. The image is configured to use the root user and password for SSH.

      Run the following command to connect to the container as if it were running locally:

      runai submit build-remote -i runai.jfrog.io/demo/pycharm-demo --interactive  \\\n        --service-type=portforward --port 2222:22\n

      The terminal will show the connection:

      The job 'build-remote' has been submitted successfully\nYou can run `runai describe job build-remote -p team-a` to check the job status\nINFO[0007] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0045] Job started\nOpen access point(s) to service from localhost:2222\nForwarding from [::1]:2222 -> 22\n
      • The Job starts an sshd server on port 22.
      • The connection is redirected to the local machine (127.0.0.1) on port 2222

      Note

      It is possible to connect to the container using a remote IP address. However, this would be less convinient as you will need to maintain port numbers manually and change them when remote accessing using the development tool. As an example, run:\n\n```\nrunai submit build-remote -i runai.jfrog.io/demo/pycharm-demo -g 1 --interactive --service-type=nodeport --port 30022:22\n```\n\n* The Job starts an sshd server on port 22.\n* The Job redirects the external port 30022 to port 22 and uses a [Node Port](https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types){target=_blank} service type.\n* Run: `runai list worklaods`\n\n* Next to the Job, under the \"Service URL\" column you will find the IP address and port. The port is 30222\n
      "},{"location":"Researcher/tools/dev-pycharm/#pycharm","title":"PyCharm","text":"
      • Under PyCharm | Preferences go to: Project | Python Interpreter
      • Add a new SSH Interpreter.
      • As Host, use the IP address above. Change the port to the above and use the Username root
      • You will be prompted for a password. Enter root
      • Apply settings and run the code via this interpreter. You will see your project uploaded to the container and running remotely.
      "},{"location":"Researcher/tools/dev-tensorboard/","title":"Connecting to TensorBoard","text":"

      Once you launch a Deep Learning workload using Run:ai, you may want to view its progress. A popular tool for viewing progress is TensorBoard.

      The document below explains how to use TensorBoard to view the progress or a Run:ai Job.

      "},{"location":"Researcher/tools/dev-tensorboard/#emitting-tensorboard-logs","title":"Emitting TensorBoard Logs","text":"

      When you submit a workload, your workload must save TensorBoard logs which can later be viewed. Follow this document on how to do this. You can also view the Run:ai sample code here.

      The code shows:

      • A reference to a log directory:
      log_dir = \"logs/fit/\" + datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n
      • A registered Keras callback for TensorBoard:
      tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)\n\nmodel.fit(x_train, y_train,\n        ....\n        callbacks=[..., tensorboard_callback])\n

      The logs directory must be saved on a Network File Server such that it can be accessed by the TensorBoard Job. For example, by running the Job as follows:

      runai submit train-with-logs -i tensorflow/tensorflow:1.14.0-gpu-py3 \\\n  -v /mnt/nfs_share/john:/mydir -g 1  --working-dir /mydir --command -- ./startup.sh\n

      Note the volume flag (-v) and working directory flag (--working-dir). The logs directory will be created on /mnt/nfs_share/john/logs/fit.

      "},{"location":"Researcher/tools/dev-tensorboard/#submit-a-tensorboard-workload","title":"Submit a TensorBoard Workload","text":"

      There are two ways to submit a TensorBoard Workload: via the Command-line interface or the user interface

      User InterfaceCLI V1

      Browse to the provided Run:ai user interface and log in with your credentials.

      • In the Run:ai UI select Workloads
      • Select New Workload and then Workspace
      • You should already have Cluster, Project and a start from scratch Template selected. Enter tb as the name and press CONTINUE.
      • Under Environment, select jupyter-tensorboard.
      • Under Compute Resource, select one-gpu.
      • Select CREATE WORKSPACE.
      • In the workload list, add a column of Connections
      • When the workspace is running, you will see two connections:
        1. Juypter
        2. TensorBoard

      Run the following:

      runai submit tb -i tensorflow/tensorflow:latest --interactive --service-type=portforward --port 8888:8888  --working-dir /mydir  -v /mnt/nfs_share/john:/mydir  -- tensorboard --logdir logs/fit --port 8888 --host 0.0.0.0\n

      The terminal will show the following:

      The job 'tb' has been submitted successfully\nYou can run `runai describe job tb -p team-a` to check the job status\nINFO[0006] Waiting for job to start\nWaiting for job to start\nINFO[0014] Job started\nOpen access point(s) to service from localhost:8888\nForwarding from 127.0.0.1:8888 -> 8888\nForwarding from [::1]:8888 -> 8888\n

      Browse to http://localhost:8888/ to view TensorBoard.

      Note

      A single TensorBoard Job can be used to view multiple deep learning Jobs, provided it has access to the logs directory for these Jobs.

      "},{"location":"Researcher/tools/dev-vscode/","title":"Use Visual Studio Code with a Run:ai Job","text":"

      Once you launch a workload using Run:ai, you will want to connect to it. You can do so via command line or via other tools such as a Jupyter Notebook

      Important

      This document is about accessing the remote container created by Run:ai, from the installed version of Visual Studio Code. If you want to use Visual Studio Code for web, please see Visual Studio Code Web Quickstart.

      "},{"location":"Researcher/tools/dev-vscode/#submit-a-workload","title":"Submit a Workload","text":"

      You will need your image to run an SSH server (e.g OpenSSH). For this document, we have created an image named runai.jfrog.io/demo/pycharm-demo. The image runs both python and ssh. Details on how to create the image are here. The image is configured to use the root user and password for SSH.

      Run the following command to connect to the container as if it were running locally:

      runai submit build-remote -i runai.jfrog.io/demo/pycharm-demo --interactive  \\\n        --service-type=portforward --port 2222:22\n

      The terminal will show the connection:

      The job 'build-remote' has been submitted successfully\nYou can run `runai describe job build-remote -p team-a` to check the job status\nINFO[0007] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0045] Job started\nOpen access point(s) to service from localhost:2222\nForwarding from [::1]:2222 -> 22\n
      • The Job starts an sshd server on port 22.
      • The connection is redirected to the local machine (127.0.0.1) on port 2222

      Note

      It is possible to connect to the container using a remote IP address. However, this would be less convinient as you will need to maintain port numbers manually and change them when remote accessing using the development tool. As an example, run:

      runai submit build-remote -i runai.jfrog.io/demo/pycharm-demo -g 1 --interactive --service-type=nodeport --port 30022:22\n
      • The Job starts an sshd server on port 22.
      • The Job redirects the external port 30022 to port 22 and uses a Node Port service type.
      • Run: runai list jobs

      • Next to the Job, under the \"Service URL\" column you will find the IP address and port. The port is 30222

      "},{"location":"Researcher/tools/dev-vscode/#visual-studio-code","title":"Visual Studio Code","text":"
      • Under Visual Studio code install the Remote SSH extension.
      • Create an ssh entry to the service by editing .ssh/config file or use the command Remote-SSH: Connect to Host... from the Command Palette. Enter the IP address and port from above (e.g. ssh root@35.34.212.12 -p 30022 or ssh root@127.0.0.1 -p 2222). User and password are root
      • Using VS Code, install the Python extension on the remote machine
      • Write your first Python code and run it remotely.
      "},{"location":"Researcher/tools/dev-x11forward-pycharm/","title":"Use PyCharm with X11 Forwarding and Run:ai","text":"

      X11 is a window system for the Unix operating systems. X11 forwarding allows executing a program remotely through an SSH connection. Meaning, the executable file itself is hosted on a different machine than where the graphical interface is being displayed. The graphical windows are forwarded to your local machine through the SSH connection.

      This section is about setting up X11 forwarding from a Run:ai-based container to a PyCharm IDE on a remote machine.

      "},{"location":"Researcher/tools/dev-x11forward-pycharm/#submit-a-workload","title":"Submit a Workload","text":"

      You will need your image to run an SSH server (e.g OpenSSH). For the purposes of this document, we have created an image named runai.jfrog.io/demo/quickstart-x-forwarding. The image runs:

      • Python
      • SSH Daemon configured for X11Forwarding
      • OpenCV python library for image handling

      Details on how to create the image are here. The image is configured to use the root user and password for SSH.

      Run the following command to connect to the container as if it were running locally:

      runai submit xforward-remote -i runai.jfrog.io/demo/quickstart-x-forwarding --interactive  \\\n        --service-type=portforward --port 2222:22\n

      The terminal will show the connection:

      The job 'xforward-remote' has been submitted successfully\nYou can run `runai describe job xforward-remote -p team-a` to check the job status\nINFO[0007] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0045] Job started\nOpen access point(s) to service from localhost:2222\nForwarding from [::1]:2222 -> 22\n
      • The Job starts an sshd server on port 22.
      • The connection is redirected to the local machine (127.0.0.1) on port 2222
      "},{"location":"Researcher/tools/dev-x11forward-pycharm/#setup-the-x11-forwarding-tunnel","title":"Setup the X11 Forwarding Tunnel","text":"

      Connect to the new Job by running:

      ssh -X root@127.0.0.1 -p 2222\n

      Note the -X flag.

      Run:

      echo $DISPLAY\n
      Copy the value. It will be used as a PyCharm environment variable.

      Important

      The ssh terminal should remain active throughout the session.

      "},{"location":"Researcher/tools/dev-x11forward-pycharm/#pycharm","title":"PyCharm","text":"
      • Under PyCharm | Preferences go to: Project | Python Interpreter
      • Add a new SSH Interpreter.
      • As Host, use localhost. Change the port to the above (2222) and use the Username root.
      • You will be prompted for a password. Enter root.
      • Make sure to set the correct path of the Python binary. In our case it's /usr/local/bin/python.
      • Apply your settings.

      • Under PyCharm configuration set the following environment variables:

        1. DISPLAY - set environment variable you copied before
        2. HOME - In our case it's /root. This is required for the X11 authentication to work.

      Run your code. You can use our sample code here.

      "},{"location":"Researcher/workloads/assets/compute/","title":"Compute Resources","text":"

      This article explains what compute resources are and how to create and use them.

      Compute resources are one type of workload asset. A compute resource is a template that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

      A compute resource asset is a preconfigured building block that encapsulates all the specifications of compute requirements for the workload including:

      • GPU devices and GPU memory
      • CPU memory and CPU compute
      "},{"location":"Researcher/workloads/assets/compute/#compute-resource-table","title":"Compute resource table","text":"

      The Compute resource table can be found under Workload manager in the Run:ai UI.

      The Compute resource table provides a list of all the compute resources defined in the platform and allows you to manage them.

      The Compute resource table consists of the following columns:

      Column Description Compute resource The name of the compute resource Description A description of the essence of the compute resource GPU devices request per pod The number of requested physical devices per pod of the workload that uses this compute resource GPU memory request per device The amount of GPU memory per requested device that is granted to each pod of the workload that uses this compute resource CPU memory request The minimum amount of CPU memory per pod of the workload that uses this compute resource CPU memory limit The maximum amount of CPU memory per pod of the workload that uses this compute resource CPU compute request The minimum number of CPU cores per pod of the workload that uses this compute resource CPU compute limit The maximum number of CPU cores per pod of the workload that uses this compute resource Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram Workload(s) The list of workloads associated with the compute resource Template(s) The list of workload templates that use this compute resource Created by The name of the user who created the compute resource Creation time The timestamp of when the compute resource was created Last updated The timestamp of when the compute resource was last updated Cluster The cluster that the compute resource is associated with"},{"location":"Researcher/workloads/assets/compute/#workloads-associated-with-the-compute-resource","title":"Workloads associated with the compute resource","text":"

      Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

      Column Description Workload The workload that uses the compute resource Type Workspace/Training/Inference Status Represents the workload lifecycle. See the full list of workload status."},{"location":"Researcher/workloads/assets/compute/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      "},{"location":"Researcher/workloads/assets/compute/#adding-new-compute-resource","title":"Adding new compute resource","text":"

      To add a new compute resource:

      1. Go to the Compute resource table
      2. Click +NEW COMPUTE RESOURCE
      3. Select under which cluster to create the compute resource
      4. Select a scope
      5. Enter a name for the compute resource. The name must be unique.
      6. Optional: Provide a description of the essence of the compute resource
      7. Set the resource types needed within a single node (the Run:ai scheduler tries to match a single node that complies with the compute resource for each of the workload\u2019s pods)

        • GPU

          • GPU devices per pod The number of devices (physical GPUs) per pod (for example, if you requested 3 devices per pod and the running workload using this compute resource consists of 3 pods, there are 9 physical GPU devices used in total)

          Note

          • When setting it to zero, the workload using this computer resource neither requests or uses GPU resources while running
          • You can set any number of GPU devices and specify the memory requirement to any portion size (1..100), or memory size value using GB or MB units per device
          • GPU memory per device
            • Select the memory request format
              • % (of device) - Fraction of a GPU device\u2019s memory
              • MB (memory size) - An explicit GPU memory unit
              • GB (memory size) - An explicit GPU memory unit
            • Set the memory Request - The minimum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives this amount of GPU memory for each device(s) the pod utilizes
            • Optional: Set the memory Limit - The maximum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives at most this amount of GPU memory for each device(s) the pod utilizes. To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.

          Note

          • GPU memory limit is disabled by default. If you cannot see the Limit toggle in the compute resource form, then it must be enabled by your Administrator, under General settings \u2192 Resources \u2192 GPU resource optimization
          • When a Limit is set and is bigger than the Request, the scheduler allows each pod to reach the maximum amount of GPU memory in an opportunistic manner (only upon availability).
          • If the GPU Memory Limit is bigger that the Request the pod is prone to be killed by the Run:ai toolkit (out of memory signal). The greater the difference between the GPU memory used and the request, the higher the risk of being killed
          • If GPU resource optimization is turned off, the minimum and maximum are in fact equal
        • CPU

          • CPU compute per pod
            • Select the units for the CPU compute (Cores / Millicores)
            • Set the CPU compute Request - the minimum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU compute for each pod.
            • Optional: Set the CPU compute Limit - The maximum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU compute. To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request. By default, the limit is set to \u201cUnlimited\u201d - which means that the pod may consume all the node's free CPU compute resources.
          • CPU memory per pod
            • Select the units for the CPU memory (MB / GB)
            • Set the CPU memory Request - The minimum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU memory for each pod.
            • Optional: Set the CPU memory Limit - The maximum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU memory. To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request. By default, the limit is set to \u201cUnlimited\u201d - Meaning that the pod may consume all the node's free CPU memory resources.

          Note

          If the CPU Memory Limit is bigger that the Request the pod is prone to be killed by the operating system (out of memory signal). The greater the difference between the CPU memory used and the request, the higher the risk of being killed.

      8. Optional: More settings

        • Increase shared memory size When enabled, the shared memory size available to the pod is increased from the default 64MB to the node's total available memory or the CPU memory limit, if set above.
        • Set extended resource(s) Click +EXTENDED RESOURCES to add resource/quantity pairs. For more information on how to set extended resources, see the Extended resources and Quantity guides
      9. Click CREATE COMPUTE RESOURCE

        Note

        It is also possible to add compute resources directly when creating a specific Workspace, training or inference workload.

      "},{"location":"Researcher/workloads/assets/compute/#editing-a-compute-resource","title":"Editing a compute resource","text":"

      To edit a compute resource:

      1. Select the compute resource you want to edit
      2. Click Edit
      3. Click SAVE COMPUTE RESOURCE

      Note

      The already bound workload that is using this asset will not be affected.

      "},{"location":"Researcher/workloads/assets/compute/#copying-a-compute-resource","title":"Copying a compute resource","text":"

      To make a copy of an existing compute resource:

      1. Select the compute resource you want to copy
      2. Click MAKE A COPY
      3. Enter a name for the environment. The name must be unique.
      4. Update the environment
      5. Click CREATE COMPUTE RESOURCE
      "},{"location":"Researcher/workloads/assets/compute/#deleting-a-compute-resource","title":"Deleting a compute resource","text":"
      1. Select the compute resource you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm

      Note

      The already bound workload that is using this asset will not be affected.

      "},{"location":"Researcher/workloads/assets/compute/#using-api","title":"Using API","text":"

      Go to the Compute resources API reference to view the available actions

      "},{"location":"Researcher/workloads/assets/credentials/","title":"Credentials","text":"

      This article explains what credentials are and how to create and use them.

      Credentials are a workload asset that simplify the complexities of Kubernetes secrets. They consist of and mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

      Credentials are crucial for the security of AI workloads and the resources they require, as they restrict access to authorized users, verify identities, and ensure secure interactions. By enforcing the protection of sensitive data, credentials help organizations comply with industry regulations, fostering a secure environment overall.

      Essentially, credentials enable AI practitioners to access relevant protected resources, such as private data sources and Docker images, thereby streamlining the workload submission process.

      "},{"location":"Researcher/workloads/assets/credentials/#credentials-table","title":"Credentials table","text":"

      The Credentials table can be found under Workload manager in the Run:ai User interface.

      The Credentials table provides a list of all the credentials defined in the platform and allows you to manage them.

      The Credentials table comprises the following columns:

      Column Description Credentials The name of the credentials Description A description of the credentials Type The type of credentials, e.g., Docker registry Status The different lifecycle phases and representation of the credentials\u2019 condition Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram Kubernetes name The unique name of the credentials Kubernetes name as it appears in the cluster Environment(s) The environment(s) that are associated with the credentials Data source(s) The private data source(s) that are accessed using the credentials Created by The user who created the credentials Creation time The timestamp of when the credentials were created Cluster The cluster with which the credentials are associated"},{"location":"Researcher/workloads/assets/credentials/#credentials-status","title":"Credentials status","text":"

      The following table describes the credentials\u2019 condition and whether they were created successfully for the selected scope.

      Status Description No issues found No issues were found while creating the credentials (this status may change while propagating the credentials to the selected scope) Issues found Issues found while propagating the credentials Issues found Failed to access the cluster Creating\u2026 Credentials are being created Deleting\u2026 Credentials are being deleted No status When the credentials\u2019 scope is an account, or the current version of the cluster is not up to date, the status cannot be displayed"},{"location":"Researcher/workloads/assets/credentials/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then click \u2018Download as CSV\u2019. Export to CSV is limited to 20,000 rows.
      • Refresh - Click REFRESH to update the table with the latest data
      "},{"location":"Researcher/workloads/assets/credentials/#adding-new-credentials","title":"Adding new credentials","text":"

      Creating credentials is limited to specific roles.

      To add a new credential:

      1. Go to the Credentials table:
      2. Click +NEW CREDENTIALS
      3. Select the credential type from the list Follow the step-by-step guide for each credential type:
      "},{"location":"Researcher/workloads/assets/credentials/#docker-registry","title":"Docker registry","text":"

      These credentials allow users to authenticate and pull images from a Docker registry, enabling access to containerized applications and services.

      After creating the credentials, it is used automatically when pulling images.

      1. Select a scope.
      2. Enter a name for the credential. The name must be unique.
      3. Optional: Provide a description of the credentials
      4. Set how the credential is created
        • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
          • Select a secret from the list (The list is empty if no secrets were created in advance)
        • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
          • Enter the username, password, and Docker registry URL
      5. Click CREATE CREDENTIALS

      After the credentials are created, check their status to monitor their proper creation across the selected scope.

      "},{"location":"Researcher/workloads/assets/credentials/#access-key","title":"Access key","text":"

      These credentials are unique identifiers used to authenticate and authorize access to cloud services or APIs, ensuring secure communication between applications. They typically consist of two parts:

      • An access key ID
      • A secret access key

      The purpose of this credential type is to allow access to restricted data.

      1. Select a scope.
      2. Enter a name for the credential. The name must be unique.
      3. Optional: Provide a description of the credential
      4. Set how the credential is created
        • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
          • Select a secret from the list (The list is empty if no secrets were created in advance)
        • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
          • Enter the Access key and Access secret
      5. Click CREATE CREDENTIALS

      After the credentials are created, check their status to monitor their proper creation across the selected scope.

      "},{"location":"Researcher/workloads/assets/credentials/#username-password","title":"Username & password","text":"

      These credentials require a username and corresponding password to access various resources, ensuring that only authorized users can log in.

      The purpose of this credential type is to allow access to restricted data.

      1. Select a scope
      2. Enter a name for the credential. The name must be unique.
      3. Optional: Provide a description of the credentials
      4. Set how the credential is created
        • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
          • Select a secret from the list (The list is empty if no secrets were created in advance)
        • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
          • Enter the username and password
      5. Click CREATE CREDENTIALS

      After the credentials are created, check their status to monitor their proper creation across the selected scope.

      "},{"location":"Researcher/workloads/assets/credentials/#generic-secret","title":"Generic secret","text":"

      These credentials are a flexible option that consists of multiple keys & values and can store various sensitive information, such as API keys or configuration data, to be used securely within applications.

      The purpose of this credential type is to allow access to restricted data.

      1. Select a scope
      2. Enter a name for the credential. The name must be unique.
      3. Optional: Provide a description of the credentials
      4. Set how the credential is created
        • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
          • Select a secret from the list (The list is empty if no secrets were created in advance)
        • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
          • Click +KEY & VALUE - to add key/value pairs to store in the new secret
      5. Click CREATE CREDENTIALS
      "},{"location":"Researcher/workloads/assets/credentials/#editing-credentials","title":"Editing credentials","text":"

      To rename a credential:

      1. Select the credential from the table
      2. Click Rename to edit its name and description
      "},{"location":"Researcher/workloads/assets/credentials/#deleting-credentials","title":"Deleting credentials","text":"

      To delete a credential:

      1. Select the credential you want to delete
      2. Click DELETE
      3. In the dialog, click DELETE to confirm

      Note

      Credentials cannot be deleted if they are being used by a workload and template.

      "},{"location":"Researcher/workloads/assets/credentials/#using-credentials","title":"Using credentials","text":"

      You can use credentials (secrets) in various ways within the system

      "},{"location":"Researcher/workloads/assets/credentials/#access-private-data-sources","title":"Access private data sources","text":"

      To access private data sources, attach credentials to data sources of the following types: Git, S3 Bucket

      "},{"location":"Researcher/workloads/assets/credentials/#use-directly-within-the-container","title":"Use directly within the container","text":"

      To use the secret directly from within the container, you can choose between the following options

      1. Get the secret mounted to the file system by using the Generic secret data source
      2. Get the secret as an environment variable injected into the container. There are two equivalent ways to inject the environment variable.

        a. By adding it to the Environment asset. b. By adding it ad-hoc as part of the workload.

      "},{"location":"Researcher/workloads/assets/credentials/#creating-secrets-in-advance","title":"Creating secrets in advance","text":"

      Add secrets in advance to be used when creating credentials via the Run:ai UI.

      Follow the steps below for each required scope:

      Cluster scopeDepartment scopeProject scope
      1. Create the secret in the Run:ai namespace (runai)
      2. To authorize Run:ai to use the secret, label it: run.ai/cluster-wide: \"true\"
      3. Label the secret with the correct credential type:
        1. Docker registry - run.ai/resource: \"docker-registry\"
        2. Access key - run.ai/resource: \"access-key\"
        3. Username and password - run.ai/resource: \"password\"
        4. Generic secret - run.ai/resource: \"generic\" \u05bf
      1. Create the secret in the Run:ai namespace (runai)
      2. To authorize Run:ai to use the secret, label it: run.ai/department: \"<department id>\"
      3. Label the secret with the correct credential type:
        1. Docker registry - run.ai/resource: \"docker-registry\"
        2. Access key - run.ai/resource: \"access-key\"
        3. Username and password - run.ai/resource: \"password\"
        4. Generic secret - run.ai/resource: \"generic\"
      1. Create the secret in the project\u2019s namespace
      2. Label the secret with the correct credential type:
        1. Docker registry - run.ai/resource: \"docker-registry\"
        2. Access key - run.ai/resource: \"access-key\"
        3. Username and password - run.ai/resource: \"password\"
        4. Generic secret - run.ai/resource: \"generic\"

      The secret is now displayed for that scope in the list of existing secrets.

      "},{"location":"Researcher/workloads/assets/credentials/#using-api","title":"Using API","text":"

      To view the available actions, go to the Credentials API reference

      "},{"location":"Researcher/workloads/assets/data-volumes/","title":"Data Volumes","text":"

      Data volumes offer a powerful solution for storing, managing, and sharing AI training data within the Run:ai platform. They promote collaboration, simplify data access control, and streamline the AI development lifecycle.

      Data volumes are snapshots of datasets stored in Kubernetes Persistent Volume Claims (PVCs). They act as a central repository for training data.

      "},{"location":"Researcher/workloads/assets/data-volumes/#why-use-a-data-volume","title":"Why use a data volume?","text":"
      1. Sharing with multiple scopes Unlike other Run:ai data sources, data volumes can be shared across projects, departments, or clusters, encouraging data reuse and collaboration within the organization.
      2. Storage saving A single copy of the data can be used across multiple scopes
      "},{"location":"Researcher/workloads/assets/data-volumes/#typical-use-cases","title":"Typical use cases","text":"
      1. Sharing large data sets In large organizations, the data is often stored in a remote location, which can be a barrier for large model training. Even if the data is transferred into the cluster, sharing it easily with multiple users is still challenging. Data volumes can help share the data seamlessly, with maximum security and control.
      2. Sharing data with colleagues When sharing training results, generated data sets, or other artifacts with team members is needed, data volumes can help make the data available easily.
      "},{"location":"Researcher/workloads/assets/data-volumes/#prerequisites","title":"Prerequisites","text":"

      To create a data volume, there must be a project with a PVC in its namespace.

      Working with data volumes is currently available using the API. To view the available actions, go to the Data volumes API reference.

      "},{"location":"Researcher/workloads/assets/data-volumes/#adding-a-new-data-volume","title":"Adding a new data volume","text":"

      Data volume creation is limited to specific roles

      "},{"location":"Researcher/workloads/assets/data-volumes/#adding-scopes-for-a-data-volume","title":"Adding scopes for a data volume","text":"

      Data volume sharing (adding scopes) is limited to specific roles

      Once created, the data volume is available to its originating project (see the prerequisites above).

      Data volumes can be shared with additional scopes in the organization.

      "},{"location":"Researcher/workloads/assets/data-volumes/#who-can-use-a-data-volume","title":"Who can use a data volume?","text":"

      Data volumes are used when submitting workloads. Any user, application or SSO group with a role that has permissions to create workloads can also use data volumes.

      Researchers can list available data volumes within their permitted scopes for easy selection.

      "},{"location":"Researcher/workloads/assets/datasources/","title":"Data Sources","text":"

      This article explains what data sources are and how to create and use them.

      Data sources are a type of workload asset and represent a location where data is actually stored. They may represent a remote data location, such as NFS, Git, or S3, or a Kubernetes local resource, such as PVC, ConfigMap, HostPath, or Secret.

      This configuration simplifies the mapping of the data into the workload\u2019s file system and handles the mounting process during workload creation for reading and writing. These data sources are reusable and can be easily integrated and used by AI practitioners while submitting workloads across various scopes.

      "},{"location":"Researcher/workloads/assets/datasources/#data-sources-table","title":"Data sources table","text":"

      The data sources table can be found under Workload manager in the Run:ai platform.

      The data sources table provides a list of all the data sources defined in the platform and allows you to manage them.

      The data sources table comprises the following columns:

      Column Description Data source The name of the data source Description A description of the data source Type The type of data source connected \u2013 e.g., S3 bucket, PVC, or others Status The different lifecycle phases and representation of the data source condition Scope The scope of the data source within the organizational tree. Click the scope name to view the organizational tree diagram Kubernetes name The unique name of the data sources Kubernetes name as it appears in the cluster Workload(s) The list of existing workloads that use the data source Template(s) The list of workload templates that use the data source Created by The user who created the data source Creation time The timestamp for when the data source was created Cluster The cluster that the data source is associated with"},{"location":"Researcher/workloads/assets/datasources/#data-sources-status","title":"Data sources status","text":"

      The following table describes the data sources' condition and whether they were created successfully for the selected scope.

      Status Description No issues found No issues were found while creating the data source Issues found Issues were found while propagating the data source credentials Issues found The data source couldn\u2019t be created at the cluster Creating\u2026 The data source is being created No status / \u201c-\u201d When the data source\u2019s scope is an account, the current version of the cluster is not up to date, or the asset is not a cluster-syncing entity, the status can\u2019t be displayed"},{"location":"Researcher/workloads/assets/datasources/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then click \u2018Download as CSV\u2019
      • Refresh - Click REFRESH to update the table with the latest data
      "},{"location":"Researcher/workloads/assets/datasources/#adding-a-new-data-source","title":"Adding a new data source","text":"

      To create a new data source:

      1. Click +NEW DATA SOURCE
      2. Select the data source type from the list. Follow the step-by-step guide for each data source type:
      "},{"location":"Researcher/workloads/assets/datasources/#nfs","title":"NFS","text":"

      A Network File System (NFS) is a Kubernetes concept used for sharing storage in the cluster among different pods. Like a PVC, the NFS volume\u2019s content remains preserved, even outside the lifecycle of a single pod. However, unlike PVCs, which abstract storage management, NFS provides a method for network-based file sharing. The NFS volume can be pre-populated with data and can be mounted by multiple pod writers simultaneously. At Run:ai, an NFS-type data source is an abstraction that is mapped directly to a Kubernetes NFS volume. This integration allows multiple workloads under various scopes to mount and present the NFS data source.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Set the data origin
        • Enter the NFS server (host name or host IP)
        • Enter the NFS path
      6. Set the data target location
        • Container path
      7. Optional: Restrictions
        • Prevent data modification - When enabled, the data will be mounted with read-only permissions
      8. Click CREATE DATA SOURCE
      "},{"location":"Researcher/workloads/assets/datasources/#pvc","title":"PVC","text":"

      A Persistent Volume Claim (PVC) is a Kubernetes concept used for managing storage in the cluster, which can be provisioned by an administrator or dynamically by Kubernetes using a StorageClass. PVCs allow users to request specific sizes and access modes (read/write once, read-only many). Run:ai ensures that data remains consistent and accessible across various scopes and workloads, beyond the lifecycle of individual pods, which is efficient while working with large datasets typically associated with AI projects.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Select PVC:
        • Existing PVC This option is relevant when the purpose is to create a PVC-type data source based on an existing PVC in the cluster
          • Select a PVC from the list - (The list is empty if no existing PVCs were created in advance)
        • New PVC - creates a new PVC in the cluster. New PVCs are not added to the Existing PVCs list. When creating a PVC-type data source and selecting the \u2018New PVC\u2019 option, the PVC is immediately created in the cluster (even if no workload has requested this PVC).
      6. Select the storage class
        • None - Proceed without defining a storage class
        • Custom storage class - This option applies when selecting a storage class based on existing storage classes. To add new storage classes to the storage class list, and for additional information, check Kubernetes storage classes
      7. Select the access mode(s) (multiple modes can be selected)
        • Read-write by one node - The volume can be mounted as read-write by a single node.
        • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
        • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
      8. Set the claim size and its units
      9. Select the volume mode
        • File system (default) - allows the volume to be mounted as a filesystem, enabling the usage of directories and files.
        • Block - exposes the volume as a block storage, which can be formatted or used by applications directly without a filesystem.
      10. Set the data target location
        • container path
      11. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permission.
      12. Click CREATE DATA SOURCE

      After the data source is created, check its status to monitor its proper creation across the selected scope.

      "},{"location":"Researcher/workloads/assets/datasources/#s3-bucket","title":"S3 Bucket","text":"

      The S3 bucket data source enables the mapping of a remote S3 bucket into the workload\u2019s file system. Similar to a PVC, this mapping remains accessible across different workload executions, extending beyond the lifecycle of individual pods. However, unlike PVCs, data stored in an S3 bucket resides remotely, which may lead to decreased performance during the execution of heavy machine learning workloads. As part of the Run:ai connection to the S3 bucket, you can create credentials in order to access and map private buckets.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Set the data origin
        • Set the S3 service URL
        • Select the credentials
          • None - for public buckets
          • Credential names - This option is relevant for private buckets based on existing credentials that were created for the scope. To add new credentials to the credentials list, and for additional information, check the Credentials article.
        • Enter the bucket name
      6. Set the data target location
        • container path
      7. Click CREATE DATA SOURCE

      After a private data source is created, check its status to monitor its proper creation across the selected scope.

      "},{"location":"Researcher/workloads/assets/datasources/#git","title":"Git","text":"

      A Git-type data source is a Run:ai integration, that enables code to be copied from a Git branch into a dedicated folder in the container. It is mainly used to provide the workload with the latest code repository. As part of the integration with Git, in order to access private repositories, you can add predefined credentials to the data source mapping.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Set the data origin
        • Set the Repository URL
        • Set the Revision (branch, tag, or hash)- If left empty, it will use the 'HEAD' (latest)
        • Select the credentials
          • None - for public repositories
          • Credential names - This option applies to private repositories based on existing credentials that were created for the scope. To add new credentials to the credentials list, and for additional information, check the Credentials article.
      6. Set the data target location
        • container path
      7. Click CREATE DATA SOURCE

      After a private data source is created, check its status to monitor its proper creation across the selected scope.

      "},{"location":"Researcher/workloads/assets/datasources/#host-path","title":"Host path","text":"

      A Host path volume is a Kubernetes concept that enables mounting a host path file or a directory on the workload\u2019s file system. Like a PVC, the host path volume\u2019s data persists across workloads under various scopes. It also enables data serving from the hosting node.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Set the data origin
        • host path
      6. Set the data target location
        • container path
      7. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permissions.
      8. Click CREATE DATA SOURCE
      "},{"location":"Researcher/workloads/assets/datasources/#configmap","title":"ConfigMap","text":"

      A ConfigMap data source is a Run:ai abstraction for the Kubernetes ConfigMap concept. The ConfigMap is used mainly for storage that can be mounted on the workload container for non-confidential data. It is usually represented in key-value pairs (e.g., environment variables, command-line arguments etc.). It allows you to decouple environment-specific system configurations from your container images, so that your applications are easily portable. ConfigMaps must be created on the cluster prior to being used within the Run:ai system.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Set the data origin
        • Select the ConfigMap name (The list is empty if no existing ConfigMaps were created in advance).
      6. Set the data target location
        • container path
      7. Click CREATE DATA SOURCE
      "},{"location":"Researcher/workloads/assets/datasources/#secret","title":"Secret","text":"

      A secret-type data source enables the mapping of a credential into the workload\u2019s file system. Credentials are a workload asset that simplify the complexities of Kubernetes Secrets. The credentials mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Set the data origin
        • Select the credentials To add new credentials, and for additional information, check the Credentials article.
      6. Set the data target location
        • container path
      7. Click CREATE DATA SOURCE

      After the data source is created, check its status to monitor its proper creation across the selected scope.

      Note

      It is also possible to add data sources directly when creating a specific workspace, training or inference workload

      "},{"location":"Researcher/workloads/assets/datasources/#editing-a-data-source","title":"Editing a data source","text":"

      To edit a data source:

      1. Select the data source from the table
      2. Click Rename to provide it with a new name
      3. Click Copy & Edit to make any changes to the data source
      "},{"location":"Researcher/workloads/assets/datasources/#deleting-a-data-source","title":"Deleting a data source","text":"

      To delete a data source:

      1. Select the data source you want to delete
      2. Click DELETE
      3. Confirm you want to delete the data source

      Note

      It is not possible to delete an environment being used by an existing workload or template.

      "},{"location":"Researcher/workloads/assets/datasources/#using-api","title":"Using API","text":"

      To view the available actions, go to the Data sources API reference.

      "},{"location":"Researcher/workloads/assets/environments/","title":"Environments","text":"

      This article explains what environments are and how to create and use them.

      Environments are one type of workload asset. An environment consists of a configuration that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

      An environment asset is a preconfigured building block that encapsulates aspects for the workload such as:

      • Container image and container configuration
      • Tools and connections
      • The type of workload it serves
      "},{"location":"Researcher/workloads/assets/environments/#environments-table","title":"Environments table","text":"

      The Environments table can be found under Workload manager in the Run:ai platform.

      The Environment table provides a list of all the environment defined in the platform and allows you to manage them.

      The Environments table consists of the following columns:

      Column Description Environment The name of the environment Description A description of the environment Scope The scope of this environment within the organizational tree. Click the name of the scope to view the organizational tree diagram Image The application or service to be run by the workload Workload Architecture This can be either standard for running workloads on a single node or distributed for running distributed workloads on multiple nodes Tool(s) The tools and connection types the environment exposes Workload(s) The list of existing workloads that use the environment Workload types The workload types that can use the environment (Workspace/ Training / Inference) Template(s) The list of workload templates that use this environment Created by The user who created the environment. By default Run:ai UI comes with preinstalled environments created by Run:ai created by Run:ai Creation time The timestamp of when the environment was created Last updated The timestamp of when the environment was last updated Cluster The cluster with which the environment is associated"},{"location":"Researcher/workloads/assets/environments/#tools-associated-with-the-environment","title":"Tools associated with the environment","text":"

      Click one of the values in the tools column to view the list of tools and their connection type.

      Column Description Tool name The name of the tool or application AI practitioner can set up within the environment. Connection type The method by which you can access and interact with the running workload. It's essentially the \"doorway\" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc)"},{"location":"Researcher/workloads/assets/environments/#workloads-associated-with-the-environment","title":"Workloads associated with the environment","text":"

      Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

      Column Description Workload The workload that uses the environment Type The workload type (Workspace/Training/Inference) Status Represents the workload lifecycle. See the full list of workload status"},{"location":"Researcher/workloads/assets/environments/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"Researcher/workloads/assets/environments/#environments-created-by-runai","title":"Environments created by Run:ai","text":"

      When installing Run:ai, you automatically get the environments created by Run:ai to ease up the onboarding process and support different use cases out of the box. These environments are created at the scope of the account.

      Environment Image Jupiter-lab jupyter/scipy-notebook jupyter-tensorboard gcr.io/run-ai-demo/jupyter-tensorboard tensorboard tensorflow/tensorflow:latest llm-server runai.jfrog.io/core-llm/runai-vllm:v0.6.4-0.10.0 chatbot-ui runai.jfrog.io/core-llm/llm-app gpt2 runai.jfrog.io/core-llm/quickstart-inference:gpt2-cpu"},{"location":"Researcher/workloads/assets/environments/#adding-a-new-environment","title":"Adding a new environment","text":"

      Environment creation is limited to specific roles

      To add a new environment:

      1. Go to the Environments table
      2. Click +NEW ENVIRONMENT
      3. Select under which cluster to create the environment
      4. Select a scope
      5. Enter a name for the environment. The name must be unique.
      6. Optional: Provide a description of the essence of the environment
      7. Enter the Image URL If a token or secret is required to pull the image, it is possible to create it via credentials of type docker registry. These credentials are automatically used once the image is pulled (which happens when the workload is submitted)
      8. Set the image pull policy - the condition for when to pull the image from the registry
      9. Set the workload architecture:
        • Standard Only standard workloads can use the environment. A standard workload consists of a single process.
        • Distributed Only distributed workloads can use the environment. A distributed workload consists of multiple processes working together. These processes can run on different nodes.
        • Select a framework from the list.
      10. Set the workload type:
        • Workspace
        • Training
        • Inference
        • When inference is selected, define the endpoint of the model by providing both the protocol and the container\u2019s serving port
      11. Optional: Set the connection for your tool(s). The tools must be configured in the image. When submitting a workload using the environment, it is possible to connect to these tools
        • Select the tool from the list (the available tools varies from IDE, experiment tracking, and more, including a custom tool for your choice)
        • Select the connection type
          • External URL
            • Auto generate A unique URL is automatically created for each workload using the environment
            • Custom URL The URL is set manually
          • Node port
            • Auto generate A unique port is automatically exposed for each workload using the environment
            • Custom URL Set the port manually
          • Set the container port
      12. Optional: Set a command and arguments for the container running the pod
        • When no command is added, the default command of the image is used (the image entrypoint)
        • The command can be modified while submitting a workload using the environment
        • The argument(s) can be modified while submitting a workload using the environment
      13. Optional: Set the environment variable(s)
        • Click +ENVIRONMENT VARIABLE
        • Enter a name
        • Select the source for the environment variable
        • Custom
          • Enter a value
          • Leave empty
          • Add instructions for the expected value if any
        • Credentials - Select existing credentials as the environment variable
          • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
          • Select a secret key
        • The environment variables can be modified and new variables can be added while submitting a workload using the environment
      14. Optional: Set the container\u2019s working directory to define where the container\u2019s process starts running. When left empty, the default directory is used.
      15. Optional: Set where the UID, GID and supplementary groups are taken from, this can be:
        • From the image
        • From the IdP token (only available in an SSO installations)
        • Custom (manually set) - decide whether the submitter can modify these value upon submission.
        • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
          • Enter UID
          • Enter GID
          • Add Supplementary groups (multiple groups can be added, separated by commas)
          • Disable Allow the values above to be modified within the workload if you want the above values to be used as the default
      16. Optional: Select Linux capabilities - Grant certain privileges to a container without granting all the privileges of the root user.
      17. Click CREATE ENVIRONMENT

      Note

      It is also possible to add environments directly when creating a specific workspace, training or inference workload.

      "},{"location":"Researcher/workloads/assets/environments/#editing-an-environment","title":"Editing an environment","text":"

      To edit an environment:

      1. Select the environment you want to edit
      2. Click Edit
      3. Click SAVE ENVIRONMENT

      Note

      • The already bound workload that is using this asset will not be affected.
      • llm-server and chatbot-ui environments cannot be edited.
      "},{"location":"Researcher/workloads/assets/environments/#copying-an-environment","title":"Copying an environment","text":"

      To make a copy of an existing environment:

      1. Select the environment you want to copy
      2. Click MAKE A COPY
      3. Enter a name for the environment. The name must be unique.
      4. Update the environment
      5. Click CREATE ENVIRONMENT
      "},{"location":"Researcher/workloads/assets/environments/#deleting-an-environment","title":"Deleting an environment","text":"

      To delete an environment:

      1. Select the environment you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm

      Note

      The already bound workload that is using this asset will not be affected.

      "},{"location":"Researcher/workloads/assets/environments/#using-api","title":"Using API","text":"

      Go to the Environment API reference to view the available actions

      "},{"location":"Researcher/workloads/assets/overview/","title":"Overview","text":"

      Workload assets enable organizations to:

      • Create and reuse preconfigured setup for code, data, storage and resources to be used by AI practitioners to simplify the process of submitting workloads
      • Share the preconfigured setup with a wide audience of AI practitioners with similar needs

      Note

      • The creation of assets is possible only via API and the Run:ai UI
      • The submission of workloads using assets, is possible only via the Run:ai UI
      "},{"location":"Researcher/workloads/assets/overview/#workload-asset-types","title":"Workload asset types","text":"

      There are four workload asset types used by the workload:

      • Environments The container image, tools and connections for the workload
      • Data sources The type of data, its origin and the target storage location such as PVCs or cloud storage buckets where datasets are stored
      • Compute resources The compute specification, including GPU and CPU compute and memory
      • Credentials The secrets to be used to access sensitive data, services, and applications such as docker registry or S3 buckets
      "},{"location":"Researcher/workloads/assets/overview/#asset-scope","title":"Asset scope","text":"

      When a workload asset is created, a scope is required. The scope defines who in the organization can view and/or use the asset.

      Note

      When an asset is created via API, the scope can be the entire account, this is currently an experimental feature.

      "},{"location":"Researcher/workloads/assets/overview/#who-can-create-an-asset","title":"Who can create an asset?","text":"

      Any subject (user, application, or SSO group) with a role that has permissions to Create an asset, can do so within their scope.

      "},{"location":"Researcher/workloads/assets/overview/#who-can-use-an-asset","title":"Who can use an asset?","text":"

      Assets are used when submitting workloads. Any subject (user, application or SSO group) with a role that has permissions to Create workloads, can also use assets.

      "},{"location":"Researcher/workloads/assets/overview/#who-can-view-an-asset","title":"Who can view an asset?","text":"

      Any subject (user, application, or SSO group) with a role that has permission to View an asset, can do so within their scope.

      "},{"location":"Researcher/workloads/assets/templates/","title":"Workspace Templates","text":"

      This article explains the procedure to manage templates.

      A template is a pre-set configuration that is used to quickly configure and submit workloads using existing assets. A template consists of all the assets a workload needs, allowing researchers to submit a workload in a single click, or make subtle adjustments to differentiate them from each other.

      "},{"location":"Researcher/workloads/assets/templates/#workspace-templates-table","title":"Workspace templates table","text":"

      The Templates table can be found under Workload manager in the Run:ai User interface.

      The Templates table provides a list of all the templates defined in the platform, and allows you to manage them.

      Flexible Management

      It is also possible to manage templates directly for a specific user, application, project, or department.

      The Templates table consists of the following columns:

      Column Description Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates Environment The name of the environment related to the workspace template Compute resource The name of the compute resource connected to the workspace template Data source(s) The name of the data source(s) connected to the workspace template Created by The subject that created the template Creation time The timestamp for when the template was created Cluster The cluster name containing the template"},{"location":"Researcher/workloads/assets/templates/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then click Download as CSV. Export to CSV is limited to 20,000 rows.
      • Refresh (optional) - Click REFRESH to update the table with the latest data
      • Show/Hide details (optional) - Click to view additional information on the selected row
      "},{"location":"Researcher/workloads/assets/templates/#adding-a-new-workspace-template","title":"Adding a new workspace template","text":"

      To add a new template:

      1. Click +NEW TEMPLATE
      2. Set the scope for the template
      3. Enter a name for the template
      4. Select the environment for your workload
      5. Select the node resources needed to run your workload - or - Click +NEW COMPUTE RESOURCE

      6. Set the volume needed for your workload

      7. Create a new data source
      8. Set auto-deletion, annotations and labels, as required
      9. Click CREATE TEMPLATE
      "},{"location":"Researcher/workloads/assets/templates/#editing-a-template","title":"Editing a template","text":"

      To edit a template:

      1. Select the template from the table
      2. Click Rename to provide it with a new name
      3. Click Copy & Edit to make any changes to the template
      "},{"location":"Researcher/workloads/assets/templates/#deleting-a-template","title":"Deleting a template","text":"

      To delete a template:

      1. Select the template you want to delete
      2. Click DELETE
      3. Confirm you want to delete the template
      "},{"location":"Researcher/workloads/assets/templates/#using-api","title":"Using API**","text":"

      Go to the Workload template API reference to view the available actions

      "},{"location":"Researcher/workloads/inference/custom-inference/","title":"Deploy a custom inference workload","text":"

      This article explains how to create a custom inference workload via the Run:ai UI.

      An inference workload provides the setup and configuration needed to deploy your trained model for real-time or batch predictions. It includes specifications for the container image, data sets, network settings, and resource requests required to serve your models.

      The inference workload is assigned to a project and is affected by the project\u2019s quota.

      To learn more about the inference workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

      "},{"location":"Researcher/workloads/inference/custom-inference/#creating-a-custom-inference-workload","title":"Creating a custom inference workload","text":"

      Before you start, make sure you have a project.

      To add a new custom inference workload:

      1. Go to the Workload manager \u2192 Workloads
      2. Click +NEW WORKLOAD and select Inference Within the new inference form:
      3. Select under which cluster to create the inference workload
      4. Select the project in which your inference will run
      5. Select custom inference from Inference type

        Note

        Selecting the Inference type is disabled by default. If you cannot see it in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Models.

      6. Enter a unique name for the inference workload (if the name already exists in the project, you will be requested to submit a different name)

      7. Click CONTINUE In the next step:
      8. Select the environment for your inference workload

        • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery. For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
        • Set an inference serving endpoint. The connection protocol and the container port are defined within the environment

          • Optional: Modify who can access the endpoint

            • Public (default)

              Everyone within the network can access the endpoint with no authentication

            • All authenticated users

              Everyone within the organization\u2019s account that can log in (to Run:ai or SSO)

            • Specific group(s)

              • Click +GROUP
              • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the endpoint.
            • Specific user(s)

              • Click +USER
              • Enter a valid email address or username. If you remove yourself, you will lose access to the endpoint.
        • Set the connection for your tool(s). The tools are configured as part of the environment.

          • External URL
            • Custom URL
              • Set the URL
            • Optional: Modify who can access the tool:
              • All authenticated users (default) Everyone within the organization\u2019s account
              • Specific group(s)
                • Click +GROUP
                • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
              • Specific user(s)
                • Click +USER
                • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
          • Node port
            • Custom port
              • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
        • Optional: Set the command and arguments for the container running the workload If no command is added, the container will use the image\u2019s default command (entry-point).
          • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
          • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
        • Set the environment variable(s)
          • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
          • (Optional) Add new variables
            • Click +ENVIRONMENT VARIABLE
            • Enter a name
            • Select the source for the environment variable
              • Custom
                • Enter a value according to the provided instructions
              • Credentials - Select existing credentials as the environment variable
                • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
                • Select a secret key
      9. Select the compute resource for your inference workload

        • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery. For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
        • Optional: Set the minimum and maximum number of replicas to be scaled up and down to meet the changing demands of inference services.
        • If the number of minimum and maximum replicas are different, autoscaling will be triggered and you'll need to set conditions for creating a new replica. A replica will be created every time a condition is met. When a condition is no longer met after a replica was created, the replica will be automatically deleted to save resources.

          • Select a variable - The variable's values will be monitored via the container's port.
            • Latency (milliseconds)
            • Throughput (Requests/sec)
            • Concurrency (Requests)
          • Set a value - This value is the threshold at which autoscaling is triggered.
        • Optional: Set when the replicas should be automatically scaled down to zero. This allows compute resources to be freed up when the model is inactive (i.e., there are no requests being sent) When automatic scaling to zero is enabled, the minimum number of replicas set in the previous step, automatically changes to 0

        • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload. When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
          • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
          • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster. To configure a new node pool and for additional information, see node pools.
        • Select a node affinity to schedule the workload on a specific node type. If the administrator added a \u2018node type (affinity)\u2019 scheduling rule to the project/department, then this field is mandatory. Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
        • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

          Note

          Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Tolerations

          • Click +TOLERATION
          • Enter a key
          • Select the operator
            • Exists - If the key exists on the node, the effect will be applied.
            • Equals - If the key and the value set below matches to the value on the node, the effect will be applied
              • Enter a value matching the value on the node
          • Select the effect for the toleration
            • NoExecute - Pods that do not tolerate this taint are evicted immediately.
            • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
            • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
            • Any - All effects above match.
          • Optional: Select data sources for your inference workload Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection. For a step-by-step guide on adding data sources to the gallery, see data sources. Once created, the new data source will be automatically selected.
            • Optional: Modify the data target location for the selected data source(s).
      10. Optional - General settings:

        • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
        • Set annotations(s) Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
          • Click +ANNOTATION
          • Enter a name
          • Enter a value
        • Set labels(s) Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
          • Enter a name
          • Enter a value
      11. Click CREATE INFERENCE
      "},{"location":"Researcher/workloads/inference/custom-inference/#managing-and-monitoring","title":"Managing and monitoring","text":"

      After the inference workload is created, it is added to the Workloads table, where it can be managed and monitored.

      "},{"location":"Researcher/workloads/inference/custom-inference/#using-api","title":"Using API","text":"

      To view the available actions, see the Inferences API reference.

      "},{"location":"Researcher/workloads/inference/hugging-face-inference/","title":"Deploy inference workloads from Hugging Face","text":"

      This article explains how to create an inference workload via the Run:ai UI using Hugging Face inference models.

      An inference workload provides the setup and configuration needed to deploy your trained model for real-time or batch predictions. It includes specifications for the container image, data sets, network settings, and resource requests required to serve your models.

      The inference workload is assigned to a project and is affected by the project\u2019s quota.

      To learn more about the inference workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

      "},{"location":"Researcher/workloads/inference/hugging-face-inference/#creating-a-hugging-face-inference-workload","title":"Creating a Hugging Face inference workload","text":"

      Before you start, make sure you have a project.

      To add a new inference workload:

      1. Go to the Workload manager \u2192 Workloads
      2. Click +NEW WORKLOAD and select Inference Within the new inference form:
      3. Select under which cluster to create the inference workload
      4. Select the project in which your inference will run
      5. Select Hugging Face from Inference type

        Note

        Selecting the Inference type is disabled by default. If you cannot see it in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Models.

      6. Enter a unique name for the inference workload (if the name already exists in the project, you will be requested to submit a different name)

      7. Click CONTINUE In the next step:
      8. Set the model and how to access

        • Set the model name as displayed in Hugging Face. The model must be supported by vLLM version 0.6.4.
          • Enter a name
        • Set how to access Hugging Face

          • Provide a token
            • Access token
              • Enter a token
          • Select credentials
            • Select existing credentials. Make sure the existing credentials contain an HF_TOKEN key
            • Add new credentials with a HF_TOKEN

              Within the new credentials form:

              • Enter a name for the credential. The name must be unique.
              • Optional: Provide a description of the credentials
              • Set how the credential is created

                • Existing secret (in the cluster)

                  This option applies when the purpose is to create credentials based on an existing secret

                  • Select a secret from the list (the list is empty if no secrets were created in advance)
                • New secret

                  A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.

                  • Enter a key
                  • Enter the HF_TOKEN as the value
        • Optional: Modify who can access the inference serving endpoint

          • Public (default)

            Everyone within the network can access the endpoint with no authentication

          • All authenticated users

            Everyone within the organization\u2019s account that can log in (to Run:ai or SSO)

          • Specific group(s)

            • Click +GROUP
            • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the endpoint.
          • Specific user(s)

            • Click +USER
            • Enter a valid email address or username. If you remove yourself, you will lose access to the endpoint.
      9. Select the compute resource for your inference workload

        • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery. For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
        • Optional: Set the minimum and maximum number of replicas to be scaled up and down to meet the changing demands of inference services.
        • If the number of minimum and maximum replicas are different, autoscaling will be triggered and you'll need to set conditions for creating a new replica. A replica will be created every time a condition is met. When a condition is no longer met after a replica was created, the replica will be automatically deleted to save resources.

          • Select a variable - The variable's values will be monitored via the container's port.
            • Latency (milliseconds)
            • Throughput (Requests/sec)
            • Concurrency (Requests)
          • Set a value - This value is the threshold at which autoscaling is triggered.
        • Optional: Set when the replicas should be automatically scaled down to zero. This allows the compute resources to be freed up when the model is inactive (i.e., there are no requests being sent) When automatic scaling to zero is enabled, the minimum number of replicas set in the previous step, automatically changes to 0

        • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload. When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
          • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
          • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster. To configure a new node pool and for additional information, see node pools.
        • Select a node affinity to schedule the workload on a specific node type. If the administrator added a \u2018node type (affinity)\u2019 scheduling rule to the project/department, then this field is mandatory. Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
        • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

          Note

          Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Tolerations

          • Click +TOLERATION
          • Enter a key
          • Select the operator
            • Exists - If the key exists on the node, the effect will be applied.
            • Equals - If the key and the value set below matches to the value on the node, the effect will be applied
              • Enter a value matching the value on the node
          • Select the effect for the toleration
            • NoExecute - Pods that do not tolerate this taint are evicted immediately.
            • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
            • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
            • Any - All effects above match.
      10. Optional - General settings:

        • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
        • Set annotations(s) Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
          • Click +ANNOTATION
          • Enter a name
          • Enter a value
        • Set labels(s) Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
          • Enter a name
          • Enter a value
      11. Click CREATE INFERENCE
      "},{"location":"Researcher/workloads/inference/hugging-face-inference/#managing-and-monitoring","title":"Managing and monitoring","text":"

      After the inference workload is created, it is added to the Workloads table, where it can be managed and monitored.

      "},{"location":"Researcher/workloads/inference/inference-overview/","title":"Inference overview","text":""},{"location":"Researcher/workloads/inference/inference-overview/#what-is-inference","title":"What is Inference","text":"

      Machine learning (ML) inference is the process of running live data points into a machine-learning algorithm to calculate an output.

      With Inference workloads, you are taking a trained Model and deploying it into a production environment. The deployment must align with the organization's production standards such as average and 95% response time as well as up-time.

      "},{"location":"Researcher/workloads/inference/inference-overview/#inference-and-gpus","title":"Inference and GPUs","text":"

      The Inference process is a subset of the original Training algorithm on a single datum (e.g. one sentence or one image), or a small batch. As such, GPU memory requirements are typically smaller than a full-blown Training process.

      Given that, Inference lends itself nicely to the usage of Run:ai Fractions. You can, for example, run 4 instances of an Inference server on a single GPU, each employing a fourth of the memory.

      "},{"location":"Researcher/workloads/inference/inference-overview/#inference-runai","title":"Inference @Run:ai","text":"

      Run:ai provides Inference services as an equal part together with the other two Workload types: Train and Build.

      • Inference is considered a high-priority workload as it is customer-facing. Running an Inference workload (within the Project's quota) will preempt any Run:ai Workload marked as Training.

      • Inference workloads will receive priority over Train and Build workloads during scheduling.

      • Inference is implemented as a Kubernetes Deployment object with a defined number of replicas. The replicas are load-balanced by Kubernetes so adding more replicas will improve the overall throughput of the system.

      • Multiple replicas will appear in Run:ai as a single Inference workload. The workload will appear in all Run:ai dashboards and views as well as the Command-line interface.

      • Inference workloads can be submitted via Run:ai user interface as well as Run:ai API. Internally, spawning an Inference workload also creates a Kubernetes Service. The service is an end-point to which clients can connect.

      "},{"location":"Researcher/workloads/inference/inference-overview/#autoscaling","title":"Autoscaling","text":"

      To withstand SLA, Inference workloads are typically set with auto scaling. Auto-scaling is the ability to add more computing power (Kubernetes pods) when the load increases and shrink allocated resources when the system is idle. There are several ways to trigger autoscaling. Run:ai supports the following:

      Metric Units Latency Millisecond Throughput Requests/sec Concurrency Requests

      The Minimum and Maximum number of replicas can be configured as part of the autoscaling configuration.

      Autoscaling also supports a scale-to-zero policy with Throughput and Concurrency metrics, meaning that given enough time under the target threshold, the number of replicas will be scaled down to 0.

      This has the benefit of conserving resources at the risk of a delay from \"cold starting\" the model when traffic resumes.

      "},{"location":"Researcher/workloads/inference/inference-overview/#rolling-inference-updates","title":"Rolling inference updates","text":"

      When deploying models and running inference workloads, you may need to update the workload configuration in a live manner, without impacting the important services that are provided by the workload.

      This means you can submit updates to an existing inference workload whether it is currently running, pending, or any other status.

      The following are a few examples of updates that can be implemented:

      • Changing the container image to deploy a new version of the model
      • Changing different parameters (such as environment variables)
      • Changing compute resources to improve performance
      • Changing the number of replicas and scale plan to adapt to requirement changes and scales

      During the update and until its successful completion, the service that the workload provides is not jeopardized as these are production-grade workloads. Hence, consumers can continue using the model (interact with the LLM) during the update process.

      During the update process of an inference workload, a new revision of pod(s) is created. This revision is the new desired specification of the workload. Although several updates can be submitted consecutively even if the process of the previous update is not complete, the target goal is always according to the last submitted update. This means, the previous updates are ignored.

      Once the new revision is created completely and is up and running, the entire traffic of requests is navigated to the new revision, the original revision is terminated and the resources are sent back to the shared pool. Only then is the update process considered complete.

      It is important to note that:

      • To finish the inference workload update successfully, the project must have sufficient free GPU quota in favor of the update. For example:

        • The existing workload uses 3 replicas: A running inference workload with 3 replicas, assuming that each replica is equal to 1 GPU, means the project is already using 3 GPUs of its quota. For the sake of simplicity, we will refer to this revision as revision #1.

        • The workload is updated to use 8 replicas: This means, to complete the update, an additional 8 GPUs of free quota is needed. Only when the update is complete, the 3 GPUs used for the initial revision (revision #1) are reclaimed.

      • In the UI, the Workloads table displays the configuration of the latest submitted update. For example, if you change the container image, the image column in the running / requested pods will display the name of updated image. The status of the workload continues to reflect the operational state of the service the workload exposes. For instance, during an update, the workload status remains \"Running\" if the service is still being delivered to consumers. Hovering over the workload's status in the grid will display the phase message for the update, offering additional insights into its update state.

      • The submission of inference updates is currently possible only via API. The following are the API fields that can be updated:

        • Command
        • Args
        • Image
        • imagePullPolicy
        • workingDir
        • createHomeDir
        • Probes
        • environmentVariables
        • Autoscaling
      • As long as the update process is not completed, GPUs are not allocated to the replicas of the new revision. This prevents the allocation of idle GPUs so others will not be deprived using them.

      • If the update process is not completed within the default time limit of 10 minutes, it will automatically stop. At that point, all replicas of the new revision will be removed, and the original revision will continue to run normally.
      • The default time limit for updates is configurable. Consider setting a longer duration if your workload requires extended time to pull the image due to its size, if the workload takes additional time to reach a 'READY' state due to a long initialization process, or if your cluster depends on autoscaling to allocate resources for new replicas. For example, to set the time limit to 30 minutes, you can run the following command:
        kubectl patch ConfigMap config-deployment -n knative-serving --type='merge' -p '{\"data\": {\"progress-deadline\": \"1800s\"}}'\n
      "},{"location":"Researcher/workloads/inference/inference-overview/#inference-workloads-with-knative-new-behavior-in-v219","title":"Inference workloads with Knative new behavior in v2.19","text":"

      Starting in version 2.19, all pods of a single Knative revision are grouped under a single Pod-Group. This means that when a new Knative revision is created:

      • It either succeeds in allocating the minimum number of pods; or
      • It fails and moves into a pending state, to retry again later to allocate all pods with their resources.

      The resources (GPUs, CPUs) are not occupied by a new Knative revision until it succeeds in allocating all pods. The older revision pods are then terminated and release their resources (GPUs, CPUs) back to the cluster to be used by other workloads.

      "},{"location":"Researcher/workloads/inference/inference-overview/#see-also","title":"See Also","text":"
      • To set up Inference, see Cluster installation prerequisites.
      • For running Inference see Inference quick-start.
      • To run Inference using API see Workload overview.
      "},{"location":"Researcher/workloads/inference/nim-inference/","title":"Deploy inference workloads with NVIDIA NIM","text":"

      This article explains how to deploy a GenAI model from Nvidia NIM as an inference workload via the Run:ai UI.

      An inference workload provides the setup and configuration needed to deploy your trained model for real-time or batch predictions. It includes specifications for the container image, data sets, network settings, and resource requests required to serve your models.

      The inference workload is assigned to a project and is affected by the project\u2019s quota.

      To learn more about the inference workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

      "},{"location":"Researcher/workloads/inference/nim-inference/#creating-a-nim-inference-workload","title":"Creating a NIM inference workload","text":"

      Before you start, make sure you have a project.

      To add a new inference workload:

      1. Go to the Workload manager \u2192 Workloads
      2. Click +NEW WORKLOAD and select Inference Within the new inference form:
      3. Select under which cluster to create the inference workload
      4. Select the project in which your inference will run
      5. Select NIM from Inference type

        Note

        Selecting the Inference type is disabled by default. If you cannot see it in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Models.

      6. Enter a unique name for the inference workload (if the name already exists in the project, you will be requested to submit a different name)

      7. Click CONTINUE In the next step:
      8. Select the NIM model and set how to access

        • Set the model name by selecting a model or entering the model name as displayed in NIM
        • Set how the model profile should be selected

          A NIM model profile sets compatible model engines and criteria for engine selection, such as precision, latency, throughput optimization, and GPU requirements. Profiles are optimized to balance either latency or throughput, with quantized profiles (e.g., fp8) preferred to reduce memory usage and enhance performance.

          • Automatically (recommended) NIM is designed to automatically select the most suitable profile from the list of compatible profiles based on the detected hardware. Each profile consists of different parameters that influence the selection process.
          • Manually
            • Enter profile name or hash
        • Optional: Modify who can access the inference serving endpoint

          • Public (default)

            Everyone within the network can access the endpoint with no authentication

          • All authenticated users

            Everyone within the organization\u2019s account that can log in (to Run:ai or SSO)

          • Specific group(s)

            • Click +GROUP
            • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the endpoint.
          • Specific user(s)

            • Click +USER
            • Enter a valid email address or username. If you remove yourself, you will lose access to the endpoint.
      9. Select how to access the model store

        • From NVIDIA NGC - The model is downloaded when the workload starts running
          • Set the NVIDIA NGC API key
            • Enter a key
            • (Optional) Click Storage - When downloading a model from NVIDIA NGC, selecting storage is recommended. Select a data source where the model is already cached to reduce loading time or click +NEW DATA SOURCE to add a new data source to the gallery. This will cache the model and reduce loading time for future use. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection. For a step-by-step guide on adding data sources to the gallery, see data sources. Once created, the new data source will be automatically selected.
        • From storage - The model is accessed directly and without being downloaded
          • Storage - Set where to load the model Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection. For a step-by-step guide on adding data sources to the gallery, see data sources. Once created, the new data source will be automatically selected.
      10. Select the compute resource for your inference workload

        • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery. For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
        • Optional: Set the minimum and maximum number of replicas to be scaled up and down to meet the changing demands of inference services.
        • If the number of minimum and maximum replicas are different, autoscaling will be triggered and you'll need to set conditions for creating a new replica. A replica will be created every time a condition is met. When a condition is no longer met after a replica was created, the replica will be automatically deleted to save resources.

          • Select a variable - The variable's values will be monitored via the container's port.
            • Latency (milliseconds)
            • Throughput (Requests/sec)
            • Concurrency (Requests)
          • Set a value - This value is the threshold at which autoscaling is triggered
        • Optional: Set when the replicas should be automatically scaled down to zero. This allows the compute resources to be freed up when the model is inactive (i.e., there are no requests being sent) When automatic scaling to zero is enabled, the minimum number of replicas set in the previous step, automatically changes to 0

        • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload. When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
          • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
          • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster. To configure a new node pool and for additional information, see node pools.
        • Select a node affinity to schedule the workload on a specific node type. If the administrator added a \u2018node type (affinity)\u2019 scheduling rule to the project/department, then this field is mandatory. Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
        • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

          Note

          Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Tolerations

          • Click +TOLERATION
          • Enter a key
          • Select the operator
            • Exists - If the key exists on the node, the effect will be applied.
            • Equals - If the key and the value set below matches to the value on the node, the effect will be applied
              • Enter a value matching the value on the node
          • Select the effect for the toleration
            • NoExecute - Pods that do not tolerate this taint are evicted immediately.
            • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
            • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
            • Any - All effects above match.
      11. Optional - General settings:

        • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
        • Set annotations(s) Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
          • Click +ANNOTATION
          • Enter a name
          • Enter a value
        • Set labels(s) Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
          • Enter a name
          • Enter a value
      12. Click CREATE INFERENCE
      "},{"location":"Researcher/workloads/inference/nim-inference/#managing-and-monitoring","title":"Managing and monitoring","text":"

      After the inference workload is created, it is added to the Workloads table, where it can be managed and monitored.

      "},{"location":"Researcher/workloads/overviews/introduction-to-workloads/","title":"Introduction to Workloads","text":"

      Run:ai enhances visibility and simplifies management, by monitoring, presenting and orchestrating all AI workloads in the clusters it is installed on. Workloads are the fundamental building blocks for consuming resources, enabling AI practitioners such as researchers, data scientists and engineers to efficiently support the entire life cycle of an AI initiative.

      "},{"location":"Researcher/workloads/overviews/introduction-to-workloads/#workloads-across-the-ai-lifecycle","title":"Workloads across the AI lifecycle","text":"

      A typical AI initiative progresses through several key stages, each with distinct workloads and objectives. With Run:ai, research and engineering teams can host and manage all these workloads to achieve the following:

      • Data preparation: Aggregating, cleaning, normalizing, and labeling data to prepare for training.
      • Training: Conducting resource-intensive model development and iterative performance optimization.
      • Fine-tuning: Adapting pre-trained models to domain-specific data sets while balancing efficiency and performance.
      • Inference: Deploying models for real-time or batch predictions with a focus on low latency and high throughput.
      • Monitoring and optimization: Ensuring ongoing performance by addressing data drift, usage patterns, and retraining as needed.
      "},{"location":"Researcher/workloads/overviews/introduction-to-workloads/#what-is-a-workload","title":"What is a workload?","text":"

      A workload runs in the cluster, is associated with a namespace, and operates to fulfill its targets, whether that is running to completion for a batch job, allocating resources for experimentation in an integrated development environment (IDE)/notebook, or serving inference requests in production.

      The workload, defined by the AI practitioner, consists of:

      • Container images: This includes the application, its dependencies, and the runtime environment.
      • Compute resources: CPU, GPU, and RAM to execute efficiently and address the workload\u2019s needs.
      • Data sets: The data needed for processing, such as training data sets or input from external databases.
      • Credentials: The access to certain data sources or external services, ensuring proper authentication and authorization.
      "},{"location":"Researcher/workloads/overviews/introduction-to-workloads/#workload-scheduling-and-orchestration","title":"Workload scheduling and orchestration","text":"

      Run:ai\u2019s core mission is to optimize AI resource usage at scale. This is achieved through efficient scheduling and orchestrating of all cluster workloads using the Run:ai Scheduler. The Scheduler allows the prioritization of workloads across different departments and projects within the organization at large scales, based on the resource distribution set by the system administrator.

      "},{"location":"Researcher/workloads/overviews/introduction-to-workloads/#runai-and-third-party-workloads","title":"Run:ai and third-party workloads","text":"
      • Run:ai workloads: These workloads are submitted via the Run:ai platform. They are represented by Kubernetes Custom Resource Definitions (CRDs) and APIs. When using Run:ai workloads, a complete Workload and Scheduling Policy solution is offered for administrators to ensure optimizations, governance and security standards are applied.
      • Third-party workloads: These workloads are submitted via third-party applications that use the Run:ai Scheduler. The Run:ai platform manages and monitors these workloads. They enable seamless integrations with external tools, allowing teams and individuals flexibility.
      "},{"location":"Researcher/workloads/overviews/introduction-to-workloads/#levels-of-support","title":"Levels of support","text":"

      Different types of workloads have different levels of support. Understanding what capabilities are needed before selecting the workload type to work with is important. The table below details the level of support for each workload type in Run:ai. Run:ai workloads are fully supported with all of Run:ai advanced features and capabilities. While third-party workloads are partially supported. The list of capabilities can change between different Run:ai versions.

      Functionality Workload Type Run:ai workloads Third-party workloads Training - Standard Workspace Inference Training - distributed Fairness v v v v v Priority and preemption v v v v v Over quota v v v v v Node pools v v v v v Bin packing / Spread v v v v v Multi-GPU fractions v v v v v Multi-GPU dynamic fractions v v v v v Node level scheduler v v v v v Multi-GPU memory swap v v v v v Elastic scaling NA NA v v v Gang scheduling v v v v v Monitoring v v v v v RBAC v v v v Workload awareness v v v v Workload submission v v v v Workload actions (stop/run) v v v v Workload Policies v v v v Scheduling rules v v v v

      Note

      Workload awareness

      Specific workload-aware visibility, so that different pods are identified and treated as a single workload (for example GPU utilization, workload view, dashboards).

      "},{"location":"Researcher/workloads/overviews/managing-workloads/","title":"Workloads","text":"

      This article explains the procedure for managing workloads.

      "},{"location":"Researcher/workloads/overviews/managing-workloads/#workloads-table","title":"Workloads table","text":"

      The Workloads table can be found under Workload manager in the Run:ai platform.

      The workloads table provides a list of all the workloads scheduled on the run:ai Scheduler, and allows you to manage them.

      The Workloads table consists of the following columns:

      Column Description Workload The name of the workload Type The workload type Preemptible Is the workload preemptible Status The different phases in a workload life cycle. Project The project in which the workload runs. Department The department that the workload is associated with. this column is visible only if the department toggle is enabled by your administrator. Created by The user who created the workload Running/requested pods The number of running pods out of the requested Creation time The timestamp for when the workload was created Completion time The timestamp the workload reached a terminal state (failed/completed) Connection(s) The method by which you can access and interact with the running workload. It's essentially the \"doorway\" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc). Click one of the values in the column to view the list of connections and their parameters Data source(s) Data resources used by the workload Environment The environment used by the workload Workload architecture Standard or distributed. A standard workload consists of a single process. A distributed workload consists of multiple processes working together. These processes can run on different nodes. GPU compute request Amount of GPU devices requested GPU compute allocation Amount of GPU devices allocated GPU memory request Amount of GPU memory Requested GPU memory allocation Amount of GPU memory allocated Idle GPU devices The number of allocated GPU devices that have been idle for more than 5 minutes CPU compute request Amount of CPU cores requested CPU compute allocation Amount of CPU cores allocated CPU memory request Amount of CPU memory requested CPU memory allocation Amount of CPU memory allocated Cluster The cluster that the workload is associated with"},{"location":"Researcher/workloads/overviews/managing-workloads/#workload-status","title":"Workload status","text":"

      The following table describes the different phases in a workload life cycle. The UI provides additional details for some of the below workload statuses which can be viewed by clicking the icon next to the status.

      Status Description Entry Condition Exit Condition Creating Workload setup is initiated in the cluster. Resources and pods are now provisioning. A workload is submitted. A multi-pod group is created. Pending Workload is queued and awaiting resource allocation. A pod group exists. All pods are scheduled. Initializing Workload is retrieving images, starting containers, and preparing pods. All pods are scheduled. All pods are initialized or a failure to initialize is detected. Running Workload is currently in progress with all pods operational. All pods initialized (all containers in pods are ready). Workload completion or failure. Degraded Pods may not align with specifications, network services might be incomplete, or persistent volumes may be detached. Check your logs for specific details. Pending - All pods are running but have issues. Running - All pods are running with no issues. Running - All resources are OK. Completed - Workload finished with fewer resources. Failed - Workload failure or user-defined rules. Deleting Workload and its associated resources are being decommissioned from the cluster. Deleting the workload. Resources are fully deleted. Stopped Workload is on hold and resources are intact but inactive. Stopping the workload without deleting resources. Transitioning back to the initializing phase or proceeding to deleting the workload. Failed Image retrieval failed or containers experienced a crash. Check your logs for specific details. An error occurs preventing the successful completion of the workload. Terminal state. Completed Workload has successfully finished its execution. The workload has finished processing without errors. Terminal state."},{"location":"Researcher/workloads/overviews/managing-workloads/#pods-associated-with-workload","title":"Pods Associated with Workload","text":"

      Click one of the values in the Running/requested pods column, to view the list of pods and their parameters.

      Column Description Pod Pod name Status Pod lifecycle stages Node The node on which the pod resides Node pool The node pool in which the pod resides (applicable if node pools are enabled) Image The pod\u2019s main image GPU compute allocation Amount of GPU devices allocated for the pod GPU memory allocation Amount of GPU memory allocated for the pod"},{"location":"Researcher/workloads/overviews/managing-workloads/#connections-associated-with-workload","title":"Connections Associated with Workload","text":"

      A connection refers to the method by which you can access and interact with the running workloads. It is essentially the \"doorway\" through which you can reach and use the applications (tools) these workloads provide.

      Click one of the values in the Connection(s) column, to view the list of connections and their parameters. Connections are network interfaces that communicate with the application running in the workload. Connections are either the URL the application exposes or the IP and the port of the node that the workload is running on.

      Column Description Name The name of the application running on the workload Connection type The network connection type selected for the workload Access Who is authorized to use this connection (everyone, specific groups/users) Address The connection URL Copy button Copy URL to clipboard Connect button Enabled only for supported tools"},{"location":"Researcher/workloads/overviews/managing-workloads/#data-sources-associated-with-workload","title":"Data Sources Associated with Workload","text":"

      Click one of the values in the Data source(s) column, to view the list of data sources and their parameters.

      Column Description Data source The name of the data source mounted to the workload Type The data source type"},{"location":"Researcher/workloads/overviews/managing-workloads/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      • Refresh - Click REFRESH to update the table with the latest data
      • Show/Hide details - Click to view additional information on the selected row
      "},{"location":"Researcher/workloads/overviews/managing-workloads/#showhide-details","title":"Show/Hide details","text":"

      Click a row in the Workloads table and then click the SHOW DETAILS button at the upper-right side of the action bar. The details pane appears, presenting the following tabs:

      "},{"location":"Researcher/workloads/overviews/managing-workloads/#event-history","title":"Event History","text":"

      Displays the workload status over time. It displays events describing the workload lifecycle and alerts on notable events. Use the filter to search through the history for specific events.

      "},{"location":"Researcher/workloads/overviews/managing-workloads/#metrics","title":"Metrics","text":"
      • GPU utilization Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs compute utilization (percentage of GPU compute) in this node.
      • GPU memory utilization Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs memory usage (percentage of the GPU memory) in this node.
      • CPU compute utilization The average of all CPUs\u2019 cores compute utilization graph, along an adjustable period allows you to see the trends of CPU compute utilization (percentage of CPU compute) in this node.
      • CPU memory utilization The utilization of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory utilization (percentage of CPU memory) in this node.
      • CPU memory usage The usage of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory usage (in GB or MB of CPU memory) in this node.

      • For GPUs charts - Click the GPU legend on the right-hand side of the chart, to activate or deactivate any of the GPU lines.

      • You can click the date picker to change the presented period
      • You can use your mouse to mark a sub-period in the graph for zooming in, and use Reset zoom to go back to the preset period
      • Changes in the period affect all graphs on this screen.
      "},{"location":"Researcher/workloads/overviews/managing-workloads/#logs","title":"Logs","text":"

      Workload events are ordered in chronological order. The logs contain events from the workload\u2019s lifecycle to help monitor and debug issues.

      "},{"location":"Researcher/workloads/overviews/managing-workloads/#adding-new-workload","title":"Adding new workload","text":"

      Before starting, make sure you have created a project or have one created for you to work with workloads.

      To create a new workload:

      1. Click +NEW WORKLOAD
      2. Select a workload type - Follow the links below to view the step-by-step guide for each workload type:
        • Workspace. Used for data preparation and model-building tasks.
        • Training. Used for standard training tasks of all sorts
        • Distributed Training. Used for distributed tasks of all sorts
        • Inference. Used for inference and serving tasks
        • Job (legacy). This type is displayed only if enabled by your Administrator, under General settings \u2192 Workloads \u2192 Workload policies
      3. Click CREATE WORKLOAD
      "},{"location":"Researcher/workloads/overviews/managing-workloads/#stopping-a-workload","title":"Stopping a workload","text":"

      Stopping a workload kills the workload pods and releases the workload resources.

      1. Select the workload you want to stop
      2. Click STOP
      "},{"location":"Researcher/workloads/overviews/managing-workloads/#running-a-workload","title":"Running a workload","text":"

      Running a workload spins up new pods and resumes the workload work after it was stopped.

      1. Select the workload you want to run again
      2. Click RUN
      "},{"location":"Researcher/workloads/overviews/managing-workloads/#connecting-to-a-workload","title":"Connecting to a workload","text":"

      To connect to an application running in the workload (for example, Jupyter Notebook)

      1. Select the workload you want to connect
      2. Click CONNECT
      3. Select the tool from the drop-down list
      4. The selected tool is opened in a new tab on your browser
      "},{"location":"Researcher/workloads/overviews/managing-workloads/#deleting-a-workload","title":"Deleting a workload","text":"
      1. Select the workload you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm the deletion

      Note

      Once a workload is deleted you can view it in the Deleted tab in the workloads view. This tab is displayed only if enabled by your Administrator, under General settings \u2192 Workloads \u2192 Deleted workloads

      "},{"location":"Researcher/workloads/overviews/managing-workloads/#copy-edit-a-workload","title":"Copy & Edit a workload","text":"
      1. Select the workload you want to copy and edit
      2. Click COPY & EDIT
      3. Update the workload and click CREATE WORKLOAD
      "},{"location":"Researcher/workloads/overviews/managing-workloads/#using-api","title":"Using API","text":"

      Go to the Workloads API reference to view the available actions

      "},{"location":"Researcher/workloads/overviews/managing-workloads/#troubleshooting","title":"Troubleshooting","text":"

      To understand the condition of the workload, review the workload status in the Workload table. For more information, see check the workload\u2019s event history.

      Listed below are a number of known issues when working with workloads and how to fix them:

      Issue Mediation Cluster connectivity issues (there are issues with your connection to the cluster error message) Verify that you are on a network that has been granted access to the cluster. Reach out to your cluster admin for instructions on verifying this. If you are an admin, see the troubleshooting section in the cluster documentation Workload in \u201cInitializing\u201d status for some time Check that you have access to the Container image registry. Check the statuses of the pods in the pods\u2019 modal. Check the event history for more details Workload has been pending for some time Check that you have the required quota. Check the project\u2019s available quota in the project dialog. Check that all services needed to run are bound to the workload. Check the event history for more details. PVCs created using the K8s API or kubectl are not visible or mountable in Run:ai. This is by design. - Create a new data source of type PVC in the Run:ai UI - In the Data mount section, select Existing PVC - Select the PVC you created via the K8S API You are now able to select and mount this PVC in your Run:ai submitted workloads. Workload is not visible in the UI. Check that the workload hasn\u2019t been deleted. See the \u201cDeleted\u201d tab in the workloads view"},{"location":"Researcher/workloads/overviews/workload-types/","title":"Run:ai Workload Types","text":"

      In the world of machine learning (ML), the journey from raw data to actionable insights is a complex process that spans multiple stages. Each stage of the AI lifecycle requires different tools, resources, and frameworks to ensure optimal performance. Run:ai simplifies this process by offering specialized workload types tailored to each phase, facilitating a smooth transition across various stages of the ML workflows.

      The ML lifecycle usually begins with the experimental work on data and exploration of different modeling techniques to identify the best approach for accurate predictions. At this stage, resource consumption is usually moderate as experimentation is done on a smaller scale. As confidence grows in the model's potential and its accuracy, the demand for compute resources increases. This is especially true during the training phase, where vast amounts of data need to be processed, particularly with complex models such as large language models (LLMs), with their huge parameter sizes, that often require distributed training across multiple GPUs to handle the intensive computational load.

      Finally, once the model is ready, it moves to the inference stage, where it is deployed to make predictions on new, unseen data. Run:ai's workload types are designed to correspond with the natural stages of this lifecycle. They are structured to align with the specific resource and framework requirements of each phase, ensuring that AI researchers and data scientists can focus on advancing their models without worrying about infrastructure management.

      Run:ai offers three workload types that correspond to a specific phase of the researcher\u2019s work:

      • Workspaces \u2013 For experimentation with data and models.
      • Training \u2013 For resource-intensive tasks such as model training and data preparation.
      • Inference \u2013 For deploying and serving the trained model.
      "},{"location":"Researcher/workloads/overviews/workload-types/#workspaces-the-experimentation-phase","title":"Workspaces: the experimentation phase","text":"

      The Workspace is where data scientists conduct initial research, experiment with different data sets, and test various algorithms. This is the most flexible stage in the ML lifecycle, where models and data are explored, tuned, and refined. The value of workspaces lies in the flexibility they offer, allowing the researcher to iterate quickly without being constrained by rigid infrastructure.

      • Framework flexibility

        Workspaces support a variety of machine learning frameworks, as researchers need to experiment with different tools and methods.

      • Resource requirements

        Workspaces are often lighter on resources compared to the training phase, but they still require significant computational power for data processing, analysis, and model iteration.

        Hence, the default for the Run:ai workspaces considerations is to allow scheduling those workloads without the ability to preempt them once the resources were allocated. However, this non-preemptable state doesn\u2019t allow to utilize more resources outside of the project\u2019s deserved quota.

      See Running workspaces to learn more about how to submit a workspace via the Run:ai platform. For quick starts, see Running Jupyter Notebook using workspaces.

      "},{"location":"Researcher/workloads/overviews/workload-types/#training-scaling-resources-for-model-development","title":"Training: scaling resources for model development","text":"

      As models mature and the need for more robust data processing and model training increases, Run:ai facilitates this shift through the Training workload. This phase is resource-intensive, often requiring distributed computing and high-performance clusters to process vast data sets and train models.

      • Training architecture

        For training workloads Run:ai allows you to specify the architecture - standard or distributed. The distributed architecture is relevant for larger data sets and more complex models that require utilizing multiple nodes. For the distributed architecture, Run:ai allows you to specify different configurations for the master and workers and select which framework to use - PyTorch, XGBoost, MPI, and TensorFlow. In addition, as part of the distributed configuration, Run:ai enable the researchers to schedule their distributed workloads on nodes within the same region, zone, placement group, or any other topology.

      • Resource requirements

        Training tasks demand high memory, compute power, and storage. Run:ai ensures that the allocated resources match the scale of the task and allows those workloads to utilize more compute resources than the project\u2019s deserved quota. Make sure that if you wish your training workload not to be preempted, specify the number of GPU\u2019s that are in your quota.

      See Standard training and Distributed training to learn more about how to submit a training workload via the Run:ai UI. For quick starts, see Run your first standard training and Run your first distributed training.

      "},{"location":"Researcher/workloads/overviews/workload-types/#inference-deploying-and-serving-models","title":"Inference: deploying and serving models","text":"

      Once a model is trained and validated, it moves to the Inference stage, where it is deployed to make predictions (usually in a production environment). This phase is all about efficiency and responsiveness, as the model needs to serve real-time or batch predictions to end-users or other systems.

      • Inference-specific use cases

        Naturally, inference workloads are required to change and adapt to the ever-changing demands to meet SLA. For example, additional replicas may be deployed, manually or automatically, to increase compute resources as part of a horizontal scaling approach or a new version of the deployment may need to be rolled out without affecting the running services.

      • Resource requirements

        Inference models differ in size and purpose, leading to varying computational requirements. For example, small OCR models can run efficiently on CPUs, whereas LLMs typically require significant GPU memory for deployment and serving. Inference workloads are considered production-critical and are given the highest priority to ensure compliance with SLAs. Additionally, Run:ai ensures that inference workloads cannot be preempted, maintaining consistent performance and reliability.

      See Deploy a custom inference workload to learn more about how to submit an inference workload via the Run:ai UI.

      "},{"location":"Researcher/workloads/training/distributed-training/distributed-training/","title":"Train models using a distributed training workload","text":"

      This article explains how to create a distributed training workload via the Run:ai UI.

      A training workload contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

      The distributed training workload is assigned to a project and is affected by the project\u2019s quota.

      To learn more about the distributed training workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

      "},{"location":"Researcher/workloads/training/distributed-training/distributed-training/#creating-a-distributed-training-workload","title":"Creating a distributed training workload","text":"

      Before you start, make sure you have a project.

      To add a new distributed training workload:

      1. Go to the Workload manager \u2192 Workloads
      2. Click +NEW WORKLOAD and select Training Within the new training form:
      3. Select under which cluster to create the training workload
      4. Select the project in which your training will run
      5. Set the training workload architecture as distributed workload, which consists of multiple processes working together. These processes can run on different nodes. This workload uses environments that support distributed training workloads only.

        • Set the framework for the distributed workload. Select from -

          • PyTorch
          • TensorFlow
          • XG Boost
          • MPI

          In case one the above frameworks is not enabled, see Distributed training prerequisites for details on enabling.

        • Set the distributed workload configuration that defines how distributed training workloads are divided across multiple machines or processes. Choose a configuration based on your training requirements and infrastructure -

          • Workers & master
          • Workers only
      6. Select a preconfigured template or select Start from scratch to launch a new training workload quickly

      7. Enter a unique name for the training workload (if the name already exists in the project, you will be requested to submit a different name)
      8. Click CONTINUE In the next step:
      9. Select the environment for your training workload
        • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery. For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
        • Set the connection for your tool(s). The tools are configured as part of the environment.
          • External URL
            • Custom URL
              • Set the URL
            • Optional: Modify who can access the tool:
              • All authenticated users (default) Everyone within the organization\u2019s account
              • Specific group(s)
                • Click +GROUP
                • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
              • Specific user(s)
                • Click +USER
                • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
          • Node port
            • Custom port
              • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
        • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
          • Enter UID
          • Enter GID
          • Add Supplementary groups (multiple groups can be added, separated by commas).
        • Optional: Set the command and arguments for the container running the workload When If no command is added, the container will use the image\u2019s default command (entry-point).
          • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
          • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
        • Set the environment variable(s)
          • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
          • (Optional) Add new variables
            • Click +ENVIRONMENT VARIABLE
            • Enter a name
            • Select the source for the environment variable
              • Custom
                • Enter a value according to the provided instructions
              • Credentials - Select existing credentials as the environment variable
                • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
                • Select a secret key
      10. Select the compute resource for your training workload

        • Set the number of workers for your workload
        • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery. For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
        • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload. When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
          • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
          • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster. To configure a new node pool and for additional information, see node pools.
        • Select a node affinity to schedule the workload on a specific node type. If the administrator added a \u2018node type (affinity)\u2019 scheduling rule to the project/department, then this field is mandatory. Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
        • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

          Note

          Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Tolerations

          • Click +TOLERATION
          • Enter a key
          • Select the operator
            • Exists - If the key exists on the node, the effect will be applied.
            • Equals - if the key and the value set below matches to the value on the node, the effect will be applied
              • Enter a value matching the value on the node
          • Select the effect for the toleration
            • NoExecute - Pods that do not tolerate this taint are evicted immediately.
            • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
            • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
            • Any - All effects above match.
        • Optional: Set topology to let the workload be scheduled on nodes with a matching topology. Topology lets the workload to be scheduled on nodes within the same region, zone, placement group or any other topology you define.

          Note

          Setting topology is disabled, by default. If you cannot see Topology in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Topology

          • Click +TOPOLOGY
          • Enter a key
          • Select the operator
            • Required - If the scheduler can\u2019t schedule all pods within the same topology, the workload will be pending.
            • Preferred - The scheduler will try to schedule all pods within the same topology but may schedule some pods on nodes that are not part of the same topology.
      11. Optional: Set the volume needed for your workload A volume allocates storage space to your workload that is persistent across restarts.

        • Click +VOLUME
        • Select the storage class
          • None - Proceed without defining a storage class.
          • Custom storage class - This option applies when selecting a storage class based on existing storage classes. To add new storage classes to the storage class list, and for additional information, see Kubernetes storage classes.
        • Select the access mode(s) (multiple modes can be selected)
          • Read-write by one node - The volume can be mounted as read-write by a single node.
          • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
          • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
        • Set the claim size and its units
        • Select the volume mode
          • File system (default) - This allows the volume to be mounted as a file system, enabling the usage of directories and files.
          • Block - This exposes the volume as a block storage, which can be formatted or used directly by applications without a file system.
        • Set the Container path with the volume target location
        • Set the volume persistency
          • Persistent - The volume and its data will be deleted only when the workload is deleted.
          • Ephemeral - The volume and its data will be deleted every time the workload\u2019s status changes to \u201cStopped.\u201d
      12. Optional: Select data sources for your training workload

        Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection. For a step-by-step guide on adding data sources to the gallery, see data sources. Once created, the new data source will be automatically selected.

        • Optional: Modify the data target location for the selected data source(s).
      13. Optional - General settings:

        • Set the grace period for workload preemption. This is a buffer that allows a preempted workload to reach a safe checkpoint before it is forcibly preempted. Enter a timeframe between 0 sec and 5 min.
        • Set the backoff limit before workload failure. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload status will change to \"Failed.\" Enter a value between 1 and 100.
        • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
        • Set annotations(s) Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
          • Click +ANNOTATION
          • Enter a name
          • Enter a value
        • Set labels(s) Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
          • Enter a name
          • Enter a value
      14. Click CONTINUE
      15. Decide if you wish to define a different setup between the Workers and the Master via the toggle. When disabled the master\u2019s setup will inherit the workers\u2019 setup.

        • In case a different setup is requested or required, repeat steps 9-13 stated above with necessary changes.
      16. Click CREATE TRAINING

      "},{"location":"Researcher/workloads/training/distributed-training/distributed-training/#workload-policies","title":"Workload Policies","text":"

      When creating a new workload, fields and assets may have limitations or defaults. These rules and defaults are derived from a policy your administrator set.

      Policies allow you to control, standardize, and simplify the workload submission process. For additional information, see Workload Policies and Rules.

      The effects of the policy are reflected in the training creation form:

      • Defaults derived from the policy will be displayed automatically for specific fields.
      • Disabled actions or values must be within a certain range.
      • Rules and defaults for entire sections (such as environments, compute resources, or data sources) may prevent selection and will appear on the entire library card with an option for additional information via an external modal.
      "},{"location":"Researcher/workloads/training/distributed-training/distributed-training/#managing-and-monitoring","title":"Managing and monitoring","text":"

      After the training workload is created, it is added to the Workloads table, where it can be managed and monitored.

      "},{"location":"Researcher/workloads/training/distributed-training/distributed-training/#using-cli","title":"Using CLI","text":"

      To view the available actions,see all possible distributed training workloads in the CLI v2 reference or the CLI v1 reference.

      "},{"location":"Researcher/workloads/training/distributed-training/distributed-training/#using-api","title":"Using API","text":"

      To view the available actions, see the Distributed workload API reference.

      "},{"location":"Researcher/workloads/training/distributed-training/quickstart-distributed-training/","title":"Run your First Distributed Training","text":"

      This article provides a step-by-step walkthrough for running a PyTorch distributed training workload.

      Distributed training is the ability to split the training of a model among multiple processors. Each processor is called a worker. Worker nodes work in parallel to speed up model training. There is also a master which coordinates the workers.

      "},{"location":"Researcher/workloads/training/distributed-training/quickstart-distributed-training/#prerequisites","title":"Prerequisites","text":"

      Before you start, make sure:

      • You have created a project or have one created for you.
      • The project has an assigned quota of at least 1 GPU.
      "},{"location":"Researcher/workloads/training/distributed-training/quickstart-distributed-training/#step-1-logging-in","title":"Step 1: Logging in","text":"User InterfaceCLI V1CLI V2API

      Browse to the provided Run:ai user interface and log in with your credentials.

      Log in using the following command. You will be prompted to enter your username and password:

      runai login\n

      Run the below --help command to obtain the login options and log in according to your setup:

      runai login --help  \n

      To use the API, you will need to obtain a token. Please follow the API authentication article.

      "},{"location":"Researcher/workloads/training/distributed-training/quickstart-distributed-training/#step-2-submitting-a-distributed-training-workload","title":"Step 2: Submitting a distributed training workload","text":"User InterfaceCLI V1CLI V2API
      1. Go to the Workload manager \u2192 Workloads
      2. Click +NEW WORKLOAD and select Training
      3. Select under which cluster to create the workload
      4. Select the project in which your workload will run
      5. Under Workload architecture, select Distributed and choose PyTorch. Set the distributed training configuration to Worker & master
      6. Select a preconfigured template or select the Start from scratch to launch a new workload quickly
      7. Enter a name for the distributed training workload (if the name already exists in the project, you will be requested to submit a different name)
      8. Click CONTINUE
      9. Click +NEW ENVIRONMENT

        a. Enter pytorch-dt as the name

        b. Enter kubeflow/pytorch-dist-mnist:latest as the Image URL

        c. Click CREATE ENVIRONMENT

      10. When the previous screen comes up, enter 2 workers and select \u2018small-fraction\u2019 as the compute resource for your workload

        • If the \u2018small-fraction\u2019 is not displayed in the gallery, follow the step-by-step guide:
        Create a small-fraction compute resource
        1. Click +NEW COMPUTE RESOURCE
        2. Select under which cluster to create the compute resource
        3. Select a scope
        4. Enter a name for the compute resource. The name must be unique.
        5. Set GPU devices per pod - 1
        6. Set GPU memory per device

          • Select % (of device) - Fraction of a GPU device\u2019s memory
        7. Optional: set the CPU compute per pod - 0.1 cores (default)

        8. Optional: set the CPU memory per pod - 100 MB (default)
        9. Click CREATE COMPUTE RESOURCE
        • The newly created small-fraction compute resource will be selected automatically
      11. Click CONTINUE

      12. Click CREATE TRAINING

        After the distributed training workload is created, it is added to the workloads table.

      Copy the following command to your terminal. Make sure to update the below with the name of your project and workload:

      runai config project \"project-name\"  \nrunai submit-dist pytorch \"workload-name\" --workers=2 -g 0.1 \\\n   -i kubeflow/pytorch-dist-mnist:latest\n

      This would start a distributed training workload based on kubeflow/pytorch-dist-mnist:latest with one master and two workers.

      Copy the following command to your terminal. Make sure to update the below with the name of your project and workload:

      runai project set \"project-name\"\nrunai distributed submit \"workload-name\" --framework PyTorch \\\n   -i kubeflow/pytorch-dist-mnist:latest --workers 2 \n   --gpu-request-type portion --gpu-portion-request 0.1 --gpu-devices-request 1 --cpu-memory-request 100M\n

      This would start a distributed training workload based on kubeflow/pytorch-dist-mnist:latest with one master and two workers.

      Copy the following command to your terminal. Make sure to update the below parameters according to the comments. For more details, see Distributed API reference:

      curl -L 'https://<COMPANY-URL>/api/v1/workloads/distributed' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"workload-name\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {  \n        \"compute\": { \n            \"cpuCoreRequest\": 0.1,\n            \"gpuRequestType\": \"portion\",\n            \"cpuMemoryRequest\": \"100M\",\n            \"gpuDevicesRequest\": 1,\n            \"gpuPortionRequest\": 0.1 \n        },      \n        \"image\": \"kubeflow/pytorch-dist-mnist:latest\",  \n        \"numWorkers\": 2,  \\ \n        \"distributedFramework\": \"PyTorch\" \\\n    } \n}'\n
      1. <COMPANY-URL> is the link to the Run:ai user interface.
      2. <TOKEN> is the API access token obtained in Step 1.
      3. <PROJECT-ID> is #The ID of the Project the workload is running on. You can get the Project ID via the Get Projects API Get Projects API.
      4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.

      This would start a distributed training workload based on kubeflow/pytorch-dist-mnist:latest with one master and two workers.

      Note

      The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.

      "},{"location":"Researcher/workloads/training/distributed-training/quickstart-distributed-training/#next-steps","title":"Next Steps","text":"
      • Manage and monitor your newly created workload using the workloads table.
      • After validating your training performance and results, deploy your model using inference.
      "},{"location":"Researcher/workloads/training/standard-training/quickstart-standard-training/","title":"Run your First Standard Training","text":"

      This article provides a step-by-step walkthrough for running a standard training workload.

      A training workload contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

      "},{"location":"Researcher/workloads/training/standard-training/quickstart-standard-training/#prerequisites","title":"Prerequisites","text":"

      Before you start, make sure:

      • You have created a project or have one created for you.
      • The project has an assigned quota of at least 1 GPU.
      "},{"location":"Researcher/workloads/training/standard-training/quickstart-standard-training/#step-1-logging-in","title":"Step 1: Logging in","text":"User InterfaceCLI V1CLI V2API

      Browse to the provided Run:ai user interface and log in with your credentials.

      Log in using the following command. You will be prompted to enter your username and password:

      runai login\n

      Run the below --help command to obtain the login options and log in according to your setup:

      runai login --help  \n

      To use the API, you will need to obtain a token. Please follow the API authentication article.

      "},{"location":"Researcher/workloads/training/standard-training/quickstart-standard-training/#step-2-submitting-a-standard-training-workload","title":"Step 2: Submitting a standard training workload","text":"User InterfaceCLI V1CLI V2API
      1. Go to the Workload manager \u2192 Workloads
      2. Click +NEW WORKLOAD and select Training
      3. Select under which cluster to create the workload
      4. Select the project in which your workload will run
      5. Under Workload architecture, select Standard
      6. Select a preconfigured template or select the Start from scratch to launch a new workload quickly
      7. Enter a name for the standard training workload (if the name already exists in the project, you will be requested to submit a different name)
      8. Click CONTINUE
      9. Click +NEW ENVIRONMENT

        a. Enter quickstart as the name

        b. Enter runai.jfrog.io/demo/quickstart as the Image URL

        c. Click CREATE ENVIRONMENT

      10. Select the \u2018one-gpu\u2019 compute resource for your workload (GPU devices: 1)

        • If the \u2018one-gpu\u2019 is not displayed in the gallery, follow the step-by-step guide:
        Create a one-gpu compute resource
        1. Click +NEW COMPUTE RESOURCE
        2. Select under which cluster to create the compute resource
        3. Select a scope
        4. Enter a name for the compute resource. The name must be unique.
        5. Set GPU devices per pod - 1
        6. Set GPU memory per device

          • Select % (of device) - Fraction of a GPU device\u2019s memory
          • Set the memory Request - 100 (The workload will allocate 100% of the GPU memory)
        7. Optional: set the CPU compute per pod - 0.1 cores (default)

        8. Optional: set the CPU memory per pod - 100 MB (default)
        9. Click CREATE COMPUTE RESOURCE
        • The newly created one-gpu compute resource will be selected automatically
      11. Click CONTINUE

      12. Click CREATE TRAINING

        After the standard training workload is created, it is added to the workloads table.

      Copy the following command to your terminal. Make sure to update the below with the name of your project:

      runai config project \"project-name\"  \nrunai submit \"workload-name\" -i runai.jfrog.io/demo/quickstart -g 1\n

      This would start a standard training workload based on a sample docker image, runai.jfrog.io/demo/quickstart, with one GPU allocated.

      Copy the following command to your terminal. Make sure to update the below with the name of your project and workload:

      runai project set \"project-name\"\nrunai training submit \"workload-name\" -i runai.jfrog.io/demo/quickstart -g 1\n

      This would start a standard training workload based on a sample docker image, runai.jfrog.io/demo/quickstart, with one GPU allocated.

      Copy the following command to your terminal. Make sure to update the below parameters according to the comments. For more details, see Trainings API reference:

      curl -L 'https://<COMPANY-URL>/api/v1/workloads/trainings' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"workload-name\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {  \n       \"image\": \"runai.jfrog.io/demo/quickstart\", \n       \"compute\": { \n       \"gpuDevicesRequest\": 1\n       }, \n    } \n}'\n
      1. <COMPANY-URL> is the link to the Run:ai user interface.
      2. <TOKEN> is the API access token obtained in Step 1.
      3. <PROJECT-ID> is #The ID of the Project the workload is running on. You can get the Project ID via the Get Projects API Get Projects API.
      4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.

      This would start a standard training workload based on a sample docker image, runai.jfrog.io/demo/quickstart, with one GPU allocated.

      Note

      The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.

      "},{"location":"Researcher/workloads/training/standard-training/quickstart-standard-training/#next-steps","title":"Next Steps","text":"
      • Manage and monitor your newly created workload using the workloads table.
      • After validating your training performance and results, deploy your model using inference.
      "},{"location":"Researcher/workloads/training/standard-training/trainings-v2/","title":"Train models using a standard training workload","text":"

      This article explains how to create a standard training workload via the Run:ai UI.

      A training workload contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

      The training workload is assigned to a project and is affected by the project\u2019s quota.

      To learn more about the training workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

      "},{"location":"Researcher/workloads/training/standard-training/trainings-v2/#creating-a-standard-training-workload","title":"Creating a standard training workload","text":"

      Before you start, make sure you have a project.

      To add a new training workload:

      1. Go to the Workload manager \u2192 Workloads
      2. Click +NEW WORKLOAD and select Training Within the new training form:
      3. Select under which cluster to create the training workload
      4. Select the project in which your training will run
      5. Set the training workload architecture as standard, which consists of a single main running process. This workload uses environments that support standard training workloads only.
      6. Select a preconfigured template or select Start from scratch to launch a new training workload quickly
      7. Enter a unique name for the training workload (if the name already exists in the project, you will be requested to submit a different name)
      8. Click CONTINUE In the next step:
      9. Select the environment for your training workload
        • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery. For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
        • Set the connection for your tool(s). The tools are configured as part of the environment.
          • External URL
            • Custom URL
              • Set the URL
            • Optional: Modify who can access the tool:
              • All authenticated users (default) Everyone within the organization\u2019s account
              • Specific group(s)
                • Click +GROUP
                • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
              • Specific user(s)
                • Click +USER
                • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
          • Node port
            • Custom port
              • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
        • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
          • Enter UID
          • Enter GID
          • Add Supplementary groups (multiple groups can be added, separated by commas).
        • Optional: Set the command and arguments for the container running the workload When If no command is added, the container will use the image\u2019s default command (entry-point).
          • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
          • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
        • Set the environment variable(s)
          • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
          • (Optional) Add new variables
            • Click +ENVIRONMENT VARIABLE
            • Enter a name
            • Select the source for the environment variable
              • Custom
                • Enter a value according to the provided instructions
              • Credentials - Select existing credentials as the environment variable
                • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
                • Select a secret key
      10. Select the compute resource for your training workload

        • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery. For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
        • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload. When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
          • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
          • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster. To configure a new node pool and for additional information, see node pools.
        • Select a node affinity to schedule the workload on a specific node type. If the administrator added a \u2018node type (affinity)\u2019 scheduling rule to the project/department, then this field is mandatory. Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
        • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

          Note

          Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Tolerations

          • Click +TOLERATION
          • Enter a key
          • Select the operator
            • Exists - If the key exists on the node, the effect will be applied.
            • Equals - if the key and the value set below matches to the value on the node, the effect will be applied
              • Enter a value matching the value on the node
          • Select the effect for the toleration
            • NoExecute - Pods that do not tolerate this taint are evicted immediately.
            • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
            • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
            • Any - All effects above match.
      11. Optional: Set the volume needed for your workload A volume allocates storage space to your workload that is persistent across restarts.

        • Click +VOLUME
        • Select the storage class
          • None - Proceed without defining a storage class.
          • Custom storage class - This option applies when selecting a storage class based on existing storage classes. To add new storage classes to the storage class list, and for additional information, see Kubernetes storage classes.
        • Select the access mode(s) (multiple modes can be selected)
          • Read-write by one node - The volume can be mounted as read-write by a single node.
          • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
          • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
        • Set the claim size and its units
        • Select the volume mode
          • File system (default) - This allows the volume to be mounted as a file system, enabling the usage of directories and files.
          • Block - This exposes the volume as a block storage, which can be formatted or used directly by applications without a file system.
        • Set the Container path with the volume target location
        • Set the volume persistency
          • Persistent - The volume and its data will be deleted only when the workload is deleted.
          • Ephemeral - The volume and its data will be deleted every time the workload\u2019s status changes to \u201cStopped.\u201d
      12. Optional: Select data sources for your training workload

        Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection. For a step-by-step guide on adding data sources to the gallery, see data sources. Once created, the new data source will be automatically selected.

        • Optional: Modify the data target location for the selected data source(s).
      13. Optional - General settings:

        • Set the grace period for workload preemption. This is a buffer that allows a preempted workload to reach a safe checkpoint before it is forcibly preempted. Enter a timeframe between 0 sec and 5 min.
        • Set the number of runs the workload must finish to be considered complete. Multiple runs enhance the reliability and validity of the training results.
        • If the number of runs is above 1, enter a value under Parallelism to specify how many runs may be scheduled in parallel. The value must be less than or equal to the number of runs.
        • Set the backoff limit before workload failure. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload status will change to \"Failed.\" Enter a value between 1 and 100.
        • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
        • Set annotations(s) Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
          • Click +ANNOTATION
          • Enter a name
          • Enter a value
        • Set labels(s) Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
          • Enter a name
          • Enter a value
      14. Click CREATE TRAINING
      "},{"location":"Researcher/workloads/training/standard-training/trainings-v2/#workload-policies","title":"Workload Policies","text":"

      When creating a new workload, fields and assets may have limitations or defaults. These rules and defaults are derived from a policy your administrator set.

      Policies allow you to control, standardize, and simplify the workload submission process. For additional information, see Workload Policies and Rules.

      The effects of the policy are reflected in the training creation form:

      • Defaults derived from the policy will be displayed automatically for specific fields.
      • Disabled actions or values must be within a certain range.
      • Rules and defaults for entire sections (such as environments, compute resources, or data sources) may prevent selection and will appear on the entire library card with an option for additional information via an external modal.
      "},{"location":"Researcher/workloads/training/standard-training/trainings-v2/#managing-and-monitoring","title":"Managing and monitoring","text":"

      After the training workload is created, it is added to the Workloads table, where it can be managed and monitored.

      "},{"location":"Researcher/workloads/training/standard-training/trainings-v2/#using-cli","title":"Using CLI","text":"

      To view the available actions, see the training workload CLI v2 reference or the CLI v1 reference.

      "},{"location":"Researcher/workloads/training/standard-training/trainings-v2/#using-api","title":"Using API","text":"

      To view the available actions, see the Trainings workload API reference.

      "},{"location":"Researcher/workloads/workspaces/quickstart-jupyter/","title":"Running Jupyter Notebook Using Workspaces","text":"

      This guide provides a step-by-step walkthrough for running a Jupyter Notebook using workspaces.

      A workspace contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in one place. See Running workspaces for more information.

      "},{"location":"Researcher/workloads/workspaces/quickstart-jupyter/#prerequisites","title":"Prerequisites","text":"

      Before you start, make sure:

      • You have created a project or have one created for you.
      • The project has an assigned quota of at least 1 GPU.
      "},{"location":"Researcher/workloads/workspaces/quickstart-jupyter/#step-1-logging-in","title":"Step 1: Logging in","text":"User InterfaceCLI V1CLI V2API

      Browse to the provided Run:ai user interface and log in with your credentials.

      Log in using the following command. You will be prompted to enter your username and password:

      runai login\n

      Run the below --help command to obtain the login options and log in according to your setup:

      runai login --help  \n

      To use the API, you will need to obtain a token. Please follow the API authentication article.

      "},{"location":"Researcher/workloads/workspaces/quickstart-jupyter/#step-2-submitting-a-workspace","title":"Step 2: Submitting a workspace","text":"User InterfaceCLI V1CLI V2API
      1. Go to the Workload manager \u2192 Workloads
      2. Select +NEW WORKLOAD and then Workspace
      3. Select under which cluster to create the workload
      4. Select the project in which your workspace will run
      5. Select a preconfigured template or select the Start from scratch to launch a new workspace quickly
      6. Enter a name for the workspace (If the name already exists in the project, you will be requested to submit a different name)
      7. Click CONTINUE
      8. Select the \u2018jupyter-lab\u2019 environment for your workspace (Image URL: jupyter/scipy-notebook)

        • If the \u2018jupyter-lab\u2019 is not displayed in the gallery, follow the step-by-step guide:
        Create a jupyter-lab environment
        1. Click +NEW ENVIRONMENT
        2. Select under which cluster to create the environment
        3. Select a scope.
        4. Enter a name for the environment. The name must be unique.
        5. Enter the jupyter-lab Image URL - jupyter/scipy-notebook
        6. Tools - Set the connection for your tool

          • Click +TOOL
          • Select Jupyter tool from the list
        7. Set the runtime settings for the environment

          • Click +COMMAND
          • Enter command - start-notebook.sh
          • Enter arguments - --NotebookApp.base_url=/${RUNAI_PROJECT}/${RUNAI_JOB_NAME} --NotebookApp.token=''

        Note

        If host-based routing is enabled on the cluster, enter the argument --NotebookApp.token='' only.

        1. Click CREATE ENVIRONMENT
        • The newly created jupyter-lab will be selected automatically
      9. Select the \u2018one-gpu\u2019 compute resource for your workspace (GPU devices: 1)

        • If the \u2018one-gpu\u2019 is not displayed in the gallery, follow the step-by-step guide:
        Create a one-gpu compute resource
        1. Click +NEW COMPUTE RESOURCE
        2. Select under which cluster to create the compute resource
        3. Select a scope
        4. Enter a name for the compute resource. The name must be unique.
        5. Set GPU devices per pod - 1
        6. Set GPU memory per device

          • Select % (of device) - Fraction of a GPU device\u2019s memory
          • Set the memory Request - 100 (The workload will allocate 100% of the GPU memory)
        7. Optional: set the CPU compute per pod - 0.1 cores (default)

        8. Optional: set the CPU memory per pod - 100 MB (default)
        9. Click CREATE COMPUTE RESOURCE
        • The newly created one-gpu compute resource will be selected automatically
      10. Click CREATE WORKSPACE

        After the workspace is created, it is added to the workloads table.

      Copy the following command to your terminal. Make sure to update the below with the name of your project and workload:

      runai config project \"project-name\"  \nrunai submit \"workload-name\" --jupyter -g 1\n

      This would start a workspace with a pre-configured Jupyter image with one GPU allocated.

      Copy the following command to your terminal. Make sure to update the below with the name of your project and workload:

      runai project set \"project-name\"\nrunai workspace submit \"workload-name\"  --image jupyter/scipy-notebook --gpu-devices-request 1 \\\n    --external-url container=8888  --command start-notebook.sh  \\\n    -- --NotebookApp.base_url=/\\${RUNAI_PROJECT}/\\${RUNAI_JOB_NAME} --NotebookApp.token=''\n

      Copy the following command to your terminal. Make sure to update the below parameters according to the comments. For more details, see Workspaces API reference:

      curl -L 'https://<COMPANY-URL>/api/v1/workloads/workspaces' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"workload-name\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {\n        \"command\" : \"start-notebook.sh\",\n        \"args\" : \"--NotebookApp.base_url=/${RUNAI_PROJECT}/${RUNAI_JOB_NAME} --NotebookApp.token=''\",\n        \"image\": \"jupyter/scipy-notebook\",\n        \"compute\": {\n            \"gpuDevicesRequest\": 1\n        },\n        \"exposedUrls\" : [\n            { \n                \"container\" : 8888,\n                \"toolType\": \"jupyter-notebook\", \\ # (5)\n                \"toolName\": \"Jupyter\" \\ # (6)\n            }\n        ]\n    }\n}'\n
      1. <COMPANY-URL> is the link to the Run:ai user interface.
      2. <TOKEN> is the API access token obtained in Step 1.
      3. <PROJECT-ID> is #The ID of the Project the workspace is running on. You can get the Project ID via the Get Projects API Get Projects API.
      4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.
      5. toolType will show the Jupyter icon when connecting to the Jupyter tool via the user interface.
      6. toolName text will show when connecting to the Jupyter tool via the user interface.

      Note

      The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.

      "},{"location":"Researcher/workloads/workspaces/quickstart-jupyter/#step-3-connecting-to-the-jupyter-notebook","title":"Step 3: Connecting to the Jupyter Notebook","text":"User InterfaceCLI V1CLI V1API
      1. Select the newly created workspace with the Jupyter application that you want to connect to
      2. Click CONNECT
      3. Select the Jupyter tool
      4. The selected tool is opened in a new tab on your browser

      To connect to the Jupyter Notebook, browse directly to https://<COMPANY-URL>/<PROJECT-NAME>/jup1.

      To connect to the Jupyter Notebook, browse directly to https://<COMPANY-URL>/<PROJECT-NAME>/jup1.

      To connect to the Jupyter Notebook, browse directly to https://<COMPANY-URL>/<PROJECT-NAME>/jup1.

      "},{"location":"Researcher/workloads/workspaces/quickstart-jupyter/#next-steps","title":"Next Steps","text":"

      Manage and monitor your newly created workspace using the workloads table.

      "},{"location":"Researcher/workloads/workspaces/workspace-v2/","title":"Running Workspaces","text":"

      This article explains how to create a workspace via the Run:ai UI.

      A workspace contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

      To learn more about the workspace workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

      "},{"location":"Researcher/workloads/workspaces/workspace-v2/#creating-a-new-workspace","title":"Creating a new Workspace","text":"

      Before you start, make sure you have a project.

      To add a new workspace:

      1. Go to the Workload manager \u2192 Workloads
      2. Click +NEW WORKLOAD and select Workspace Within the new workspace form:
      3. Select under which cluster to create the workload
      4. Select the project in which your workspace will run
      5. Select a preconfigured template or select Start from scratch to launch a new workspace quickly
      6. Enter a unique name for the workspace (if the name already exists in the project, you will be requested to submit a different name)
      7. Click CONTINUE In the next step:
      8. Select the environment for your workspace

        • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery. For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
        • Set the connection for your tool(s). The tools are configured as part of the environment.
          • External URL
            • Custom URL
              • Set the URL
            • Optional: Modify who can access the tool:
              • All authenticated users (default) Everyone within the organization\u2019s account
              • Specific group(s)
                • Click +GROUP
                • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
              • Specific user(s)
                • Click +USER
                • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
          • Node port
            • Custom port
              • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
        • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
          • Enter UID
          • Enter GID
          • Add Supplementary groups (multiple groups can be added, separated by commas).
        • Optional: Set the command and arguments for the container running the workload. If no command is added, the container will use the image\u2019s default command (entry-point).
          • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
          • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
        • Set the environment variable(s)
          • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
          • (Optional) Add new variables
          • Click +ENVIRONMENT VARIABLE
            • Enter a name
            • Select the source for the environment variable

              • Custom
                • Enter a value according to the provided instructions
              • Credentials - Select select an existing credentials as the environment variable
                • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
                • Select a secret key
      9. Select the compute resource for your workspace

        • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery. For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
        • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload. When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
          • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
          • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster. To configure a new node pool and for additional information, see node pools.
        • Select a node affinity to schedule the workload on a specific node type. If the administrator added a \u2018node type (affinity)\u2019 scheduling rule to the project/department, then this field is mandatory. Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
        • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

          Note

          Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Tolerations

          • Click +TOLERATION
          • Enter a key
          • Select the operator
            • Exists - If the key exists on the node, the effect will be applied.
            • Equals - if the key and the value set below matches to the value on the node, the effect will be applied
              • Enter a value matching the value on the node
          • Select the effect for the toleration
            • NoExecute - Pods that do not tolerate this taint are evicted immediately.
            • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
            • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
            • Any - All effects above match.
      10. Optional: Set the volume needed for your workload A volume allocates storage space to your workload that is persistent across restarts.

        • Click +VOLUME
        • Select the storage class
          • None - Proceed without defining a storage class.
          • Custom storage class - This option applies when selecting a storage class based on existing storage classes. To add new storage classes to the storage class list, and for additional information, see Kubernetes storage classes
        • Select the access mode(s) (multiple modes can be selected)
          • Read-write by one node - The volume can be mounted as read-write by a single node.
          • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
          • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
        • Set the claim size and its units
        • Select the volume mode
          • File system (default) - This allows the volume to be mounted as a file system, enabling the usage of directories and files.
          • Block - This exposes the volume as a block storage, which can be formatted or used directly by applications without a file system.
        • Set the Container path with the volume target location
        • Set the volume persistency
          • Persistent - The volume and its data will be deleted only when the workload is deleted.
          • Ephemeral - The volume and its data will be deleted every time the workload\u2019s status changes to \u201cStopped.\u201d
      11. Optional: Select data sources for your workspace Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection. For a step-by-step guide on adding data sources to the gallery, see data sources. Once created, the new data source will be automatically selected.
        • Optional: Modify the data target location for the selected data source(s).
      12. Optional - General settings:
        • Allow the workload to exceed the project quota. Workloads running over quota may be preempted and stopped at any time.
        • Set the backoff limit before workload failure. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload status will change to \"Failed.\" Enter a value between 1 and 100.
        • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
        • Set annotations(s) Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
          • Click +ANNOTATION
          • Enter a name
          • Enter a value
        • Set labels(s) Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
          • Enter a name
          • Enter a value
      13. Click CREATE WORKSPACE
      "},{"location":"Researcher/workloads/workspaces/workspace-v2/#workload-policies","title":"Workload Policies","text":"

      When creating a new workload, fields and assets may have limitations or defaults. These rules and defaults are derived from a policy your administrator set.

      Policies allow you to control, standardize, and simplify the workload submission process. For additional information, see Workload Policies and Rules.

      The effects of the policy are reflected in the workspace creation form:

      • Defaults derived from the policy will be displayed automatically for specific fields.
      • Disabled actions or values must be within a certain range.
      • Rules and defaults for entire sections (such as environments, compute resources, or data sources) may prevent selection and will appear on the entire library card with an option for additional information via an external modal.
      "},{"location":"Researcher/workloads/workspaces/workspace-v2/#managing-and-monitoring","title":"Managing and monitoring","text":"

      After the workspace is created, it is added to the Workloads table, where it can be managed and monitored.

      "},{"location":"Researcher/workloads/workspaces/workspace-v2/#using-cli","title":"Using CLI","text":"

      To view the available actions on workspaces, see the Workspaces CLI v2 reference or the CLI v1 reference.

      "},{"location":"Researcher/workloads/workspaces/workspace-v2/#using-api","title":"Using API","text":"

      To view the available actions on workspaces, see the Workspaces API reference.

      "},{"location":"admin/overview-administrator/","title":"Overview: Infrastructure Administrator","text":"

      The Infrastructure Administrator is an IT person, responsible for the installation, setup and IT maintenance of the Run:ai product.

      As part of the Infrastructure Administrator documentation you will find:

      • Install Run:ai
        • Understand the Run:ai installation
        • Set up a Run:ai Cluster.
        • Set up Researchers to work with Run:ai.
      • IT Configuration of the Run:ai system
      • Connect Run:ai to an identity provider.
      • Maintenance & monitoring of the Run:ai system
      • Troubleshooting.
      "},{"location":"admin/authentication/accessrules/","title":"Access Rules","text":"

      This article explains the procedure to manage Access rules.

      Access rules provide users, groups, or applications privileges to system entities.

      An access rule is the assignment of a role to a subject in a scope: <Subject> is a <Role> in a <Scope>.

      For example, user user@domain.com is a department admin in department A.

      "},{"location":"admin/authentication/accessrules/#access-rules-table","title":"Access rules table","text":"

      The Access rules table can be found under Access in the Run:ai platform.

      The Access rules table provides a list of all the access rules defined in the platform and allows you to manage them.

      Note

      Flexible management

      It is also possible to manage access rules directly for a specific user, application, project, or department.

      The Access rules table consists of the following columns:

      Column Description Type The type of subject assigned to the access rule (user, SSO group, or application). Subject The user, SSO group, or application assigned with the role Role The role assigned to the subject Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates Authorized by The user who granted the access rule Creation time The timestamp for when the rule was created Last updated The last time the access rule was updated"},{"location":"admin/authentication/accessrules/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"admin/authentication/accessrules/#adding-new-access-rules","title":"Adding new access rules","text":"

      To add a new access rule:

      1. Click +NEW ACCESS RULE
      2. Select a subject User, SSO Group, or Application
      3. Select or enter the subject identifier:
        • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
        • Group name as recognized by the IDP
        • Application name as created in Run:ai
      4. Select a role
      5. Select a scope
      6. Click SAVE RULE

      Note

      An access rule consists of a single subject with a single role in a single scope. To assign multiple roles or multiple scopes to the same subject, multiple access rules must be added.

      "},{"location":"admin/authentication/accessrules/#editing-an-access-rule","title":"Editing an access rule","text":"

      Access rules cannot be edited. To change an access rule, you must delete the rule, and then create a new rule to replace it.

      "},{"location":"admin/authentication/accessrules/#deleting-an-access-rule","title":"Deleting an access rule","text":"
      1. Select the access rule you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm the deletion
      "},{"location":"admin/authentication/accessrules/#using-api","title":"Using API","text":"

      Go to the Access rules API reference to view the available actions

      "},{"location":"admin/authentication/applications/","title":"Applications","text":"

      This article explains the procedure to manage your organization's applications.

      Applications are used for API integrations with Run:ai. An application contains a client ID and a client secret. With the client credentials, you can obtain a token as detailed in API authentication and use it within subsequent API calls.

      Applications are assigned with Access Rules to manage permissions. For example, application ci-pipeline-prod is assigned with a Researcher role in Cluster: A.

      "},{"location":"admin/authentication/applications/#applications-table","title":"Applications table","text":"

      The Applications table can be found under Access in the Run:ai platform.

      The Applications table provides a list of all the applications defined in the platform, and allows you to manage them.

      The Applications table consists of the following columns:

      Column Description Application The name of the application Client ID The client ID of the application Access rule(s) The access rules assigned to the application Last login The timestamp for the last time the user signed in Created by The user who created the application Creation time The timestamp for when the application was created Last updated The last time the application was updated"},{"location":"admin/authentication/applications/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"admin/authentication/applications/#creating-an-application","title":"Creating an application","text":"

      To create an application:

      1. Click +NEW APPLICATION
      2. Enter the application\u2019s name
      3. Click CREATE
      4. Copy the Client ID and Client secret and store them securely
      5. Click DONE

      Note

      The client secret is visible only at the time of creation. It cannot be recovered but can be regenerated.

      "},{"location":"admin/authentication/applications/#adding-an-access-rule-to-an-application","title":"Adding an access rule to an application","text":"

      To create an access rule:

      1. Select the application you want to add an access rule for
      2. Click ACCESS RULES
      3. Click +ACCESS RULE
      4. Select a role
      5. Select a scope
      6. Click SAVE RULE
      7. Click CLOSE
      "},{"location":"admin/authentication/applications/#deleting-an-access-rule-from-an-application","title":"Deleting an access rule from an application","text":"

      To delete an access rule:

      1. Select the application you want to remove an access rule from
      2. Click ACCESS RULES
      3. Find the access rule assigned to the user you would like to delete
      4. Click on the trash icon
      5. Click CLOSE
      "},{"location":"admin/authentication/applications/#regenerating-client-secret","title":"Regenerating client secret","text":"

      To regenerate a client secret:

      1. Locate the application you want to regenerate its client secret
      2. Click REGENERATE CLIENT SECRET
      3. Click REGENERATE
      4. Copy the New client secret and store it securely
      5. Click DONE

      Warning

      Regenerating a client secret revokes the previous one.

      "},{"location":"admin/authentication/applications/#deleting-an-application","title":"Deleting an application","text":"
      1. Select the application you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm
      "},{"location":"admin/authentication/applications/#using-api","title":"Using API","text":"

      Go to the Applications, Access rules API reference to view the available actions

      "},{"location":"admin/authentication/authentication-overview/","title":"Authentication & Authorization","text":"

      Run:ai Authentication & Authorization enables a streamlined experience for the user with precise controls covering the data each user can see and the actions each user can perform in the Run:ai platform.

      Authentication verifies user identity during login, and Authorization assigns the user with specific permissions according to the assigned access rules.

      Authenticated access is required to use all aspects of the Run:ai interfaces, including the Run:ai platform, the Run:ai Command Line Interface (CLI) and APIs.

      "},{"location":"admin/authentication/authentication-overview/#authentication","title":"Authentication","text":"

      There are multiple methods to authenticate and access Run:ai.

      "},{"location":"admin/authentication/authentication-overview/#single-sign-on-sso","title":"Single Sign-On (SSO)","text":"

      Single Sign-On (SSO) is the preferred authentication method by large organizations, as it avoids the need to manage duplicate sets of user identities.

      Run:ai offers SSO integration, enabling users to utilize existing organizational credentials to access Run:ai without requiring dedicated credentials.

      Run:ai supports three methods to set up SSO:

      • SAML
      • OpenID Connect (OIDC)
      • OpenShift

      When using SSO, it is highly recommended to manage at least one local user, as a breakglass account (an emergency account), in case access to SSO is not possible.

      "},{"location":"admin/authentication/authentication-overview/#username-and-password","title":"Username and password","text":"

      Username and password access can be used when SSO integration is not possible.

      "},{"location":"admin/authentication/authentication-overview/#secret-key-for-application-programmatic-access","title":"Secret key (for Application programmatic access)","text":"

      A Secret is the authentication method for Applications. Applications use the Run:ai APIs to perform automated tasks including scripts and pipelines based on their assigned access rules.

      "},{"location":"admin/authentication/authentication-overview/#authorization","title":"Authorization","text":"

      The Run:ai platform uses Role Base Access Control (RBAC) to manage authorization.

      Once a user or an application is authenticated, they can perform actions according to their assigned access rules.

      "},{"location":"admin/authentication/authentication-overview/#role-based-access-control-rbac-in-runai","title":"Role Based Access Control (RBAC) in Run:ai","text":"

      While Kubernetes RBAC is limited to a single cluster, Run:ai expands the scope of Kubernetes RBAC, making it easy for administrators to manage access rules across multiple clusters.

      RBAC at Run:ai is configured using access rules.

      An access rule is the assignment of a role to a subject in a scope: <Subject> is a <Role> in a <Scope>.

      • Subject
      • A user, a group, or an application assigned with the role
      • Role
      • A set of permissions that can be assigned to subjects
      • A permission is a set of actions (view, edit, create and delete) over a Run:ai entity (e.g. projects, workloads, users)
        • For example, a role might allow a user to create and read Projects, but not update or delete them
        • Roles at Run:ai are system defined and cannot be created, edited or deleted
      • Scope
      • A scope is part of an organization in which a set of permissions (roles) is effective. Scopes include Projects, Departments, Clusters, Account (all clusters).

      Below is an example of an access rule: username@company.com is a Department admin in Department: A

      "},{"location":"admin/authentication/non-root-containers/","title":"User Identity in Container","text":"

      The identity of the user in the container determines its access to resources. For example, network file storage solutions typically use this identity to determine the container's access to network volumes. This document explains multiple ways for propagating the user identity into the container.

      "},{"location":"admin/authentication/non-root-containers/#the-default-root-access","title":"The Default: Root Access","text":"

      In docker, as well as in Kubernetes, the default for running containers is running as root. The implication of running as root is that processes running within the container have enough permissions to change anything in the container, and if propagated to network resources - can have permissions outside the container as well.

      This gives a lot of power to the Researcher but does not sit well with modern security standards of enterprise security.

      By default, if you run:

      runai submit -i ubuntu --attach --interactive -- bash\n
      then run id, you will see the root user.

      "},{"location":"admin/authentication/non-root-containers/#use-runai-flags-to-limit-root-access","title":"Use Run:ai flags to limit root access","text":"

      There are two [runai submit flags that control user identity at the Researcher level:

      • The flag --run-as-user starts the container with a specific user. The user is the current Linux user (see below for other behaviors if used in conjunction with Single sign-on).
      • The flag --prevent-privilege-escalation prevents the container from elevating its own privileges into root (e.g. running sudo or changing system files.).

      Equivalent flags exist in the Researcher User Interface.

      "},{"location":"admin/authentication/non-root-containers/#run-as-current-user","title":"Run as Current User","text":"

      From a Linux/Mac box, run:

      runai submit -i ubuntu --attach --interactive --run-as-user -- bash\n

      then run id, you will see the users and groups of the box you have been using to launch the Job.

      "},{"location":"admin/authentication/non-root-containers/#prevent-escalation","title":"Prevent Escalation","text":"

      From a Linux/Mac box, run:

      runai submit -i ubuntu --attach --interactive --run-as-user \\\n  --prevent-privilege-escalation  -- bash\n

      then verify that you cannot run su to become root within the container.

      "},{"location":"admin/authentication/non-root-containers/#setting-a-cluster-wide-default","title":"Setting a Cluster-Wide Default","text":"

      The two flags are voluntary. They are not enforced by the system. It is however possible to enforce them using Policies. Policies allow an Administrator to force compliance on both the User Interface and Command-line interface.

      "},{"location":"admin/authentication/non-root-containers/#passing-user-identity","title":"Passing user identity","text":""},{"location":"admin/authentication/non-root-containers/#passing-user-identity-from-identity-provider","title":"Passing user identity from Identity Provider","text":"

      A best practice is to store the user identifier (UID) and the group identifier (GID) in the organization's directory. Run:ai allows you to pass these values to the container and use them as the container identity.

      To perform this, you must:

      • Set up single sign-on. Perform the steps for UID/GID integration.
      • Run: runai login and enter your credentials
      • Use the flag --run-as-user

      Running id should show the identifier from the directory.

      "},{"location":"admin/authentication/non-root-containers/#passing-user-identity-explicitly-via-the-researcher-ui","title":"Passing user identity explicitly via the Researcher UI","text":"

      Via the Researcher User Interface, it is possible to explicitly provide the user id and group id:

      "},{"location":"admin/authentication/non-root-containers/#using-openshift-or-gatekeeper-to-provide-cluster-level-controls","title":"Using OpenShift or Gatekeeper to provide Cluster Level Controls","text":"

      Run:ai supports OpenShift as a Kubernetes platform. In OpenShift the system will provide a random UID to containers. The flags --run-as-user and --prevent-privilege-escalation are disabled on OpenShift. It is possible to achieve a similar effect on Kubernetes systems that are not OpenShift. A leading tool is Gatekeeper. Gatekeeper similarly enforces non-root on containers at the system level.

      "},{"location":"admin/authentication/non-root-containers/#creating-a-temporary-home-directory","title":"Creating a Temporary Home Directory","text":"

      When containers run as a specific user, the user needs to have a pre-created home directory within the image. Otherwise, when running a shell, you will not have a home directory:

      runai submit -i ubuntu --attach --interactive --run-as-user -- bash\nThe job 'job-0' has been submitted successfully\nYou can run `runai describe job job-0 -p team-a` to check the job status\nWaiting for pod to start running...\nINFO[0007] Job started\nConnecting to pod job-0-0-0\nIf you don't see a command prompt, try pressing enter.\nI have no name!@job-0-0-0:/$ \n

      Adding home directories to an image per user is not a viable solution. To overcome this, Run:ai provides an additional flag --create-home-dir. Adding this flag creates a temporary home directory for the user within the container.

      Notes

      • Data saved in this directory will not be saved when the container exits.
      • This flag is set by default to true when the --run-as-user flag is used, and false if not.
      "},{"location":"admin/authentication/researcher-authentication/","title":"Setup Researcher Access Control","text":""},{"location":"admin/authentication/researcher-authentication/#introduction","title":"Introduction","text":"

      The following instructions explain how to complete the configuration of access control for Researchers. This requires several steps:

      • (Mandatory) Modify the Kubernetes entry point (called the Kubernetes API server) to validate the credentials of incoming requests against the Run:ai Authentication authority.
      • (Command-line Interface usage only) Modify the Kubernetes profile to prompt the Researcher for credentials when running runai login (or oc login for OpenShift).

      Important

      • As of Run:ai version 2.16, you only need to perform these steps when accessing Run:ai from the command-line interface or sending YAMLs directly to Kubernetes
      • As of Run:ai version 2.18, you only need to perform these steps when if using the older command-line interface or sending YAMLs directly to Kubernetes.
      "},{"location":"admin/authentication/researcher-authentication/#kubernetes-configuration","title":"Kubernetes Configuration","text":"

      You must direct the Kubernetes API server to authenticate via Run:ai. This requires adding flags to the Kubernetes API Server. The flags show in the Run:ai user interface under Settings | General | Researcher Authentication | Server configuration.

      Modifying the API Server configuration differs between Kubernetes distributions:

      Vanilla KubernetesOpenShiftRKERKE2GKEEKSBCMAKSOther
      • Locate the Kubernetes API Server configuration file. The file's location may differ between different Kubernetes distributions. The location for vanilla Kubernetes is /etc/kubernetes/manifests/kube-apiserver.yaml
      • Edit the document, under the command tag, add the server configuration text described above.
      • Verify that the kube-apiserver-<master-node-name> pod in the kube-system namespace has been restarted and that changes have been incorporated. Run the below and verify that the oidc flags you have added:
      kubectl get pods -n kube-system kube-apiserver-<master-node-name> -o yaml\n

      No configuration is needed. Instead, Run:ai assumes that an Identity Provider has been defined at the OpenShift level and that the Run:ai Cluster installation has set the OpenshiftIdp flag to true. For more information see the Run:ai OpenShift control-plane setup.

      Edit Rancher cluster.yml (with Rancher UI, follow this). Add the following:

      cluster.yml
      kube-api:\n    always_pull_images: false\n    extra_args:\n        oidc-client-id: runai  # (1)\n        ...\n
      1. These are example parameters. Copy the actual parameters from Settings | General | Researcher Authentication as described above.

      You can verify that the flags have been incorporated into the RKE cluster by following the instructions here and running docker inspect <kube-api-server-container-id>, where <kube-api-server-container-id> is the container ID of api-server via obtained in the Rancher document.

      If working via the RKE2 Quickstart, edit /etc/rancher/rke2/config.yaml. Add the parameters provided in the server configuration section as described above in the following fashion:

      /etc/rancher/rke2/config.yaml
      kube-apiserver-arg:\n- \"oidc-client-id=runai\" # (1)\n...\n
      1. These are example parameters. Copy the actual parameters from Settings | General | Researcher Authentication as described above.

      If working via Rancher UI, need to add the flag as part of the cluster provisioning.

      Under Cluster Management | Create, turn on RKE2 and select a platform. Under Cluster Configuration | Advanced | Additional API Server Args. Add the Run:ai flags as <key>=<value> (e.g. oidc-username-prefix=-).

      Install Anthos identity service by running:

      gcloud container clusters update <gke-cluster-name> \\\n    --enable-identity-service --project=<gcp-project-name> --zone=<gcp-zone-name>\n

      Install the yq utility and run:

      For username-password authentication, run:

      kubectl get clientconfig default -n kube-public -o yaml > login-config.yaml\nyq -i e \".spec +={\\\"authentication\\\":[{\\\"name\\\":\\\"oidc\\\",\\\"oidc\\\":{\\\"clientID\\\":\\\"runai\\\",\\\"issuerURI\\\":\\\"$OIDC_ISSUER_URL\\\",\\\"kubectlRedirectURI\\\":\\\"http://localhost:8000/callback\\\",\\\"userClaim\\\":\\\"sub\\\",\\\"userPrefix\\\":\\\"-\\\"}}]}\" login-config.yaml\nkubectl apply -f login-config.yaml\n

      For single-sign-on, run:

      kubectl get clientconfig default -n kube-public -o yaml > login-config.yaml\nyq -i e \".spec +={\\\"authentication\\\":[{\\\"name\\\":\\\"oidc\\\",\\\"oidc\\\":{\\\"clientID\\\":\\\"runai\\\",\\\"issuerURI\\\":\\\"$OIDC_ISSUER_URL\\\",\\\"groupsClaim\\\":\\\"groups\\\",\\\"kubectlRedirectURI\\\":\\\"http://localhost:8000/callback\\\",\\\"userClaim\\\":\\\"email\\\",\\\"userPrefix\\\":\\\"-\\\"}}]}\" login-config.yaml\nkubectl apply -f login-config.yaml\n

      Where the OIDC flags are provided in the Run:ai server configuration section as described above.

      Then update runaiconfig with the Anthos endpoint - gke-oidc-envoy. Get the externel IP of the service in the Anthos namespace.

      kubectl get svc -n anthos-identity-service\nNAME               TYPE           CLUSTER-IP    EXTERNAL-IP     PORT(S)              AGE\ngke-oidc-envoy     LoadBalancer   10.37.3.111   39.201.319.10   443:31545/TCP        12h\n

      Add the IP to runaiconfig

      kubectl -n runai patch runaiconfig runai -p '{\"spec\": {\"researcher-service\": {\"args\": {\"gkeOidcEnvoyHost\": \"35.236.229.19\"}}}}'  --type=\"merge\"\n

      To create a kubeconfig profile for Researchers run:

      kubectl oidc login --cluster=CLUSTER_NAME --login-config=login-config.yaml \\\n    --kubeconfig=developer-kubeconfig\n

      (this will require installing the kubectl oidc plug-in as described in the Anthos document above gcloud components install kubectl-oidc)

      Then modify the developer-kubeconfig file as described in the Command-line Inteface Access section below.

      • In the AWS Console, under EKS, find your cluster.
      • Go to Configuration and then to Authentication.
      • Associate a new identity provider. Use the parameters provided in the server configuration section as described above. The process can take up to 30 minutes.

      Please follow the \"Vanilla Kubernetes\" instructions

      Please contact Run:ai customer support.

      See specific instructions in the documentation of the Kubernetes distribution.

      "},{"location":"admin/authentication/researcher-authentication/#command-line-interface-access","title":"Command-line Interface Access","text":"

      To control access to Run:ai (and Kubernetes) resources, you must modify the Kubernetes configuration file. The file is distributed to users as part of the Command-line interface installation.

      When making changes to the file, keep a copy of the original file to be used for cluster administration. After making the modifications, distribute the modified file to Researchers.

      • Under the ~/.kube directory edit the config file, remove the administrative user, and replace it with text from Settings | General | Researcher Authentication | Client Configuration.
      • Under contexts | context | user change the user to runai-authenticated-user.

      Important

      • After adding the new user, ensure to delete the following fields from the kubeconfig file to prevent unauthorized access: - Delete: client-certificate-data- Delete: client-key-data- Remove: Any references to the admin user.
      "},{"location":"admin/authentication/researcher-authentication/#test-via-command-line-interface","title":"Test via Command-line interface","text":"
      • Run: runai login (in OpenShift environments use oc login rather than runai login).
      • You will be prompted for a username and password. In a single sign-on flow, you will be asked to copy a link to a browser, log in and return a code.
      • Once login is successful, submit a Job.
      • If the Job was submitted with a Project to which you have no access, your access will be denied.
      • If the Job was submitted with a Project to which you have access, your access will be granted.

      You can also submit a Job from the Run:ai User interface and verify that the new job shows on the job list with your user name.

      "},{"location":"admin/authentication/researcher-authentication/#test-via-user-interface","title":"Test via User Interface","text":"
      • Open the Run:ai user interface, go to Workloads.
      • On the top-right, select Submit Workload.
      "},{"location":"admin/authentication/roles/","title":"Roles","text":"

      This article explains the available roles in the Run:ai platform.

      A role is a set of permissions that can be assigned to a subject in a scope.

      A permission is a set of actions (View, Edit, Create and Delete) over a Run:ai entity (e.g. projects, workloads, users).

      "},{"location":"admin/authentication/roles/#roles-table","title":"Roles table","text":"

      The Roles table can be found under Access in the Run:ai platform.

      The Roles table displays a list of predefined roles available to users in the Run:ai platform. It is not possible to create additional rules or edit or delete existing rules.

      The Roles table consists of the following columns:

      Column Description Role The name of the role Created by The name of the role creator Creation time The timestamp when the role was created"},{"location":"admin/authentication/roles/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"admin/authentication/roles/#reviewing-a-role","title":"Reviewing a role","text":"
      1. To review a role click the role name on the table
      2. In the role form review the following:
        • Role name The name of the role
        • Entity A system-managed object that can be viewed, edited, created or deleted by a user based on their assigned role and scope
        • Actions The actions that the role assignee is authorized to perform for each entity
          • View If checked, an assigned user with this role can view instances of this type of entity within their defined scope
          • Edit If checked, an assigned user with this role can change the settings of an instance of this type of entity within their defined scope
          • Create If checked, an assigned user with this role can create new instances of this type of entity within their defined scope
          • Delete If checked, an assigned user with this role can delete instances of this type of entity within their defined scope
      "},{"location":"admin/authentication/roles/#roles-in-runai","title":"Roles in Run:ai","text":"

      Run:ai supports the following roles and their permissions: Under each role is a detailed list of the actions that the role assignee is authorized to perform for each entity.

      Compute resource administrator

      Data source administrator

      Data volume administrator

      Department administrator

      Department viewer

      Editor

      Environment administrator

      L1 researcher

      L2 researcher

      ML engineer

      Research manager

      System administrator

      Template administrator

      Viewer

      Notes

      Keep the following in mind when upgrading from versions 2.13 or earlier:

      • Admin becomes System Admin with full access to all managed objects and scopes
      • Research Manager is not automatically assigned to all projects, but to projects set by the relevant Admin when assigning this role to a user, group or app
      • To preserve backwards compatibility, users with the role of Research Manager are assigned to all current projects, but not to new projects
      • To allow the Department Admin to assign a Researcher role to a user, group or app, the Department Admin must have VECD permissions for jobs and workspaces. This creates a broader span of managed objects
      • To preserve backwards compatibility, users with the role of Editor, are assigned to the same scope they had before the upgrade. However, with new user assignments, the Admin can limit the scope to only part of the organizational scope.
      "},{"location":"admin/authentication/roles/#permitted-workloads","title":"Permitted workloads","text":"

      When assigning a role with either one, all or any combination of the View, Edit, Create and Delete permissions for workloads, the subject has permissions to manage not only Run:ai native workloads (Workspace, Training, Inference), but also a list of 3rd party workloads:

      • k8s: StatefulSet
      • k8s: ReplicaSet
      • k8s: Pod
      • k8s: Deployment
      • batch: Job
      • batch: CronJob
      • machinelearning.seldon.io: SeldonDeployment
      • kubevirt.io: VirtualMachineInstance
      • kubeflow.org: TFJob
      • kubeflow.org: PyTorchJob
      • kubeflow.org: XGBoostJob
      • kubeflow.org: MPIJob
      • kubeflow.org: MPIJob
      • kubeflow.org: Notebook
      • kubeflow.org: ScheduledWorkflow
      • amlarc.azureml.com: AmlJob
      • serving.knative.dev: Service
      • workspace.devfile.io: DevWorkspace
      • ray.io: RayCluster
      • ray.io: RayJob
      • ray.io: RayService
      • ray.io: RayCluster
      • ray.io: RayJob
      • ray.io: RayService
      • tekton.dev: TaskRun
      • tekton.dev: PipelineRun
      • argoproj.io: Workflow
      "},{"location":"admin/authentication/roles/#using-api","title":"Using API","text":"

      Go to the Roles API reference to view the available actions.

      "},{"location":"admin/authentication/users/","title":"Users","text":"

      This article explains the procedure to manage users and their permissions.

      Users can be managed locally, or via the Identity provider, while assigned with Access Rules to manage its permissions.

      For example, user user@domain.com is a department admin in department A.

      "},{"location":"admin/authentication/users/#users-table","title":"Users table","text":"

      The Users table can be found under Access in the Run:ai platform.

      The users table provides a list of all the users in the platform. You can manage local users and manage user permissions (access rules) for both local and SSO users.

      Note

      Single Sign-On users

      SSO users are managed by the identity provider and appear once they have signed in to Run:ai

      The Users table consists of the following columns:

      Column Description User The unique identity of the user (email address) Type The type of the user - SSO / local Last login The timestamp for the last time the user signed in Access rule(s) The access rules assigned to the user Created By The user who created the user Creation time The timestamp for when the user was created Last updated The last time the user was updated"},{"location":"admin/authentication/users/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"admin/authentication/users/#creating-a-local-user","title":"Creating a local user","text":"

      To create a local user:

      1. Click +NEW LOCAL USER
      2. Enter the user\u2019s Email address
      3. Click CREATE
      4. Review and copy the user\u2019s credentials:
        • User Email
        • Temporary password to be used on first sign-in
      5. Click DONE

      Note

      The temporary password is visible only at the time of user\u2019s creation, and must be changed after the first sign-in

      "},{"location":"admin/authentication/users/#adding-an-access-rule-to-a-user","title":"Adding an access rule to a user","text":"

      To create an access rule:

      1. Select the user you want to add an access rule for
      2. Click ACCESS RULES
      3. Click +ACCESS RULE
      4. Select a role
      5. Select a scope
      6. Click SAVE RULE
      7. Click CLOSE
      "},{"location":"admin/authentication/users/#deleting-users-access-rule","title":"Deleting user\u2019s access rule","text":"

      To delete an access rule:

      1. Select the user you want to remove an access rule from
      2. Click ACCESS RULES
      3. Find the access rule assigned to the user you would like to delete
      4. Click on the trash icon
      5. Click CLOSE
      "},{"location":"admin/authentication/users/#resetting-a-user-password","title":"Resetting a user password","text":"

      To reset a user\u2019s password:

      1. Select the user you want to reset it\u2019s password
      2. Click RESET PASSWORD
      3. Click RESET
      4. Review and copy the user\u2019s credentials:
        • User Email
        • Temporary password to be used on next sign-in
      5. Click DONE
      "},{"location":"admin/authentication/users/#deleting-a-user","title":"Deleting a user","text":"
      1. Select the user you want to delete
      2. Click DELETE
      3. In the dialog, click DELETE to confirm the deletion

      Note

      To ensure administrative operations are always available, at least one local user with System Administrator role should exist.

      "},{"location":"admin/authentication/users/#using-api","title":"Using API","text":"

      Go to the Users, Access rules API reference to view the available actions

      "},{"location":"admin/authentication/sso/openidconnect/","title":"Setup SSO with OpenID Connect","text":"

      Single Sign-On (SSO) is an authentication scheme, allowing users to log-in with a single pair of credentials to multiple, independent software systems.

      This article explains the procedure to configure single sign-on to Run:ai using the OpenID Connect protocol.

      "},{"location":"admin/authentication/sso/openidconnect/#prerequisites","title":"Prerequisites","text":"

      Before starting, make sure you have the following available from your identity provider:

      • Discovery URL - the OpenID server where the content discovery information is published.
      • ClientID - the ID used to identify the client with the Authorization Server.
      • Client Secret - a secret password that only the Client and Authorization server know.
      • Optional: Scopes - a set of user attributes to be used during authentication to authorize access to a user's details.
      "},{"location":"admin/authentication/sso/openidconnect/#setup","title":"Setup","text":"

      Follow the steps below to setup SSO with OpenID Connect.

      "},{"location":"admin/authentication/sso/openidconnect/#adding-the-identity-provider","title":"Adding the identity provider","text":"
      1. Go to General settings
      2. Open the Security section and click +IDENTITY PROVIDER
      3. Select Custom OpenID Connect
      4. Enter the Discovery URL, Client ID, and Client Secret
      5. Copy the Redirect URL to be used in your identity provider
      6. Optional: Add the OIDC scopes
      7. Optional: Enter the user attributes and their value in the identity provider (see the user attributes table below)
      8. Click SAVE User attributes
      Attribute Default value in Run:ai Description User role groups GROUPS If it exists in the IDP, it allows you to assign Run:ai role groups via the IDP. The IDP attribute must be a list of strings. Linux User ID UID If it exists in the IDP, it allows Researcher containers to start with the Linux User UID. Used to map access to network resources such as file systems to users. The IDP attribute must be of type integer. Linux Group ID GID If it exists in the IDP, it allows Researcher containers to start with the Linux Group GID. The IDP attribute must be of type integer. Supplementary Groups SUPPLEMENTARYGROUPS If it exists in the IDP, it allows Researcher containers to start with the relevant Linux supplementary groups. The IDP attribute must be a list of integers. Email email Defines the user attribute in the IDP holding the user's email address, which is the user identifier in Run:ai User first name firstName Used as the user\u2019s first name appearing in the Run:ai user interface User last name lastName Used as the user\u2019s last name appearing in the Run:ai user interface"},{"location":"admin/authentication/sso/openidconnect/#testing-the-setup","title":"Testing the setup","text":"
      1. Log-in to the Run:ai platform as an admin
      2. Add Access Rules to an SSO user defined in the IDP
      3. Open the Run:ai platform in an incognito browser tab
      4. On the sign-in page click CONTINUE WITH SSO You are redirected to the identity provider sign in page
      5. In the identity provider sign-in page, log in with the SSO user who you granted with access rules
      6. If you are unsuccessful signing-in to the identity provider, follow the Troubleshooting section below
      "},{"location":"admin/authentication/sso/openidconnect/#editing-the-identity-provider","title":"Editing the identity provider","text":"

      You can view the identity provider details and edit its configuration:

      1. Go to General settings
      2. Open the Security section
      3. On the identity provider box, click Edit identity provider
      4. You can edit either the Discovery URL, Client ID, Client Secret, OIDC scopes, or the User attributes
      "},{"location":"admin/authentication/sso/openidconnect/#removing-the-identity-provider","title":"Removing the identity provider","text":"

      You can remove the identity provider configuration:

      1. Go to General settings
      2. Open the Security section
      3. On the identity provider card, click Remove identity provider
      4. In the dialog, click REMOVE to confirm the action

      Note

      To avoid losing access, removing the identity provider must be carried out by a local user.

      "},{"location":"admin/authentication/sso/openidconnect/#troubleshooting","title":"Troubleshooting","text":"

      If testing the setup was unsuccessful, try the different troubleshooting scenarios according to the error you received.

      "},{"location":"admin/authentication/sso/openidconnect/#troubleshooting-scenarios","title":"Troubleshooting scenarios","text":"403 - Sorry, we can\u2019t let you see this page. Something about permissions\u2026

      Description: The authenticated user is missing permissions

      Mitigation:

      1. Validate either the user or its related group/s are assigned with access rules
      2. Validate groups attribute is available in the configured OIDC Scopes
      3. Validate the user\u2019s groups attribute is mapped correctly

      Advanced:

      1. Open the Chrome DevTools: Right-click on page \u2192 Inspect \u2192 Console tab
      2. Run the following command to retrieve and paste the user\u2019s token: localStorage.token;
      3. Paste in https://jwt.io
      4. Under the Payload section validate the values of the user\u2019s attributes
      401 - We\u2019re having trouble identifying your account because your email is incorrect or can\u2019t be found.

      Description: Authentication failed because email attribute was not found.

      Mitigation:

      1. Validate email attribute is available in the configured OIDC Scopes
      2. Validate the user\u2019s email attribute is mapped correctly
      Unexpected error when authenticating with identity provider

      Description: User authentication failed

      Mitigation:

      1. Validate that the configured OIDC Scopes exist and match the Identity Provider\u2019s available scopes

      Advanced:

      1. Look for the specific error message in the URL address
      Unexpected error when authenticating with identity provider (SSO sign-in is not available)

      Description: User authentication failed

      Mitigation:

      1. Validate that the configured OIDC scope exists in the Identity Provider
      2. Validate the configured Client Secret match the Client Secret in the Identity Provider

      Advanced:

      1. Look for the specific error message in the URL address
      Client not found

      Description: OIDC Client ID was not found in the Identity Provider

      Mitigation:

      1. Validate that the configured Client ID matches the Identity Provider Client ID
      "},{"location":"admin/authentication/sso/openshift/","title":"Setup SSO with OpenShift","text":"

      Single Sign-On (SSO) is an authentication scheme, allowing users to log-in with a single pair of credentials to multiple, independent software systems.

      This article explains the procedure to configure single sign-on to Run:ai using the OpenID Connect protocol in OpenShift V4.

      "},{"location":"admin/authentication/sso/openshift/#prerequisites","title":"Prerequisites","text":"

      Before starting, make sure you have the following available from your OpenShift cluster:

      • OpenShift OAuth client - see Registering an additional OAuth client
      • ClientID - the ID used to identify the client with the Authorization Server.
      • Client Secret - a secret password that only the Client and Authorization Server know.
      • Base URL - the OpenShift API Server endpoint (example: https://api.<cluster-url>:6443)
      "},{"location":"admin/authentication/sso/openshift/#setup","title":"Setup","text":"

      Follow the steps below to setup SSO with OpenShift.

      "},{"location":"admin/authentication/sso/openshift/#adding-the-identity-provider","title":"Adding the identity provider","text":"
      1. Go to General settings
      2. Open the Security section and click +IDENTITY PROVIDER
      3. Select OpenShift V4
      4. Enter the Base URL, Client ID, and Client Secret from your OpenShift OAuth client.
      5. Copy the Redirect URL to be used in your OpenShift OAuth client
      6. Optional: Enter the user attributes and their value in the identity provider (see the user attributes table below)
      7. Click SAVE User attributes
      Attribute Default value in Run:ai Description User role groups GROUPS If it exists in the IDP, it allows you to assign Run:ai role groups via the IDP. The IDP attribute must be a list of strings. Linux User ID UID If it exists in the IDP, it allows researcher containers to start with the Linux User UID. Used to map access to network resources such as file systems to users. The IDP attribute must be of type integer. Linux Group ID GID If it exists in the IDP, it allows researcher containers to start with the Linux Group GID. The IDP attribute must be of type integer. Supplementary Groups SUPPLEMENTARYGROUPS If it exists in the IDP, it allows researcher containers to start with the relevant Linux supplementary groups. The IDP attribute must be a list of integers. Email email Defines the user attribute in the IDP holding the user's email address, which is the user identifier in Run:ai User first name firstName Used as the user\u2019s first name appearing in the Run:ai platform User last name lastName Used as the user\u2019s last name appearing in the Run:ai platform"},{"location":"admin/authentication/sso/openshift/#testing-the-setup","title":"Testing the setup","text":"
      1. Open the Run:ai platform as an admin
      2. Add Access Rules to an SSO user defined in the IDP
      3. Open the Run:ai platform in an incognito browser tab
      4. On the sign-in page click CONTINUE WITH SSO You are redirected to the OpenShift IDP sign-in page
      5. In the identity provider sign-in page, log-in with the SSO user who you granted with access rules
      6. If you are unsuccessful signing-in to the identity provider, follow the Troubleshooting section below
      "},{"location":"admin/authentication/sso/openshift/#editing-the-identity-provider","title":"Editing the identity provider","text":"

      You can view the identity provider details and edit its configuration:

      1. Go to General settings
      2. Open the Security section
      3. On the identity provider box, click Edit identity provider
      4. You can edit either the Base URL, Client ID, Client Secret, or the User attributes
      "},{"location":"admin/authentication/sso/openshift/#removing-the-identity-provider","title":"Removing the identity provider","text":"

      You can remove the identity provider configuration:

      1. Go to General settings
      2. Open the Security section
      3. On the identity provider card, click Remove identity provider
      4. In the dialog, click REMOVE to confirm the action

      Note

      To avoid losing access, removing the identity provider must be carried out by a local user.

      "},{"location":"admin/authentication/sso/openshift/#troubleshooting","title":"Troubleshooting","text":"

      If testing the setup was unsuccessful, try the different troubleshooting scenarios according to the error you received.

      "},{"location":"admin/authentication/sso/openshift/#troubleshooting-scenarios","title":"Troubleshooting scenarios","text":"403 - Sorry, we can\u2019t let you see this page. Something about permissions\u2026

      Description: The authenticated user is missing permissions

      Mitigation:

      1. Validate either the user or its related group/s are assigned with access rules
      2. Validate groups attribute is available in the configured OIDC Scopes
      3. Validate the user\u2019s groups attribute is mapped correctly

      Advanced:

      1. Open the Chrome DevTools: Right-click on page \u2192 Inspect \u2192 Console tab
      2. Run the following command to retrieve and copy the user\u2019s token: localStorage.token;
      3. Paste in https://jwt.io
      4. Under the Payload section validate the value of the user\u2019s attributes
      401 - We\u2019re having trouble identifying your account because your email is incorrect or can\u2019t be found.

      Description: Authentication failed because e-mail attribute was not found.

      Mitigation:

      1. Validate email attribute is available in the configured OIDC Scopes
      2. Validate the user\u2019s email attribute is mapped correctly
      Unexpected error when authenticating with identity provider

      Description: User authentication failed

      Mitigation:

      1. Validate the the configured OIDC Scopes exist and match the Identity Provider\u2019s available scopes

      Advanced:

      1. Look for the specific error message in the URL address
      Unexpected error when authenticating with identity provider (SSO sign-in is not available)

      Description: User authentication failed

      Mitigation:

      1. Validate that the configured OIDC scope exists in the Identity Provider
      2. Validate that the configured Client Secret matches the Client Secret value in the OAuthclient Kubernetes object.

      Advanced:

      1. Look for the specific error message in the URL address
      unauthorized_client

      Description: OIDC Client ID was not found in the OpenShift IDP

      Mitigation:

      1. Validate that the configured Client ID matches the value in the OAuthclient Kubernetes object.
      "},{"location":"admin/authentication/sso/saml/","title":"Setup SSO with SAML","text":"

      Single Sign-On (SSO) is an authentication scheme, allowing users to log-in with a single pair of credentials to multiple, independent software systems.

      This article explains the procedure to configure SSO to Run:ai using the SAML 2.0 protocol.

      "},{"location":"admin/authentication/sso/saml/#prerequisites","title":"Prerequisites","text":"

      Before starting, ensure you have the following available from your identity provider:

      • SAML XML Metadata
      "},{"location":"admin/authentication/sso/saml/#setup","title":"Setup","text":"

      Follow the steps below to setup SSO with SAML.

      "},{"location":"admin/authentication/sso/saml/#adding-the-identity-provider","title":"Adding the identity provider","text":"
      1. Go to General settings
      2. Open the Security section and click +IDENTITY PROVIDER
      3. Select Custom SAML 2.0
      4. Select either From computer or From URL
        • From computer - click the Metadata XML file field, then select your file for upload
        • From URL - in the Metadata XML URL field, enter the URL to the XML Metadata file
      5. Copy the Redirect URL and Entity ID to be used in your identity provider
      6. Optional: Enter the user attributes and their value in the identity provider (see the user attributes table below)
      Attribute Default value in Run:ai Description User role groups GROUPS If it exists in the IDP, it allows you to assign Run:ai role groups via the IDP. The IDP attribute must be a list of strings. Linux User ID UID If it exists in the IDP, it allows Researcher containers to start with the Linux User UID. Used to map access to network resources such as file systems to users. The IDP attribute must be of type integer. Linux Group ID GID If it exists in the IDP, it allows Researcher containers to start with the Linux Group GID. The IDP attribute must be of type integer. Supplementary Groups SUPPLEMENTARYGROUPS If it exists in the IDP, it allows Researcher containers to start with the relevant Linux supplementary groups. The IDP attribute must be a list of integers. Email email Defines the user attribute in the IDP holding the user's email address, which is the user identifier in Run:ai. User first name firstName Used as the user\u2019s first name appearing in the Run:ai platform. User last name lastName Used as the user\u2019s last name appearing in the Run:ai platform.
      1. Click SAVE
      "},{"location":"admin/authentication/sso/saml/#testing-the-setup","title":"Testing the setup","text":"
      1. Open the Run:ai platform as an admin
      2. Add Access Rules to an SSO user defined in the IDP
      3. Open the Run:ai platform in an incognito browser tab
      4. On the sign-in page click CONTINUE WITH SSO. You are redirected to the identity provider sign in page
      5. In the identity provider sign-in page, log-in with the SSO user who you granted with access rules
      6. If you are unsuccessful signing-in to the identity provider, follow the Troubleshooting section below
      "},{"location":"admin/authentication/sso/saml/#editing-the-identity-provider","title":"Editing the identity provider","text":"

      You can view the identity provider details and edit its configuration:

      1. Go General settings
      2. Open the Security section
      3. On the identity provider box, click Edit identity provider
      4. You can edit either the metadata file or the user attributes
      5. You can view the identity provider URL, identity provider entity ID, and the certificate expiration date
      "},{"location":"admin/authentication/sso/saml/#removing-the-identity-provider","title":"Removing the identity provider","text":"

      You can remove the identity provider configuration:

      1. Go to General settings
      2. Open the Security section
      3. On the identity provider card, click Remove identity provider
      4. In the dialog, click REMOVE to confirm the action

      Note

      To avoid losing access, removing the identity provider must be carried out by a local user.

      "},{"location":"admin/authentication/sso/saml/#downloading-the-xml-metadata-file","title":"Downloading the XML metadata file","text":"

      You can download the XML file to view the identity provider settings:

      1. Go to General settings
      2. Open the Security section
      3. On the identity provider card, click Download metadata XML file
      "},{"location":"admin/authentication/sso/saml/#troubleshooting","title":"Troubleshooting","text":"

      If testing the setup was unsuccessful, try the different troubleshooting scenarios according to the error you received. If an error still occurs, check the advanced troubleshooting section.

      "},{"location":"admin/authentication/sso/saml/#troubleshooting-scenarios","title":"Troubleshooting scenarios","text":"Invalid signature in response from identity provider

      Description: After trying to log-in, the following message is received in the RunLai log-in page. Mitigation: 1. Go to General settings 2. Open the Security section 3. In the identity provider box, check for a \"Certificate expired\u201d error 4. If it is expired, update the SAML metadata file to include a valid certificate

      401 - We\u2019re having trouble identifying your account because your email is incorrect or can\u2019t be found.

      Description: Authentication failed because email attribute was not found.

      Mitigation:

      1. Validate the user\u2019s email attribute is mapped correctly
      403 - Sorry, we can\u2019t let you see this page. Something about permissions\u2026

      Description: The authenticated user is missing permissions

      Mitigation:

      1. Validate either the user or its related group/s are assigned with access rules
      2. Validate the user\u2019s groups attribute is mapped correctly

      Advanced:

      1. Open the Chrome DevTools: Right-click on page \u2192 Inspect \u2192 Console tab
      2. Run the following command to retrieve and paste the user\u2019s token: localStorage.token;
      3. Paste in https://jwt.io
      4. Under the Payload section validate the values of the user\u2019s attributes
      "},{"location":"admin/authentication/sso/saml/#advanced-troubleshooting","title":"Advanced Troubleshooting","text":"Validating the SAML request

      The SAML login flow can be separated into two parts:

      • Run:ai redirects to the IDP for log-ins using a SAML Request
      • On successful log-in, the IDP redirects back to Run:ai with a SAML Response

      Validate the SAML Request to ensure the SAML flow works as expected:

      1. Go to the Run:ai login screen
      2. Open the Chrome Network inspector: Right-click \u2192 Inspect on the page \u2192 Network tab
      3. On the sign-in page click CONTINUE WITH SSO.
      4. Once redirected to the Identity Provider, search in the Chrome network inspector for an HTTP request showing the SAML Request. Depending on the IDP url, this would be a request to the IDP domain name. For example, accounts.google.com/idp?1234.
      5. When found, go to the Payload tab and copy the value of the SAML Request
      6. Paste the value into a SAML decoder (e.g. https://www.samltool.com/decode.php)
      7. Validate the request:
        • The content of the <saml:Issuer> tag is the same as Entity ID given when adding the identity provider
        • The content of the AssertionConsumerServiceURL is the same as the Redirect URI given when adding the identity provider
      8. Validate the response:
        • The user email under the <saml2:Subject> tag is the same as the logged-in user
        • Make sure that under the <saml2:AttributeStatement> tag, there is an Attribute named email (lowercase). This attribute is mandatory.
        • If other, optional user attributes (groups, firstName, lastName, uid, gid) are mapped make sure they also exist under <saml2:AttributeStatement> along with their respective values.
      "},{"location":"admin/config/access-roles/","title":"Understand the Kubernetes Cluster Access provided to Run:ai","text":"

      Run:ai has configuration flags that control specific behavioral aspects of Run:ai. Specifically, those which require additional permissions. Such as automatic namespace/project creation, secret propagation, and more.

      The purpose of this document is to provide security officers with the ability to review what cluster-wide access Run:ai requires, and verify that it is in line with organizational policy, before installing the Run:ai cluster.

      "},{"location":"admin/config/access-roles/#review-cluster-access-roles","title":"Review Cluster Access Roles","text":"

      Run the following:

      helm repo add runai https://run-ai-charts.storage.googleapis.com\nhelm repo update\nhelm install runai-cluster runai/runai-cluster -n runai -f runai-<cluster-name>.yaml \\\n        --dry-run > cluster-all.yaml\n

      The file cluster-all.yaml can be then be reviewed. You can use the internal filenames (provided in comments within the file) to gain more understanding according to the table below:

      Folder File Purpose clusterroles base.yaml Mandatory Kubernetes Cluster Roles and Cluster Role Bindings clusterroles project-controller-ns-creation.yaml Automatic Project Creation and Maintenance. Provides Run:ai with the ability to create Kubernetes namespaces when the Run:ai administrator creates new Projects. Can be turned on/off via flag clusterroles project-controller-rb-creation.yaml Automatically assign Users to Projects. Can be turned on/off via flag clusterroles project-controller-limit-range.yaml Disables the usage of the Kubernetes Limit Range feature. Can be turned on/off via flag ocp scc.yaml OpenShift-specific Security Contexts priorityclasses 4 files Folder contains a list of Priority Classes used by Run:ai"},{"location":"admin/config/admin-messages/","title":"Administrator Messages","text":"

      System administrators can use Administrator messages to make announcements to users once they have logged in. These messages typically are used to keep user informed about different aspects of the platform.

      To configure an Administrator message:

      1. Press General settings.
      2. Expand the Message from administrator pane.
      3. Press Message.
      4. Enter your message in the text box. Use the formatting tools in the toolbar to add special formatting or links to the message.
      5. Enable the Display \"Don't show this again\" checkbox on message to users to allow the users to see the message only once.
      6. Press Publish when complete.
      "},{"location":"admin/config/advanced-cluster-config/","title":"Advanced Cluster Configuration","text":"

      Advanced cluster configurations can be used to tailor your Run:ai cluster deployment to meet specific operational requirements and optimize resource management. By fine-tuning these settings, you can enhance functionality, ensure compatibility with organizational policies, and achieve better control over your cluster environment. This article provides guidance on implementing and managing these configurations to adapt the Run:ai cluster to your unique needs.

      After the Run:ai cluster is installed, you can adjust various settings to better align with your organization's operational needs and security requirements.

      "},{"location":"admin/config/advanced-cluster-config/#edit-cluster-configurations","title":"Edit cluster configurations","text":"

      Advanced cluster configurations are managed through the runaiconfig Kubernetes Custom Resource. To modify the cluster configurations, use the following command:

      kubectl edit runaiconfig runai -n runai\n
      "},{"location":"admin/config/advanced-cluster-config/#configurations","title":"Configurations","text":"

      The following configurations allow you to enable or disable features, control permissions, and customize the behavior of your Run:ai cluster:

      Key Description Default spec.project-controller.createNamespaces (boolean) Allows Kubernetes namespace creation for new projects true spec.mps-server.enabled (boolean) Enabled when using NVIDIA MPS false spec.global.subdomainSupport (boolean) Allows the creation of subdomains for ingress endpoints, enabling access to workloads via unique subdomains on the Fully Qualified Domain Name (FQDN). For details, see External Access to Container false spec.runai-container-toolkit.enabled (boolean) Allows workloads to use GPU fractions true spec.prometheus.spec.retention (string) Defines how long Prometheus retains Run:ai metrics locally, which is useful in case of potential connectivity issues. For more information, see Prometheus Storage 2h spec.prometheus.spec.retentionSize (string) Allocates storage space for Run:ai metrics in Prometheus, which is useful in case of potential connectivity issues. For more information, see Prometheus Storage \"\" spec.prometheus.logLevel (string) Sets the Prometheus log levelPossible values: [debug, info, warn, error] \u201cinfo\" spec.prometheus.additionalAlertLabels (object) Sets additional custom labels for the built-in alerts Example: {\u201cenv\u201d: \u201cprod\u201d} {} spec.global.schedulingServices (object) Defines resource constraints uniformly for the entire set of Run:ai scheduling services. For more information, see Resource requests and limits of Pod and container {resources: {}} spec.global.syncServices (object) Defines resource constraints uniformly for the entire set of Run:ai sync services. For more information, see Resource requests and limits of Pod and container {resources: {}} spec.global.workloadServices (object) Defines resource constraints uniformly for the entire set of Run:ai workload services. For more information, see Resource requests and limits of Pod and container {resources: {}} spec.global.nodeAffinity.restrictScheduling (boolean) Enables setting node roles and restricting workload scheduling to designated nodes false spec.global.affinity (object) Sets the system nodes where Run:ai system-level services are scheduled. Using global.affinity will overwrite the node roles set using the Administrator CLI (runai-adm). Prefer to schedule on nodes that are labeled with node-role.kubernetes.io/runai-system spec.global.tolerations (object) Configure Kubernetes tolerations for Run:ai system-level services. spec.daemonSetsTolerations (object) Configure Kubernetes tolerations for Run:ai daemonSets / engine. spec.runai-container-toolkit.logLevel (boolean) Specifies the run:ai-container-toolkit logging level: either 'SPAM', 'DEBUG', 'INFO', 'NOTICE', 'WARN', or 'ERROR' INFO node-scale-adjuster.args.gpuMemoryToFractionRatio (object) A scaling-pod requesting a single GPU device will be created for every 1 to 10 pods requesting fractional GPU memory (1/gpuMemoryToFractionRatio). This value represents the ratio (0.1-0.9) of fractional GPU memory (any size) to GPU fraction (portion) conversion. 0.1 spec.global.core.dynamicFractions.enabled (boolean) Enables dynamic GPU fractions true spec.global.core.swap.enabled (boolean) Enables memory swap for GPU workloads false spec.global.core.swap.limits.cpuRam (string) Sets the CPU memory size used to swap GPU workloads 100Gi spec.global.core.swap.limits.reservedGpuRam (string) Sets the reserved GPU memory size used to swap GPU workloads 2Gi spec.global.core.nodeScheduler.enabled (boolean) Enables the node-level scheduler false spec.global.replicaCount (int) Sets a global number of pod replicas to be created for services that support replication 1 spec.limitRange.cpuDefaultRequestCpuLimitFactorNoGpu (string) Sets a default ratio between the CPU request and the limit for workloads without GPU requests 0.1 spec.limitRange.memoryDefaultRequestMemoryLimitFactorNoGpu (string) Sets a default ratio between the memory request and the limit for workloads without GPU requests 0.1 spec.limitRange.cpuDefaultRequestGpuFactor (string) Sets a default amount of CPU allocated per GPU when the CPU is not specified spec.limitRange.cpuDefaultLimitGpuFactor (int) Sets a default CPU limit based on the number of GPUs requested when no CPU limit is specified NO DEFAULT spec.limitRange.memoryDefaultRequestGpuFactor (string) Sets a default amount of memory allocated per GPU when the memory is not specified 100Mi spec.limitRange.memoryDefaultLimitGpuFactor (string) Sets a default memory limit based on the number of GPUs requested when no memory limit is specified NO DEFAULT spec.global.core.timeSlicing.mode (string) Sets the GPU time-slicing mode.Possible values:timesharing - all pods on a GPU share the GPU compute time evenly.\u2018strict\u2019 - each pod gets an exact time slice according to its memory fraction value.fair - each pod gets an exact time slice according to its memory fraction value and any unused GPU compute time is split evenly between the running pods. timesharing runai-scheduler.fullHierarchyFairness (boolean) Enables fairness between departments, on top of projects fairness true spec.pod-grouper.args.gangSchedulingKnative (boolean) Enables gang scheduling for inference workloads.For backward compatibility with versions earlier than v2.19, change the value to false true runai-scheduler.args.defaultStalenessGracePeriod Sets the timeout in seconds before the scheduler evicts a stale pod-group (gang) that went below its min-members in running state: 0s - Immediately (no timeout) -1 - Never 60s spec.runai-scheduler.args.verbosity (int) Configures the level of detail in the logs generated by the scheduler service 4 pod-grouper.args.gangScheduleArgoWorkflow (boolean) Groups all pods of a single ArgoWorkflow workload into a single Pod-Group for gang scheduling. true"},{"location":"admin/config/advanced-cluster-config/#runai-managed-nodes","title":"Run:ai Managed Nodes","text":"

      To include or exclude specific nodes from running workloads within a cluster managed by Run:ai, use the nodeSelectorTerms flag. For additional details, see Kubernetes nodeSelector.

      Label the nodes using the below:

      • key: Label key (e.g., zone, instance-type).
      • operator: Operator defining the inclusion/exclusion condition (In, NotIn, Exists, DoesNotExist).
      • values: List of values for the key when using In or NotIn.

      The below example shows how to include NVIDIA GPUs only and exclude all other GPU types in a cluster with mixed nodes, based on product type GPU label:

      nodeSelectorTerms:\n- matchExpressions:\n  - key: nvidia.com/gpu.product  \n    operator: Exists\n

      Tip

      To view the full runaiconfig object structure, use the following command:

      kubectl get crds/runaiconfigs.run.ai -n runai -o yaml\n

      "},{"location":"admin/config/allow-external-access-to-containers/","title":"External access to Containers","text":""},{"location":"admin/config/allow-external-access-to-containers/#introduction","title":"Introduction","text":"

      Researchers working with containers may at times need to remotely access the container. Some examples:

      • Using a Jupyter notebook that runs within the container
      • Using PyCharm to run python commands remotely.
      • Using TensorBoard to view machine learning visualizations

      This requires exposing container ports. When using docker, the way Researchers expose ports is by declaring them when starting the container. Run:ai has similar syntax.

      Run:ai is based on Kubernetes. Kubernetes offers an abstraction of the container's location. This complicates the exposure of ports. Kubernetes offers several options:

      Method Description Prerequisites Port Forwarding Simple port forwarding allows access to the container via local and/or remote port. None NodePort Exposes the service on each Node\u2019s IP at a static port (the NodePort). You\u2019ll be able to contact the NodePort service from outside the cluster by requesting <NODE-IP>:<NODE-PORT> regardless of which node the container actually resides in. None LoadBalancer Exposes the service externally using a cloud provider\u2019s load balancer. Only available with cloud providers

      See https://kubernetes.io/docs/concepts/services-networking/service for further details on these options.

      "},{"location":"admin/config/allow-external-access-to-containers/#workspaces-configuration","title":"Workspaces configuration","text":"

      Workspaces allow the Researcher to build AI models interactively.

      Workspaces allow the Researcher to launch tools such as Visual Studio code, TensorFlow, TensorBoard etc. These tools require access to the container. Access is provided via URLs.

      Run:ai uses the Cluster URL provided to dynamically create SSL-secured URLs for researchers\u2019 workspaces in the format of https://<CLUSTER_URL>/project-name/workspace-name.

      While this form of path-based routing conveniently works with applications like Jupyter Notebooks, it may often not be compatible with other applications. These applications assume running at the root file system, so hardcoded file paths and settings within the container may become invalid when running at a path other than the root. For instance, if the container is expecting to find a file at /etc/config.json but is running at /project-name/workspace-name, the file will not be found. This can cause the container to fail or not function as intended.

      To address this issue, Run:ai provides support for host-based routing. When enabled, Run:ai creates workspace URLs in a subdomain format (https://project-name-workspace-name.<CLUSTER_URL>/), which allows all workspaces to run at the root path and function properly.

      To enable host-based routing you must perform the following steps:

      Note

      For OpenShift, editing the Runaiconfig command is the only step required to generate workspace URLs. Refer to the last step below.

      1. Create a second DNS entry (A record) for *.<CLUSTER_URL>, pointing to the same IP as the cluster Fully Qualified Domain Name (FQDN)
      2. Obtain a wildcard SSL certificate for this DNS.

      3. Add the certificate as a secret:

      kubectl create secret tls runai-cluster-domain-star-tls-secret -n runai \\ \n    --cert /path/to/fullchain.pem --key /path/to/private.pem\n
      1. Create the following ingress rule:
      apiVersion: networking.k8s.io/v1\nkind: Ingress\nmetadata:\n  name: runai-cluster-domain-star-ingress\n  namespace: runai\nspec:\n  ingressClassName: nginx\n  rules:\n  - host: '*.<CLUSTER_URL>'\n  tls:\n  - hosts:\n    - '*.<CLUSTER_URL>'\n    secretName: runai-cluster-domain-star-tls-secret\n

      Replace <CLUSTER_URL> as described above and run: kubectl apply -f <filename>.

      1. Edit Runaiconfig to generate the URLs correctly:
      kubectl patch RunaiConfig runai -n runai --type=\"merge\" \\\n    -p '{\"spec\":{\"global\":{\"subdomainSupport\": true}}}' \n

      Once these requirements have been met, all workspaces will automatically be assigned a secured URL with a subdomain, ensuring full functionality for all researcher applications.

      "},{"location":"admin/config/allow-external-access-to-containers/#see-also","title":"See Also","text":"
      • To learn how to use port forwarding see the Quickstart document: Launch an Interactive Build Workload with Connected Ports.
      • See CLI command runai submit.
      "},{"location":"admin/config/cli-admin-install/","title":"Administrator CLI","text":"

      The Run:ai Administrator (runai-adm) is a lightweight tool designed to support infrastructure administrators by simplifying two key tasks:

      • Collecting logs for troubleshooting and sharing with Run:ai support.
      • Configuring node roles in the cluster for optimal performance and reliability.

      This article outlines the installation and usage of the Run:ai Administrator CLI to help you get started quickly.

      "},{"location":"admin/config/cli-admin-install/#prerequisites","title":"Prerequisites","text":"

      Before installing the CLI, review the following:

      • Operating system: The CLI is supported on Mac and Linux.
      • Kubectl: The Kubernetes command-line interface must be installed and configured to access your cluster. Follow the official guide.
      • Cluster administrative permissions: The CLI requires a Kubernetes profile with administrative privileges.
      "},{"location":"admin/config/cli-admin-install/#installation","title":"Installation","text":"

      To install the Run:ai Administrator CLI, ensure that the CLI version matches the version of your Run:ai cluster. You can either install the latest version or a specific version from the list.

      "},{"location":"admin/config/cli-admin-install/#installing-the-latest-version","title":"Installing the latest version","text":"

      Use the following commands to download and install the latest version of the CLI:

      MacLinux
      wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/darwin # (1) \nchmod +x runai-adm  \nsudo mv runai-adm /usr/local/bin/runai-adm\n
      1. In self-hosted environment, use the control-plane URL instead of app.run.ai
      wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/linux # (1) \nchmod +x runai-adm  \nsudo mv runai-adm /usr/local/bin/runai-adm\n
      1. In self-hosted environment, use the control-plane URL instead of app.run.ai
      "},{"location":"admin/config/cli-admin-install/#installing-a-specific-version","title":"Installing a specific version","text":"

      To install a specific version of the Administrator CLI that matches your Run:ai cluster version, append the version number to the download URL. Refer to the list of available versions linked above for the correct version number.

      MacLinux
      wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/<version>/darwin # Replace <version> with the desired version in the format vX.X.X (e.g., v2.19.5) \nchmod +x runai-adm  \nsudo mv runai-adm /usr/local/bin/runai-adm\n
      wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/<version>/linux # Replace <version> with the desired version in the format vX.X.X (e.g., v2.19.5)\nchmod +x runai-adm  \nsudo mv runai-adm /usr/local/bin/runai-adm\n
      "},{"location":"admin/config/cli-admin-install/#verifying-installation","title":"Verifying installation","text":"

      Verify your installation completed successfully by running the following command:

      runai-adm version\n
      "},{"location":"admin/config/cli-admin-install/#reference","title":"Reference","text":""},{"location":"admin/config/cli-admin-install/#node-roles","title":"Node roles","text":"

      To set or remove node rules using the runai-adm tool, run the following:

      runai-adm set node-role [--runai-system-worker | --gpu-worker | --cpu-worker] <node-name>\n
      runai-adm remove node-role [--runai-system-worker | --gpu-worker | --cpu-worker] <node-name>\n

      Note

      Use the --all flag to set or remove a role to all nodes.

      "},{"location":"admin/config/cli-admin-install/#collect-logs","title":"Collect logs","text":"

      To collect logs using the runai-adm tool:

      1. Run the following command:

        runai-adm collect-logs\n
      2. Locate the generated compressed log file.

      "},{"location":"admin/config/cluster-wide-pvc/","title":"Cluster wide PVCs","text":"

      A PersistentVolumeClaim (PVC) is a request for storage by a user. It is similar to a Pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes. For more information about PVCs, see Persistent Volumes.

      PVCs are namespace-specific. If your PVC relates to all run:ai Projects, do the following to propagate the PVC to all Projects:

      Create a PVC within the run:ai namespace, then run the following once to propagate the PVC to all run:ai Projects:

      kubectl label persistentvolumeclaims -n runai <PVC_NAME> runai/cluster-wide=true\n

      To delete a PVC from all run:ai Projects, run:

      kubectl label persistentvolumeclaims -n runai <PVC_NAME> runai/cluster-wide-\n

      You can add a PVC to a job using the New job form.

      To add a PVC to a new job:

      1. On the New job form, press Storage.
      2. In Persistent Volume Claims press Add.
      3. Enable Existing PVC.
      4. Enter the name (claim name) of the PVC.
      5. Enter the storage class. (Optional)
      6. Enter the size.
      7. Enable / disable access modes.
      "},{"location":"admin/config/clusters/","title":"Clusters","text":"

      This article explains the procedure to view and manage Clusters.

      The Cluster table provides a quick and easy way to see the status of your cluster.

      "},{"location":"admin/config/clusters/#clusters-table","title":"Clusters table","text":"

      The Clusters table can be found under Resources in the Run:ai platform.

      The clusters table provides a list of the clusters added to Run:ai platform, along with their status.

      The clusters table consists of the following columns:

      Column Description Cluster The name of the cluster Status The status of the cluster. For more information see the table below. Hover over the information icon for a short description and links to troubleshooting Creation time The timestamp when the cluster was created URL The URL that was given to the cluster Run:ai cluster version The Run:ai version installed on the cluster Kubernetes distribution The flavor of Kubernetes distribution Kubernetes version The version of Kubernetes installed Run:ai cluster UUID The unique ID of the cluster"},{"location":"admin/config/clusters/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"admin/config/clusters/#cluster-status","title":"Cluster status","text":"Status Description Waiting to connect The cluster has never been connected. Disconnected There is no communication from the cluster to the {{glossary.Control plane}}. This may be due to a network issue. See the troubleshooting scenarios. Missing prerequisites Some prerequisites are missing from the cluster. As a result, some features may be impacted. See the troubleshooting scenarios. Service issues At least one of the services is not working properly. You can view the list of nonfunctioning services for more information. See the troubleshooting scenarios. Connected The Run:ai cluster is connected, and all Run:ai services are running."},{"location":"admin/config/clusters/#adding-a-new-cluster","title":"Adding a new cluster","text":"

      To add a new cluster see the installation guide.

      "},{"location":"admin/config/clusters/#removing-a-cluster","title":"Removing a cluster","text":"
      1. Select the cluster you want to remove
      2. Click REMOVE
      3. A dialog appears: Make sure to carefully read the message before removing
      4. Click REMOVE to confirm the removal.
      "},{"location":"admin/config/clusters/#using-the-api","title":"Using the API","text":"

      Go to the Clusters API reference to view the available actions

      "},{"location":"admin/config/clusters/#troubleshooting","title":"Troubleshooting","text":"

      Before starting, make sure you have the following:

      • Access to the Kubernetes cluster where Run:ai is deployed with the necessary permissions
      • Access to the Run:ai Platform
      "},{"location":"admin/config/clusters/#troubleshooting-scenarios","title":"Troubleshooting scenarios","text":"Cluster disconnected

      Description: When the cluster's status is \u2018disconnected\u2019, there is no communication from the cluster services reaching the Run:ai Platform. This may be due to networking issues or issues with Run:ai services.

      Mitigation:

      1. Check Run:ai\u2019s services status:

        • Open your terminal
        • Make sure you have access to the Kubernetes cluster with permission to view pods
        • Copy and paste the following command to verify that Run:ai\u2019s services are running:

        kubectl get pods -n runai | grep -E 'runai-agent|cluster-sync|assets-sync'\n
        * If any of the services are not running, see the \u2018cluster has service issues\u2019 scenario.

      2. Check the network connection

        • Open your terminal
        • Make sure you have access to the Kubernetes cluster with permissions to create pods
        • Copy and paste the following command to create a connectivity check pod:
        kubectl run control-plane-connectivity-check -n runai --image=wbitt/network-multitool \\\n    --command -- /bin/sh -c 'curl -sSf <control-plane-endpoint> > /dev/null && echo \"Connection Successful\" \\\n    || echo \"Failed connecting to the Control Plane\"'\n
        • Replace <control-plane-endpoint> with the URL of the Control Plane in your environment. If the pod fails to connect to the Control Plane, check for potential network policies
      3. Check and modify the network policies

        • Open your terminal
        • Copy and paste the following command to check the existence of network policies:

          kubectl get networkpolicies -n runai\n

        • Review the policies to ensure that they allow traffic from the Run:ai namespace to the Control Plane. If necessary, update the policies to allow the required traffic. Example of allowing traffic:

        apiVersion: networking.k8s.io/v1\nkind: NetworkPolicy\nmetadata:\nname: allow-control-plane-traffic\nnamespace: runai\nspec:\npodSelector:\n    matchLabels:\n    app: runai\npolicyTypes:\n    - Ingress\n    - Egress\negress:\n    - to:\n        - ipBlock:\n            cidr: <control-plane-ip-range>\n    ports:\n        - protocol: TCP\n        port: <control-plane-port>\ningress:\n    - from:\n        - ipBlock:\n            cidr: <control-plane-ip-range>\n    ports:\n        - protocol: TCP\n        port: <control-plane-port>\n
        • Check infrastructure-level configurations:

          • Ensure that firewall rules and security groups allow traffic between your Kubernetes cluster and the Control Plane
          • Verify required ports and protocols:
            • Ensure that the necessary ports and protocols for Run:ai\u2019s services are not blocked by any firewalls or security groups
      4. Check Run:ai services logs

        • Open your terminal
        • Make sure you have access to the Kubernetes cluster with permissions to view logs
        • Copy and paste the following commands to view the logs of the Run:ai services:
        kubectl logs deployment/runai-agent -n runai\nkubectl logs deployment/cluster-sync -n runai\nkubectl logs deployment/assets-sync -n runai\n
        • Try to identify the problem from the logs. If you cannot resolve the issue, continue to the next step.
      5. Contact Run:ai\u2019s support

        • If the issue persists, contact Run:ai\u2019s support for assistance.
      Cluster has service issues

      Description: When a cluster's status is Has service issues, it means that one or more Run:ai services running in the cluster are not available.

      Mitigation:

      1. Verify non-functioning services

        • Open your terminal
        • Make sure you have access to the Kubernetes cluster with permissions to view the runaiconfig resource
        • Copy and paste the following command to determine which services are not functioning:
        kubectl get runaiconfig -n runai runai -ojson | jq -r '.status.conditions | map(select(.type == \"Available\"))'\n
      2. Check for Kubernetes events

        • Open your terminal
        • Make sure you have access to the Kubernetes cluster with permissions to view events
        • Copy and paste the following command to get all Kubernetes events:
      3. Inspect resource details

        • Open your terminal
        • Make sure you have access to the Kubernetes cluster with permissions to describe resources
        • Copy and paste the following command to check the details of the required resource:
        kubectl describe <resource_type> <name>\n
      4. Contact Run:ai\u2019s Support

        • If the issue persists, contact contact Run:ai\u2019s support for assistance.
      Cluster is waiting to connect

      Description: When the cluster's status is \u2018waiting to connect\u2019, it means that no communication from the cluster services reaches the Run:ai Platform. This may be due to networking issues or issues with Run:ai services.

      Mitigation:

      1. Check Run:ai\u2019s services status

        • Open your terminal
        • Make sure you have access to the Kubernetes cluster with permissions to view pods
        • Copy and paste the following command to verify that Run:ai\u2019s services are running:
        kubectl get pods -n runai | grep -E 'runai-agent|cluster-sync|assets-sync'\n
        • If any of the services are not running, see the \u2018cluster has service issues\u2019 scenario.
      2. Check the network connection

        • Open your terminal
        • Make sure you have access to the Kubernetes cluster with permissions to create pods
        • Copy and paste the following command to create a connectivity check pod:
        kubectl run control-plane-connectivity-check -n runai --image=wbitt/network-multitool --command -- /bin/sh -c 'curl -sSf <control-plane-endpoint> > /dev/null && echo \"Connection Successful\" || echo \"Failed connecting to the Control Plane\"'\n
        • Replace <control-plane-endpoint> with the URL of the Control Plane in your environment. If the pod fails to connect to the Control Plane, check for potential network policies:
      3. Check and modify the network policies

        • Open your terminal
        • Copy and paste the following command to check the existence of network policies:
        kubectl get networkpolicies -n runai\n
        • Review the policies to ensure that they allow traffic from the Run:ai namespace to the Control Plane. If necessary, update the policies to allow the required traffic. Example of allowing traffic:
        apiVersion: networking.k8s.io/v1\nkind: NetworkPolicy\nmetadata:\nname: allow-control-plane-traffic\nnamespace: runai\nspec:\n  podSelector:\n    matchLabels:\n    app: runai\n  policyTypes:\n    - Ingress\n    - Egress\n  egress:\n    - to:\n        - ipBlock:\n            cidr: <control-plane-ip-range>\n    ports:\n        - protocol: TCP\n        port: <control-plane-port>\n  ingress:\n    - from:\n        - ipBlock:\n            cidr: <control-plane-ip-range>\n    ports:\n        - protocol: TCP\n        port: <control-plane-port>\n
        • Check infrastructure-level configurations:
        • Ensure that firewall rules and security groups allow traffic between your Kubernetes cluster and the Control Plane
        • Verify required ports and protocols:
          • Ensure that the necessary ports and protocols for Run:ai\u2019s services are not blocked by any firewalls or security groups
      4. Check Run:ai services logs

        • Open your terminal
        • Make sure you have access to the Kubernetes cluster with permission to view logs
        • Copy and paste the following commands to view the logs of the Run:ai services:
        kubectl logs deployment/runai-agent -n runai\nkubectl logs deployment/cluster-sync -n runai\nkubectl logs deployment/assets-sync -n runai\n
        • Try to identify the problem from the logs. If you cannot resolve the issue, continue to the next step
      5. Contact Run:ai\u2019s support

        • If the issue persists, contact Run:ai\u2019s support for assistance.
      Cluster is missing prerequisites

      Description: When a cluster's status displays Missing prerequisites, it indicates that at least one of the Mandatory Prerequisites has not been fulfilled. In such cases, Run:ai services may not function properly.

      Mitigation:

      If you have ensured that all prerequisites are installed and the status still shows missing prerequisites, follow these steps:

      1. Check the message in the Run:ai platform for further details regarding the missing prerequisites.
      2. Inspect the runai-public ConfigMap:

        • Open your terminal. In the terminal, type the following command to list all ConfigMaps in the runai namespace:
        kubectl get configmap -n runai\n
      3. Describe the ConfigMap

        • Locate the ConfigMap named runai-public from the list
        • To view the detailed contents of this ConfigMap, type the following command:
        kubectl describe configmap runai-public -n runai\n
      4. Find Missing Prerequisites

        • In the output displayed, look for a section labeled dependencies.required
        • This section provides detailed information about any missing resources or prerequisites. Review this information to identify what is needed
      5. Contact Run:ai\u2019s support

        • If the issue persists, contact Run:ai\u2019s support for assistance.
      "},{"location":"admin/config/create-k8s-assets-in-advance/","title":"Creating Kubernetes Assets in Advance","text":"

      The article describe how to mark Kubernetes assets for use by Run:ai

      "},{"location":"admin/config/create-k8s-assets-in-advance/#creating-pvcs-in-advance","title":"Creating PVCs in advance","text":"

      Add PVCs in advance to be used when creating a PVC-type data source via the Run:ai UI.

      Follow the steps below for each required scope:

      "},{"location":"admin/config/create-k8s-assets-in-advance/#cluster-scope","title":"Cluster scope","text":"
      1. Locate the PVC in the Run:ai namespace (runai)
      2. To authorize Run:ai to use the PVC, label it: run.ai/cluster-wide: \"true\u201d The PVC is now displayed for that scope in the list of existing PVCs.
      "},{"location":"admin/config/create-k8s-assets-in-advance/#department-scope","title":"Department scope","text":"
      1. Locate the PVC in the Run:ai namespace (runai)
      2. To authorize Run:ai to use the PVC, label it: run.ai/department: \"id\" The PVC is now displayed for that scope in the list of existing PVCs.
      "},{"location":"admin/config/create-k8s-assets-in-advance/#project-scope","title":"Project scope","text":"
      1. Locate the PVC in the project\u2019s namespace The PVC is now displayed for that scope in the list of existing PVCs.
      "},{"location":"admin/config/create-k8s-assets-in-advance/#creating-configmaps-in-advance","title":"Creating ConfigMaps in advance","text":"

      Add ConfigMaps in advance to be used when creating a ConfigMap-type data source via the Run:ai UI.

      "},{"location":"admin/config/create-k8s-assets-in-advance/#cluster-scope_1","title":"Cluster scope","text":"
      1. Locate the ConfigMap in the Run:ai namespace (runai)
      2. To authorize Run:ai to use the ConfigMap, label it: run.ai/cluster-wide: \"true\u201d
      3. The ConfigMap must have a label of run.ai/resource: <resource-name>

        The ConfigMap is now displayed for that scope in the list of existing ConfigMaps.

      "},{"location":"admin/config/create-k8s-assets-in-advance/#department-scope_1","title":"Department scope","text":"
      1. Locate the ConfigMap in the Run:ai namespace (runai)
      2. To authorize Run:ai to use the ConfigMap, label it: run.ai/department: \"<department-id>\"
      3. The ConfigMap must have a label of run.ai/resource: <resource-name>

        The ConfigMap is now displayed for that scope in the list of existing ConfigMaps.

      "},{"location":"admin/config/create-k8s-assets-in-advance/#project-scope_1","title":"Project scope","text":"
      1. Locate the ConfigMap in the project\u2019s namespace
      2. The ConfigMap must have a label of run.ai/resource: <resource-name>

        The ConfigMap is now displayed for that scope in the list of existing ConfigMaps.

      "},{"location":"admin/config/default-scheduler/","title":"Setting Run:ai as the default scheduler per Namespace (Project)","text":""},{"location":"admin/config/default-scheduler/#introduction","title":"Introduction","text":"

      Kubernetes has a default scheduler that makes decisions on where to place Kubernetes Pods. Run:ai has implemented a different scheduler called the runai-scheduler. By default, Run:ai uses its own scheduler

      You can decide to use the Run:ai scheduler for other, non-Run:ai, workloads by adding the following to the workload's YAML file:

      schedulerName: runai-scheduler\n
      "},{"location":"admin/config/default-scheduler/#making-runai-the-default-scheduler","title":"Making Run:ai the default scheduler","text":"

      There may be cases where you cannot change the YAML file but still want to use the Run:ai Scheduler to schedule those workloads.

      For such cases, another option is to configure the Run:ai Scheduler as the default scheduler for a specific namespace. This will now make any workload type that is submitted to that namespace (equivalent to a Run:ai Project) use the Run:ai scheduler.

      To configure this, add the following annotation to the namespace itself:

      runai/enforce-scheduler-name: true

      "},{"location":"admin/config/default-scheduler/#example","title":"Example","text":"

      To annotate a project named proj-a, use the following command:

      kubectl annotate ns runai-proj-a runai/enforce-scheduler-name=true\n

      Verify the namespace in YAML format to see the annotation:

      kubectl get ns runai-proj-a -o yaml\n

      Output:

      apiVersion: v1\nkind: Namespace\nmetadata:\n  annotations:\n    runai/enforce-scheduler-name: \"true\"\n  creationTimestamp: \"2024-04-09T08:15:50Z\"\n  labels:\n    kubernetes.io/metadata.name: runai-proj-a\n    runai/namespace-version: v2\n    runai/queue: proj-a\n  name: runai-proj-a\n  resourceVersion: \"388336\"\n  uid: c53af666-7989-43df-9804-42bf8965ce83\nspec:\n  finalizers:\n  - kubernetes\nstatus:\n  phase: Active\n
      "},{"location":"admin/config/dr/","title":"Backup & Restore","text":""},{"location":"admin/config/dr/#runai-cluster-restore","title":"Run:ai Cluster Restore","text":"

      This article explains how to restore a Run:ai cluster on a different Kubernetes environment.

      In the event of a critical Kubernetes failure or alternatively, if you want to migrate a Run:ai cluster to a new Kubernetes environment, simply reinstall the Run:ai cluster. Once you have reinstalled and reconnected the cluster - projects, workloads and other cluster data is synced automatically.

      The restoration or back-up of Run:ai cluster Advanced features and Customized deployment configurations which are stored locally on the Kubernetes cluster is optional and they can be restored and backed-up separately.

      "},{"location":"admin/config/dr/#backup","title":"Backup","text":"

      As back-up of data is not required, the backup procedure is optional for advanced deployments, as explained above.

      "},{"location":"admin/config/dr/#backup-cluster-configurations","title":"Backup cluster configurations","text":"

      To backup Run:ai cluster configurations:

      1. Run the following command in your terminal:
        kubectl get runaiconfig runai -n runai -o yaml -o=jsonpath='{.spec}' > runaiconfig_backup.yaml\n
      2. Once the runaiconfig_back.yaml back-up file is created, save the file externally, so that it can be retrieved later.
      "},{"location":"admin/config/dr/#restore","title":"Restore","text":"

      Follow the steps below to restore the Run:ai cluster on a new Kubernetes environment.

      "},{"location":"admin/config/dr/#prerequisites","title":"Prerequisites","text":"

      Before restoring the Run:ai cluster, it is essential to validate that it is both disconnected and uninstalled.

      1. If the Kubernetes cluster is still available, uninstall the Run:ai cluster - make sure not to remove the cluster from the Control Plane
      2. Navigate to the Cluster page in the Run:ai platform
      3. Search for the cluster, and make sure its status is Disconnected
      "},{"location":"admin/config/dr/#re-installing-runai-cluster","title":"Re-installing Run:ai Cluster","text":"
      1. Follow the Run:ai cluster installation instructions and ensure all prerequisites are met
      2. If you have a back-up of the cluster configurations, reload it once the installation is complete
        kubectl apply -f runaiconfig_backup.yaml -n runai\n
      3. Navigate to the Cluster page in the Run:ai platform
      4. Search for the cluster, and make sure its status is Connected
      "},{"location":"admin/config/dr/#runai-control-plane","title":"Run:ai Control Plane","text":"

      The self-hosted variant of Run:ai also installs the control-plane at the customer site. As such, it becomes the responsibility of the IT organization to verify that the system is configured for proper backup and learn how to recover the data when needed.

      "},{"location":"admin/config/dr/#database-storage","title":"Database Storage","text":"

      Run:ai uses an internal PostgreSQL database. The database is stored on a Kubernetes Persistent Volume (PV). You must provide a backup solution for the database. Some options:

      • Backing up of PostgreSQL itself. Example: kubectl -n runai-backend exec -it runai-backend-postgresql-0 -- env PGPASSWORD=password pg_dump -U postgres backend > cluster_name_db_backup.sql
      • Backing up the persistent volume holding the database storage.
      • Using third-party backup solutions.

      Run:ai also supports an external PostgreSQL database. For details on how to configure an external database please contact Run:ai customer support.

      "},{"location":"admin/config/dr/#metrics-storage","title":"Metrics Storage","text":"

      Run:ai stores metric history using Thanos. Thanos is configured to store data on a persistent volume. The recommendation is to back up the PV.

      "},{"location":"admin/config/dr/#backing-up-control-plane-configuration","title":"Backing up Control-Plane Configuration","text":"

      The installation of the Run:ai control plane can be configured. The configuration is provided as --set command in the helm installation. These changes will be preserved on upgrade, but will not be preserved on uninstall or upon damage to Kubernetes. Thus, it is best to back up these customizations. For a list of customizations used during the installation, run:

      helm get values runai-backend -n runai-backend

      "},{"location":"admin/config/dr/#recovery","title":"Recovery","text":"

      To recover Run:ai

      • Re-create the Kubernetes/OpenShift cluster.
      • Recover the persistent volumes for metrics and database.
      • Re-install the Run:ai control plane. Use the additional configuration previously saved and connect to the restored PostgreSQL PV. Connect Prometheus to the stored metrics PV.
      • Re-install the cluster. Add additional configuration post-install.
      • If the cluster is configured such that Projects do not create a namespace automatically, you will need to re-create namespaces and apply role bindings as discussed in Kubernetes or OpenShift.
      "},{"location":"admin/config/ha/","title":"High Availability","text":"

      The purpose of this document is to configure Run:ai such that it will continue to provide service even if parts of the system are down.

      A frequent fail scenario is a physical node in the system becoming non-responsive due to physical problems or lack of resources. In such a case, Kubernetes will attempt to relocate the running pods, but the process may take time, during which Run:ai will be down.

      A different scenario is a high transaction load, leading to system overload. To address such a scenario, please review the article: scaling the Run:ai system.

      "},{"location":"admin/config/ha/#runai-control-plane","title":"Run:ai Control Plane","text":""},{"location":"admin/config/ha/#runai-system-workers","title":"Run:ai system workers","text":"

      The Run:ai control plane allows the optional gathering of Run:ai pods into specific nodes. When this feature is used, it is important to set more than one node as a Run:ai system worker. Otherwise, the horizontal scaling described below will not span multiple nodes, and the system will remain with a single point of failure.

      "},{"location":"admin/config/ha/#horizontal-scalability-of-runai-services","title":"Horizontal Scalability of Run:ai services","text":"

      Horizontal scalability is about instructing the system to create more pods to dynamically scale according to incoming load and downsize when the load subsides.

      The Run:ai control plane is running on a single Kubernetes namespace named runai-backend. The namespace contains various Kubernetes Deployments and StatefulSets. Each of these services can be scaled horizontally.

      "},{"location":"admin/config/ha/#deployments","title":"Deployments","text":"

      Each of the Run:ai deployments can be set to scale up, by adding a helm settings on install/upgrade. E.g. --set frontend.autoscaling.enabled=true. For a full list of settings, please contact Run:ai customer support.

      "},{"location":"admin/config/ha/#statefulsets","title":"StatefulSets","text":"

      Run:ai uses three third parties which are managed as Kubernetes StatefulSets:

      • Keycloak\u2014Stores the Run:ai authentication configuration as well as user identities. To scale Keycloak, use the Run:ai control-plane helm flag --set keycloakx.autoscaling.enabled=true. By default, Keycloak sets a minimum of 3 pods and will scale to more on transaction load.
      • PostgreSQL\u2014It is not possible to configure an internal PostgreSQL to scale horizontally. If this is of importance, please contact Customer Support to understand how to connect Run:ai to an external PostgreSQL service which can be configured for high availability.
      • Thanos\u2014To enable Thanos autoscaling, use the following Run:ai control-plane helm flags:
      --set thanos.query.autoscaling.enabled=true  \n--set thanos.query.autoscaling.maxReplicas=2\n--set thanos.query.autoscaling.minReplicas=2 \n
      "},{"location":"admin/config/ha/#runai-cluster","title":"Run:ai Cluster","text":""},{"location":"admin/config/ha/#runai-system-workers_1","title":"Run:ai system workers","text":"

      The Run:ai cluster allows the mandatory gathering of Run:ai pods into specific nodes. When this feature is used, it is important to set more than one node as a Run:ai system worker. Otherwise, the horizontal scaling described below may not span multiple nodes, and the system will remain with a single point of failure.

      "},{"location":"admin/config/ha/#prometheus","title":"Prometheus","text":"

      The default Prometheus installation uses a single pod replica. If the node running the pod is unresponsive, metrics will not be scraped from the cluster and will not be sent to the Run:ai control-plane.

      Prometheus supports high availability by allowing to run multiple instances. The tradeoff of this approach is that all instances will scrape and send the same data. The Run:ai control plane will identify duplicate metric series and ignore them. This approach will thus increase network, CPU and memory consumption.

      To change the number of Prometheus instances, edit the runaiconfig as described under customizing the Run:ai cluster. Under prometheus.spec, set replicas to 2.

      "},{"location":"admin/config/large-clusters/","title":"Scaling the Run:ai system","text":"

      The purpose of this document is to provide information on how to scale the Run:ai cluster and the Run:ai control-plane to withstand large transaction loads

      "},{"location":"admin/config/large-clusters/#scaling-the-runai-control-plane","title":"Scaling the Run:ai Control Plane","text":"

      The Control plane deployments which may encounter load are:

      Name Kubernetes Deployment name Purpose Backend runai-backend-backend Main control-plane service Frontend runai-backend-frontend Serving of the Run:ai console Grafana runai-backend-grafana Serving of the Run:ai metrics inside the Run:ai console

      To increase the number of replicas, run:

      To increase the number of replicas, use the following Run:ai control-plane helm flags

      --set backend.autoscaling.enabled=true \n--set frontend.autoscaling.enabled=true\n--set grafana.autoscaling.enabled=true --set grafana.autoscaling.minReplicas=2\n

      Important

      If you have chosen to mark some of the nodes as Run:ai System Workers, the new replicas will attempt to use these nodes first. Thus, for high availability purposes, you will want to mark more than one node as a Run:ai System Worker.

      "},{"location":"admin/config/large-clusters/#thanos","title":"Thanos","text":"

      Thanos is the 3rd party used by Run:ai to store metrics Under a significant user load, we would also need to increase resources for the Thanos query function. Use the following Run:ai control-plane helm flags:

      --set thanos.query.resources.limits.memory=3G\n--set thanos.query.resources.requests.memory=3G\n--set thanos.query.resources.limits.cpu=1\n--set thanos.query.resources.requests.cpu=1\n\n--set thanos.receive.resources.limits.memory=6G \n--set thanos.receive.resources.requests.memory=6G\n--set thanos.receive.resources.limits.cpu=1 \n--set thanos.receive.resources.requests.cpu=1\n
      "},{"location":"admin/config/large-clusters/#scaling-the-runai-cluster","title":"Scaling the Run:ai Cluster","text":""},{"location":"admin/config/large-clusters/#cpu-memory-resources","title":"CPU & Memory Resources","text":"

      Under Kubernetes, each of the Run:ai containers, has default resource requirements that reflect an average customer load. With significantly larger cluster loads, certain Run:ai services will require more CPU and memory resources. Run:ai now supports the ability to configure these resources and to do so for each Run:ai service group separately.

      "},{"location":"admin/config/large-clusters/#service-groups","title":"Service Groups","text":"

      Run:ai supports setting requests and limits configurations for CPU and memory for Run:ai containers. The configuration is set per service group. Each service group reflects a certain load type:

      Service Group Description Run:ai containers SchedulingServices Containers associated with the Run:ai scheduler Scheduler, StatusUpdater, MetricsExporter, PodGrouper, PodGroupAssigner, Binder SyncServices Containers associated with syncing updates between the Run:ai cluster and the Run:ai control plane Agent, ClusterSync, AssetsSync WorkloadServices Containers associated with submitting Run:ai Workloads WorkloadController, JobController"},{"location":"admin/config/large-clusters/#configuration-steps","title":"Configuration Steps","text":"

      To configure resource requirements for a group of services, update the RunaiConfig. Set the spec.global.<service-group>. resources section. The following example shows the configuration of scheduling services resource requirements:

      apiVersion: run.ai/v1\nkind: RunaiConfig\nmetadata:\nspec:\n global:\n   schedulingServices:\n     resources:\n       limits:\n         cpu: 1000m\n         memory: 1Gi\n       requests:\n         cpu: 100m\n         memory: 512Mi\n

      Use syncServices and workloadServices for the other two service groups.

      "},{"location":"admin/config/large-clusters/#recommended-resource-specifications-for-large-clusters","title":"Recommended Resource Specifications For Large Clusters","text":"

      In large clusters (100 nodes or 1500 GPUs or more), we recommend the following configuration for SchedulingServices and SyncServices groups:

      resources:\n requests:\n   cpu: 1\n   memory: 1Gi\n limits:\n   cpu: 2\n   memory: 2Gi\n
      "},{"location":"admin/config/large-clusters/#sending-metrics","title":"Sending Metrics","text":"

      Run:ai uses Prometheus to scrape metrics from the Run:ai cluster and to send them to the Run:ai control plane. The number of metrics is a function of the number of Nodes, Jobs and Projects which the system contains. When reaching hundreds of Nodes and Projects, the system will be sending large quantities of metrics which, in turn, will create a strain on the network as well as the receiving side in the control plane (SaaS or self-hosted).

      To reduce this strain, we suggest to configure Prometheus to send information in larger bulks and reduce the number of network connections:

      • Edit the runaiconfig as described under customizing the cluster.
      • Under prometheus.remoteWrite add the following:
      queueConfig:\n  capacity: 5000\n  maxSamplesPerSend: 1000\n  maxShards: 100\n

      This article provides additional details and insight.

      Also, note that this configuration enlarges the Prometheus queues and thus increases the required memory. It is hence suggested to reduce the metrics retention period as described here

      "},{"location":"admin/config/limit-to-node-group/","title":"Group Nodes","text":""},{"location":"admin/config/limit-to-node-group/#why","title":"Why?","text":"

      In some business scenarios, you may want to direct the Run:ai scheduler to schedule a Workload to a specific node or a node group. For example, in some academic institutions, Hardware is bought using a specific grant and thus \"belongs\" to a specific research group. Another example is an inference workload that is optimized to a specific GPU type and must have dedicated resources reserved to ensure enough capacity.

      Run:ai provides two methods to designate, and group, specific resources:

      • Node Pools: Run:ai allows administrators to group specific nodes into a node pool. A node pool is a group of nodes identified by a given name (node pool name) and grouped by any label (key and value combination). The label can be chosen by the administrator or can be an existing, pre-set, label (such as an NVIDIA GPU type label).
      • Node Affinity: Run:ai allows this \"taint\" by labeling a node, or a set of nodes and then during scheduling, using the flag --node-type <label> to force this allocation.

      Important

      One can set and use both node pool and node affinity combined as a prerequisite to the scheduler, for example, if a researcher wants to use a T4 node with an Infiniband card - he or she can use a node pool of T4 and from that group, choose only the nodes with Infiniband card (node-type = infiniband).

      There is a tradeoff in place when allowing Researchers to designate specific nodes. Overuse of this feature limits the scheduler in finding an optimal resource and thus reduces overall cluster utilization.

      "},{"location":"admin/config/limit-to-node-group/#configuring-node-groups","title":"Configuring Node Groups","text":"

      To configure a node pool:

      • Find the label key & value you want to use for Run:ai to create the node pool.
      • Check that the nodes you want to group as a pool have a unique label to use, otherwise you should mark those nodes with your own uniquely identifiable label.
      • Get the names of the nodes you want Run:ai to group together. To get a list of nodes, run:
      kubectl get nodes\nKubectl get nodes --show-labels\n
      • If you chose to set your own label, run the following:
      kubectl label node <node-name> <label-key>=<label-value>\n

      The same value can be set to a single node or multiple nodes. Node Pool can only use one label (key & value) at a time.

      • To create a node pool use the create node pool Run:ai API.

      To configure a node affinity:

      • Get the names of the nodes where you want to limit Run:ai. To get a list of nodes, run:
      kubectl get nodes\n
      • For each node run the following:
      kubectl label node <node-name> run.ai/type=<label>\n

      The same value can be set to a single node, or for multiple nodes. A node can only be set with a single value.

      "},{"location":"admin/config/limit-to-node-group/#using-node-groups-via-the-cli","title":"Using Node Groups via the CLI","text":"

      To use Run:ai node pool with a workload, use Run:ai CLI command \u2018node-pool\u2019:

      runai submit job1 ... --node-pools \"my-pool\" ...\n

      To use multiple node pools with a workload, use the Run:ai CLI command:

      runai submit job1 ... --node-pools \"my-pool my-pool2 my-pool3\" ...\n

      With multiple node pools, the researcher creates a list of prioritized node pools and lets the scheduler try and choose from any of the node pools in the list, according to the given priority.

      To use node affinity, use the node type label with the --node-type flag:

      runai submit job1 ... --node-type \"my-nodes\"\n

      A researcher may combine the two flags to select both a node pool and a specific set of nodes out of that node pool (e.g. gpu-type=t4 and node-type=infiniband):

      runai submit job1 ... --node-pool-name \u201cmy pool\u201d --node-type \"my-nodes\"\n

      Note

      When submitting a workload, if you choose a node pool label and a node affinity (node type) label which does not intersect, the Run:ai scheduler will not be able to schedule that workload as it represents an empty nodes group.

      See the runai submit documentation for further information.

      "},{"location":"admin/config/limit-to-node-group/#assigning-node-groups-to-a-project","title":"Assigning Node Groups to a Project","text":"

      Node Pools are automatically assigned to all Projects and Departments with zero resource allocation as default. Allocating resources to a node pool can be done for each Project and Department. Submitting a workload to a node pool that has zero allocation for a specific project (or department) results in that workload running as an over-quota workload.

      To assign and configure specific node affinity groups or node pools to a Project see working with Projects.

      When the command-line interface flag is used in conjunction with Project-based affinity, the flag is used to refine the list of allowable node groups set in the Project.

      "},{"location":"admin/config/node-affinity-with-cloud-node-pools/","title":"Node affinity with cloud node pools","text":"

      Run:ai allows for node affinity. Node affinity is the ability to assign a Project to run on specific nodes. To use the node affinity feature, You will need to label the target nodes with the label run.ai/node-type. Most cloud clusters allow configuring node labels for the node pools in the cluster. This guide shows how to apply this configuration to different cloud providers.

      To make the node affinity work with node pools on various cloud providers, we need to make sure the node pools are configured with the appropriate Kubernetes label (run.ai/type=<TYPE_VALUE>).

      "},{"location":"admin/config/node-affinity-with-cloud-node-pools/#setting-node-labels-while-creating-a-new-cluster","title":"Setting node labels while creating a new cluster","text":"

      You can configure node-pool labels at cluster creation time

      GKEAKSEKS
      • At the first creation screen, you will see a menu on the left side named node-pools.
      • Expand the node pool you want to label.
      • Click on Metadata.
      • Near the bottom, you will find the Kubernetes label section. Add the key run.ai/type and the value <TYPE_VALUE>.
      • When creating AKS cluster at the node-pools page click on create new node-pool.
      • Go to the labels section and add key run.ai/type and the value <TYPE_VALUE>.
      • Create a regular EKS cluster.
      • Click on compute.
      • Click on Add node group.
      • In the Kubernetes labels section click on Add label. Add the key run.ai/type and the value <TYPE_VALUE>.
      "},{"location":"admin/config/node-affinity-with-cloud-node-pools/#setting-node-labels-for-a-new-node-pool","title":"Setting node labels for a new node pool","text":"GKEAKSEKS
      • At the node pool creation screen, go to the metadata section.
      • Near the bottom, you will find the Kubernetes label section. Add the key run.ai/type and the value <TYPE_VALUE>.
      • Go to your AKS page at Azure.
      • On the left menu click the node-pools button.
      • Click on Add Node Pool.
      • In the new Node Pool page go to Optional settings.
      • In the Kubernetes labels section click on Add label. Add the key run.ai/type and the value <TYPE_VALUE>.
      • Go to Add node group screen.
      • In the Kubernetes labels section click on Add label. Add the key run.ai/type and the value <TYPE_VALUE>.
      "},{"location":"admin/config/node-affinity-with-cloud-node-pools/#editing-node-labels-for-an-existing-node-pool","title":"Editing node labels for an existing node pool","text":"GKEAKSEKS
      • Go to the Google Kubernetes Engine page in the Google Cloud console.
      • Go to Google Kubernetes Engine.
      • In the cluster list, click the name of the cluster you want to modify.
      • Click the Nodes tab
      • Under Node Pools, click the name of the node pool you want to modify, then click Edit.
      • Near the bottom, you will find the Kubernetes label section. Add the key run.ai/type and the value <TYPE_VALUE>.

      To update an existing node pool label you must use the azure cli. Run the following command:

      az aks nodepool update \\\n    --resource-group [RESOURCE GROUP] \\\n    --cluster-name [CLUSTER NAME] \\\n    --name labelnp \\\n    --labels run.ai/type=[TYPE_VALUE] \\\n    --no-wait\n
      • Go to the node group page and click on Edit.
      • In the Kubernetes labels section click on Add label. Add the key run.ai/type and the value <TYPE_VALUE>.
      "},{"location":"admin/config/node-roles/","title":"Node roles","text":"

      This article explains how to designate specific node roles in a Kubernetes cluster to ensure optimal performance and reliability in production deployments.

      For optimal performance in production clusters, it is essential to avoid extensive CPU usage on GPU nodes where possible. This can be done by ensuring the following:

      • Run:ai system-level services run on dedicated CPU-only nodes.
      • Workloads that do not request GPU resources (e.g. Machine Learning jobs) are executed on CPU-only nodes.
      "},{"location":"admin/config/node-roles/#prerequisites","title":"Prerequisites","text":"

      To perform these tasks, make sure to install the Run:ai Administrator CLI.

      "},{"location":"admin/config/node-roles/#configure-node-roles","title":"Configure Node Roles","text":"

      The following node roles can be configured on the cluster:

      • System node: Reserved for Run:ai system-level services.
      • GPU Worker node: Dedicated for GPU-based workloads.
      • CPU Worker node: Used for CPU-only workloads.
      "},{"location":"admin/config/node-roles/#system-nodes","title":"System nodes","text":"

      Run:ai system nodes run system-level services required to operate. This can be done via the Run:ai Administrator CLI.

      Recommendation

      To ensure high availability and prevent a single point of failure, it is recommended to configure at least three system nodes in your cluster.

      To set a system role for a node in your Kubernetes cluster, follow these steps:

      1. Run the kubectl get nodes command to list all the nodes in your cluster and identify the name of the node you want to modify.
      2. Run one of the following commands to set or remove a node\u2019s role:
        runai-adm set node-role --runai-system-worker <node-name>\nrunai-adm remove node-role --runai-system-worker <node-name>\n

      The runai-adm CLI will label the node and set relevant cluster configurations.

      The Run:ai cluster applies Kubernetes Node Affinity using node labels to manage scheduling for cluster services (system).

      Warning

      Do not assign a system node role to the Kubernetes master node. This may disrupt Kubernetes functionality, particularly if the Kubernetes API Server is configured to use port 443 instead of the default 6443.

      "},{"location":"admin/config/node-roles/#worker-nodes","title":"Worker nodes","text":"

      Run:ai worker nodes run user-submitted workloads and system-level DeamonSets required to operate. This can be managed via the Run:ai Administrator CLI, or Kubectl.

      "},{"location":"admin/config/node-roles/#runai-administrator-cli","title":"Run:ai Administrator CLI","text":"

      To set worker role for a node in your Kubernetes cluster via Run:ai Administrator CLI, follow these steps:

      1. Use the kubectl get nodes command to list all the nodes in your cluster and identify the name of the node you want to modify.
      2. Run one of the following commands to set or remove a node\u2019s role:
         runai-adm set node-role [--gpu-worker | --cpu-worker] <node-name>\n runai-adm remove node-role [--gpu-worker | --cpu-worker] <node-name>\n

      The runai-adm CLI will label the node and set relevant cluster configurations.

      Tip

      Use the --all flag to set or remove a role to all nodes.

      "},{"location":"admin/config/node-roles/#kubectl","title":"Kubectl","text":"

      To set a worker role for a node in your Kubernetes cluster using Kubectl, follow these steps:

      1. Validate the global.nodeAffinity.restrictScheduling is set to true in the cluster\u2019s Configurations.
      2. Use the kubectl get nodes command to list all the nodes in your cluster and identify the name of the node you want to modify.
      3. Run one of the following commands to label the node with its role:
        kubectl label nodes <node-name> [node-role.kubernetes.io/runai-gpu-worker=true | node-role.kubernetes.io/runai-cpu-worker=true]\nkubectl label nodes <node-name> [node-role.kubernetes.io/runai-gpu-worker=false | node-role.kubernetes.io/runai-cpu-worker=false]\n
      "},{"location":"admin/config/notifications/","title":"Notifications System","text":""},{"location":"admin/config/notifications/#email-notifications-for-data-scientists","title":"Email Notifications for Data Scientists","text":"

      Managing numerous data science workloads requires monitoring various stages, including submission, scheduling, initialization, execution, and completion. Additionally, handling suspensions and failures is crucial for ensuring timely workload completion. Email Notifications address this need by sending alerts for critical workload life cycle changes. This empowers data scientists to take necessary actions and prevent delays.

      "},{"location":"admin/config/notifications/#setting-up-email-notifications","title":"Setting Up Email Notifications","text":"

      Important

      The system administrator needs to enable and setup email notifications so that users are kept informed about different system statuses.

      To enable email notifications for the system:

      1. Press General settings, then select Notifications.

        Note

        For SaaS deployments, use the Enable email notifications toggle.

      2. In the SMTP Host field, enter the SMTP server address and in the SMTP port field the port number.

      3. Select an Authentication type Plain or Login. Enter a username and password to be used for authentication.
      4. Enter the From email address and the Display name.
      5. Press Verify to ensure that the email configuration is working.
      6. Press Save when complete.
      "},{"location":"admin/config/notifications/#system-notifications","title":"System Notifications","text":"

      Administrators can set system wide notifications for all the users in order to alert them of important information. System notifications allows administrators the ability to update users with events that may be occurring within the Run:ai platform. The system notification will appear at each login or after the message has changed for users who are already logged in.

      To configure system notifications:

      1. Press General settings, then select Notifications.
      2. In the System notification pane, press +MESSAGE.
      3. Enter your message in the text box. Use the formatting tool bar to add special formats to your message text.
      4. Enable the \"Don't show this again\" option to allow users to opt out of seeing the message multiple times.
      5. When complete, press Save & Publish.
      "},{"location":"admin/config/org-cert/","title":"Working with a Local Certificate Authority","text":"

      Run:ai can be installed in an isolated network. In this air-gapped configuration, the organization will not be using an established root certificate authority. Instead, the organization creates a local certificate which serves as the root certificate for the organization. The certificate is installed in all browsers within the organization.

      In the context of Run:ai, the cluster and control-plane need to be aware of this certificate for consumers to be able to connect to the system.

      "},{"location":"admin/config/org-cert/#preparation","title":"Preparation","text":"

      You will need to have the public key of the local certificate authority.

      "},{"location":"admin/config/org-cert/#control-plane-installation","title":"Control-Plane Installation","text":"
      • Create the runai-backend namespace if it does not exist.
      • Add the public key to the runai-backend namespace:

        kubectl -n runai-backend create secret generic runai-ca-cert \\ \n    --from-file=runai-ca.pem=<ca_bundle_path>\n

      • As part of the installation instructions, you need to create a secret for runai-backend-tls. Use the local certificate authority instead.

      • Install the control plane, add the following flag to the helm command --set global.customCA.enabled=true
      "},{"location":"admin/config/org-cert/#cluster-installation","title":"Cluster Installation","text":"
      • Create the runai namespace if it does not exist.
      • Add the public key to the runai namespace:
        kubectl -n runai create secret generic runai-ca-cert \\\n    --from-file=runai-ca.pem=<ca_bundle_path>\n
      • In case you're using Openshift, add the public key to the openshift-monitoring namespace:
        kubectl -n openshift-monitoring create secret generic runai-ca-cert \\\n    --from-file=runai-ca.pem=<ca_bundle_path>\n
      • Install the Run:ai operator, add the following flag to the helm command --set global.customCA.enabled=true
      "},{"location":"admin/config/overview/","title":"Run:ai Configuration Articles","text":"

      This section provides a list of installation-related articles dealing with a wide range of subjects:

      Article Purpose Designating Specific Role Nodes Set one or more designated Run:ai system nodes or limit Run:ai monitoring and scheduling to specific nodes in the cluster. Create and Troubleshoot Clusters Create new clusters, view properties and status, and troubleshoot cluster connectivity related issues. Set Default Scheduler Set the default scheduler for a specific namespace Review Kubernetes Access provided to Run:ai In Restrictive Kubernetes environments such as when using OpenShift, understand and control what Kubernetes roles are provided to Run:ai External access to Containers Understand the available options for Researchers to access containers from the outside Install the Run:ai Administrator Command-line Interface The Administrator command-line is useful in a variety of flows such as cluster upgrade, node setup etc. Set Node affinity with cloud node pools Set node affinity when using a cloud provider for your cluster Local Certificate Authority For self-hosted Run:ai environments, specifically air-gapped installation, setup a local certificate authority to allow customers to safely connect to Run:ai Backup & Restore For self-hosted Run:ai environments, set up a scheduled backup of Run:ai data High Availability Configure Run:ai such that it will continue to provide service even if parts of the system are down. Scaling Scale the Run:ai cluster and the Run:ai control-plane to withstand large transaction loads Emails and system notification Configure e-mail notification"},{"location":"admin/config/secure-cluster/","title":"Secure your cluster","text":"

      This article details the security considerations for deploying Run:ai. It is intended to help administrators and security officers understand the specific permissions required by Run:ai.

      "},{"location":"admin/config/secure-cluster/#access-to-the-kubernetes-cluster","title":"Access to the Kubernetes cluster","text":"

      Run:ai integrates with Kubernetes clusters and requires specific permissions to successfully operate. These are permissions are controlled with configuration flags that dictate how Run:ai interacts with cluster resources. Prior to installation, security teams can review the permissions and ensure it aligns with their organization\u2019s policies.

      "},{"location":"admin/config/secure-cluster/#permissions-and-their-related-use-case","title":"Permissions and their related use-case","text":"

      Run:ai provides various security-related permissions that can be customized to fit specific organizational needs. Below are brief descriptions of the key use cases for these customizations:

      Permission Use case Automatic Namespace creation Controls whether Run:ai automatically creates Kubernetes namespaces when new projects are created. Useful in environments where namespace creation must be strictly managed. Automatic user assignment Decides if users are automatically assigned to projects within Run:ai. Helps manage user access more tightly in certain compliance-driven environments. Secret propagation Determines whether Run:ai should propagate secrets across the cluster. Relevant for organizations with specific security protocols for managing sensitive data. Disabling Kubernetes limit range Chooses whether to disable the Kubernetes Limit Range feature. May be adjusted in environments with specific resource management needs.

      Note

      These security customizations allow organizations to tailor Run:ai to their specific needs. All changes should be modified cautiously and only when necessary to meet particular security, compliance or operational requirements.

      "},{"location":"admin/config/secure-cluster/#secure-installation","title":"Secure installation","text":"

      Many organizations enforce IT compliance rules for Kubernetes, with strict access control for installing and running workloads. OpenShift uses Security Context Constraints (SCC) for this purpose. Run:ai fully supports SCC, ensuring integration with OpenShift's security requirements.

      "},{"location":"admin/config/secure-cluster/#security-vulnerabilities","title":"Security vulnerabilities","text":"

      The platform is actively monitored for security vulnerabilities, with regular scans conducted to identify and address potential issues. Necessary fixes are applied to ensure that the software remains secure and resilient against emerging threats, providing a safe and reliable experience.

      "},{"location":"admin/config/shared-storage/","title":"Shared Storage","text":"

      Shared storage is a critical component in AI and machine learning workflows, particularly in scenarios involving distributed training and shared datasets. In AI and ML environments, data must be readily accessible across multiple nodes, especially when training large models or working with vast datasets. Shared storage enable seamless access to data, ensuring that all nodes in a distributed training setup can read and write to the same datasets simultaneously. This setup not only enhances efficiency but is also crucial for maintaining consistency and speed in high-performance computing environments.

      While Run:ai Platform supports a variety of remote data sources, such as Git and S3, it is often more efficient to keep data close to the compute resources. This proximity is typically achieved through the use of shared storage, accessible to multiple nodes in your Kubernetes cluster.

      "},{"location":"admin/config/shared-storage/#shared-storage","title":"Shared storage","text":"

      When implementing shared storage in Kubernetes, there are two primary approaches:

      • Utilizing the Kubernetes Storage Classes of your storage provider; or
      • Using a direct NFS (Network File System) mount

      Storage Classes being the recommended option.

      Run:ai Data Sources support both direct NFS mount and Kubernetes Storage Classes.

      "},{"location":"admin/config/shared-storage/#kubernetes-storage-classes","title":"Kubernetes storage classes","text":"

      Storage classes in Kubernetes defines how storage is provisioned and managed. This allows you to select storage types optimized for AI workloads. For example, you can choose storage with high IOPS (Input/Output Operations Per Second) for rapid data access during intensive training sessions, or tiered storage options to balance cost and performance-based on your organization\u2019s requirements. This approach supports dynamic provisioning, enabling storage to be allocated on-demand as required by your applications.

      Run:ai data sources such as Persistent Volume Claims (PVC) and Data Volumes leverage storage class to manage and allocate storage efficiently. This ensures that the most suitable storage option is always accessible, contributing to the efficiency and performance of AI workloads.

      Note

      Run:ai lists all available storage classes in the Kubernetes cluster, making it easy for users to select the appropriate storage. Additionally, policies can be set to restrict or enforce the use of specific storage classes, to helpl maintain compliance with organizational standards and optimize resource utilization.

      Kubernetes 1.23 (old)

      When using Kubernetes 1.23, Data Source of PVC type does not work using a Storage Class with the property volumeBindingMode equals to WaitForFirstConsumer

      "},{"location":"admin/config/shared-storage/#direct-nfs-mount","title":"Direct NFS mount","text":"

      Direct NFS allows you to mount a shared file system directly across multiple nodes in your Kubernetes cluster. This method provides a straightforward way to share data among nodes and is often used for simple setups or when a dedicated NFS server is available.

      However, using NFS can present challenges related to security and control. Direct NFS setups might lack the fine-grained control and security features available with storage class.

      "},{"location":"admin/config/workload-ownership-protection/","title":"Workload Deletion Protection","text":""},{"location":"admin/config/workload-ownership-protection/#workload-deletion-protection","title":"Workload Deletion Protection","text":"

      Workload deletion protection in Run:ai ensures that only users who created a workload can delete or modify them. This feature is designed to safeguard important jobs and configurations from accidental or unauthorized modifications by users who did not originally create the workload.

      By enforcing ownership rules, Run:ai helps maintain the integrity and security of your machine learning operations. This additional layer of security ensures that only users with the appropriate permissions can delete and suspend workloads.

      The protection feature is implemented at the cluster level.

      To enable deletion protection run the following command:

      kubectl patch -n runai runaiconfigs.run.ai/runai --type='merge' --patch '{\"spec\":{\"global\":{\"enableWorkloadOwnershipProtection\":true}}}'\n
      "},{"location":"admin/maintenance/alert-monitoring/","title":"System Monitoring","text":"

      This article explains how to configure Run:ai to generate health alerts and to connect these alerts to alert-management systems within your organization. Alerts are generated for Run:ai clusters.

      "},{"location":"admin/maintenance/alert-monitoring/#alert-infrastructure","title":"Alert infrastructure","text":"

      Run:ai uses Prometheus for externalizing metrics and providing visibility to end-users. The Run:ai Cluster installation includes Prometheus or can connect to an existing Prometheus instance used in your organization. The alerts are based on the Prometheus AlertManager. Once installed, it is enabled by default.

      This document explains how to:

      • Configure alert destinations - triggered alerts send data to specified destinations
      • Understand the out-of-the-box cluster alerts, provided by Run:ai
      • Add additional custom alerts
      "},{"location":"admin/maintenance/alert-monitoring/#prerequisites","title":"Prerequisites","text":"
      • A Kubernetes cluster with the necessary permissions
      • Up and running Run:ai environment, including Prometheus Operator
      • kubectl command-line tool installed and configured to interact with the cluster
      "},{"location":"admin/maintenance/alert-monitoring/#set-up","title":"Set-up","text":"

      Use the steps below to set up monitoring alerts.

      "},{"location":"admin/maintenance/alert-monitoring/#validating-prometheus-operator-installed","title":"Validating Prometheus operator installed","text":"
      1. Verify that the Prometheus Operator Deployment is running Copy the following command and paste it in your terminal, where you have access to the Kubernetes cluster: kubectl get deployment kube-prometheus-stack-operator -n monitoring In your terminal, you can see an output indicating the deployment's status, including the number of replicas and their current state.
      2. Verify that Prometheus instances are running Copy the following command and paste it in your terminal: kubectl get prometheus -n runai You can see the Prometheus instance(s) listed along with their status.
      "},{"location":"admin/maintenance/alert-monitoring/#enabling-prometheus-alertmanager","title":"Enabling Prometheus AlertManager","text":"

      In each of the steps in this section, copy the content of the code snippet to a new YAML file (e.g., step1.yaml).

      • Copy the following command to your terminal, to apply the YAML file to the cluster:

      kubectl apply -f step1.yaml Copy the following command to your terminal to create the AlertManager CustomResource, to enable AlertManager:

      apiVersion: monitoring.coreos.com/v1  \nkind: Alertmanager  \nmetadata:  \n   name: runai  \n   namespace: runai  \nspec:  \n   replicas: 1  \n   alertmanagerConfigSelector:  \n      matchLabels:\n         alertmanagerConfig: runai \n
      • Copy the following command to your terminal to validate that the AlertManager instance has started: kubectl get alertmanager -n runai
      • Copy the following command to your terminal to validate that the Prometheus operator has created a Service for AlertManager: kubectl get svc alertmanager-operated -n runai
      "},{"location":"admin/maintenance/alert-monitoring/#configuring-prometheus-to-send-alerts","title":"Configuring Prometheus to send alerts","text":"
      1. Open the terminal on your local machine or another machine that has access to your Kubernetes cluster
      2. Copy and paste the following command in your terminal to edit the Prometheus configuration for the runai Namespace:

        kubectl edit prometheus runai -n runai\n
        This command opens the Prometheus configuration file in your default text editor (usually vi or nano).

      3. Copy and paste the following text to your terminal to change the configuration file:

        alerting:  \n   alertmanagers:  \n      - namespace: runai  \n        name: alertmanager-operated  \n        port: web\n

      4. Save the changes and exit the text editor.

      Note

      To save changes using vi, type :wq and press Enter. The changes are applied to the Prometheus configuration in the cluster.

      "},{"location":"admin/maintenance/alert-monitoring/#alert-destinations","title":"Alert destinations","text":"

      Set out below are the various alert destinations.

      "},{"location":"admin/maintenance/alert-monitoring/#configuring-alertmanager-for-custom-email-alerts","title":"Configuring AlertManager for custom email alerts","text":"

      In each step, copy the contents of the code snippets to a new file and apply it to the cluster using kubectl apply -f.

      Add your smtp password as a secret:

      apiVersion: v1  \nkind: Secret  \nmetadata:  \n   name: alertmanager-smtp-password  \n   namespace: runai  \nstringData:\n   password: \"your_smtp_password\"\n

      Replace the relevant smtp details with your own, then apply the alertmanagerconfig using kubectl apply.

       apiVersion: monitoring.coreos.com/v1alpha1  \n kind: AlertmanagerConfig  \n metadata:  \n   name: runai  \n   namespace: runai  \n labels:  \n    alertmanagerConfig: runai  \n spec:  \n    route:  \n       continue: true  \n       groupBy:   \n       - alertname\n\n       groupWait: 30s  \n       groupInterval: 5m  \n       repeatInterval: 1h\n\n    matchers:  \n    - matchType: =~  \n      name: alertname  \n      value: Runai.*\n\n    receiver: email\n\n receivers:  \n - name: 'email'  \n   emailConfigs:  \n   - to: '<destination_email_address>'  \n     from: '<from_email_address>'  \n     smarthost: 'smtp.gmail.com:587'  \n     authUsername: '<smtp_server_user_name>'  \n     authPassword:  \n       name: alertmanager-smtp-password\n         key: password  \n

      Save and exit the editor. The configuration is automatically reloaded.

      "},{"location":"admin/maintenance/alert-monitoring/#third-party-alert-destinations","title":"Third-party alert destinations","text":"

      Prometheus AlertManager provides a structured way to connect to alert-management systems. There are built-in plugins for popular systems such as PagerDuty and OpsGenie, including a generic Webhook.

      "},{"location":"admin/maintenance/alert-monitoring/#example-integrating-runai-with-a-webhook","title":"Example: Integrating Run:ai with a Webhook","text":"
      1. Use webhook.site to get a unique URL.
      2. Use the upgrade cluster instructions to modify the values file: Edit the values file to add the following, and replace <WEB-HOOK-URL> with the URL from webhook.site.

      codekube-prometheus-stack:  \n  ...  \n  alertmanager:  \n    enabled: true  \n    config:  \n      global:  \n        resolve_timeout: 5m  \n      receivers:  \n      - name: \"null\"  \n      - name: webhook-notifications  \n        webhook_configs:  \n          - url: <WEB-HOOK-URL>  \n            send_resolved: true  \n      route:  \n        group_by:  \n        - alertname  \n        group_interval: 5m  \n        group_wait: 30s  \n        receiver: 'null'  \n        repeat_interval: 10m  \n        routes:  \n        - receiver: webhook-notifications\n
      3. Verify that you are receiving alerts on the webhook.site, in the left pane:

      "},{"location":"admin/maintenance/alert-monitoring/#built-in-alerts","title":"Built-in alerts","text":"

      A Run:ai cluster comes with several built-in alerts. Each alert notifies on a specific functionality of a Run:ai\u2019s entity. There is also a single, inclusive alert: Run:ai Critical Problems, which aggregates all component-based alerts into a single cluster health test.

      Runai agent cluster info push rate low

      Meaning The cluster-sync Pod in the runai namespace might not be functioning properly Impact Possible impact - no info/partial info from the cluster is being synced back to the control-plane Severity Critical Diagnosis kubectl get pod -n runai to see if the cluster-sync pod is running Troubleshooting/Mitigation To diagnose issues with the cluster-sync pod, follow these steps: Paste the following command to your terminal, to receive detailed information about the cluster-sync deployment:kubectl describe deployment cluster-sync -n runai Check the Logs: Use the following command to view the logs of the cluster-sync deployment:kubectl logs deployment/cluster-sync -n runai Analyze the Logs and Pod Details: From the information provided by the logs and the deployment details, attempt to identify the reason why the cluster-sync pod is not functioning correctly Check Connectivity: Ensure there is a stable network connection between the cluster and the Run:ai Control Plane. A connectivity issue may be the root cause of the problem. Contact Support: If the network connection is stable and you are still unable to resolve the issue, contact Run:ai support for further assistance

      Runai cluster sync handling rate low

      | Meaning | The cluster-sync Pod in the runai namespace might be functioning slowly | | | :---- |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Impact | Possible impact - info from the cluster is being synced back to the control-plane with a slow rate | | Severity | Warning | | Diagnosis | kubectl logs deployment/cluster-sync -n runai to see if the cluster-sync pod is running properly | | Troubleshooting/Mitigation | To diagnose issues with the cluster-sync pod, follow these steps: Check the Logs: Use the following command to view the logs of the cluster-sync deployment:kubectl logs deployment/cluster-sync -n runai Analyze the Logs and Pod Details: From the information provided by the logs and the deployment details, attempt to identify the reason why the cluster-sync pod is not functioning correctly Check Connectivity: Ensure there is a stable network connection between the cluster and the Run:ai Control Plane. A connectivity issue may be the root cause of the problem. Contact Support: If the network connection is stable and you are still unable to resolve the issue, contact Run:ai support for further assistance |

      Runai agent pull rate low

      Meaning The runai-agent pod may be too loaded, is slow in processing data (possible in very big clusters), or the runai-agent pod itself in the runai namespace may not be functioning properly. Impact Possible impact - no info/partial info from the control-plane is being synced in the cluster Severity Critical Diagnosis Run: kubectl get pod -n runai And see if the runai-agent pod is running. Troubleshooting/Mitigation To diagnose issues with the runai-agent pod, follow these steps: Describe the Deployment: Run the following command to get detailed information about the runai-agent deployment:kubectl describe deployment runai-agent -n runai Check the Logs: Use the following command to view the logs of the runai-agent deployment:kubectl logs deployment/runai-agent -n runai Analyze the Logs and Pod Details: From the information provided by the logs and the deployment details, attempt to identify the reason why the runai-agent pod is not functioning correctly. There may be a connectivity issue with the control plane. Check Connectivity: Ensure there is a stable network connection between the runai-agent and the control plane. A connectivity issue may be the root cause of the problem. Consider Cluster Load: If the runai-agent appears to be functioning properly but the cluster is very large and heavily loaded, it may take more time for the agent to process data from the control plane. Adjust Alert Threshold: If the cluster load is causing the alert to fire, you can adjust the threshold at which the alert triggers. The default value is 0.05. You can try changing it to a lower value (e.g., 0.045 or 0.04).To edit the value, paste the following in your terminal:kubectl edit runaiconfig -n runaiIn the editor, navigate to:spec: prometheus: agentPullPushRateMinForAlert: If the agentPullPushRateMinForAlert value does not exist, add it under spec -> prometheus

      Runai container memory usage critical

      Meaning Runai container is using more than 90% of its Memory limit Impact The container might run out of memory and crash. Severity Critical Diagnosis Calculate the memory usage, this is performed by pasting the following to your terminal: container_memory_usage_bytes{namespace=~\"runai|runai-backend\"} Troubleshooting/Mitigation Add more memory resources to the container. If the issue persists, contact Run:ai

      Runai container memory usage warning

      Meaning Runai container is using more than 80% of its memory limit Impact The container might run out of memory and crash Severity Warning Diagnosis Calculate the memory usage, this can be done by pasting the following to your terminal: container_memory_usage_bytes{namespace=~\"runai|runai-backend\"} Troubleshooting/Mitigation Add more memory resources to the container. If the issue persists, contact Run:ai

      Runai container restarting

      Meaning Runai container has restarted more than twice in the last 10 min Impact The container might become unavailable and impact the Run:ai system Severity Warning Diagnosis To diagnose the issue and identify the problematic pods, paste this into your terminal: kubectl get pods -n runai kubectl get pods -n runai-backendOne or more of the pods have a restart count >= 2. Troubleshooting/Mitigation Paste this into your terminal:kubectl logs -n NAMESPACE POD_NAMEReplace NAMESPACE and POD_NAME with the relevant pod information from the previous step. Check the logs for any standout issues and verify that the container has sufficient resources. If you need further assistance, contact Run:ai

      Runai CPU usage warning

      Meaning runai container is using more than 80% of its CPU limit Impact This might cause slowness in the operation of certain Run:ai features. Severity Warning Diagnosis Paste the following query to your terminal in order to calculate the CPU usage: rate(container_cpu_usage_seconds_total{namespace=~\"runai|runai-backend\"}[2m]) Troubleshooting/Mitigation Add more CPU resources to the container. If the issue persists, please contact Run:ai.

      Runai critical problem

      Meaning One of the critical Run:ai alerts is currently active Impact Impact is based on the active alert Severity Critical Diagnosis Check Run:ai alerts in Prometheus to identify any active critical alerts

      Unknown state alert for a node

      Meaning The Kubernetes node hosting GPU workloads is in an unknown state, and its health and readiness cannot be determined. Impact This may interrupt GPU workload scheduling and execution. Severity Critical - Node is either unschedulable or has unknown status. The node is in one of the following states: Ready=Unknown: The control plane cannot communicate with the node. Ready=False: The node is not healthy. Unschedulable=True: The node is marked as unschedulable. Diagnosis Check the node's status using kubectl describe node, verify Kubernetes API server connectivity, and inspect system logs for GPU-specific or node-level errors.

      Low Memory Node Alert

      Meaning The Kubernetes node hosting GPU workloads has insufficient memory to support current or upcoming workloads. Impact GPU workloads may fail to schedule, experience degraded performance, or crash due to memory shortages, disrupting dependent applications. Severity Critical - Node is using more than 90% of its memory. Warning - Node is using more than 80% of its memory. Diagnosis Use kubectl top node to assess memory usage, identify memory-intensive pods, consider resizing the node or optimizing memory usage in affected pods.

      Runai daemonSet rollout stuck / Runai DaemonSet unavailable on nodes

      Meaning There are currently 0 available pods for the runai daemonset on the relevant node Impact No fractional GPU workloads support Severity Critical Diagnosis Paste the following command to your terminal: kubectl get daemonset -n runai-backend In the result of this command, identify the daemonset(s) that don\u2019t have any running pods Troubleshooting/Mitigation Paste the following command to your terminal, where daemonsetX is the problematic daemonset from the pervious step: kubectl describe daemonsetX -n runai on the relevant deamonset(s) from the previous step. The next step is to look for the specific error which prevents it from creating pods. Possible reasons might be:Node Resource Constraints: The nodes in the cluster may lack sufficient resources (CPU, memory, etc.) to accommodate new pods from the daemonset. Node Selector or Affinity Rules: The daemonset may have node selector or affinity rules that are not matching with any nodes currently available in the cluster, thus preventing pod creation.

      Runai deployment insufficient replicas / Runai deployment no available replicas /RunaiDeploymentUnavailableReplicas

      Meaning Runai deployment has one or more unavailable pods Impact When this happens, there may be scale issues. Additionally, new versions cannot be deployed, potentially resulting in missing features. Severity Critical Diagnosis Paste the following commands to your terminal, in order to get the status of the deployments in the runai and runai-backend namespaces:kubectl get deployment -n runai kubectl get deployment -n runai-backendIdentify any deployments that have missing pods. Look for discrepancies in the DESIRED and AVAILABLE columns. If the number of AVAILABLE pods is less than the DESIRED pods, it indicates that there are missing pods. Troubleshooting/Mitigation Paste the following commands to your terminal, to receive detailed information about the problematic deployment:kubectl describe deployment <DEPLOYMENT_NAME> -n runai kubectl describe deployment <DEPLOYMENT_NAME> -n runai-backend Paste the following commands to your terminal, to check the replicaset details associated with the deployment:kubectl describe replicaset <REPLICASET_NAME> -n runai kubectl describe replicaset <REPLICASET_NAME> -n runai-backend Paste the following commands to your terminal to retrieve the logs for the deployment to identify any errors or issues:kubectl logs deployment/<DEPLOYMENT_NAME> -n runai kubectl logs deployment/<DEPLOYMENT_NAME> -n runai-backend From the logs and the detailed information provided by the describe commands, analyze the reasons why the deployment is unable to create pods. Look for common issues such as: Resource constraints (CPU, memory) Misconfigured deployment settings or replicasets Node selector or affinity rules preventing pod schedulingIf the issue persists, contact Run:ai.

      Runai project controller reconcile failure

      Meaning The project-controller in runai namespace had errors while reconciling projects Impact Some projects might not be in the \u201cReady\u201d state. This means that they are not fully operational and may not have all the necessary components running or configured correctly. Severity Critical Diagnosis Retrieve the logs for the project-controller deployment by pasting the following command in your terminal:kubectl logs deployment/project-controller -n runai Carefully examine the logs for any errors or warning messages. These logs help you understand what might be going wrong with the project controller. Troubleshooting/Mitigation Once errors in the log have been identified, follow these steps to mitigate the issue: The error messages in the logs should provide detailed information about the problem. Read through them to understand the nature of the issue. If the logs indicate which project failed to reconcile, you can further investigate by checking the status of that specific project. Run the following command, replacing <PROJECT_NAME> with the name of the problematic project:kubectl get project <PROJECT_NAME> -o yaml Review the status section in the YAML output. This section describes the current state of the project and provide insights into what might be causing the failure.If the issue persists, contact Run:ai.

      Runai StatefulSet insufficient replicas / Runai StatefulSet no available replicas

      Meaning Runai statefulset has no available pods Impact Absence of Metrics Database Unavailability Severity Critical Diagnosis To diagnose the issue, follow these steps: Check the status of the stateful sets in the runai-backend namespace by running the following command:kubectl get statefulset -n runai-backend Identify any stateful sets that have no running pods. These are the ones that might be causing the problem. Troubleshooting/Mitigation Once you've identified the problematic stateful sets, follow these steps to mitigate the issue: Describe the stateful set to get detailed information on why it cannot create pods. Replace X with the name of the stateful set:kubectl describe statefulset X -n runai-backend Review the description output to understand the root cause of the issue. Look for events or error messages that explain why the pods are not being created. If you're unable to resolve the issue based on the information gathered, contact Run:ai support for further assistance."},{"location":"admin/maintenance/alert-monitoring/#adding-a-custom-alert","title":"Adding a custom alert","text":"

      You can add additional alerts from Run:ai. Alerts are triggered by using the Prometheus query language with any Run:ai metric.

      To create an alert, follow these steps using Prometheus query language with Run:ai Metrics:

      • Modify Values File: Use the upgrade cluster instructions to modify the values file.
      • Add Alert Structure: Incorporate alerts according to the structure outlined below. Replace placeholders <ALERT-NAME>, <ALERT-SUMMARY-TEXT>, <PROMQL-EXPRESSION>, <optional: duration s/m/h>, and <critical/warning> with appropriate values for your alert, as described below.

      kube-prometheus-stack:  \n   additionalPrometheusRulesMap:  \n     custom-runai:  \n       groups:  \n       - name: custom-runai-rules  \n         rules:  \n         - alert: <ALERT-NAME>  \n           annotations:  \n             summary: <ALERT-SUMMARY-TEXT>  \n           expr:  <PROMQL-EXPRESSION>  \n           for: <optional: duration s/m/h>  \n           labels:  \n             severity: <critical/warning>\n
      * <ALERT-NAME>: Choose a descriptive name for your alert, such as HighCPUUsage or LowMemory. <ALERT-SUMMARY-TEXT>: Provide a brief summary of what the alert signifies, for example, High CPU usage detected or Memory usage below threshold. <PROMQL-EXPRESSION>: Construct a Prometheus query (PROMQL) that defines the conditions under which the alert should trigger. This query should evaluate to a boolean value (1 for alert, 0 for no alert). <optional: duration s/m/h>: Optionally, specify a duration in seconds (s), minutes (m), or hours (h) that the alert condition should persist before triggering an alert. If not specified, the alert triggers as soon as the condition is met. <critical/warning>: Assign a severity level to the alert, indicating its importance. Choose between critical for severe issues requiring immediate attention, or warning for less critical issues that still need monitoring.

      You can find an example in the Prometheus documentation.

      "},{"location":"admin/maintenance/audit-log/","title":"Audit Log","text":"

      This article provides details about Run:ai\u2019s Audit log. The Run:ai control plane provides the audit log API and event history table in the Run:ai UI . Both reflect the same information regarding changes to business objects: clusters, projects and assets etc.

      "},{"location":"admin/maintenance/audit-log/#events-history-table","title":"Events history table","text":"

      The Events history table can be found under Event history in the Run:ai UI.

      The Event history table consists of the following columns:

      Column Description Subject The name of the subject Subject type The user or application assigned with the role Source IP The IP address of the subject Date & time The exact timestamp at which the event occurred. Format dd/mm/yyyy for date and hh:mm am/pm for time. Event The type of the event. Possible values: Create, Update, Delete, Login Event ID Internal event ID, can be used for support purposes Status The outcome of the logged operation. Possible values: Succeeded, Failed Entity type The type of the logged business object. Entity name The name of logged business object. Entity ID The system's internal id of the logged business object. URL The endpoint or address that was accessed during the logged event. HTTP Method The HTTP operation method used for the request. Possible values include standard HTTP methods such as GET, POST, PUT, DELETE, indicating what kind of action was performed on the specified URL."},{"location":"admin/maintenance/audit-log/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV or Download as JSON
      "},{"location":"admin/maintenance/audit-log/#using-the-event-history-date-selector","title":"Using the event history date selector","text":"

      The Event history table saves events for the last 90 days. However, the table itself presents up to the last 30 days of information due to the potentially very high number of operations that might be logged during this period.

      To view older events, or to refine your search for more specific results or fewer results, use the time selector and change the period you search for. You can also refine your search by clicking and using ADD FILTER accordingly.

      "},{"location":"admin/maintenance/audit-log/#using-api","title":"Using API","text":"

      Go to the Audit log API reference to view the available actions. Since the amount of data is not trivial, the API is based on paging. It retrieves a specified number of items for each API call. You can get more data by using subsequent calls.

      "},{"location":"admin/maintenance/audit-log/#limitations","title":"Limitations","text":"

      Submissions of workloads are not audited. As a result, the system does not track or log details of workload submissions, such as timestamps or user activity.

      "},{"location":"admin/maintenance/node-downtime/","title":"Node Maintenance","text":"

      This article provides detailed instructions on how to manage both planned and unplanned node downtime in a Kubernetes cluster that is running Run:ai. It covers all the steps to maintain service continuity and ensure the proper handling of workloads during these events.

      "},{"location":"admin/maintenance/node-downtime/#prerequisites","title":"Prerequisites","text":"
      • Access to Kubernetes cluster Administrative access to the Kubernetes cluster, including permissions to run kubectl commands
      • Basic knowledge of Kubernetes Familiarity with Kubernetes concepts such as nodes, taints, and workloads
      • Run:ai installation The Run:ai software installed and configured within your Kubernetes cluster
      • Node naming conventions Know the names of the nodes within your cluster, as these are required when executing the commands
      "},{"location":"admin/maintenance/node-downtime/#node-types","title":"Node types","text":"

      This article distinguishes between two types of nodes within a Run:ai installation:

      • Worker nodes. Nodes on which AI practitioners can submit and run workloads
      • Run:ai system nodes. Nodes on which the Run:ai software runs, managing the cluster's operations
      "},{"location":"admin/maintenance/node-downtime/#worker-nodes","title":"Worker nodes","text":"

      Worker Nodes are responsible for running workloads. When a worker node goes down, either due to planned maintenance or unexpected failure, workloads ideally migrate to other available nodes or wait in the queue to be executed when possible.

      "},{"location":"admin/maintenance/node-downtime/#training-vs-interactive-workloads","title":"Training vs. Interactive workloads","text":"

      The following workload types can run on worker nodes:

      • Training workloads. These are long-running processes that, in case of node downtime, can automatically move to another node.

      • Interactive workloads. These are short-lived, interactive processes that require manual intervention to be relocated to another node.

      Note

      While training workloads can be automatically migrated, it is recommended to plan maintenance and manually manage this process for a faster response, as it may take time for Kubernetes to detect a node failure,

      "},{"location":"admin/maintenance/node-downtime/#planned-maintenance","title":"Planned maintenance","text":"

      Before stopping a worker node for maintenance, perform the following steps:

      1. Prevent new workloads on the node To stop the Kubernetes Scheduler from assigning new workloads to the node and to safely remove all existing workloads, copy the following command to your terminal:

        kubectl taint nodes <node-name> runai=drain:NoExecute\n

        Explanation:

        • <node-name> Replace this placeholder with the actual name of the node you want to drain
        • kubectl taint nodes This command is used to add a taint to the node, which prevents any new pods from being scheduled on it
        • runai=drain:NoExecute This specific taint ensures that all existing pods on the node are evicted and rescheduled on other available nodes, if possible.

        Result: The node stops accepting new workloads, and existing workloads either migrate to other nodes or are placed in a queue for later execution.

      2. Shut down and perform maintenance After draining the node, you can safely shut it down and perform the necessary maintenance tasks.

      3. Restart the node Once maintenance is complete and the node is back online, remove the taint to allow the node to resume normal operations. Copy the following command to your terminal:

        kubectl taint nodes <node-name> runai=drain:NoExecute-\n

        Explanation:

        • runai=drain:NoExecute- The - at the end of the command indicates the removal of the taint. This allows the node to start accepting new workloads again.

        Result: The node rejoins the cluster's pool of available resources, and workloads can be scheduled on it as usual

      "},{"location":"admin/maintenance/node-downtime/#unplanned-downtime","title":"Unplanned downtime","text":"

      In the event of unplanned downtime:

      1. Automatic Restart If a node fails but immediately restarts, all services and workloads automatically resume.
      2. Extended Downtime If the node remains down for an extended period, drain the node to migrate workloads to other nodes. Copy the following command to your terminal:

        kubectl taint nodes <node-name> runai=drain:NoExecute\n

        Explanation: The command works the same as in the planned maintenance section, ensuring that no workloads remain scheduled on the node while it is down.

      3. Reintegrate the Node Once the node is back online, remove the taint to allow it to rejoin the cluster's operations. Copy the following command to your terminal:

        kubectl taint nodes <node-name> runai=drain:NoExecute-\n
        Result: This action reintegrates the node into the cluster, allowing it to accept new workloads.

      4. Permanent Shutdown If the node is to be permanently decommissioned, remove it from Kubernetes with the following command:

        kubectl delete node <node-name>\n
        Explanation:

        • kubectl delete node This command completely removes the node from the cluster
        • <node-name> Replace this placeholder with the actual name of the node

        Result: The node is no longer part of the Kubernetes cluster. If you plan to bring the node back later, it must be rejoined to the cluster using the steps outlined in the next section.

      "},{"location":"admin/maintenance/node-downtime/#runai-system-nodes","title":"Run:ai System nodes","text":"

      In a production environment, the services responsible for scheduling, submitting and managing Run:ai workloads operate on one or more Run:ai system nodes. It is recommended to have more than one system node to ensure high availability. If one system node goes down, another can take over, maintaining continuity. If a second system node does not exist, you must designate another node in the cluster as a temporary Run:ai system node to maintain operations.

      The protocols for handling planned maintenance and unplanned downtime are identical to those for worker nodes. Refer to the above section for detailed instructions.

      "},{"location":"admin/maintenance/node-downtime/#rejoining-a-node-into-the-kubernetes-cluster","title":"Rejoining a node into the Kubernetes cluster","text":"

      To rejoin a node to the Kubernetes cluster, follow these steps:

      1. Generate a join command on the master node On the master node, copy the following command to your terminal:

        kubeadm token create --print-join-command\n

        Explanation:

        • kubeadm token create This command generates a token that can be used to join a node to the Kubernetes cluster.
        • --print-join-command This option outputs the full command that needs to be run on the worker node to rejoin it to the cluster.

        Result: The command outputs a kubeadm join command.

      2. Run the Join Command on the Worker Node Copy the kubeadm join command generated from the previous step and run it on the worker node that needs to rejoin the cluster.

        Explanation:

        • The kubeadm join command re-enrolls the node into the cluster, allowing it to start participating in the cluster's workload scheduling.
      3. Verify Node Rejoining Verify that the node has successfully rejoined the cluster by running:

        kubectl get nodes\n

        Explanation:

        This command lists all nodes currently part of the Kubernetes cluster, along with their status

        Result: The rejoined node should appear in the list with a status of Ready

      4. Re-label Nodes Once the node is back online, ensure it is labeled according to its role within the cluster

      "},{"location":"admin/maintenance/overview/","title":"Monitoring and maintenance Overview","text":"

      Deploying Run:ai in mission-critical environments requires proper monitoring and maintenance of resources to ensure workloads run and are deployed as expected.

      Details on how to monitor different parts of the physical resources in your Kubernetes system, including clusters and nodes, can be found in the monitoring and maintenance section. Adjacent configuration and troubleshooting sections also cover high availability, restoring and securing clusters, collecting logs, and reviewing audit logs to meet compliance requirements.

      In addition to monitoring Run:ai resources, it is also highly recommended to monitor Run:ai runs on Kubernetes, which manages containerized applications. In particular, focus on three main layers:

      "},{"location":"admin/maintenance/overview/#runai-control-plane-and-cluster-services","title":"Run:ai Control Plane and cluster services","text":"

      This is the highest layer and includes the parts of Run:ai pods, which run in containers managed by Kubernetes.

      "},{"location":"admin/maintenance/overview/#kubernetes-cluster","title":"Kubernetes cluster","text":"

      This layer includes the main Kubernetes system that runs and manages Run:ai components. Important elements to monitor include:

      • The health of the cluster and nodes (machines in the cluster).
      • The status of key Kubernetes services, such as the API server. For detailed information on managing clusters, see the official Kubernetes documentation.
      "},{"location":"admin/maintenance/overview/#host-infrastructure","title":"Host infrastructure","text":"

      This is the base layer, representing the actual machines (virtual or physical) that make up the cluster IT teams need to handle:

      • Managing CPU, memory, and storage
      • Keeping the operating system updated
      • Setting up the network and balancing the load

      Run:ai does not require any special configurations at this level.

      The articles below explain how to monitor these layers, maintain system security and compliance, and ensure the reliable operation of Run:ai in critical environments.

      "},{"location":"admin/researcher-setup/cli-install/","title":"Install the Run:ai V1 Command-line Interface","text":"

      The Run:ai Command-line Interface (CLI) is one of the ways for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, etc.

      The instructions below will guide you through the process of installing the CLI. The Run:ai CLI runs on Mac, Linux and Windows.

      "},{"location":"admin/researcher-setup/cli-install/#researcher-authentication","title":"Researcher Authentication","text":"

      When enabled, Researcher authentication requires additional setup when installing the CLI. To configure authentication see Setup Project-based Researcher Access Control. Use the modified Kubernetes configuration file described in the article.

      "},{"location":"admin/researcher-setup/cli-install/#prerequisites","title":"Prerequisites","text":"
      • When installing the command-line interface, it is worth considering future upgrades:
        • Install the CLI on a dedicated Jumpbox machine. Researchers will connect to the Jumpbox from which they can submit Run:ai commands
        • Install the CLI on a shared directory that is mounted on Researchers' machines.
      • A Kubernetes configuration file.
      "},{"location":"admin/researcher-setup/cli-install/#setup","title":"Setup","text":""},{"location":"admin/researcher-setup/cli-install/#kubernetes-configuration","title":"Kubernetes Configuration","text":"
      • In the Researcher's root folder, create a directory .kube. Copy the Kubernetes configuration file into the directory. Each Researcher should have a separate copy of the configuration file. The Researcher should have write access to the configuration file as it stores user defaults.
      • If you choose to locate the file at a different location than ~/.kube/config, you must create a shell variable to point to the configuration file as follows:
      export KUBECONFIG=<Kubernetes-config-file>\n
      • Test the connection by running:
      kubectl get nodes\n
      "},{"location":"admin/researcher-setup/cli-install/#install-runai-cli","title":"Install Run:ai CLI","text":"
      • Go to the Run:ai user interface. On the top right select Researcher Command Line Interface.
      • Select Mac, Linux or Windows.
      • Download directly using the button or copy the file to run it on a remote machine
      Mac or LinuxWindows

      Run:

      chmod +x runai\nsudo mv runai /usr/local/bin/runai\n

      Rename the downloaded file to have a .exe extension and move the file to a folder that is a part of the PATH.

      Note

      An alternative way of downloading the CLI is provided under the CLI Troubleshooting section.

      To verify the installation run:

      runai list jobs\n
      "},{"location":"admin/researcher-setup/cli-install/#install-command-auto-completion","title":"Install Command Auto-Completion","text":"

      It is possible to configure your Linux/Mac shell to complete Run:ai CLI commands. This feature works on bash and zsh shells only.

      ZshBash

      Edit the file ~/.zshrc. Add the lines:

      autoload -U compinit; compinit -i\nsource <(runai completion zsh)\n

      Install the bash-completion package:

      • Mac: brew install bash-completion
      • Ubuntu/Debian: sudo apt-get install bash-completion
      • Fedora/Centos: sudo yum install bash-completion

      Edit the file ~/.bashrc. Add the lines:

      [[ -r \u201c/usr/local/etc/profile.d/bash_completion.sh\u201d ]] && . \u201c/usr/local/etc/profile.d/bash_completion.sh\u201d\nsource <(runai completion bash)\n
      "},{"location":"admin/researcher-setup/cli-install/#troubleshoot-the-cli-installation","title":"Troubleshoot the CLI Installation","text":"

      See Troubleshooting a CLI installation

      "},{"location":"admin/researcher-setup/cli-install/#update-the-runai-cli","title":"Update the Run:ai CLI","text":"

      To update the CLI to the latest version perform the same install process again.

      "},{"location":"admin/researcher-setup/cli-install/#delete-the-runai-cli","title":"Delete the Run:ai CLI","text":"

      If you have installed using the default path, run:

      sudo rm /usr/local/bin/runai\n
      "},{"location":"admin/researcher-setup/docker-to-runai/","title":"From Docker to Run:ai","text":""},{"location":"admin/researcher-setup/docker-to-runai/#dockers-images-and-kubernetes","title":"Dockers, Images, and Kubernetes","text":"

      Researchers are typically proficient in working with Docker. Docker is an isolation level above the operating system which allows creating your own bundle of the operating system + deep learning environment and packaging it within a single file. The file is called a docker image.

      You create a container by starting a docker image on a machine.

      Run:ai is based on Kubernetes. At its core, Kubernetes is an orchestration software above Docker: Among other things, it allows location abstraction as to where the actual container is running. This calls for some adaptation to the Researcher's workflow as follows.

      "},{"location":"admin/researcher-setup/docker-to-runai/#image-repository","title":"Image Repository","text":"

      If your Kubernetes cluster contains a single GPU node (machine), then your image can reside on the node itself (in which case, when runai submit workloads, the Researcher must use the flag --local-image).

      If your Kubernetes cluster contains more than a single node, then, to enable location abstraction, the image can no longer reside on the node itself. It must be relocated to an image repository. There are quite a few repository-as-a-service, most notably Docker hub. Alternatively, the organization can install a private repository on-prem.

      Day-to-day work with the image located remotely is almost identical to local work. The image name now contains its location. For example, nvcr.io/nvidia/pytorch:19.12-py_3 is a PyTorch image that is located in nvcr.io. This is the Nvidia image repository as found on the web.

      "},{"location":"admin/researcher-setup/docker-to-runai/#data","title":"Data","text":"

      Deep learning is about data. It can be your code, the training data, saved checkpoints, etc.

      If your Kubernetes cluster contains a single GPU node (machine), then your data can reside on the node itself.

      If your Kubernetes cluster contains more than a single node, then, to enable location abstraction, the data must sit outside the machine, typically on network storage. The storage must be uniformly mapped to your container when it starts (using the -v command).

      "},{"location":"admin/researcher-setup/docker-to-runai/#working-with-containers","title":"Working with Containers","text":"

      Starting a container using docker usually involves a single command-line with multiple flags. A typical example:

      docker run --runtime=nvidia --shm-size 16G -it --rm -e HOSTNAME='hostname' \\\n    -v /raid/public/my_datasets:/root/dataset:ro   -i  nvcr.io/nvidia/pytorch:19.12-py3\n

      The docker command docker run should be replaced with a Run:ai command runai submit. The flags are usually the same but some adaptation is required. A complete list of flags can be found here: runai submit.

      There are similar commands to get a shell into the container (runai bash), get the container logs (runai logs), and more. For a complete list see the Run:ai CLI reference.

      "},{"location":"admin/researcher-setup/docker-to-runai/#schedule-an-onboarding-session","title":"Schedule an Onboarding Session","text":"

      It is highly recommended to schedule an onboarding session for Researchers with a Run:ai customer success professional. Run:ai can help with the above transition, but adding to that, we at Run:ai have also acquired a large body of knowledge on data science best practices which can help streamline Researchers' work as well as save money for the organization.

      "},{"location":"admin/researcher-setup/new-cli-install/","title":"Installing the V2 Command-line interface","text":"

      This article explains the procedure for installing and configuring the new researcher Command Line Interface (CLI).

      Important

      This document refers to the new CLI which only works with clusters of version 2.18 and up. The installation instructions for the older CLI are here.

      "},{"location":"admin/researcher-setup/new-cli-install/#enabling-the-v2-cli","title":"Enabling the V2 CLI","text":"

      Under General settings \u2192 Workloads, enable the flag Improved command line interface

      "},{"location":"admin/researcher-setup/new-cli-install/#installing-the-cli","title":"Installing the CLI","text":"
      1. Click the Help (?) icon in the top right corner
      2. Select Researcher Command Line Interface
      3. Select the cluster you want the CLI to communicate with
      4. Select your computer\u2019s operating system
      5. Copy the installer command and run it in the terminal
      6. Follow the installation process instructions
      7. Click Enter to use the default values (recommended)
      "},{"location":"admin/researcher-setup/new-cli-install/#testing-the-installation","title":"Testing the installation","text":"

      To verify the CLI client was installed properly

      1. Open the terminal
      2. Run the command runai version
      "},{"location":"admin/researcher-setup/new-cli-install/#configuring-the-cli","title":"Configuring the CLI","text":"

      Follow the steps below to configure the CLI.

      "},{"location":"admin/researcher-setup/new-cli-install/#authenticating-the-cli","title":"Authenticating the CLI","text":"

      After installation, sign in to the Run:ai platform to authenticate the CLI:

      1. Open the terminal on your local machine.
      2. Run runai login.
      3. Enter your username and password on the Run:ai platform's sign-in page.
      4. Return to the terminal window to use the CLI.
      "},{"location":"admin/researcher-setup/new-cli-install/#setting-the-default-cluster","title":"Setting the default cluster","text":"

      If only one cluster is connected to the account, it is set as the default cluster when you first sign in. If there are multiple clusters, you must follow the steps below to set your preferred cluster for workload submission:

      1. Open the terminal on your local machine.
      2. Run runai cluster list to find the required cluster name.
      3. Run the following command runai cluster set <CLUSTER_NAME>
      "},{"location":"admin/researcher-setup/new-cli-install/#setting-a-default-project","title":"Setting a default project","text":"

      Set a default working project, to easily submit workloads without mentioning the project name in every command.

      1. Run the following command on the terminal: runai project set <PROJECT_NAME>
      2. If successful, the following message is returned: project <PROJECT_NAME> configured successfully
      3. To see the current configuration run: runai config generate --json
      "},{"location":"admin/researcher-setup/new-cli-install/#installing-command-auto-completion","title":"Installing command auto-completion","text":"

      Auto-completion assists with completing the command syntax automatically for ease of use. Auto-completion is installed automatically. The interfaces below require manual installation:

      ZshBashWindows
      1. Edit the file ~/.zshrc
      2. Add the following code:
      autoload -U compinit; compinit -i\nsource <(runai completion zsh)\n
      1. Install the bash-completion package
      2. Choose your operating system: Mac: brew install bash-completion Ubuntu/Debian: sudo apt-get install bash-completion Fedora/Centos: sudo yum install bash-completion
      3. Edit the file ~/.bashrc and add the following lines:
      [[ $PS1 && -f /usr/share/bash-completion/bash_completion ]] && . /usr/share/bash-completion/bash_completion\nsource <(runai completion bash)\n

      Add the following code in the powershell profile:

      runai.exe completion powershell | Out-String | Invoke-Expression\nSet-PSReadLineKeyHandler -Key Tab -Function MenuComplete\n
      For more completion modes options, see Powershell completions.

      "},{"location":"admin/researcher-setup/researcher-setup-intro/","title":"Researcher Setup Overview","text":"

      Following is a step-by-step guide for getting a new Researcher up to speed with Run:ai and Kubernetes.

      "},{"location":"admin/researcher-setup/researcher-setup-intro/#change-of-paradigms-from-docker-to-kubernetes","title":"Change of Paradigms: from Docker to Kubernetes","text":"

      As part of Run:ai, the organization is typically moving from Docker-based workflows to Kubernetes. This document is an attempt to help the Researcher with this paradigm shift. It explains the basic concepts and provides links for further information about the Run:ai CLI.

      "},{"location":"admin/researcher-setup/researcher-setup-intro/#setup-the-runai-command-line-interface","title":"Setup the Run:ai Command-Line Interface","text":"

      Run:ai CLI needs to be installed on the Researcher's machine. This document provides step by step instructions.

      "},{"location":"admin/researcher-setup/researcher-setup-intro/#provide-the-researcher-with-a-gpu-quota","title":"Provide the Researcher with a GPU Quota","text":"

      To submit workloads with Run:ai, the Researcher must be provided with a Project that contains a GPU quota. Please see Working with Projects document on how to create Projects and set a quota.

      "},{"location":"admin/researcher-setup/researcher-setup-intro/#provide-access-to-the-runai-user-interface","title":"Provide access to the Run:ai User Interface","text":"

      See Setting up users for further information on how to provide access to users.

      "},{"location":"admin/researcher-setup/researcher-setup-intro/#schedule-an-onboarding-session","title":"Schedule an Onboarding Session","text":"

      It is highly recommended to schedule an onboarding session for Researchers with a Run:ai customer success professional. Run:ai can help with the above transition, but adding to that, we at Run:ai have also acquired a large body of knowledge on data science best practices which can help streamline the Researchers' work as well as save money for the organization.

      "},{"location":"admin/runai-setup/installation-types/","title":"Installation Types","text":"

      Run:ai consists of two components:

      • The Run:ai Cluster. One or more data-science GPU clusters hosted by the customer (on-prem or cloud).
      • The Run:ai Control plane. A single entity that monitors clusters, sets priorities, and business policies.

      There are two main installation options:

      Installation Type Description Classic (SaaS) Run:ai is installed on the customer's data science GPU clusters. The cluster connects to the Run:ai control plane on the cloud (https://<tenant-name>.run.ai). With this installation, the cluster requires an outbound connection to the Run:ai cloud. Self-hosted The Run:ai control plane is also installed in the customer's data center

      The self-hosted option is for organizations that cannot use a SaaS solution due to data leakage concerns. The self-hosted installation is priced differently. For further information please talk to Run:ai sales.

      "},{"location":"admin/runai-setup/installation-types/#self-hosted-installation","title":"Self-hosted Installation","text":"

      Run:ai self-hosting comes with two variants:

      Self-hosting Type Description Connected The organization can freely download from the internet (though upload is not allowed) Air-gapped The organization has no connection to the internet"},{"location":"admin/runai-setup/installation-types/#self-hosting-with-kubernetes-vs-openshift","title":"Self-hosting with Kubernetes vs OpenShift","text":"

      Kubernetes has many Certified Kubernetes Providers. Run:ai has been certified with several of them (see the Kubernetes distribution section). The OpenShift installation is different from the rest. As such, the Run:ai self-hosted installation instructions are divided into two separate sections:

      • OpenShift-based installation. See Run:ai OpenShift installation.
      • Kubernetes-based installation. See Run:ai Kubernetes installation.
      "},{"location":"admin/runai-setup/installation-types/#secure-installation","title":"Secure Installation","text":"

      In many organizations, Kubernetes is governed by IT compliance rules. In this scenario, there are strict access control rules during the installation and running of workloads:

      • OpenShift is secured using Security Context Constraints (SCC). The Run:ai installation supports SCC.
      • Run:ai provides limited support for Kubernetes Pod Security Admission (PSA). For more information see Kubernetes prerequisites.
      "},{"location":"admin/runai-setup/cluster-setup/cluster-delete/","title":"Cluster Uninstall","text":"

      This article explains how to uninstall Run:ai Cluster installation from the Kubernetes cluster.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-delete/#unistall-runai-cluster","title":"Unistall Run:ai cluster","text":"

      Uninstall of Run:ai cluster from the Kubernetes cluster does not delete existing projects, departments or workloads submitted by users.

      To uninstall the Run:ai cluster, run the following helm command in your terminal:

      helm uninstall runai-cluster -n runai\n

      To delete the Run:ai cluster from the Run:ai Platform, see Removing a cluster.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/","title":"Cluster Install","text":"

      This article explains the steps required to install the Run:ai cluster on a Kubernetes cluster using Helm.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#before-installation","title":"Before installation","text":"

      There are a number of matters to consider prior to installing using Helm.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#system-and-network-requirements","title":"System and network requirements","text":"

      Before installing the Run:ai cluster, validate that the system requirements and network requirements are met.

      Once all the requirements are met, it is highly recommend to use the Run:ai cluster preinstall diagnostics tool to:

      • Test the below requirements in addition to failure points related to Kubernetes, NVIDIA, storage, and networking
      • Look at additional components installed and analyze their relevance to a successful installation

      To run the preinstall diagnostics tool, download the latest version, and run:

      SaaSSelf-hostedAirgap
      • On EKS deployments, run aws configure prior to execution
      chmod +x ./preinstall-diagnostics-<platform> && \\\n./preinstall-diagnostics-<platform> \\\n  --domain ${COMPANY_NAME}.run.ai \\\n  --cluster-domain ${CLUSTER_FQDN}\n
      chmod +x ./preinstall-diagnostics-<platform> && \\ \n./preinstall-diagnostics-<platform> \\\n  --domain ${CONTROL_PLANE_FQDN} \\\n  --cluster-domain ${CLUSTER_FQDN} \\\n#if the diagnostics image is hosted in a private registry\n  --image-pull-secret ${IMAGE_PULL_SECRET_NAME} \\\n  --image ${PRIVATE_REGISTRY_IMAGE_URL}    \n

      In an air-gapped deployment, the diagnostics image is saved, pushed, and pulled manually from the organization's registry.

      #Save the image locally\ndocker save --output preinstall-diagnostics.tar gcr.io/run-ai-lab/preinstall-diagnostics:${VERSION}\n#Load the image to the organization's registry\ndocker load --input preinstall-diagnostics.tar\ndocker tag gcr.io/run-ai-lab/preinstall-diagnostics:${VERSION} ${CLIENT_IMAGE_AND_TAG} \ndocker push ${CLIENT_IMAGE_AND_TAG}\n

      Run the binary with the --image parameter to modify the diagnostics image to be used:

      chmod +x ./preinstall-diagnostics-darwin-arm64 && \\\n./preinstall-diagnostics-darwin-arm64 \\\n  --domain ${CONTROL_PLANE_FQDN} \\\n  --cluster-domain ${CLUSTER_FQDN} \\\n  --image-pull-secret ${IMAGE_PULL_SECRET_NAME} \\\n  --image ${PRIVATE_REGISTRY_IMAGE_URL}    \n

      For more information see preinstall diagnostics.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#helm","title":"Helm","text":"

      Run:ai cluster requires Helm 3.14 or above. To install Helm, see Helm Install.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#permissions","title":"Permissions","text":"

      A Kubernetes user with the cluster-admin role is required to ensure a successful installation, for more information see Using RBAC authorization.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#runai-namespace","title":"Run:ai namespace","text":"

      Run:ai cluster must be installed in a namespace named runai. Create the namespace by running:

      kubectl create ns runai\n
      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#tls-certificates","title":"TLS certificates","text":"

      A TLS private and public keys are required for HTTP access to the cluster. Create a Kubernetes Secret named runai-cluster-domain-tls-secret in the runai namespace with the cluster\u2019s Fully Qualified Domain Name (FQDN) private and public keys, by running the following:

      kubectl create secret tls runai-cluster-domain-tls-secret -n runai \\\n    --cert /path/to/fullchain.pem  \\ # Replace /path/to/fullchain.pem with the actual path to your TLS certificate\n    --key /path/to/private.pem # Replace /path/to/private.pem with the actual path to your private key\n
      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#installation","title":"Installation","text":"

      Follow these instructions to install using Helm.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#adding-a-new-cluster","title":"Adding a new cluster","text":"

      Follow the steps below to add a new cluster.

      Note

      When adding a cluster for the first time, the New Cluster form automatically opens when you log-in to the Run:ai platform. Other actions are prevented, until the cluster is created.

      If this is your first cluster and you have completed the New Cluster form, start at step 3. Otherwise, start at step 1.

      1. In the Run:ai platform, go to Resources
      2. Click +NEW CLUSTER
      3. Enter a unique name for your cluster
      4. Optional: Chose the Run:ai cluster version (latest, by default)
      5. Enter the Cluster URL . For more information see Domain Name Requirement
      6. Click Continue
      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#installing-runai-cluster","title":"Installing Run:ai cluster","text":"

      In the next Section, the Run:ai cluster installation steps will be presented.

      1. Follow the installation instructions and run the commands provided on your Kubernetes cluster.
      2. Click DONE

      The cluster is displayed in the table with the status Waiting to connect, once installation is complete, the cluster status changes to Connected

      Note

      To customize the installation based on your environment, see Customize cluster installation.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#troubleshooting","title":"Troubleshooting","text":"

      If you encounter an issue with the installation, try the troubleshooting scenario below.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#installation_1","title":"Installation","text":"

      If the Run:ai cluster installation failed, check the installation logs to identify the issue. Run the following script to print the installation logs:

      curl -fsSL https://raw.githubusercontent.com/run-ai/public/main/installation/get-installation-logs.sh\n
      "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#cluster-status","title":"Cluster status","text":"

      If the Run:ai cluster installation completed, but the cluster status did not change its status to Connected, check the cluster troubleshooting scenarios

      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/","title":"System Requirements","text":"

      The Run:ai Cluster is a Kubernetes application.

      This article explains the required hardware and software system requirements for the Run:ai cluster.

      Set out below are the system requirements for the Run:ai cluster.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#hardware-requirements","title":"Hardware Requirements","text":"

      The following hardware requirements are for the Kubernetes Cluster nodes\u2019. By default, all Run:ai cluster services run on all available nodes. For production deployments, you may want to Set Node Roles, to separate between system and worker nodes, reduce downtime and save CPU cycles on expensive GPU Machines.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#runai-cluster-system-nodes","title":"Run:ai Cluster - system nodes","text":"

      This configuration is the minimum requirement you need to install and use Run:ai Cluster.

      Component Required Capacity CPU 10 cores Memory 20GB Disk space 50GB"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#runai-cluster-worker-nodes","title":"Run:ai Cluster - Worker nodes","text":"

      The Run:ai Cluster supports both x86 CPUs and NVIDIA x86 GPUs. For the list of supported GPU models, see Supported NVIDIA Data Center GPUs and Systems.

      The following configuration represents the minimum hardware requirements for installing and operating the Run:ai cluster on worker nodes. Each node must meet these specifications:

      Component Required Capacity CPU 2 cores Memory 4GB"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#shared-storage","title":"Shared storage","text":"

      Run:ai workloads must be able to access data from any worker node in a uniform way, to access training data and code as well as save checkpoints, weights, and other machine-learning-related artifacts.

      Typical protocols are Network File Storage (NFS) or Network-attached storage (NAS). Run:ai Cluster supports both, for more information see Shared storage.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#software-requirements","title":"Software requirements","text":"

      The following software requirements must be fulfilled on the Kubernetes cluster.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#operating-system","title":"Operating system","text":"
      • Any Linux operating system supported by both Kubernetes and NVIDIA GPU Operator
      • Run:ai cluster on Google Kubernetes Engine (GKE) supports both Ubuntu and Container Optimized OS (COS). COS is supported only with NVIDIA GPU Operator 24.6 or newer, and Run:ai cluster version 2.19 or newer.
      • Internal tests are being performed on Ubuntu 22.04 and CoreOS for OpenShift.
      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#kubernetes-distribution","title":"Kubernetes distribution","text":"

      Run:ai Cluster requires Kubernetes. The following Kubernetes distributions are supported:

      • Vanilla Kubernetes
      • OpenShift Container Platform (OCP)
      • NVIDIA Base Command Manager (BCM)
      • Elastic Kubernetes Engine (EKS)
      • Google Kubernetes Engine (GKE)
      • Azure Kubernetes Service (AKS)
      • Oracle Kubernetes Engine (OKE)
      • Rancher Kubernetes Engine (RKE1)
      • Rancher Kubernetes Engine 2 (RKE2)

      Important

      The latest release of the Run:ai cluster supports Kubernetes 1.29 to 1.32 and OpenShift 4.14 to 4.17

      For existing Kubernetes clusters, see the following Kubernetes version support matrix for the latest Run:ai cluster releases:

      Run:ai version Supported Kubernetes versions Supported OpenShift versions v2.13 1.23 to 1.28 4.10 to 4.13 v2.16 1.26 to 1.28 4.11 to 4.14 v2.17 1.27 to 1.29 4.12 to 4.15 v2.18 1.28 to 1.30 4.12 to 4.16 v2.19 1.28 to 1.31 4.12 to 4.17 v2.20 (latest) 1.29 to 1.32 4.14 to 4.17

      For information on supported versions of managed Kubernetes, it's important to consult the release notes provided by your Kubernetes service provider. There, you can confirm the specific version of the underlying Kubernetes platform supported by the provider, ensuring compatibility with Run:ai. For an up-to-date end-of-life statement see Kubernetes Release History or OpenShift Container Platform Life Cycle Policy

      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#kubernetes-pod-security-admission","title":"Kubernetes Pod Security Admission","text":"

      Run:ai v2.15 and above supports restricted policy for Pod Security Admission (PSA) on OpenShift only. Other Kubernetes distributions are only supported with privileged policy.

      For Run:ai on OpenShift to run with PSA restricted policy:

      • Label the runai namespace as described in Pod Security Admission with the following labels:
      pod-security.kubernetes.io/audit=privileged\npod-security.kubernetes.io/enforce=privileged\npod-security.kubernetes.io/warn=privileged\n
      • The workloads submitted through Run:ai should comply with the restrictions of PSA restricted policy, This can be enforced using Policies.
      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#kubernetes-ingress-controller","title":"Kubernetes Ingress Controller","text":"

      Run:ai cluster requires Kubernetes Ingress Controller to be installed on the Kubernetes cluster.

      • OpenShift, RKE and RKE2 come pre-installed ingress controller.
      • Internal tests are being performed on NGINX, Rancher NGINX, OpenShift Router, and Istio.
      • Make sure that a default ingress controller is set.

      There are many ways to install and configure different ingress controllers. A simple example to install and configure NGINX ingress controller using helm:

      Vanilla KubernetesManaged Kubernetes (EKS, GKE, AKS)Oracle Kubernetes Engine (OKE)

      Run the following commands:

      helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx\nhelm repo update\nhelm upgrade -i nginx-ingress ingress-nginx/ingress-nginx \\\n    --namespace nginx-ingress --create-namespace \\\n    --set controller.kind=DaemonSet \\\n    --set controller.service.externalIPs=\"{<INTERNAL-IP>,<EXTERNAL-IP>}\" # Replace <INTERNAL-IP> and <EXTERNAL-IP> with the internal and external IP addresses of one of the nodes\n

      Run the following commands:

      helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx\nhelm repo update\nhelm install nginx-ingress ingress-nginx/ingress-nginx \\\n    --namespace nginx-ingress --create-namespace\n

      Run the following commands:

      helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx\nhelm repo update\nhelm install nginx-ingress ingress-nginx/ingress-nginx \\\n    --namespace ingress-nginx --create-namespace \\\n    --set controller.service.annotations.oci.oraclecloud.com/load-balancer-type=nlb \\\n    --set controller.service.annotations.oci-network-load-balancer.oraclecloud.com/is-preserve-source=True \\\n    --set controller.service.annotations.oci-network-load-balancer.oraclecloud.com/security-list-management-mode=None \\\n    --set controller.service.externalTrafficPolicy=Local \\\n    --set controller.service.annotations.oci-network-load-balancer.oraclecloud.com/subnet=<SUBNET-ID> # Replace <SUBNET-ID> with the subnet ID of one of your cluster\n
      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#nvidia-gpu-operator","title":"NVIDIA GPU Operator","text":"

      Run:ai Cluster requires NVIDIA GPU Operator to be installed on the Kubernetes Cluster, supports version 22.9 to 24.6

      See the Installing the NVIDIA GPU Operator, followed by notes below:

      • Use the default gpu-operator namespace . Otherwise, you must specify the target namespace using the flag runai-operator.config.nvidiaDcgmExporter.namespace as described in customized cluster installation.
      • NVIDIA drivers may already be installed on the nodes. In such cases, use the NVIDIA GPU Operator flags --set driver.enabled=false. DGX OS is one such example as it comes bundled with NVIDIA Drivers.
      • For distribution-specific additional instructions see below:
      OpenShift Container Platform (OCP)

      The Node Feature Discovery (NFD) Operator is a prerequisite for the NVIDIA GPU Operator in OpenShift. Install the NFD Operator using the Red Hat OperatorHub catalog in the OpenShift Container Platform web console. For more information see Installing the Node Feature Discovery (NFD) Operator

      Elastic Kubernetes Service (EKS)
      • When setting-up the cluster, do not install the NVIDIA device plug-in (we want the NVIDIA GPU Operator to install it instead).
      • When using the eksctl tool to create a cluster, use the flag --install-nvidia-plugin=false to disable the installation.

      For GPU nodes, EKS uses an AMI which already contains the NVIDIA drivers. As such, you must use the GPU Operator flags: --set driver.enabled=false

      Google Kubernetes Engine (GKE)

      Before installing the GPU Operator, create the gpu-operator namespace by running

      kubectl create ns gpu-operator\n

      create the following file:

      resourcequota.yaml
      apiVersion: v1\nkind: ResourceQuota\nmetadata:\nname: gcp-critical-pods\nnamespace: gpu-operator\nspec:\nscopeSelector:\n    matchExpressions:\n    - operator: In\n    scopeName: PriorityClass\n    values:\n    - system-node-critical\n    - system-cluster-critical\n

      And then run:

      kubectl apply -f resourcequota.yaml\n
      Rancher Kubernetes Engine 2 (RKE2)

      Make sure to specify the CONTAINERD_CONFIG option exactly as outlined in the documentation and custom configuration guide, using the path /var/lib/rancher/rke2/agent/etc/containerd/config.toml.tmpl. Do not create the file manually if it does not already exist. The GPU Operator will handle this configuration during deployment.

      Oracle Kubernetes Engine (OKE)
      • During cluster setup, create a nodepool, and set initial_node_labels to include oci.oraclecloud.com/disable-gpu-device-plugin=true which disables the NVIDIA GPU device plugin.
      • For GPU nodes, OKE defaults to Oracle Linux, which is incompatible with NVIDIA drivers. To resolve this, use a custom Ubuntu image instead.

      For troubleshooting information, see the NVIDIA GPU Operator Troubleshooting Guide.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#prometheus","title":"Prometheus","text":"

      Run:ai Cluster requires Prometheus to be installed on the Kubernetes cluster.

      • OpenShift comes pre-installed with prometheus
      • For RKE2 see Enable Monitoring instructions to install Prometheus

      There are many ways to install Prometheus. A simple example to install the community Kube-Prometheus Stack using helm, run the following commands:

      helm repo add prometheus-community https://prometheus-community.github.io/helm-charts\nhelm repo update\nhelm install prometheus prometheus-community/kube-prometheus-stack \\\n    -n monitoring --create-namespace --set grafana.enabled=false\n
      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#optional-software-requirements","title":"Optional software requirements","text":"

      Optional Run:ai capabilities, Distributed Training and Inference require additional Kubernetes applications (frameworks) to be installed on the cluster.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#distributed-training","title":"Distributed training","text":"

      Distributed training enables training of AI models over multiple nodes. This requires installing a distributed training framework on the cluster. The following frameworks are supported:

      • TensorFlow
      • PyTorch
      • XGBoost
      • MPI v2

      There are several ways to install each framework. A simple method of installation example is the Kubeflow Training Operator which includes TensorFlow, PyTorch, and XGBoost.

      It is recommended to use Kubeflow Training Operator v1.8.1, and MPI Operator v0.6.0 or later for compatibility with advanced workload capabilities, such as Stopping a workload and Scheduling rules.

      • To install the Kubeflow Training Operator for TensorFlow, PyTorch and XGBoost frameworks, run the following command:
      kubectl apply -k \"github.com/kubeflow/training-operator.git/manifests/overlays/standalone?ref=v1.8.1\"\n
      • To install the MPI Operator for MPI v2, run the following command:
      kubectl apply --server-side -f https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.6.0/deploy/v2beta1/mpi-operator.yaml\n

      Note

      If you require both the MPI Operator and Kubeflow Training Operator, follow the steps below:

      • Install the Kubeflow Training Operator as described above.
      • Disable and delete MPI v1 in the Kubeflow Training Operator by running:
      kubectl patch deployment training-operator -n kubeflow --type='json' -p='[{\"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/args\", \"value\": [\"--enable-scheme=tfjob\", \"--enable-scheme=pytorchjob\", \"--enable-scheme=xgboostjob\"]}]'\nkubectl delete crd mpijobs.kubeflow.org\n
      • Install the MPI Operator as described above.
      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#inference","title":"Inference","text":"

      Inference enables serving of AI models. This requires the Knative Serving framework to be installed on the cluster and supports Knative versions 1.11 to 1.16.

      Follow the Installing Knative instructions. After installation, configure Knative to use the Run:ai scheduler and features, by running:

      kubectl patch configmap/config-autoscaler \\\n  --namespace knative-serving \\\n  --type merge \\\n  --patch '{\"data\":{\"enable-scale-to-zero\":\"true\"}}' && \\\nkubectl patch configmap/config-features \\\n  --namespace knative-serving \\\n  --type merge \\\n  --patch '{\"data\":{\"kubernetes.podspec-schedulername\":\"enabled\",\"kubernetes.podspec-affinity\":\"enabled\",\"kubernetes.podspec-tolerations\":\"enabled\",\"kubernetes.podspec-volumes-emptydir\":\"enabled\",\"kubernetes.podspec-securitycontext\":\"enabled\",\"kubernetes.containerspec-addcapabilities\":\"enabled\",\"kubernetes.podspec-persistent-volume-claim\":\"enabled\",\"kubernetes.podspec-persistent-volume-write\":\"enabled\",\"multi-container\":\"enabled\",\"kubernetes.podspec-init-containers\":\"enabled\"}}'\n
      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#knative-autoscaling","title":"Knative Autoscaling","text":"

      Run:ai allows for autoscaling a deployment according to the below metrics:

      • Latency (milliseconds)
      • Throughput (requests/sec)
      • Concurrency (requests)

      Using a custom metric (for example, Latency) requires installing the Kubernetes Horizontal Pod Autoscaler (HPA). Use the following command to install. Make sure to update the VERSION in the below command with a supported Knative version.

      kubectl apply -f https://github.com/knative/serving/releases/download/knative-{VERSION}/serving-hpa.yaml\n
      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#domain-name-requirement","title":"Domain Name Requirement","text":"

      The following requirement must be followed for naming the domain.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#fully-qualified-domain-name-fqdn","title":"Fully Qualified Domain Name (FQDN)","text":"

      You must have a Fully Qualified Domain Name (FQDN) to install Run:ai Cluster (ex: runai.mycorp.local). This cannot be an IP. The domain name must be accessible inside the organization only. You also need a TLS certificate (private and public) for HTTPS access.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/","title":"SaaS Cluster Setup Introduction","text":"

      This section is a step-by-step guide for setting up a Run:ai cluster.

      • A Run:ai cluster is a Kubernetes application installed on top of a Kubernetes cluster.
      • A Run:ai cluster connects to the Run:ai control plane on the cloud. The control plane provides a control point as well as a monitoring and control user interface for Administrators and Researchers.
      • A customer may have multiple Run:ai Clusters, all connecting to a single control plane.

      For additional details see the Run:ai system components

      "},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#documents","title":"Documents","text":"
      • Review Run:ai cluster System Requirements and Network Requirements.
      • Cluster Install step-by-step guid.
      • Look for troubleshooting tips if required.
      • Cluster Upgrade and Cluster Uninstall instructions.
      "},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#customization","title":"Customization","text":"

      For a list of optional customizations see Customize Installation

      "},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#additional-configuration","title":"Additional Configuration","text":"

      For a list of advanced configuration scenarios such as configuring researcher authentication, Single sign-on limiting the installation to specific nodes, and more, see the Configuration Articles section.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#next-steps","title":"Next Steps","text":"

      After setting up the cluster, you may want to start setting up Researchers. See: Researcher Setup.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/","title":"Cluster Upgrade","text":"

      This article explains how to upgrade Run:ai cluster version.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#before-upgrade","title":"Before upgrade","text":"

      There are a number of matters to consider prior to upgrading the Run:ai cluster version.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#system-and-network-requirements","title":"System and network requirements","text":"

      Before upgrading the Run:ai cluster, validate that the latest system requirements and network requirements are met, as they can change from time to time.

      Important

      It is highly recommended to upgrade the Kubernetes version together with the Run:ai cluster version, to ensure compatibility with latest supported version of your Kubernetes distribution

      "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#helm","title":"Helm","text":"

      The latest releases of the Run:ai cluster require Helm 3.14 or above.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#upgrade","title":"Upgrade","text":"

      Follow the instructions to upgrade using Helm. The Helm commands to upgrade the Run:ai cluster version may differ between versions. The steps below describe how to get the instructions from the Run:ai UI.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#getting-the-installation-instructions","title":"Getting the installation instructions","text":"

      Follow the setup and installation instructions below to get the installation instructions to upgrade the Run:ai cluster.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#setup","title":"Setup","text":"
      1. In the Run:ai UI, go to Clusters
      2. Select the cluster you want to upgrade
      3. Click INSTALLATION INSTRUCTIONS
      4. Optional: Select the Run:ai cluster version (latest, by default)
      5. Click CONTINUE
      "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#installation-instructions","title":"Installation instructions","text":"
      1. Follow the installation instructions (See the additional instructions below when upgrading to v2.13) run the Helm commands provided on your Kubernetes cluster (see the troubleshooting below if installation fails)
      2. Click DONE
      3. Once installation is complete, validate the cluster is Connected and listed with the new cluster version (see the cluster troubleshooting scenarios). Once you have done this, the cluster is upgraded to the latest version.

      Note

      To upgrade to a specific version, modify the --version flag by specifying the desired <version-number>. You can find all available versions by using the helm search repo command.

      "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#upgrade-to-runai-cluster-version-213-old-release","title":"Upgrade to Run:ai cluster version 2.13 (old release)","text":"

      Run:ai cluster version 2.13 (old release) does not support migration of the configured Helm values. If you have customized configurations you want to migrate, follow the additional steps below:

      1. Download the Run:ai Helm values file by running the command provided in your terminal
      2. Run the following command to save existing cluster Helm values into old-values.yaml
      helm get values runai-cluster -n runai > old-values.yaml\n
      1. Identify configured custom values that you want to migrate
      2. Manually merge the values from old-values.yaml into the new values file
      "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#troubleshooting","title":"Troubleshooting","text":"

      If you encounter an issue with the cluster upgrade, use the troubleshooting scenario below.

      Installation fails

      If the Run:ai cluster upgrade fails, check the installation logs to identify the issue.

      Run the following script to print the installation logs:

      curl -fsSL https://raw.githubusercontent.com/run-ai/public/main/installation/get-installation-logs.sh\n
      Cluster status

      If the Run:ai cluster upgrade completes, but the cluster status does not show as Connected, refer to the cluster troubleshooting scenarios

      .

      "},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/","title":"Customize Installation","text":"

      This article explains the available configurations for customizing the Run:ai cluster installation.

      "},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/#helm-chart-values","title":"Helm chart values","text":"

      The Run:ai cluster installation can be customized to support your environment via Helm values files or Helm install flags.

      These configurations are saved in the runaiconfig Kubernetes object and can be edited post-installation as needed. For more information, see Advanced Cluster Configurations.

      "},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/#values","title":"Values","text":"

      The following table lists the available Helm chart values that can be configured to customize the Run:ai cluster installation.

      Key Description Default global.image.registry (string) Global Docker image registry Default: \"\" global.additionalImagePullSecrets (list) List of image pull secrets references Default: [] spec.researcherService.ingress.tlsSecret (string) Existing secret key where cluster TLS Certificates are stored (non-OpenShift) Default: runai-cluster-domain-tls-secret spec.researcherService.route.tlsSecret (string) Existing secret key where cluster TLS Certificates are stored (OpenShift only) Default: \"\" spec.prometheus.spec.image (string) Due to a known issue In the Prometheus Helm chart, the imageRegistry setting is ignored. To pull the image from a different registry, you can manually specify the Prometheus image reference. Default: quay.io/prometheus/prometheus spec.prometheus.spec.imagePullSecrets (string) List of image pull secrets references in the runai namespace to use for pulling Prometheus images (relevant for air-gapped installations). Default: []"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/","title":"Install using Base Command Manager","text":"

      This article explains the steps required to install the Run:ai cluster on a DGX Kubernetes Cluster using NVIDIA Base Command Manager (BCM).

      "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#runai-installer","title":"Run:ai Installer","text":"

      The Run:ai Installer is an User Interface (UI) wizard that simplifies the deployment of Run:ai Cluster on DGX. The Run:ai installer can be installed via the BCM cluster wizard on cluster creation.

      Note

      For advanced configuration and custom deployment options, refer to the Install using Helm.

      "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#before-installation","title":"Before installation","text":"

      There are a number of matters to consider prior to installing using the Run:ai Installer.

      "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#application-secret-key","title":"Application secret key","text":"

      An Application secret key is required to connect the cluster to the Run:ai Platform, In order to get the Application secret key, a new cluster must be added.

      1. follow the Adding a new cluster setup instructions, Do not follow the Installation instructions.
      2. Once cluster instructions are displayed, find the controlPlane.clientSecret flag in the displayed Helm command, copy and save its value.

      Note

      For DGX Bundle customers, installing their first Run:ai cluster - The Application secret key will be provided by the Run:ai Support team.

      "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#system-and-network-requirements","title":"System and network requirements","text":"

      Before installing the Run:ai cluster on a DGX system using BCM, ensure that your System requirements and Network requirements meets the necessary prerequisites.

      The BCM cluster wizard deploys essential Software Requirements, such as the Kubernetes Ingress Controller, NVIDIA GPU Operator, and Prometheus, as part of the Run:ai Installer deployment. Additional optional software requirements for Distributed training and Inference, requires manual setup.

      "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#tenant-name","title":"Tenant Name","text":"

      Your tenant name is predefined and supplied by Run:ai. Each customer is provided with a unique, dedicated URL in the format <tenant-name>.run.ai which includes the required tenant name.

      "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#tls-certificate","title":"TLS certificate","text":"

      A TLS private and public keys for the cluster\u2019s Fully Qualified Domain Name (FQDN) are required for HTTP access to the cluster

      Important

      TLS Certificate must be trusted, Self-signed certificates are not supported.

      "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#installation","title":"Installation","text":"

      Follow these instructions to install using BCM.

      "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#installing-a-cluster","title":"Installing a cluster","text":"

      The cluster installer is available via the locally installed BCM landing page,

      1. Go to the locally installed BCM landing page, Select the Run:ai tile or access directly to http://<BCM-CLUSTER-IP>:30080/runai-installer (HTTP only)
      2. Click VERIFY in order to check System Requirements are met.
      3. After verification completed successfully, click CONTINUE.
      4. Enter the cluster information and click CONTINUE.
      5. The Run:ai installation will start and should be complete within a few minutes
      6. Once a message of Run:ai was installed successfully! is displayed, Click on START USING RUN:AI to launch the login page of the tenant in a new browser tab.
      "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#troubleshooting","title":"Troubleshooting","text":"

      If you encounter an issue with the installation, try the troubleshooting scenario below.

      "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#runai-installer_1","title":"Run:ai Installer","text":"

      The Run:ai Installer is a pod in Kubernetes. The pod is responsible for the installation preparation and prerequisite gathering phase. In case of an error during the Prerequisites verification, Run the following command to print the logs:

      kubectl get pods -n runai | grep 'cluster-installer' # Find the cluster installer pod's name\nkubectl logs <POD-NAME> -n runai # Print the cluster installer pod logs\n
      "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#installation_1","title":"Installation","text":"

      If the Run:ai cluster installation failed, check the installation logs to identify the issue. Run the following script to print the installation logs:

      curl -fsSL https://raw.githubusercontent.com/run-ai/public/main/installation/get-installation-logs.sh\n
      "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#cluster-status","title":"Cluster status","text":"

      If the Run:ai cluster installation completed, but the cluster status did not change its status to Connected, check the cluster troubleshooting scenarios

      "},{"location":"admin/runai-setup/cluster-setup/network-req/","title":"Network Requirements","text":"

      The following network requirements are for the Run:ai cluster installation and usage.

      "},{"location":"admin/runai-setup/cluster-setup/network-req/#external-access","title":"External access","text":"

      Set out below are the domains to whitelist and ports to open for installation, upgrade, and usage of the application and its management.

      Ensure the inbound and outbound rules are correctly applied to your firewall.

      "},{"location":"admin/runai-setup/cluster-setup/network-req/#inbound-rules","title":"Inbound rules","text":"

      To allow your organization\u2019s Run:ai users to interact with the cluster using the Run:ai Command-line interface, or access specific UI features, certain inbound ports need to be open.

      Name Description Source Destination Port Run:ai cluster Run:ai cluster HTTPS entrypoint 0.0.0.0 all k8s nodes 443"},{"location":"admin/runai-setup/cluster-setup/network-req/#outbound-rules","title":"Outbound rules","text":"

      For the Run:ai cluster installation and usage, certain outbound ports must be open.

      Name Description Source Destination Port Run:ai Platform Run:ai cloud instance Run:ai system nodes app.run.ai 443 Grafana Run:ai cloud metrics store Run:ai system nodes prometheus-us-central1.grafana.net and runailabs.com 443 Google Container Registry Run:ai image repository All K8S nodes gcr.io/run-ai-prod 443 JFrog Artifactory Run:ai Helm repository Helm client machine runai.jfrog.io 443

      The Run:ai installation has software requirements that require additional components to be installed on the cluster. This article includes simple installation examples which can be used optionally and require the following cluster outbound ports to be open:

      Name Description Source Destination Port Kubernetes Registry Ingress Nginx image repository All K8S nodes registry.k8s.io 443 Google Container Registry GPU Operator, and Knative image repository All K8S nodes gcr.io 443 Red Hat Container Registry Prometheus Operator image repository All K8S nodes quay.io 443 Docker Hub Registry Training Operator image repository All K8S nodes docker.io 443

      Note

      If you are using an HTTP proxy, contact Run:ai support for further instructions.

      "},{"location":"admin/runai-setup/cluster-setup/network-req/#internal-network","title":"Internal network","text":"

      Ensure that all Kubernetes nodes can communicate with each other across all necessary ports. Kubernetes assumes full interconnectivity between nodes, so you must configure your network to allow this seamless communication. Specific port requirements may vary depending on your network setup.

      "},{"location":"admin/runai-setup/cluster-setup/project-management/","title":"Manually Create Projects","text":""},{"location":"admin/runai-setup/cluster-setup/project-management/#manual-creation-of-namespaces-for-projects","title":"Manual Creation of Namespaces for Projects","text":""},{"location":"admin/runai-setup/cluster-setup/project-management/#introduction","title":"Introduction","text":"

      The Administrator creates Run:ai Projects via the Run:ai user interface. When enabling Researcher Authentication you also assign users to Projects.

      Run:ai Projects are implemented as Kubernetes namespaces. When creating a new Run:ai Project, Run:ai does the following automatically:

      1. Creates a namespace by the name of runai-<PROJECT-NAME>.
      2. Labels the namespace as managed by Run:ai.
      3. Provides access to the namespace for Run:ai services.
      4. Associates users with the namespace.

      This process may need to be altered if,

      • Researchers already have existing Kubernetes namespaces
      • The organization's Kubernetes namespace naming convention does not allow the runai- prefix.
      • The organization's policy does not allow the automatic creation of namespaces.
      "},{"location":"admin/runai-setup/cluster-setup/project-management/#process","title":"Process","text":"

      Run:ai allows the association of a Run:ai Project with any existing Kubernetes namespace:

      • Disable namespace creation by setting the cluster flag createNamespaces to false. For more information see Advanced Cluster Configuration
      • Using the Run:ai User Interface, create a new Project <PROJECT-NAME>. A namespace will not be created.
      • Associate and existing namepace <NAMESPACE> with the Run:ai project by running:
      kubectl label ns <NAMESPACE>  runai/queue=<PROJECT_NAME>\n

      Caution

      Setting the createNamespaces flag to false moves the responsibility of creating namespaces to match Run:ai Projects to the administrator.

      "},{"location":"admin/runai-setup/self-hosted/overview/","title":"Self Hosted Run:ai Installation","text":"

      The self-hosted option is for organizations that cannot use a SaaS solution due to data leakage concerns.

      Run:ai self-hosting comes with two variants:

      Self-hosting Type Description Connected The organization can freely download from the internet (though upload is not allowed) Air-gapped The organization has no connection to the internet

      The self-hosted installation is priced differently. For further information please talk to Run:ai sales.

      "},{"location":"admin/runai-setup/self-hosted/overview/#self-hosting-with-kubernetes-vs-openshift","title":"Self-hosting with Kubernetes vs OpenShift","text":"

      Run:ai has been certified with a specified set of Kubernetes distributions. The OpenShift installation is different from the rest. As such, the Run:ai self-hosted installation instructions are divided into two separate sections:

      • OpenShift-based installation. See Run:ai OpenShift installation. The Run:ai operator for OpenShift is certified by Red Hat.
      • Kubernetes-based installation. See Run:ai Kubernetes installation.
      "},{"location":"admin/runai-setup/self-hosted/k8s/additional-clusters/","title":"Installing additional Clusters","text":"

      The first Run:ai cluster is typically installed on the same Kubernetes cluster as the Run:ai control plane. Run:ai supports multiple clusters per single control plane. This document is about installing additional clusters on different Kubernetes clusters.

      "},{"location":"admin/runai-setup/self-hosted/k8s/additional-clusters/#installation","title":"Installation","text":"

      Follow the Run:ai SaaS installation network instructions as described in Domain name requirement. Specifically:

      1. Install Run:ai prerequisites. Including ingress controller and Prometheus.
      2. The Cluster should have a dedicated URL with a trusted certificate.
      3. Create a secret in the Run:ai namespace containing the details of a trusted certificate.
      4. Run the helm command as instructed.
      "},{"location":"admin/runai-setup/self-hosted/k8s/backend/","title":"Install the Run:ai Control Plane","text":""},{"location":"admin/runai-setup/self-hosted/k8s/backend/#prerequisites-and-preparations","title":"Prerequisites and preparations","text":"

      Make sure you have followed the Control Plane prerequisites and preparations.

      "},{"location":"admin/runai-setup/self-hosted/k8s/backend/#helm-install","title":"Helm install","text":"

      Run the helm command below:

      ConnectedAirgapped
      helm repo add runai-backend https://runai.jfrog.io/artifactory/cp-charts-prod\nhelm repo update\nhelm upgrade -i runai-backend -n runai-backend runai-backend/control-plane --version \"~2.20.0\" \\\n    --set global.domain=<DOMAIN>  # (1)\n
      1. Domain name described here.

      Info

      To install a specific version, add --version <version> to the install command. You can find available versions by running helm search repo -l runai-backend.

      helm upgrade -i runai-backend control-plane-<VERSION>.tgz  \\ # (1)\n    --set global.domain=<DOMAIN>  \\ # (2)\n    --set global.customCA.enabled=true \\  # (3)\n    -n runai-backend -f custom-env.yaml  # (4)\n
      1. Replace <VERSION> with the Run:ai control plane version.
      2. Domain name described here.
      3. See the Local Certificate Authority instructions below
      4. custom-env.yaml should have been created by the prepare installation script in the previous section.

      Tip

      Use the --dry-run flag to gain an understanding of what is being installed before the actual installation.

      "},{"location":"admin/runai-setup/self-hosted/k8s/backend/#additional-runai-configurations-optional","title":"Additional Run:ai configurations (optional)","text":"

      There may be cases where you need to set additional properties, To apply the changes run helm upgrade and use --set to set specific configurations, and restart the relevant Run:ai pods so they can fetch the new configurations.

      Key Change Description global.ingress.ingressClass Ingress class Run:ai default is using NGINX. If your cluster has a different ingress controller, you can configure the ingress class to be created by Run:ai global.ingress.tlsSecretName TLS secret name Run:ai requires the creation of a secret with domain certificate. If the runai-backend namespace already had such a secret, you can set the secret name here <component> resources: limits: cpu: 500m memory: 512Mi requests: cpu: 250m memory: 256Mi Pod request and limits Set Run:ai and 3rd party services' resources disableIstioSidecarInjection.enabled Disable Istio sidecar injection Disable the automatic injection of Istio sidecars across the entire Run:ai Control Plane services. global.affinity System nodes Sets the system nodes where the Run:ai control plane services are scheduled."},{"location":"admin/runai-setup/self-hosted/k8s/backend/#additional-3rd-party-configurations-optional","title":"Additional 3rd party configurations (optional)","text":"

      The Run:ai Control Plane chart, includes multiple sub-charts of 3rd party components:

      • PostgreSQL - Data store
      • Thanos - Metrics Store
      • Keycloakx - Identity & Access Management
      • Grafana - Analytics Dashboard
      • Redis - Caching (Disabled, by default)

      Tip

      Click on any component, to view it's chart values and configurations

      If you have opted to connect to an external PostgreSQL database, refer to the additional configurations table below. Adjust the following parameters based on your connection details:

      1. Disable PostgreSQL deployment - postgresql.enabled
      2. Run:ai connection details - global.postgresql.auth
      3. Grafana connection details - grafana.dbUser, grafana.dbPassword
      "},{"location":"admin/runai-setup/self-hosted/k8s/backend/#postgresql","title":"PostgreSQL","text":"Key Change Description postgresql.enabled PostgreSQL installation If set to false the PostgreSQL will not be installed global.postgresql.auth.host PostgreSQL host Hostname or IP address of the PostgreSQL server global.postgresql.auth.port PostgreSQL port Port number on which PostgreSQL is running global.postgresql.auth.username PostgreSQL username Username for connecting to PostgreSQL global.postgresql.auth.password PostgreSQL password Password for the PostgreSQL user specified by global.postgresql.auth.username global.postgresql.auth.postgresPassword PostgreSQL default admin password Password for the built-in PostgreSQL superuser (postgres) global.postgresql.auth.existingSecret Postgres Credentials (secret) Existing secret name with authentication credentials global.postgresql.auth.dbSslMode Postgres connection SSL mode Set the SSL mode, see list in Protection Provided in Different Modes, prefer mode is not supported postgresql.primary.initdb.password PostgreSQL default admin password Set the same password as in global.postgresql.auth.postgresPassword (if changed) postgresql.primary.persistence.storageClass Storage class The installation to work with a specific storage class rather than the default one"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#thanos","title":"Thanos","text":"Key Change Description thanos.receive.persistence.storageClass Storage class The installation to work with a specific storage class rather than the default one"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#keycloakx","title":"Keycloakx","text":"Key Change Description keycloakx.adminUser User name of the internal identity provider administrator This user is the administrator of Keycloak keycloakx.adminPassword Password of the internal identity provider administrator This password is for the administrator of Keycloak keycloakx.existingSecret Keycloakx Credentials (secret) Existing secret name with authentication credentials global.keycloakx.host KeyCloak (Run:ai internal identity provider) host path Override the DNS for Keycloak. This can be used to access Keycloak from outside the Run:ai Control Plane cluster via ingress

      The keycloakx.adminUser can only be set during the initial installation. The admin password, however, can also be changed later through the Keycloak UI, but you must also update the keycloakx.adminPassword value in the Helm chart using helm upgrade. Failing to update the Helm values after changing the password can lead to control plane services encountering errors.

      "},{"location":"admin/runai-setup/self-hosted/k8s/backend/#grafana","title":"Grafana","text":"Key Change Description grafana.db.existingSecret Grafana database connection credentials (secret) Existing secret name with authentication credentials grafana.dbUser Grafana database username Username for accessing the Grafana database grafana.dbPassword Grafana database password Password for the Grafana database user grafana.admin.existingSecret Grafana admin default credentials (secret) Existing secret name with authentication credentials grafana.adminUser Grafana username Override the Run:ai default user name for accessing Grafana grafana.adminPassword Grafana password Override the Run:ai default password for accessing Grafana"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#redis","title":"Redis","text":"Key Change Description redisCache.auth.password Redis (Runai internal cache mechanism) applicative password Override the default password redisCache.auth.existingSecret Redis credentials (secret) Existing secret name with authentication credentials"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#next-steps","title":"Next Steps","text":""},{"location":"admin/runai-setup/self-hosted/k8s/backend/#connect-to-runai-user-interface","title":"Connect to Run:ai User interface","text":"

      Go to: runai.<domain>. Log in using the default credentials: User: test@run.ai, Password: Abcd!234. Go to the Users area and change the password.

      "},{"location":"admin/runai-setup/self-hosted/k8s/backend/#enable-forgot-password-optional","title":"Enable Forgot Password (optional)","text":"

      To support the Forgot password functionality, follow the steps below.

      • Go to runai.<domain>/auth and Log in.
      • Under Realm settings, select the Login tab and enable the Forgot password feature.
      • Under the Email tab, define an SMTP server, as explained here
      "},{"location":"admin/runai-setup/self-hosted/k8s/backend/#install-runai-cluster","title":"Install Run:ai Cluster","text":"

      Continue with installing a Run:ai Cluster.

      "},{"location":"admin/runai-setup/self-hosted/k8s/cluster/","title":"Self Hosted installation over Kubernetes - Cluster Setup","text":""},{"location":"admin/runai-setup/self-hosted/k8s/cluster/#prerequisites","title":"Prerequisites","text":"

      Install prerequisites as per System Requirements document.

      Note

      For self-hosted deployments, Kubernetes Ingress Controller and Cluster Fully Qualified Domain Name (FQDN) requirements are only necessary when the Run:ai Control Plane and Run:ai Cluster reside on seperate Kuebrnetes clusters.

      "},{"location":"admin/runai-setup/self-hosted/k8s/cluster/#install-cluster","title":"Install Cluster","text":"ConnectedAirgapped

      Perform the cluster installation instructions explained here.

      Perform the cluster installation instructions explained here.

      On the second tab of the cluster wizard, when copying the helm command for installation, you will need to use the pre-provided installation file instead of using helm repositories. As such:

      • Do not add the helm repository and do not run helm repo update.
      • Instead, edit the helm upgrade command.
        • Replace runai/runai-cluster with runai-cluster-<version>.tgz.
        • Add --set global.image.registry=<Docker Registry address> where the registry address is as entered in the preparation section

      The command should look like the following:

      helm upgrade -i runai-cluster runai-cluster-<version>.tgz \\\n    --set controlPlane.url=... \\\n    --set controlPlane.clientSecret=... \\\n    --set cluster.uid=... \\\n    --set cluster.url=... --create-namespace \\\n    --set global.image.registry=registry.mycompany.local \\\n

      Tip

      Use the --dry-run flag to gain an understanding of what is being installed before the actual installation. For more details see Understanding cluster access roles.

      "},{"location":"admin/runai-setup/self-hosted/k8s/cluster/#optional-customize-installation","title":"(Optional) Customize Installation","text":"

      To customize specific aspects of the cluster installation see customize cluster installation.

      "},{"location":"admin/runai-setup/self-hosted/k8s/next-steps/","title":"Next Steps","text":"
      • Create additional I Users.
      • Set up Project-based Researcher Access Control.
      • Set up Researchers to work with the Run:ai Command-line interface (CLI). See Installing the Run:ai Command-line Interface on how to install the CLI for users.
      • Review advanced setup and maintenace scenarios.
      "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/","title":"Preparing for a Run:ai Kubernetes installation","text":"

      The following section provides IT with the information needed to prepare for a Run:ai installation.

      "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#prerequisites","title":"Prerequisites","text":"

      Follow the prerequisites as explained in Self-Hosted installation over Kubernetes.

      "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#software-artifacts","title":"Software artifacts","text":"ConnectedAirgapped

      You should receive a file: runai-reg-creds.yaml from Run:ai Customer Support. The file provides access to the Run:ai Container registry.

      SSH into a node with kubectl access to the cluster and Docker installed. Run the following to enable image download from the Run:ai Container Registry on Google cloud:

      kubectl create namespace runai-backend\nkubectl apply -f runai-reg-creds.yaml\n

      You should receive a single file runai-air-gapped-<VERSION>.tar.gz from Run:ai customer support

      SSH into a node with kubectl access to the cluster and Docker installed.

      Run:ai assumes the existence of a Docker registry for images. Most likely installed within the organization. The installation requires the network address and port for the registry (referenced below as <REGISTRY_URL>).

      To extract Run:ai files, replace <VERSION> in the command below and run:

      tar xvf runai-airgapped-package-<VERSION>.tar.gz\n\nkubectl create namespace runai-backend\n

      Upload images

      Upload images to a local Docker Registry. Set the Docker Registry address in the form of NAME:PORT (do not add https):

      export REGISTRY_URL=<Docker Registry address>\n

      Run the following script (you must dockerd installed and at least 20GB of free disk space to run):

      sudo -E ./setup.sh\n

      If Docker is configured to run as non-root then sudo is not required.

      The script should create a file named custom-env.yaml which will be used by the control-plane installation.

      "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#private-docker-registry-optional","title":"Private Docker Registry (optional)","text":"

      To access the organization's docker registry it is required to set the registry's credentials (imagePullSecret)

      Create the secret named runai-reg-creds based on your existing credentials. For more information, see Pull an Image from a Private Registry.

      "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#configure-your-environment","title":"Configure your environment","text":""},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#domain-certificate","title":"Domain Certificate","text":"

      The Run:ai control plane requires a domain name (FQDN). You must supply a domain name as well as a trusted certificate for that domain.

      • When installing the first Run:ai cluster on the same Kubernetes cluster as the control plane, the Run:ai cluster URL will be the same as the control-plane URL.
      • When installing the Run:ai cluster on a separate Kubernetes cluster, follow the Run:ai Domain name requirement.
      • If your network is air-gapped, you will need to provide the Run:ai control-plane and cluster with information about the local certificate authority.

      You must provide the domain's private key and crt as a Kubernetes secret in the runai-backend namespace. Run:

      kubectl create secret tls runai-backend-tls -n runai-backend \\\n    --cert /path/to/fullchain.pem --key /path/to/private.pem\n
      "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#local-certificate-authority-air-gapped-only","title":"Local Certificate Authority (air-gapped only)","text":"

      In air-gapped environments, you must prepare the public key of your local certificate authority as described here. It will need to be installed in Kubernetes for the installation to succeed.

      "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#mark-runai-system-workers-optional","title":"Mark Run:ai system workers (optional)","text":"

      You can optionally set the Run:ai control plane to run on specific nodes. Kubernetes will attempt to schedule Run:ai pods to these nodes. If lacking resources, the Run:ai nodes will move to another, non-labeled node.

      To set system worker nodes run:

      kubectl label node <NODE-NAME> node-role.kubernetes.io/runai-system=true\n

      Warning

      Do not select the Kubernetes master as a runai-system node. This may cause Kubernetes to stop working (specifically if Kubernetes API Server is configured on 443 instead of the default 6443).

      "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#external-postgres-database-optional","title":"External Postgres database (optional)","text":"

      If you have opted to use an external PostgreSQL database, you need to perform initial setup to ensure successful installation. Follow these steps:

      1. Create a SQL script file, edit the parameters below, and save it locally:

        • Replace <DATABASE_NAME> with a dedicate database name for RunAi in your PostgreSQL database.
        • Replace <ROLE_NAME> with a dedicated role name (user) for RunAi database.
        • Replace <ROLE_PASSWORD> with a password for the new PostgreSQL role.
        • Replace <GRAFANA_PASSWORD> with the password to be set for Grafana integration.
        -- Create a new database for runai\nCREATE DATABASE <DATABASE_NAME>; \n\n-- Create the role with login and password\nCREATE ROLE <ROLE_NAME>  WITH LOGIN PASSWORD '<ROLE_PASSWORD>'; \n\n-- Grant all privileges on the database to the role\nGRANT ALL PRIVILEGES ON DATABASE <DATABASE_NAME> TO <ROLE_NAME>; \n\n-- Connect to the newly created database\n\\c <DATABASE_NAME> \n\n-- grafana\nCREATE ROLE grafana WITH LOGIN PASSWORD '<GRAFANA_PASSWORD>'; \nCREATE SCHEMA grafana authorization grafana;\nALTER USER grafana set search_path='grafana';\n-- Exit psql\n\\q\n
      2. Run the following command on a machine where PostgreSQL client (pgsql) is installed:

        psql --host <POSTGRESQL_HOST> \\ # (1)\n--user <POSTGRESQL_USER> \\ # (2)\n--port <POSTGRESQL_PORT> \\ # (3)\n--dbname <POSTGRESQL_DB> \\ # (4)\n-a -f <SQL_FILE> \\ # (5)\n
        1. Replace <POSTGRESQL_HOST> with the PostgreSQL ip address or hostname.
        2. Replace <POSTGRESQL_USER> with the PostgreSQL username.
        3. Replace <POSTGRESQL_PORT> with the port number where PostgreSQL is running.
        4. Replace <POSTGRESQL_DB> with the name of your PostgreSQL database.
        5. Replace <POSTGRESQL_DB> with the name of your PostgreSQL database.
        6. Replace <SQL_FILE> with the path to the SQL script created in the previous step.
      "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#additional-permissions","title":"Additional permissions","text":"

      As part of the installation, you will be required to install the Run:ai Control Plane and Cluster Helm Charts. The Helm Charts require Kubernetes administrator permissions. You can review the exact permissions provided by using the --dry-run on both helm charts.

      "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#validate-prerequisites","title":"Validate Prerequisites","text":"

      Once you believe that the Run:ai prerequisites and preperations are met, we highly recommend installing and running the Run:ai pre-install diagnostics script. The tool:

      • Tests the below requirements as well as additional failure points related to Kubernetes, NVIDIA, storage, and networking.
      • Looks at additional components installed and analyze their relevance to a successful Run:ai installation.

      To use the script download the latest version of the script and run:

      chmod +x preinstall-diagnostics-<platform>\n./preinstall-diagnostics-<platform> --domain <dns-entry>\n

      If the script fails, or if the script succeeds but the Kubernetes system contains components other than Run:ai, locate the file runai-preinstall-diagnostics.txt in the current directory and send it to Run:ai technical support.

      For more information on the script including additional command-line flags, see here.

      "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#next-steps","title":"Next steps","text":"

      Continue with installing the Run:ai Control Plane.

      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/","title":"Self-Hosted installation over Kubernetes - Prerequisites","text":"

      Before proceeding with this document, please review the installation types documentation to understand the difference between air-gapped and connected installations.

      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#runai-components","title":"Run:ai Components","text":"

      As part of the installation process you will install:

      • A control-plane managing cluster
      • One or more clusters

      Both the control plane and clusters require Kubernetes. Typically the control plane and first cluster are installed on the same Kubernetes cluster but this is not a must.

      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#installer-machine","title":"Installer machine","text":"

      The machine running the installation script (typically the Kubernetes master) must have:

      • At least 50GB of free space.
      • Docker installed.
      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#helm","title":"Helm","text":"

      Run:ai requires Helm 3.14 or later. To install Helm, see Installing Helm. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary.

      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#cluster-hardware-requirements","title":"Cluster hardware requirements","text":"

      The Run:ai control plane services require the following resources:

      Component Required Capacity CPU 10 cores Memory 12GB Disk space 110GB

      If Run:ai cluster is planned to be installed on the same cluster as the Run:ai control plane: Ensure the control plane requirements are in addition to the Run:ai cluster hardware requirements.

      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#arm-limitation","title":"ARM Limitation","text":"

      The control plane does not support CPU nodes with ARM64k architecture. To schedule the Run:ai control plane services on supported nodes, use the global.affinity configuration paramter as detailed in Additional Run:ai configurations.

      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#runai-software-requirements","title":"Run:ai software requirements","text":""},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#cluster-nodes","title":"Cluster Nodes","text":"

      See Run:ai Cluster prerequisites operating system requirements.

      Nodes are required to be synchronized by time using NTP (Network Time Protocol) for proper system functionality.

      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#kubernetes","title":"Kubernetes","text":"

      See Run:ai Cluster prerequisites Kubernetes distribution requirements.

      The Run:ai control plane operating system prerequisites are identical.

      The Run:ai control-plane requires a default storage class to create persistent volume claims for Run:ai storage. The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the Run:ai persistent data is saved or deleted when the Run:ai control plane is deleted.

      Note

      For a simple (nonproduction) storage class example see Kubernetes Local Storage Class. The storage class will set the directory /opt/local-path-provisioner to be used across all nodes as the path for provisioning persistent volumes.

      Then set the new storage class as default:

      kubectl patch storageclass local-path -p '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"true\"}}}'\n
      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#install-prerequisites","title":"Install prerequisites","text":""},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#ingress-controller","title":"Ingress Controller","text":"

      The Run:ai control plane installation assumes an existing installation of NGINX as the ingress controller. You can follow the Run:ai Cluster prerequisites Kubernetes ingress controller installation.

      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#nvidia-gpu-operator","title":"NVIDIA GPU Operator","text":"

      See Run:ai Cluster prerequisites NVIDIA GPU operator requirements.

      The Run:ai control plane, when installed without a Run:ai cluster, does not require the NVIDIA prerequisites.

      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#prometheus","title":"Prometheus","text":"

      See Run:ai Cluster prerequisites Prometheus requirements.

      The Run:ai control plane, when installed without a Run:ai cluster, does not require the Prometheus prerequisites.

      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#inference-optional","title":"Inference (optional)","text":"

      See Run:ai Cluster prerequisites Inference requirements.

      The Run:ai control plane, when installed without a Run:ai cluster, does not require the Inference prerequisites.

      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#external-postgres-database-optional","title":"External Postgres database (optional)","text":"

      The Run:ai control plane installation includes a default PostgreSQL database. However, you may opt to use an existing PostgreSQL database if you have specific requirements or preferences. Please ensure that your PostgreSQL database is version 16 or higher.

      "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#next-steps","title":"Next steps","text":"

      Continue to Preparing for a Run:ai Kubernetes Installation .

      "},{"location":"admin/runai-setup/self-hosted/k8s/project-management/","title":"Self Hosted installation over Kubernetes - Create Projects","text":""},{"location":"admin/runai-setup/self-hosted/k8s/project-management/#introduction","title":"Introduction","text":"

      The Administrator creates Run:ai Projects via the Run:ai user interface. When enabling Researcher Authentication you also assign users to Projects.

      Run:ai Projects are implemented as Kubernetes namespaces. When creating a new Run:ai Project, Run:ai does the following automatically:

      1. Creates a namespace by the name of runai-<PROJECT-NAME>.
      2. Labels the namespace as managed by Run:ai.
      3. Provides access to the namespace for Run:ai services.
      4. Associates users with the namespace.

      This process may need to be altered if,

      • Researchers already have existing Kubernetes namespaces
      • The organization's Kubernetes namespace naming convention does not allow the runai- prefix.
      • The organization's policy does not allow the automatic creation of namespaces.
      "},{"location":"admin/runai-setup/self-hosted/k8s/project-management/#process","title":"Process","text":"

      Run:ai allows the association of a Run:ai Project with any existing Kubernetes namespace:

      • When setting up a Run:ai cluster, Disable namespace creation by setting the cluster flag createNamespaces to false.
      • Using the Run:ai User Interface, create a new Project <PROJECT-NAME>. A namespace will not be created.
      • Associate and existing namepace <NAMESPACE> with the Run:ai project by running:
      kubectl label ns <NAMESPACE>  runai/queue=<PROJECT_NAME>\n

      Caution

      Setting the createNamespaces flag to false moves the responsibility of creating namespaces to match Run:ai Projects to the administrator.

      "},{"location":"admin/runai-setup/self-hosted/k8s/uninstall/","title":"Uninstall Run:ai","text":""},{"location":"admin/runai-setup/self-hosted/k8s/uninstall/#uninstall-a-runai-cluster","title":"Uninstall a Run:ai Cluster","text":"

      To uninstall the cluster see: cluster delete

      "},{"location":"admin/runai-setup/self-hosted/k8s/uninstall/#uninstall-the-runai-control-plane","title":"Uninstall the Run:ai Control Plane","text":"

      To delete the control plane, run:

      helm uninstall runai-backend -n runai-backend\n
      "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/","title":"Upgrade Run:ai","text":""},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#preparations","title":"Preparations","text":""},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#helm","title":"Helm","text":"

      Run:ai requires Helm 3.14 or later. Before you continue, validate your installed helm client version. To install or upgrade Helm, see Installing Helm. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary.

      "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#software-files","title":"Software files","text":"ConnectedAirgapped

      Run the helm command below:

      helm repo add runai-backend https://runai.jfrog.io/artifactory/cp-charts-prod\nhelm repo update\n
      • Ask for a tar file runai-air-gapped-<NEW-VERSION>.tar.gz from Run:ai customer support. The file contains the new version you want to upgrade to. <NEW-VERSION> is the updated version of the Run:ai control plane.
      • Upload the images as described here.
      "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#before-upgrade","title":"Before upgrade","text":"

      Before proceeding with the upgrade, it's crucial to apply the specific prerequisites associated with your current version of Run:ai and every version in between up to the version you are upgrading to.

      "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-from-version-29","title":"Upgrade from version 2.9","text":"

      Two significant changes to the control-plane installation have happened with version 2.12: PVC ownership, Ingress and installation customization.

      "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#pvc-ownership","title":"PVC ownership","text":"

      Run:ai will no longer directly create the PVCs that store Run:ai data (metrics and database). Instead, going forward, * Run:ai requires a Kubernetes storage class to be installed. * The PVCs are created by the Kubernetes StatefulSets.

      The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the data is saved or deleted when the Run:ai control plane is deleted.

      To remove the ownership in an older installation, run:

      kubectl patch pvc -n runai-backend pvc-thanos-receive  -p '{\"metadata\": {\"annotations\":{\"helm.sh/resource-policy\": \"keep\"}}}'\nkubectl patch pvc -n runai-backend pvc-postgresql  -p '{\"metadata\": {\"annotations\":{\"helm.sh/resource-policy\": \"keep\"}}}'\n
      "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#ingress","title":"Ingress","text":"

      Delete the ingress object which will be recreated by the control plane upgrade

      kubectl delete ing -n runai-backend runai-backend-ingress\n
      "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#installation-customization","title":"Installation customization","text":"

      The Run:ai control-plane installation has been rewritten and is no longer using a backend values file. Instead, to customize the installation use standard --set flags. If you have previously customized the installation, you must now extract these customizations and add them as --set flag to the helm installation:

      • Find previous customizations to the control plane if such exist. Run:ai provides a utility for that here https://raw.githubusercontent.com/run-ai/docs/v2.13/install/backend/cp-helm-vals-diff.sh. For information on how to use this utility please contact Run:ai customer support.
      • Search for the customizations you found in the optional configurations table and add them in the new format.
      "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-control-plane","title":"Upgrade Control Plane","text":""},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-from-version-213-or-later","title":"Upgrade from version 2.13, or later","text":"ConnectedAirgapped
      helm get values runai-backend -n runai-backend > runai_control_plane_values.yaml\nhelm upgrade runai-backend -n runai-backend runai-backend/control-plane --version \"~2.20.0\" -f runai_control_plane_values.yaml --reset-then-reuse-values\n
      helm get values runai-backend -n runai-backend > runai_control_plane_values.yaml\nhelm upgrade runai-backend control-plane-<NEW-VERSION>.tgz -n runai-backend  -f runai_control_plane_values.yaml --reset-then-reuse-values\n
      "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-from-version-29_1","title":"Upgrade from version 2.9","text":"
      • Create a tls secret as described in the control plane installation.
      • Upgrade the control plane as described in the control plane installation. During the upgrade, you must tell the installation not to create the two PVCs:
      ConnectedAirgapped
      helm upgrade -i runai-backend -n runai-backend runai-backend/control-plane --version \"~2.20.0\" \\\n--set global.domain=<DOMAIN> \\\n--set postgresql.primary.persistence.existingClaim=pvc-postgresql \\ \n--set thanos.receive.persistence.existingClaim=pvc-thanos-receive \n

      Note

      The helm repository name has changed from runai-backend/runai-backend to runai-backend/control-plane.

      helm upgrade -i runai-backend control-plane-<NEW-VERSION>.tgz -n runai-backend \\\n--set global.domain=<DOMAIN> \\\n--set postgresql.primary.persistence.existingClaim=pvc-postgresql \\ \n--set thanos.receive.persistence.existingClaim=pvc-thanos-receive \n
      "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-cluster","title":"Upgrade Cluster","text":"

      To upgrade the cluster follow the instructions here.

      "},{"location":"admin/runai-setup/self-hosted/ocp/additional-clusters/","title":"Installing additional clusters","text":"

      The first Run:ai cluster is typically installed on the same OpenShift cluster as the Run:ai control plane. Run:ai supports multiple clusters per single control plane. This document is about installing additional clusters on different OpenShift clusters.

      "},{"location":"admin/runai-setup/self-hosted/ocp/additional-clusters/#additional-cluster-installation","title":"Additional cluster installation","text":"

      Create a new cluster, then:

      • Select a target platform OpenShift
      • Select a Cluster location Remote to Control Plane.
      • You must enter a specific cluster URL with the format https://runai.apps.<BASE_DOMAIN>. To get the base Domain run oc get dns cluster -oyaml | grep baseDomain
      • Ignore the instructions for creating a secret.
      "},{"location":"admin/runai-setup/self-hosted/ocp/backend/","title":"Install the Run:ai Control Plane","text":""},{"location":"admin/runai-setup/self-hosted/ocp/backend/#prerequisites-and-preparations","title":"Prerequisites and preparations","text":"

      Make sure you have followed the Control Plane prerequisites and preparations.

      "},{"location":"admin/runai-setup/self-hosted/ocp/backend/#helm-install","title":"Helm Install","text":"

      Run the helm command below:

      ConnectedAirgapped
      helm repo add runai-backend https://runai.jfrog.io/artifactory/cp-charts-prod\nhelm repo update\nhelm upgrade -i runai-backend -n runai-backend runai-backend/control-plane --version \"~2.20.0\" \\\n    --set global.domain=runai.apps.<OPENSHIFT-CLUSTER-DOMAIN> \\ # (1)\n    --set global.config.kubernetesDistribution=openshift\n
      1. The subdomain configured for the OpenShift cluster.

      Info

      To install a specific version, add --version <version> to the install command. You can find available versions by running helm search repo -l runai-backend.

      helm upgrade -i runai-backend  ./control-plane-<version>.tgz -n runai-backend \\\n    --set global.domain=runai.apps.<OPENSHIFT-CLUSTER-DOMAIN> \\ # (1)\n    --set global.config.kubernetesDistribution=openshift \\\n    --set global.customCA.enabled=true \\ # (2)\n    -f custom-env.yaml  # (3)\n
      1. The domain configured for the OpenShift cluster. To find out the OpenShift cluster domain, run oc get routes -A
      2. See the Local Certificate Authority instructions below
      3. custom-env.yaml should have been created by the prepare installation script in the previous section.

      (replace <version> with the control plane version)

      Tip

      Use the --dry-run flag to gain an understanding of what is being installed before the actual installation.

      "},{"location":"admin/runai-setup/self-hosted/ocp/backend/#additional-runai-configurations-optional","title":"Additional Run:ai configurations (optional)","text":"

      There may be cases where you need to set additional properties, To apply the changes run helm upgrade and use --set to set specific configurations, and restart the relevant Run:ai pods so they can fetch the new configurations.

      Key Change Description <component> resources: limits: cpu: 500m memory: 512Mi requests: cpu: 250m memory: 256Mi Pod request and limits Set Run:ai and 3rd party services' resources disableIstioSidecarInjection.enabled Disable Istio sidecar injection Disable the automatic injection of Istio sidecars across the entire Run:ai Control Plane services."},{"location":"admin/runai-setup/self-hosted/ocp/backend/#additional-3rd-party-configurations-optional","title":"Additional 3rd party configurations (optional)","text":"

      The Run:ai Control Plane chart, includes multiple sub-charts of 3rd party components:

      • PostgreSQL - Data store
      • Keycloakx - Identity & Access Management
      • Grafana - Analytics Dashboard
      • Redis - Caching (Disabled, by default)

      Tip

      Click on any component, to view it's chart values and configurations

      If you have opted to connect to an external PostgreSQL database, refer to the additional configurations table below. Adjust the following parameters based on your connection details:

      1. Disable PostgreSQL deployment - postgresql.enabled
      2. Run:ai connection details - global.postgresql.auth
      3. Grafana connection details - grafana.dbUser, grafana.dbPassword
      "},{"location":"admin/runai-setup/self-hosted/ocp/backend/#postgresql","title":"PostgreSQL","text":"Key Change Description postgresql.enabled PostgreSQL installation If set to false the PostgreSQL will not be installed global.postgresql.auth.host PostgreSQL host Hostname or IP address of the PostgreSQL server global.postgresql.auth.port PostgreSQL port Port number on which PostgreSQL is running global.postgresql.auth.username PostgreSQL username Username for connecting to PostgreSQL global.postgresql.auth.password PostgreSQL password Password for the PostgreSQL user specified by global.postgresql.auth.username global.postgresql.auth.postgresPassword PostgreSQL default admin password Password for the built-in PostgreSQL superuser (postgres) global.postgresql.auth.existingSecret Postgres Credentials (secret) Existing secret name with authentication credentials global.postgresql.auth.dbSslMode Postgres connection SSL mode Set the SSL mode, see list in Protection Provided in Different Modes, prefer mode is not supported postgresql.primary.initdb.password PostgreSQL default admin password Set the same password as in global.postgresql.auth.postgresPassword (if changed) postgresql.primary.persistence.storageClass Storage class The installation to work with a specific storage class rather than the default one"},{"location":"admin/runai-setup/self-hosted/ocp/backend/#keycloakx","title":"Keycloakx","text":"Key Change Description keycloakx.adminUser User name of the internal identity provider administrator This user is the administrator of Keycloak keycloakx.adminPassword Password of the internal identity provider administrator This password is for the administrator of Keycloak keycloakx.existingSecret Keycloakx credentials (secret) Existing secret name with authentication credentials global.keycloakx.host KeyCloak (Run:ai internal identity provider) host path Override the DNS for Keycloak. This can be used to access Keycloak from outside the Run:ai Control Plane cluster via ingress

      The keycloakx.adminUser can only be set during the initial installation. The admin password, however, can also be changed later through the Keycloak UI, but you must also update the keycloakx.adminPassword value in the Helm chart using helm upgrade. Failing to update the Helm values after changing the password can lead to control plane services encountering errors.

      "},{"location":"admin/runai-setup/self-hosted/ocp/backend/#grafana","title":"Grafana","text":"Key Change Description grafana.db.existingSecret Grafana database connection credentials (secret) Existing secret name with authentication credentials grafana.dbUser Grafana database username Username for accessing the Grafana database grafana.dbPassword Grafana database password Password for the Grafana database user grafana.admin.existingSecret Grafana admin default credentials (secret) Existing secret name with authentication credentials grafana.adminUser Grafana username Override the Run:ai default user name for accessing Grafana grafana.adminPassword Grafana password Override the Run:ai default password for accessing Grafana"},{"location":"admin/runai-setup/self-hosted/ocp/backend/#redis","title":"Redis","text":"Key Change Description redisCache.auth.password Redis (Runai internal cache mechanism) applicative password Override the default password redisCache.auth.existingSecret Redis credentials (secret) Existing secret name with authentication credentials"},{"location":"admin/runai-setup/self-hosted/ocp/backend/#next-steps","title":"Next steps","text":""},{"location":"admin/runai-setup/self-hosted/ocp/backend/#connect-to-runai-user-interface","title":"Connect to Run:ai user interface","text":"
      • Run: oc get routes -n runai-backend to find the Run:ai Administration User Interface URL.
      • Log in using the default credentials: User: test@run.ai, Password: Abcd!234.
      • Go to the Users area and change the password.
      "},{"location":"admin/runai-setup/self-hosted/ocp/backend/#enable-forgot-password-optional","title":"Enable Forgot Password (optional)","text":"

      To support the Forgot password functionality, follow the steps below.

      • Go to runai.<openshift-cluster-domain>/auth and Log in.
      • Under Realm settings, select the Login tab and enable the Forgot password feature.
      • Under the Email tab, define an SMTP server, as explained here
      "},{"location":"admin/runai-setup/self-hosted/ocp/backend/#install-runai-cluster","title":"Install Run:ai Cluster","text":"

      Continue with installing a Run:ai Cluster.

      "},{"location":"admin/runai-setup/self-hosted/ocp/cluster/","title":"Self-Hosted installation over OpenShift - Cluster Setup","text":""},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#prerequisites","title":"Prerequisites","text":"

      Install prerequisites as per System Requirements document.

      "},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#create-openshift-projects","title":"Create OpenShift Projects","text":"

      Run:ai cluster installation uses several namespaces (or projects in OpenShift terminology). Run the following:

      oc new-project runai\noc new-project runai-reservation\noc new-project runai-scale-adjust\n

      The last namespace (runai-scale-adjust) is only required if the cluster is a cloud cluster and is configured for auto-scaling.

      "},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#cluster-installation","title":"Cluster Installation","text":"ConnectedAirgapped

      Perform the cluster installation instructions explained in Cluster install. When creating a new cluster, select the OpenShift target platform.

      Info

      To install a specific version, add --version <version> to the install command. You can find available versions by running helm search repo -l runai-cluster.

      Perform the cluster installation instructions explained in Cluster install. When creating a new cluster, select the OpenShift target platform.

      On the second tab of the cluster wizard, when copying the helm command for installation, you will need to use the pre-provided installation file instead of using helm repositories. As such:

      • Do not add the helm repository and do not run helm repo update.
      • Instead, edit the helm upgrade command.
        • Replace runai/runai-cluster with runai-cluster-<version>.tgz.
        • Add --set global.image.registry=<Docker Registry address> where the registry address is as entered in the preparation section
        • Add --set global.customCA.enabled=true and perform the instructions for local certificate authority.

      The command should look like the following:

      helm upgrade -i runai-cluster runai-cluster-<version>.tgz \\\n    --set controlPlane.url=... \\\n    --set controlPlane.clientSecret=... \\\n    --set cluster.uid=... \\\n    --set cluster.url=... --create-namespace \\\n    --set global.image.registry=registry.mycompany.local \\\n    --set global.customCA.enabled=true\n

      "},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#optional-customize-installation","title":"(Optional) Customize Installation","text":"

      To customize specific aspects of the cluster installation see customize cluster installation.

      "},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#next-steps","title":"Next Steps","text":"

      Continue to create Run:ai Projects.

      "},{"location":"admin/runai-setup/self-hosted/ocp/next-steps/","title":"Next Steps","text":"
      • Create additional Run:ai Users.
      • Set up Project-based Researcher Access Control.
      • Set up Researchers to work with the Run:ai Command-line interface (CLI). See Installing the Run:ai Command-line Interface on how to install the CLI for users.
      • Review advanced setup and maintenace scenarios.
      "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/","title":"Preparing for a Run:ai OpenShift installation","text":"

      The following section provides IT with the information needed to prepare for a Run:ai installation.

      "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#prerequisites","title":"Prerequisites","text":"

      See the Prerequisites section above.

      "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#software-artifacts","title":"Software artifacts","text":"ConnectedAirgapped

      You should receive a file: runai-reg-creds.yaml from Run:ai Customer Support. The file provides access to the Run:ai Container registry.

      SSH into a node with oc access (oc is the OpenShift command line) to the cluster and Docker installed.

      Run the following to enable image download from the Run:ai Container Registry on Google cloud:

      oc apply -f runai-reg-creds.yaml -n runai-backend\n

      You should receive a single file runai-<version>.tar from Run:ai customer support

      Run:ai assumes the existence of a Docker registry for images. Most likely installed within the organization. The installation requires the network address and port for the registry (referenced below as <REGISTRY_URL>).

      SSH into a node with oc access (oc is the OpenShift command line) to the cluster and Docker installed.

      To extract Run:ai files, replace <VERSION> in the command below and run:

      tar xvf runai-airgapped-package-<VERSION>.tar.gz\n
      Upload images

      Upload images to a local Docker Registry. Set the Docker Registry address in the form of NAME:PORT (do not add https):

      export REGISTRY_URL=<Docker Registry address>\n

      Run the following script (you must have at least 20GB of free disk space to run):

      ./setup.sh\n

      (If docker is configured to run as non-root then sudo is not required).

      The script should create a file named custom-env.yaml which will be used by the control-plane installation.

      "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#private-docker-registry-optional","title":"Private Docker Registry (optional)","text":"

      To access the organization's docker registry it is required to set the registry's credentials (imagePullSecret)

      Create the secret named runai-reg-creds in the runai-backend namespace based on your existing credentials. The configuration will be copied over to the runai namespace at cluster install. For more information, see Allowing pods to reference images from other secured registries.

      "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#configure-your-environment","title":"Configure your environment","text":""},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#create-openshift-project","title":"Create OpenShift project","text":"

      The Run:ai control plane uses a namespace (or project in OpenShift terminology) name runai-backend. You must create it before installing:

      oc new-project runai-backend\n
      "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#local-certificate-authority-air-gapped-only","title":"Local Certificate Authority (air-gapped only)","text":"

      In Air-gapped environments, you must prepare the public key of your local certificate authority as described here. It will need to be installed in Kubernetes for the installation to succeed.

      "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#mark-runai-system-workers-optional","title":"Mark Run:ai system workers (optional)","text":"

      You can optionally set the Run:ai control plane to run on specific nodes. Kubernetes will attempt to schedule Run:ai pods to these nodes. If lacking resources, the Run:ai nodes will move to another, non-labeled node.

      To set system worker nodes run:

      kubectl label node <NODE-NAME> node-role.kubernetes.io/runai-system=true\n

      Warning

      Do not select the Kubernetes master as a runai-system node. This may cause Kubernetes to stop working (specifically if Kubernetes API Server is configured on 443 instead of the default 6443).

      "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#external-postgres-database-optional","title":"External Postgres database (optional)","text":"

      If you have opted to use an external PostgreSQL database, you need to perform initial setup to ensure successful installation. Follow these steps:

      1. Create a SQL script file, edit the parameters below, and save it locally:

        • Replace <DATABASE_NAME> with a dedicate database name for RunAi in your PostgreSQL database.
        • Replace <ROLE_NAME> with a dedicated role name (user) for RunAi database.
        • Replace <ROLE_PASSWORD> with a password for the new PostgreSQL role.
        • Replace <GRAFANA_PASSWORD> with the password to be set for Grafana integration.
        -- Create a new database for runai\nCREATE DATABASE <DATABASE_NAME>; \n\n-- Create the role with login and password\nCREATE ROLE <ROLE_NAME>  WITH LOGIN PASSWORD '<ROLE_PASSWORD>'; \n\n-- Grant all privileges on the database to the role\nGRANT ALL PRIVILEGES ON DATABASE <DATABASE_NAME> TO <ROLE_NAME>; \n\n-- Connect to the newly created database\n\\c <DATABASE_NAME> \n\n-- grafana\nCREATE ROLE grafana WITH LOGIN PASSWORD '<GRAFANA_PASSWORD>'; \nCREATE SCHEMA grafana authorization grafana;\nALTER USER grafana set search_path='grafana';\n-- Exit psql\n\\q\n
      2. Run the following command on a machine where PostgreSQL client (pgsql) is installed:

        psql --host <POSTGRESQL_HOST> \\ # (1)\n--user <POSTGRESQL_USER> \\ # (2)\n--port <POSTGRESQL_PORT> \\ # (3)\n--dbname <POSTGRESQL_DB> \\ # (4)\n-a -f <SQL_FILE> \\ # (5)\n
        1. Replace <POSTGRESQL_HOST> with the PostgreSQL ip address or hostname.
        2. Replace <POSTGRESQL_USER> with the PostgreSQL username.
        3. Replace <POSTGRESQL_PORT> with the port number where PostgreSQL is running.
        4. Replace <POSTGRESQL_DB> with the name of your PostgreSQL database.
        5. Replace <POSTGRESQL_DB> with the name of your PostgreSQL database.
        6. Replace <SQL_FILE> with the path to the SQL script created in the previous step.
      "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#additional-permissions","title":"Additional permissions","text":"

      As part of the installation, you will be required to install the Control plane and Cluster Helm Charts. The Helm Charts require Kubernetes administrator permissions. You can review the exact permissions provided by using the --dry-run on both helm charts.

      "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#validate-prerequisites","title":"Validate prerequisites","text":"

      Once you believe that the Run:ai prerequisites and preperations are met, we highly recommend installing and running the Run:ai pre-install diagnostics script. The tool:

      • Tests the below requirements as well as additional failure points related to Kubernetes, NVIDIA, storage, and networking.
      • Looks at additional components installed and analyzes their relevancy to a successful Run:ai installation.

      To use the script download the latest version of the script and run:

      chmod +x preinstall-diagnostics-<platform>\n./preinstall-diagnostics-<platform> \n

      If the script fails, or if the script succeeds but the Kubernetes system contains components other than Run:ai, locate the file runai-preinstall-diagnostics.txt in the current directory and send it to Run:ai technical support.

      For more information on the script including additional command-line flags, see here.

      "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#next-steps","title":"Next steps","text":"

      Continue with installing the Run:ai Control Plane.

      "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/","title":"Self Hosted installation over OpenShift - prerequisites","text":"

      Before proceeding with this document, please review the installation types documentation to understand the difference between air-gapped and connected installations.

      "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#runai-components","title":"Run:ai components","text":"

      As part of the installation process you will install:

      • A control-plane managing cluster
      • One or more clusters

      Both the control plane and clusters require Kubernetes. Typically the control plane and first cluster are installed on the same Kubernetes cluster but this is not a must.

      Important

      In OpenShift environments, adding a cluster connecting to a remote control plane currently requires the assistance of customer support.

      "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#installer-machine","title":"Installer machine","text":"

      The machine running the installation script (typically the Kubernetes master) must have:

      • At least 50GB of free space.
      • Docker installed.
      "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#helm","title":"Helm","text":"

      Run:ai requires Helm 3.14 or later. To install Helm, see Installing Helm. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary.

      "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#cluster-hardware-requirements","title":"Cluster hardware requirements","text":"

      The Run:ai control plane services require the following resources:

      Component Required Capacity CPU 10 cores Memory 12GB Disk space 110GB

      If Run:ai cluster is planned to be installed on the same cluster as the Run:ai control plane: Ensure the control plane requirements are in addition to the Run:ai cluster hardware requirements.

      "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#runai-software-requirements","title":"Run:ai software requirements","text":""},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#cluster-nodes","title":"Cluster Nodes","text":"

      Nodes are required to be synchronized by time using NTP (Network Time Protocol) for proper system functionality.

      "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#openshift","title":"OpenShift","text":"

      Run:ai supports OpenShift. OpenShift Versions supported are detailed in Kubernetes distribution.

      • OpenShift must be configured with a trusted certificate. Run:ai installation relies on OpenShift to create certificates for subdomains.
      • OpenShift must have a configured identity provider (Idp).
      • If your network is air-gapped, you will need to provide the Run:ai control-plane and cluster with information about the local certificate authority.
      "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#install-prerequisites","title":"Install prerequisites","text":""},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#nvidia-gpu-operator","title":"NVIDIA GPU Operator","text":"

      See Run:ai Cluster prerequisites installing NVIDIA dependencies in OpenShift.

      The Run:ai control plane, when installed without a Run:ai cluster, does not require the NVIDIA prerequisites.

      Information on how to download the GPU Operator for air-gapped installation can be found in the NVIDIA GPU Operator pre-requisites.

      "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#inference-optional","title":"Inference (optional)","text":"

      See Run:ai Cluster prerequisites Inference requirements.

      The Run:ai control plane, when installed without a Run:ai cluster, does not require the Inference prerequisites.

      "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#external-postgresql-database-optional","title":"External PostgreSQL database (optional)","text":"

      The Run:ai control plane installation includes a default PostgreSQL database. However, you may opt to use an existing PostgreSQL database if you have specific requirements or preferences. Please ensure that your PostgreSQL database is version 16 or higher.

      "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#next-steps","title":"Next steps","text":"

      Continue to Preparing for a Run:ai OpenShift Installation .

      "},{"location":"admin/runai-setup/self-hosted/ocp/project-management/","title":"Self Hosted installation over OpenShift - Create Projects","text":""},{"location":"admin/runai-setup/self-hosted/ocp/project-management/#introduction","title":"Introduction","text":"

      The Administrator creates Run:ai Projects via the Run:ai User Interface. When enabling Researcher Authentication you also assign users to Projects.

      Run:ai Projects are implemented as Kubernetes namespaces. When creating a new Run:ai Project, Run:ai does the following automatically:

      1. Creates a namespace by the name of runai-<PROJECT-NAME>.
      2. Labels the namespace as managed by Run:ai.
      3. Provides access to the namespace for Run:ai services.
      4. Associates users with the namespace.

      This process may need to be altered if,

      • Researchers already have existing Kubernetes namespaces
      • The organization's Kubernetes namespace naming convention does not allow the runai- prefix.
      • The organization's policy does not allow the automatic creation of namespaces
      "},{"location":"admin/runai-setup/self-hosted/ocp/project-management/#process","title":"Process","text":"

      Run:ai allows the association of a Run:ai Project with any existing Kubernetes namespace:

      • When setting up a Run:ai cluster, Disable namespace creation by setting the cluster flag createNamespaces to false.
      • Using the Run:ai User Interface, create a new Project <PROJECT-NAME>. A namespace will not be created.
      • Associate and existing namepace <NAMESPACE> with the Run:ai project by running:
      oc label ns <NAMESPACE>  runai/queue=<PROJECT_NAME>\n

      Caution

      Setting the createNamespaces flag to false moves the responsibility of creating namespaces to match Run:ai Projects to the administrator.

      "},{"location":"admin/runai-setup/self-hosted/ocp/uninstall/","title":"Uninstall Run:ai","text":"

      See uninstall section here

      "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/","title":"Upgrade Run:ai","text":""},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#preparations","title":"Preparations","text":""},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#helm","title":"Helm","text":"

      Run:ai requires Helm 3.14 or later. Before you continue, validate your installed helm client version. To install or upgrade Helm, see Installing Helm. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary.

      "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#software-files","title":"Software files","text":"ConnectedAirgapped

      Run the helm command below:

      helm repo add runai-backend https://runai.jfrog.io/artifactory/cp-charts-prod\nhelm repo update\n
      • Ask for a tar file runai-air-gapped-<NEW-VERSION>.tar.gz from Run:ai customer support. The file contains the new version you want to upgrade to. <NEW-VERSION> is the updated version of the Run:ai control plane.
      • Upload the images as described here.
      "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#before-upgrade","title":"Before upgrade","text":"

      Before proceeding with the upgrade, it's crucial to apply the specific prerequisites associated with your current version of Run:ai and every version in between up to the version you are upgrading to.

      "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-from-version-29","title":"Upgrade from version 2.9","text":"

      Two significant changes to the control-plane installation have happened with version 2.12: PVC ownership and installation customization.

      "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#pvc-ownership","title":"PVC ownership","text":"

      Run:ai no longer directly creates the PVCs that store Run:ai data (metrics and database). Instead, going forward,

      • Run:ai requires a Kubernetes storage class to be installed.
      • The PVCs are created by the Kubernetes StatefulSets.

      The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the data is saved or deleted when the Run:ai control plane is deleted.

      To remove the ownership in an older installation, run:

      kubectl patch pvc -n runai-backend pvc-postgresql  -p '{\"metadata\": {\"annotations\":{\"helm.sh/resource-policy\": \"keep\"}}}'\n
      "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#installation-customization","title":"Installation customization","text":"

      The Run:ai control-plane installation has been rewritten and is no longer using a backend values file. Instead, to customize the installation use standard --set flags. If you have previously customized the installation, you must now extract these customizations and add them as --set flag to the helm installation:

      • Find previous customizations to the control plane if such exist. Run:ai provides a utility for that here https://raw.githubusercontent.com/run-ai/docs/v2.13/install/backend/cp-helm-vals-diff.sh. For information on how to use this utility please contact Run:ai customer support.
      • Search for the customizations you found in the optional configurations table and add them in the new format.
      "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-control-plane","title":"Upgrade Control Plane","text":""},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-from-version-213-or-later","title":"Upgrade from version 2.13, or later","text":"ConnectedAirgapped
      helm get values runai-backend -n runai-backend > runai_control_plane_values.yaml\nhelm upgrade runai-backend -n runai-backend runai-backend/control-plane --version \"~2.20.0\" -f runai_control_plane_values.yaml --reset-then-reuse-values\n
      helm get values runai-backend -n runai-backend > runai_control_plane_values.yaml\nhelm upgrade runai-backend control-plane-<NEW-VERSION>.tgz -n runai-backend  -f runai_control_plane_values.yaml --reset-then-reuse-values\n
      "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-from-version-29_1","title":"Upgrade from version 2.9","text":"ConnectedAirgapped
      helm upgrade -i runai-backend -n runai-backend runai-backend/control-plane --version \"~2.20.0\" \\\n--set global.domain=runai.apps.<OPENSHIFT-CLUSTER-DOMAIN> \\ #(1)\n--set global.config.kubernetesDistribution=openshift \\\n--set thanos.query.stores={thanos-grpc-port-forwarder:10901} \\\n--set postgresql.primary.persistence.existingClaim=pvc-postgresql\n
      1. The subdomain configured for the OpenShift cluster.

      Note

      The helm repository name has changed from runai-backend/runai-backend to runai-backend/control-plane.

      helm upgrade -i runai-backend  ./control-plane-<NEW-VERSION>.tgz -n runai-backend \\\n--set global.domain=runai.apps.<OPENSHIFT-CLUSTER-DOMAIN> \\ #(1)\n--set global.config.kubernetesDistribution=openshift \\\n--set thanos.query.stores={thanos-grpc-port-forwarder:10901} \\\n--set postgresql.primary.persistence.existingClaim=pvc-postgresql\n
      1. The subdomain configured for the OpenShift cluster.
      "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-cluster","title":"Upgrade Cluster","text":"

      To upgrade the cluster follow the instructions here.

      "},{"location":"admin/troubleshooting/diagnostics/","title":"Diagnostic Tools","text":""},{"location":"admin/troubleshooting/diagnostics/#add-verbosity-to-the-database-container","title":"Add Verbosity to the Database container","text":"

      Run:ai Self-hosted installation contains an internal database. To diagnose database issues, you can run the database in debug mode.

      In the runai-backend-values, search for postgresql. Add:

      postgresql:\n  image:\n    debug: true\n

      Re-install the Run:ai control-plane and then review the database logs by running:

      kubectl logs -n runai-backend runai-postgresql-0\n
      "},{"location":"admin/troubleshooting/diagnostics/#internal-networking-issues","title":"Internal Networking Issues","text":"

      Run:ai is based on Kubernetes. Kubernetes runs its own internal subnet with a separate DNS service. If you see in the logs that services have trouble connecting, the problem may reside there. You can find further information on how to debug Kubernetes DNS here. Specifically, it is useful to start a pod with networking utilities and use it for network resolution:

      kubectl run -i --tty netutils --image=dersimn/netutils -- bash\n
      "},{"location":"admin/troubleshooting/diagnostics/#add-verbosity-to-prometheus","title":"Add Verbosity to Prometheus","text":"

      Add verbosity to Prometheus by editing RunaiConfig:

      kubectl edit runaiconfig runai -n runai\n

      Add a debug log level:

      prometheus-operator:\n  prometheus:\n    prometheusSpec:\n      logLevel: debug\n

      To view logs, run:

      kubectl logs prometheus-runai-prometheus-operator-prometheus-0 prometheus \\\n      -n monitoring -f --tail 100\n

      "},{"location":"admin/troubleshooting/diagnostics/#add-verbosity-to-scheduler","title":"Add Verbosity to Scheduler","text":"

      To view extended logs run:

      kubectl edit ruaiconfig runai -n runai\n

      Then under the scheduler section add:

      runai-scheduler:\n   args:\n     verbosity: 6\n

      Warning

      Verbose scheduler logs consume a significant amount of disk space.

      "},{"location":"admin/troubleshooting/logs-collection/","title":"Logs Collection","text":"

      This article provides instructions for IT administrators on collecting Run:ai logs for support, including prerequisites, CLI commands, and log file retrieval. It also covers enabling verbose logging for Prometheus and the Run:ai Scheduler.

      "},{"location":"admin/troubleshooting/logs-collection/#collect-logs-to-send-to-support","title":"Collect logs to send to support","text":"

      To collect Run:ai logs, follow these steps:

      "},{"location":"admin/troubleshooting/logs-collection/#prerequisites","title":"Prerequisites","text":"
      • Ensure that you have administrator-level access to the Kubernetes cluster where Run:ai is installed.
      • The Run:ai Administrator Command-Line Interface (CLI) must be installed.
      "},{"location":"admin/troubleshooting/logs-collection/#step-by-step-instructions","title":"Step-by-Step Instructions","text":"
      1. Run the Command from your local machine or a Bastion Host (secure server) Open a terminal on your local machine (or any machine that has network access to the Kubernetes cluster) where the Run:ai Administrator CLI is installed.
      2. Collect the Logs Execute the following command to collect the logs:

        runai-adm collect-logs\n

        This command gathers all relevant Run:ai logs from the system and generate a compressed file.

      3. Locate the Generated File After running the command, note the location of the generated compressed log file. You can retrieve and send this file to Run:ai Support for further troubleshooting.

      Note

      The tar file packages the logs of Run:ai components only. It does not include logs of researcher containers that may contain private information

      "},{"location":"admin/troubleshooting/logs-collection/#logs-verbosity","title":"Logs verbosity","text":"

      Increase log verbosity to capture more detailed information, providing deeper insights into system behavior and make it easier to identify and resolve issues.

      "},{"location":"admin/troubleshooting/logs-collection/#prerequisites_1","title":"Prerequisites","text":"

      Before you begin, ensure you have the following:

      • Access to the Kubernetes cluster where Run:ai is installed
      • Including necessary permissions to view and modify configurations.
      • kubectl installed and configured:
      • The Kubernetes command-line tool, kubectl, must be installed and configured to interact with the cluster.
      • Sufficient privileges to edit configurations and view logs.
      • Monitoring Disk Space
      • When enabling verbose logging, ensure adequate disk space to handle the increased log output, especially when enabling debug or high verbosity levels.
      "},{"location":"admin/troubleshooting/logs-collection/#adding-verbosity","title":"Adding verbosity","text":"Adding verbosity to Prometheus

      To increase the logging verbosity for Prometheus, follow these steps:

      1. Edit the RunaiConfig to adjust Prometheus log levels. Copy the following command to your terminal:
      kubectl edit runaiconfig runai -n runai\n
      1. In the configuration file that opens, add or modify the following section to set the log level to debug:
      spec:\n    prometheus:\n        spec:\n            logLevel: debug\n
      1. Save the changes. To view the Prometheus logs with the new verbosity level, run:
      kubectl logs -n runai prometheus-runai-0\n

      This command streams the last 100 lines of logs from Prometheus, providing detailed information useful for debugging.

      Adding verbosity to the scheduler

      To enable extended logging for the Run:ai scheduler:

      1. Edit the RunaiConfig to adjust scheduler verbosity:
      kubectl edit runaiconfig runai -n runai\n

      2 Add or modify the following section under the scheduler settings:

      runai-scheduler:\n    args:\n        verbosity: 6\n

      This increases the verbosity level of the scheduler logs to provide more detailed output.

      Warning

      Enabling verbose logging can significantly increase disk space usage. Monitor your storage capacity and adjust the verbosity level as necessary.

      "},{"location":"admin/troubleshooting/troubleshooting/","title":"Troubleshooting Run:ai","text":""},{"location":"admin/troubleshooting/troubleshooting/#installation","title":"Installation","text":"Upgrade fails with \"Ingress already exists\"

      Symptom: The installation fails with error: Error: rendered manifests contain a resource that already exists. Unable to continue with install: IngressClass \"nginx\" in namespace \"\" exists

      Root cause: Run:ai installs NGINX, but there is an existing NGINX on the cluster.

      Resolution: In the Run:ai cluster YAML file, disable the installation of NGINX by setting:

      ingress-nginx:\n    enabled: false\n
      How to get installation logs

      Symptom: Installation fails and you need to troubleshoot the issue.

      Resolution: Run the following script to obtain any relevant installation logs in case of an error.

      curl -fsSL https://raw.githubusercontent.com/run-ai/public/main/installation/get-installation-logs.sh | bash\n
      Upgrade fails with \"rendered manifests contain a resource that already exists\" error

      Symptom: The installation fails with error: Error: rendered manifests contain a resource that already exists. Unable to continue with install:...

      Root cause: The Run:ai installation is trying to create a resource that already exists, which may be due to a previous installation that was not properly removed.

      Resolution: Run the following script to remove all Run:ai resources and reinstall:

      helm template <release-name> <chart-name> --namespace <namespace> | kubectl delete -f -\n

      Then reinstall Run:ai.

      Pods are failing due to certificate issues

      Symptom: Pods are failing with certificate issues.

      Root cause: The certificate provided during the Control Plane's installation is not valid.

      Resolution: Verify that the certificate is valid and trusted. If the certificate is valid, but is signed by a local CA, make sure you have followed the procedure for a local certificate authority.

      "},{"location":"admin/troubleshooting/troubleshooting/#cluster-health","title":"Cluster Health","text":"

      See Cluster Health Troubleshooting

      "},{"location":"admin/troubleshooting/troubleshooting/#dashboard-issues","title":"Dashboard Issues","text":"No Metrics are showing on Dashboard

      Symptom: No metrics are showing on dashboards at https://<company-name>.run.ai/dashboards/now

      Typical root causes:

      • Firewall-related issues.
      • Internal clock is not synced.
      • Prometheus pods are not running.

      Firewall issues

      Add verbosity to Prometheus as describe here.Verify that there are no errors. If there are connectivity-related errors you may need to:

      • Check your firewall for outbound connections. See the required permitted URL list in Network requirements.
      • If you need to set up an internet proxy or certificate, please contact Run:ai customer support.

      Machine Clocks are not synced

      Run: date on cluster nodes and verify that date/time is correct. If not:

      • Set the Linux time service (NTP).
      • Restart Run:ai services. Depending on the previous time gap between servers, you may need to reinstall the Run:ai cluster

      Prometheus pods are not running

      Run: kubectl get pods -n monitoring -o wide

      • Verify that all pods are running.
      • The default Prometheus installation is not built for high availability. If a node is down, the Prometheus pod may not recover by itself unless manually deleted. Delete the pod to see it start on a different node and consider adding a second replica to Prometheus.
      GPU Related metrics not showing

      Symptom: GPU-related metrics such as GPU Nodes and Total GPUs are showing zero but other metrics, such as Cluster load are shown.

      Root cause: An installation issue related to the NVIDIA stack.

      Resolution:

      Need to run through the NVIDIA stack and find the issue. The current NVIDIA stack looks as follows:

      1. NVIDIA Drivers (at the OS level, on every node)
      2. NVIDIA Docker (extension to Docker, on every node)
      3. Kubernetes Node feature discovery (mark node properties)
      4. NVIDIA GPU Feature discovery (mark nodes as \u201chaving GPUs\u201d)
      5. NVIDIA Device plug-in (Exposes GPUs to Kubernetes)
      6. NVIDIA DCGM Exporter (Exposes metrics from GPUs in Kubernetes)

      Run:ai requires the installation of the NVIDIA GPU Operator which installs the entire stack above. However, there are two alternative methods for using the operator:

      • Use the default operator values to install 1 through 6.
      • If NVIDIA Drivers (#1 above) are already installed on all nodes, use the operator with a flag that disables drivers install.

      For more information see [System requirements](../runai-setup/cluster-setup/.

      NVIDIA GPU Operator

      Run: kubectl get pods -n gpu-operator | grep nvidia and verify that all pods are running.

      Node and GPU feature discovery

      Kubernetes Node feature discovery identifies and annotates nodes. NVIDIA GPU Feature Discovery identifies and annotates nodes with GPU properties. See that:

      • All such pods are up.
      • The GPU feature discovery pod is available for every node with a GPU.
      • And finally, when describing nodes, they show an active gpu/nvidia resource.

      NVIDIA Drivers

      • If NVIDIA drivers have been installed on the nodes themselves, ssh into each node and run nvidia-smi. Run sudo systemctl status docker and verify that docker is running. Run nvidia-docker and verify that it is installed and working. Linux software upgrades may require a node restart.
      • If NVIDIA drivers are installed by the Operator, verify that the NVIDIA driver daemonset has created a pod for each node and that all nodes are running. Review the logs of all such pods. A typical problem may be the driver version which is too advanced for the GPU hardware. You can set the driver version via operator flags.

      NVIDIA DCGM Exporter

      • View the logs of the DCGM exporter pod and verify that no errors are prohibiting the sending of metrics.
      • To validate that the dcgm-exporter exposes metrics, find one of the DCGM Exporter pods and run:
      kubectl port-forward <dcgm-exporter-pod-name> 9400:9400\n

      Then browse to http://localhost:9400/metrics and verify that the metrics have reached the DCGM exporter.

      • The next step after the DCGM Exporter is Prometheus. To validate that metrics from the DCGM Exporter reach Prometheus, run:
      kubectl port-forward svc/runai-cluster-kube-prometh-prometheus -n monitoring 9090:9090\n

      Then browse to localhost:9090. In the UI, type DCGM_FI_DEV_GPU_UTIL as the metric name, and verify that the metric has reached Prometheus.

      If the DCGM Exporter is running correctly and exposing metrics, but this metric does not appear in Prometheus, there may be a connectivity issue between these components.

      Allocation-related metrics not showing

      Symptom: GPU Allocation-related metrics such as Allocated GPUs are showing zero but other metrics, such as Cluster load are shown.

      Root cause: The origin of such metrics is the scheduler.

      Resolution:

      • Run: kubectl get pods -n runai | grep scheduler. Verify that the pod is running.
      • Review the scheduler logs and look for errors. If such errors exist, contact Run:ai customer support.
      All metrics are showing \"No Data\"

      Symptom: All data on all dashboards is showing the text \"No Data\".

      Root cause: Internal issue with metrics infrastructure.

      Resolution: Please contact Run:ai customer support.

      "},{"location":"admin/troubleshooting/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"After a successful login, you are redirected to the same login page

      For a self-hosted installation, check Linux clock synchronization as described above. Use the Run:ai preinstall diagnostics tool to validate System and network requirements and test this automatically.

      Single-sign-on issues

      For single-sign-on issues, see the troubleshooting section in the single-sign-on configuration documents.

      "},{"location":"admin/troubleshooting/troubleshooting/#user-interface-submit-job-issues","title":"User Interface Submit Job Issues","text":"New Job button is grayed out

      Symptom: The New Job button on the top right of the Job list is grayed out.

      Root Cause: This can happen due to multiple configuration issues:

      • Open Chrome developer tools and refresh the screen.
      • Under Network locate a network call error. Search for the HTTP error code.

      Resolution for 401 HTTP Error

      • The Cluster certificate provided as part of the installation is valid and trusted (not self-signed).
      • Researcher Authentication has not been properly configured. Try running runai login from the Command-line interface. Alternatively, run: kubectl get pods -n kube-system, identify the api-server pod and review its logs.

      Resolution for 403 HTTP Error

      Run: kubectl get pods -n runai, identify the agent pod, see that it's running, and review its logs.

      New Job button is not showing

      Symptom: The New Job button on the top right of the Job list does not show.

      Root Causes: (multiple)

      • You do not have Researcher or Research Manager permissions.
      • Under Settings | General, verify that Unified UI is on.
      Submit form is distorted

      Symptom: Submit form is showing vertical lines.

      Root Cause: The control plane does not know the cluster URL.

      Using the Run:ai user interface, go to the Clusters list. See that there is no cluster URL next to your cluster.

      Resolution: Cluster must be re-installed.

      Submit form does not show the list of Projects

      Symptom: When connected with Single-sign-on, in the Submit form, the list of Projects is empty.

      Root Cause: SSO is on and researcher authentication is not properly configured as such.

      Resolution: Verify API Server settings as described in Researcher Authentication configuration.

      Job form is not opening on OpenShift

      Symptom: When clicking on \"New Job\" the Job forms does not load. Network shows 405

      Root Cause: An installation step has been missed.

      Resolution: Open the Cluster list and open the cluster installation wizard again. After selecting OpenShift, you will see a patch command at the end of the instruction set. Run it.

      "},{"location":"admin/troubleshooting/troubleshooting/#networking-issues","title":"Networking Issues","text":"'admission controller' connectivity issue

      Symptoms:

      • Pods are failing with 'admission controller' connectivity errors.
      • The command-line runai submit fails with an 'admission controller' connectivity error.
      • Agent or cluster sync pods are crashing in self-hosted installation.

      Root cause: Connectivity issues between different nodes in the cluster.

      Resolution:

      • Run the preinstall diagnostics tool to validate System and network requirements and test connectivity issues.
      • Run: kubectl get pods -n kube-system -o wide. Verify that all networking pods are running.
      • Run: kubectl get nodes. Check that all nodes are ready and connected.
      • Run: kubectl get pods -o wide -A to see which pods are Pending or in Error and which nodes they belong to.
      • See if pods from different nodes have trouble communicating with each other.
      • Advanced, run: kubectl exec <pod-name> -it /bin/sh from a pod in one node and ping a pod from another.
      Projects are not syncing

      Symptom: Create a Project on the Run:ai user interface, then run: runai list projects. The new Project does not appear.

      Root cause: The Run:ai agent is not syncing properly. This may be due to firewall issues.

      Resolution

      • Run: runai pods -n runai | grep agent. See that the agent is in Running state. Select the agent's full name and run: kubectl logs -n runai runai-agent-<id>.
      • Verify that there are no errors. If there are connectivity-related errors you may need to check your firewall for outbound connections. See the required permitted URL list in Network requirements.
      • If you need to set up an internet proxy or certificate, please contact Run:ai customer support.
      Jobs are not syncing

      Symptom: A Job on the cluster (runai list jobs) does not show in the Run:ai user interface Job list.

      Root cause: The Run:ai cluster-sync pod is not syncing properly.

      Resolution: Search the cluster-sync pod for errors.

      "},{"location":"admin/troubleshooting/troubleshooting/#job-related-issues","title":"Job-related Issues","text":"Jobs fail with ContainerCannotRun status

      Symptom: When running runai list jobs, your Job has a status of ContainerCannotRun.

      Root Cause: The issue may be caused due to an unattended upgrade of the NVIDIA driver.

      To verify, run: runai describe job <job-name>, and search for an error driver/library version mismatch.

      Resolution: Reboot the node on which the Job attempted to run.

      Going forward, we recommend blacklisting NVIDIA driver from unattended-upgrades. You can do that by editing /etc/apt/apt.conf.d/50unattended-upgrades, and adding nvidia-driver- to the Unattended-Upgrade::Package-Blacklist section. It should look something like that:

      Unattended-Upgrade::Package-Blacklist {\n    // The following matches all packages starting with linux-\n    //  \"linux-\";\n    \"nvidia-driver-\";\n
      "},{"location":"admin/troubleshooting/troubleshooting/#inference-issues","title":"Inference Issues","text":"New Deployment button is grayed out

      Symptoms:

      • The New workload type -> Inference button is grayed out.
      • Cannot create a deployment via Inference API.

      Root Cause: Run:ai Inference prerequisites have not been met.

      Resolution: Review inference prerequisites and install accordingly.

      Submitted workload type of inference remains in Pending state

      Symptom: A submitted inference is not running.

      Root Cause: The patch statement to add the runai-scheduler has not been performed.

      Workload of type inference status is \"Failed\"

      Symptom: Inference status is always Failed.

      Root Cause: (multiple)

      • Not enough resources in the cluster.
      • Server model command is misconfigured (i.e sleep infinity).
      • Server port is misconfigured.
      Worload of type inference does not scale up from zero

      Symptom: In the Inference form, when \"Auto-scaling\" is enabled, and \"Minimum Replicas\" is set to zero, the inference cannot scale up from zero.

      Root Cause:

      • Clients are not sending requests.
      • Clients are not using the same port/protocol as the server model.
      • Server model command is misconfigured (i.e sleep infinity).
      "},{"location":"admin/troubleshooting/troubleshooting/#command-line-interface-issues","title":"Command-line interface Issues","text":"Unable to install CLI due to certificate errors

      Symptom: The curl command and download button to download the CLI is not working.

      Root Cause: The cluster is not accessible from the download location

      Resolution:

      Use an alternate method for downloading the CLI. Run:

      kubectl port-forward -n runai svc/researcher-service 4180\n

      In another shell, run:

      wget --content-disposition http://localhost:4180/cli/linux\n

      When running the CLI you get an error: open .../.kube/config.lock: permission denied

      Symptom: When running any CLI command you get a permission denied error.

      Root Cause: The user running the CLI does not have read permissions to the .kube directory.

      Resolution: Change permissions for the directory.

      When running 'runai logs', the logs are delayed

      Symptom: Printout from the container is not immediately shown in the log.

      Root Cause: By default, Python buffers stdout, and stderr, which are not flushed in real-time. This may cause logs to appear sometimes minutes after being buffered.

      Resolution: Set the env var PYTHONUNBUFFERED to any non-empty string or pass -u to Python. e.g. python -u main.py.

      CLI does not download properly on OpenShift

      Symptom: When trying to download the CLI on OpenShift, the wget statement downloads a text file named darwin or linux rather than the binary runai.

      Root Cause: An installation step has been missed.

      Resolution: Open the Cluster list and open the cluster installation wizard again. After selecting OpenShift, you will see a patch command at the end of the instruction set. Run it.

      "},{"location":"developer/overview-developer/","title":"Developer Documentation Overview","text":"

      Developers can access Run:ai through various programmatic interfaces.

      "},{"location":"developer/overview-developer/#api-architecture","title":"API Architecture","text":"

      Run:ai is composed of a single, multi-tenant control plane. Each tenant can be connected to one or more GPU clusters. See Run:ai system components for detailed information.

      The following programming interfaces are available:

      API Description Purpose Run:ai REST API Get and Modify any Run:ai business object This is the API mostly used by system developers. The API is also used by the Run:ai user interface as well as the new command-line interface Cluster API (Deprecated) Submit Workloads directly to the Cluster A YAML-based API allowing submittion of Workloads directly to the Cluster. With Run:ai 2.18, this API is replaced by the above Run:ai, which is now the recommended method Metrics API (deprecated) Get cluster metrics Get utilization metrics."},{"location":"developer/overview-developer/#runai-rest-api","title":"Run:ai REST API","text":"

      Allows you to Add, delete, modify and list Run:ai meta-data objects such as Projects, Departments, Users. For Clusters of Run:ai 2.18 and above, allows the submitting of Workloasd.

      The API is provided as REST and is accessible via the control plane endpoint.

      For more information see Run:ai REST API.

      Important

      The endpoints and fields specified in the API reference are the ones that are officially supported by Run:ai. Endpoints and fields that are not listed in the API reference are not supported.

      Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

      For details, see the Deprecation notifications.

      "},{"location":"developer/overview-developer/#cluster-api-deprecated","title":"Cluster API (Deprecated)","text":"

      The Cluster API allows you to submit and delete Workloads directly to the cluster itself.

      The API is provided as Kubernetes API.

      Cluster API is accessible via the GPU cluster itself. As such, multiple clusters may have multiple endpoints.

      Important

      • This API is replaced by a Run:ai REST API to submit jobs, which is now the recommended method for cluster versions of 2.18 and above.
      • If you are looking to automate tasks with older versions of Run:ai, it's best to use the Run:ai Command-line interface which provides forward compatibility.
      "},{"location":"developer/overview-developer/#metrics-api","title":"Metrics API","text":"

      Retrieve metrics from multiple GPU clusters.

      See the Metrics API document.

      "},{"location":"developer/overview-developer/#api-authentication","title":"API Authentication","text":"

      See API Authentication for information on how to gain authenticated access to Run:ai APIs.

      "},{"location":"developer/rest-auth/","title":"API Authentication","text":"

      The following document explains how to authenticate with Run:ai APIs.

      Run:ai APIs are accessed using bearer tokens. A token can be obtained by creating an Application through the Run:ai user interface.

      An application contains a client ID and a client secret. With the client credentials you can obtain a token and use it within subsequent API calls.

      • To create applications for your organization, see Applications.
      • To create your own user applications, see User Applications.
      "},{"location":"developer/rest-auth/#request-an-api-token","title":"Request an API Token","text":"

      Use the client credentials created to get a temporary token to access Run:ai as follows.

      "},{"location":"developer/rest-auth/#example-command-to-get-an-api-token","title":"Example command to get an API token","text":"

      Replace <runai_url> below with:

      • For SaaS installations, use <tenant-name>.run.ai

      • For self-hosted use the Run:ai user interface URL.

      cURLPython
          curl  -X POST \\\n      'https://<runai_url>/api/v1/token' \\\n      --header 'Accept: */*' \\\n      --header 'Content-Type: application/json' \\\n      --data-raw '{\n      \"grantType\":\"client_credentials\",\n      \"clientId\":\"<CLIENT ID>\",\n      \"clientSecret\" : \"<CLIENT SECRET>\"\n    }'\n
          import requests\n    import json\n    reqUrl = \"https://<runai_url>/api/v1/token\"\n    headersList = {\n     \"Accept\": \"*/*\",\n     \"Content-Type\": \"application/json\"\n    }\n    payload = json.dumps({\n      \"grantType\":\"client_credentials\",\n      \"clientId\":\"<CLIENT ID>\",\n      \"clientSecret\" : \"<CLIENT SECRET>\"\n    })\n    response = requests.request(\"POST\", reqUrl, data=payload,  headers=headersList)\n    print(response.text)\n
      "},{"location":"developer/rest-auth/#response","title":"Response","text":"

      The API response will look as follows:

      API Response
      {\n  \"accessToken\": \"<TOKEN>\", \n}\n

      To call Run:ai REST APIs, the application must pass the retrieved accessToken as a Bearer token in the Authorization header of your HTTP request.

      "},{"location":"developer/user-applications/","title":"User Applications","text":"

      This article explains the procedure to create your own user applications.

      Applications are used for API integrations with Run:ai. An application contains a client ID and a client secret. With the client credentials, you can obtain a token as detailed in API authentication and use it within subsequent API calls.

      Note

      • User applications are supported in cluster version 2.20 and above.
      • The token obtained through user applications assumes the roles and permissions of the user.
      "},{"location":"developer/user-applications/#creating-applications","title":"Creating Applications","text":"

      To create an application:

      1. Click the user icon, then select Settings
      2. Click +APPLICATION
      3. Enter the application\u2019s name
      4. Click CREATE
      5. Copy the Client ID and Client secret and store securely
      6. Click DONE

      You can create up to 20 user applications.

      Note

      The client secret is visible only at the time of creation. It cannot be recovered but can be regenerated.

      "},{"location":"developer/user-applications/#regenerating-client-secret","title":"Regenerating client secret","text":"

      To regenerate a client secret:

      1. Locate the application you want to regenerate its client secret
      2. Click Regenerate client secret
      3. Click REGENERATE
      4. Copy the New client secret and store it securely
      5. Click DONE

      Warning

      Regenerating a client secret revokes the previous one.

      "},{"location":"developer/user-applications/#deleting-an-application","title":"Deleting an application","text":"
      1. Locate the application you want to delete
      2. Click on the trash icon
      3. On the dialog, click DELETE to confirm
      "},{"location":"developer/user-applications/#using-api","title":"Using API","text":"

      Go to the User Applications API reference to view the available actions

      "},{"location":"developer/admin-rest-api/overview/","title":"Run:ai REST API","text":"

      The purpose of the Run:ai REST API is to provide an easy-to-use programming interface for administrative tasks.

      "},{"location":"developer/admin-rest-api/overview/#endpoint-url-for-api","title":"Endpoint URL for API","text":"

      The domain used for Run:ai REST APIs is the same domain used to browse for the Run:ai User Interface. Either <company>.run.ai, or app.run.ai for older tenants or a custom URL used for Self-hosted installations.

      "},{"location":"developer/admin-rest-api/overview/#authentication","title":"Authentication","text":"
      • Create a Client Application to make API requests. Use the client application and secret, to obtain a time-bound bearer token (<ACCESS-TOKEN>). For details, see Calling REST APIs.
      • Use the token for subsequent API calls.
      "},{"location":"developer/admin-rest-api/overview/#example-usage","title":"Example Usage","text":"

      For example, if you have an Administrator role, you can get a list of clusters by running:

      cURLPython
      curl 'https://<COMPANY-URL>/v1/k8s/clusters' \\\n--header 'Accept: application/json' \\\n--header 'Content-Type: application/json' \\\n--header 'Authorization: Bearer <ACCESS-TOKEN>'\n
      import http.client\n\nconn = http.client.HTTPSConnection(\"https://<COMPANY-URL>\")\nheaders = {\n    'content-type': \"application/json\",\n    'authorization': \"Bearer <ACCESS-TOKEN>\"\n    }\nconn.request(\"GET\", \"/v1/k8s/clusters\", headers=headers)\n\nres = conn.getresponse()\ndata = res.read()\n\nprint(data.decode(\"utf-8\"))\n

      (replace <ACCESS-TOKEN> with the bearer token from above).

      For an additional example, see the following code. It is an example of how to use the Run:ai REST API to create a User and a Project and set the User to the Project.

      "},{"location":"developer/admin-rest-api/overview/#runai-rest-api-documentation","title":"Run:ai REST API Documentation","text":"

      The Run:ai REST API offers developers a robust interface for interacting with and managing Run:ai metadata objects, including Projects, Departments, Clusters, and Users.

      Public API documentation is available at api-docs.run.ai. For self-hosted deployments, access the documentation at https://<control-plane-url>/api/docs.

      View Documentation

      "},{"location":"developer/admin-rest-api/overview/#runai-api-policy","title":"Run:ai API Policy","text":"

      At Run:ai, we are dedicated to delivering stable, reliable, and well-documented APIs. Our goal is to ensure that our APIs evolve in a predictable, transparent manner, offering users a seamless experience.

      Run:ai follows strict API design and operational standards to ensure a consistent and high-quality experience for users.

      "},{"location":"developer/admin-rest-api/overview/#api-lifecycle-and-deprecation","title":"API Lifecycle and Deprecation","text":"

      While our goal is to maintain stable and backward-compatible APIs, there may be times when breaking changes or deprecations are necessary.

      In case of breaking changes, the deprecated version of the API will be supported for two additional versions in self-hosted deployments and for six months in SaaS deployments. During this period, no new features or functionality will be added to the deprecated API. When an API or API field is deprecated, the following process is followed: Documentation: The deprecated API or field is clearly labeled in the documentation, with a replacement provided where applicable. Release Notes: Information about deprecated APIs, including those scheduled for future removal, is included in the release notes. Customer Notification: Customers are notified of upcoming deprecations as part of the regular release communications.

      "},{"location":"developer/admin-rest-api/overview/#api-removal","title":"API Removal","text":"

      After the defined backward compatibility period has ended, deprecated APIs or fields are removed from both the codebase and the documentation.

      "},{"location":"developer/cluster-api/other-resources/","title":"Support for other Kubernetes Applications","text":""},{"location":"developer/cluster-api/other-resources/#introduction","title":"Introduction","text":"

      Kubernetes has several built-in resources that encapsulate running Pods. These are called Kubernetes Workloads and should not be confused with Run:ai Workloads.

      Examples of such resources are a Deployment that manages a stateless application, or a Job that runs tasks to completion.

      Run:ai natively runs Run:ai Workloads. A Run:ai workload encapsulates all the resources needed to run, creates them, and deletes them together. However, Run:ai, being an open platform allows the scheduling of any Kubernetes Workflow.

      "},{"location":"developer/cluster-api/other-resources/#how-to","title":"How To","text":"

      To run Kubernetes Workloads with Run:ai you must add the following to the YAML:

      • A namespace that is associated with a Run:ai Project.
      • A scheduler name: runai-scheduler.
      • When using Fractions, use a specific syntax for the nvidia/gpu limit.
      "},{"location":"developer/cluster-api/other-resources/#example-job","title":"Example: Job","text":"job1.yaml
      apiVersion: batch/v1\nkind: Job # (1)\nmetadata:\n  name: job1\n  namespace: runai-team-a # (2)\nspec:\n  template:\n    spec:\n      containers:\n      - name: job1-container\n        image: runai.jfrog.io/demo/quickstart\n        resources:\n          limits:\n            nvidia.com/gpu: 1 # (4)\n      restartPolicy: Never\n      schedulerName: runai-scheduler # (3)\n
      1. This is a Kubernetes Job.
      2. Namespace: Replace runai-team-a with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>).
      3. The job to be scheduled with the Run:ai scheduler.
      4. To run with half a GPU replace 1 with \"0.5\" (with apostrophes).

      To submit the Job run:

      kubectl apply -f job1.yaml\n

      You will be able to see the Job in the Run:ai User interface, including all metrics and lists

      "},{"location":"developer/cluster-api/other-resources/#example-deployment","title":"Example: Deployment","text":"deployment1.yaml
      apiVersion: apps/v1\nkind: Deployment # (1)\nmetadata:\n  name: inference-1\n  namespace: runai-team-a # (2)\nspec:\n  replicas: 1\n  selector:\n    matchLabels:\n      app: inference-1\n  template:\n    metadata:\n      labels:\n        app: inference-1\n    spec:\n      containers:\n        - resources:\n            limits:\n              nvidia.com/gpu: 1 # (4)\n          image: runai/example-marian-server\n          imagePullPolicy: Always\n          name: inference-1\n          ports:\n            - containerPort: 8888\n      schedulerName: runai-scheduler # (3)\n\n---\napiVersion: v1\nkind: Service # (5)\nmetadata:\n  labels:\n    app: inference-1\n  name: inference-1\nspec:\n  type: ClusterIP\n  ports:\n    - port: 8888\n      targetPort: 8888\n  selector:\n    app: inference-1\n
      1. This is a Kubernetes Deployment.
      2. Namespace: Replace runai-team-a with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>).
      3. The job to be scheduled with the Run:ai scheduler.
      4. To run with half a GPU replace 1 with \"0.5\" (with apostrophes).
      5. This example also contains the creation of a service to connect to the deployment. It is not mandatory.

      To submit the Deployment run:

      kubectl apply -f deployment1.yaml\n
      "},{"location":"developer/cluster-api/other-resources/#example-submit-a-cron-job-via-yaml","title":"Example: Submit a Cron job via YAML","text":"

      The cron command-line utility is a job scheduler typically used to set up and maintain software environments at scheduled intervals. Run:ai now supports submitting jobs with cron using a YAML file.

      To submit a job using cron, run the following command:

      kubectl apply -f <file_name>.yaml\n

      The following is an example YAML file:

      apiVersion: batch/v1\nkind: CronJob\nmetadata:\n  name: hello\nspec:\n  schedule: \"* * * * *\"\n  jobTemplate:\n    spec:\n      template:\n        metadata:\n          labels:\n          - (Mandatory) runai/queue: team-a\n        spec:\n          (Mandatory) schedulerName: runai-scheduler\n          containers:\n          - name: hello\n            image: busybox:1.28\n            imagePullPolicy: IfNotPresent\n            command:\n            - /bin/sh\n            - -c\n            - date; echo Hello from the Kubernetes cluster\n          restartPolicy: OnFailure\n          (Optional) priorityClassName: build / train / inference / interactivePreemptible\n
      "},{"location":"developer/cluster-api/other-resources/#limitations","title":"Limitations","text":"

      The Run:ai command line interface provides limited support for Kubernetes Workloads.

      "},{"location":"developer/cluster-api/other-resources/#see-also","title":"See Also","text":"

      Run:ai has specific integrations with additional third-party tools such as KubeFlow, MLFlow, and more. These integrations use the same instructions as described above.

      "},{"location":"developer/cluster-api/reference/","title":"Reference","text":"

      For a full reference for the YAML API parameters see the YAML Reference document.

      "},{"location":"developer/cluster-api/submit-rest/","title":"Submitting Workloads via HTTP/REST","text":"

      You can submit Workloads via HTTP calls, using the Kubernetes REST API.

      "},{"location":"developer/cluster-api/submit-rest/#submit-workload-example","title":"Submit Workload Example","text":"

      To submit a workload via HTTP, run the following:

      curl -X POST \\ # (1) \n'https://<IP>:6443/apis/run.ai/v2alpha1/namespaces/<PROJECT>/trainingworkloads' \\ \n    --header 'Content-Type: application/yaml' \\\n    --header 'Authorization: Bearer <BEARER>' \\  # (2) \n    --data-raw 'apiVersion: run.ai/v2alpha1\nkind: TrainingWorkload  # (3)\nmetadata:\n  name: job-1    \nspec:\n  gpu:\n    value: \"1\"\n  image:\n    value: runai.jfrog.io/demo/quickstart\n  name:\n    value: job-1  \n
      1. Replace <IP> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile). Replace <PROJECT> with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>). Replace trainingworkloads with interactiveworkloads, distributedworkloads or inferenceworkloads according to type.
      2. Add Bearer token. To obtain a Bearer token see API authentication.
      3. See Submitting a Workload via YAML for an explanation of the YAML-based workload.

      Run: runai list jobs to see the new Workload.

      "},{"location":"developer/cluster-api/submit-rest/#delete-workload-example","title":"Delete Workload Example","text":"

      To delete a workload run:

      curl -X DELETE \\ # (1) \n'https://<IP>:6443/apis/run.ai/v2alpha1/namespaces/<PROJECT>/trainingworkloads/<JOB-NAME>' \\ \n    --header 'Content-Type: application/yaml' \\\n    --header 'Authorization: Bearer <BEARER>'   # (2)\n
      1. Replace <IP> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile). Replace <PROJECT> with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>). Replace trainingworkloads with interactiveworkloads, distributedworkloads or inferenceworkloads according to type. Replace <JOB-NAME> with the name of the Job.
      2. Add Bearer token. To obtain a Bearer token see API authentication.
      "},{"location":"developer/cluster-api/submit-rest/#suspendstop-workload-example","title":"Suspend/Stop workload example","text":"

      To suspend or stop a workload run:

      curl -X PATCH \\ # (1) \n'https://<IP>:6443/apis/run.ai/v2alpha1/namespaces/<PROJECT>/interactiveworkload/<JOB-NAME>' \\\n    --header 'Content-Type: application/json' \n    --header 'Authorization: Bearer <TOKEN>'# (2) \n    --data '{\"spec\":{\"active\": {\"value\": \"false\"}}}'\n
      1. Replace <IP> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile). Replace <PROJECT> with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>). Replace trainingworkloads with interactiveworkloads, distributedworkloads or inferenceworkloads according to type. Replace <JOB-NAME> with the name of the Job.
      2. Add Bearer token. To obtain a Bearer token see API authentication.
      "},{"location":"developer/cluster-api/submit-rest/#using-other-programming-languages","title":"Using other Programming Languages","text":"

      You can use any Kubernetes client library together with the YAML documentation above to submit workloads via other programming languages. For more information see Kubernetes client libraries.

      "},{"location":"developer/cluster-api/submit-rest/#python-example","title":"Python example","text":"

      Create the following file and run it via python:

      create-train.py
      import json\nimport requests\n\n# (1)\nurl = \"https://<IP>:6443/apis/run.ai/v2alpha1/namespaces/<PROJECT>/trainingworkloads\"\n\npayload = json.dumps({\n  \"apiVersion\": \"run.ai/v2alpha1\",\n  \"kind\": \"TrainingWorkload\",\n  \"metadata\": {\n    \"name\": \"train1\",\n    \"namespace\": \"runai-team-a\"\n  },\n  \"spec\": {\n    \"image\": {\n      \"value\": \"runai.jfrog.io/demo/quickstart\"\n    },\n    \"name\": {\n      \"value\": \"train1\"\n    },\n    \"gpu\": {\n      \"value\": \"1\"\n    }\n  }\n})\n\nheaders = {\n  'Content-Type': 'application/json',\n  'Authorization': 'Bearer <TOKEN>' #(2)\n}\n\nresponse = requests.request(\"POST\", url, headers=headers, data=payload) # (3)\n\nprint(json.dumps(json.loads(response.text), indent=4))\n
      1. Replace <IP> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile). Replace <PROJECT> with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>). Replace trainingworkloads with interactiveworkloads, distributedworkloadsor inferenceworkloads according to type.
      2. Add Bearer token. To obtain a Bearer token see API authentication.
      3. if you do not have a valid certificate, you can add the flag verify=False.
      "},{"location":"developer/cluster-api/submit-yaml/","title":"Submitting Workloads via YAML","text":"

      You can use YAML to submit Workloads directly to Run:ai. Below are examples of how to create training, interactive and inference workloads via YAML.

      For details on YAML parameters, see the YAML Reference.

      "},{"location":"developer/cluster-api/submit-yaml/#submit-workload-example","title":"Submit Workload Example","text":"

      Create a file named training1.yaml with the following text:

      training1.yaml
      apiVersion: run.ai/v2alpha1\nkind: TrainingWorkload # (1)\nmetadata:\n  name: job-1  # (2) \n  namespace: runai-team-a # (3)\nspec:\n  gpu:\n    value: \"1\"\n  image:\n    value: runai.jfrog.io/demo/quickstart\n  name:\n    value: job-1 # (4)\n
      1. This is a Training workload.
      2. Kubernetes object name. Mandatory, but has no functional significance.
      3. Namespace: Replace runai-team-a with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>).
      4. Job name as appears in Run:ai. Can provide name, or create automatically if name prefix is configured.

      Change the namespace and run: kubectl apply -f training1.yaml

      Run: runai list jobs to see the new Workload.

      "},{"location":"developer/cluster-api/submit-yaml/#delete-workload-example","title":"Delete Workload Example","text":"

      Run: kubectl delete -f training1.yaml to delete the Workload.

      "},{"location":"developer/cluster-api/submit-yaml/#creating-a-yaml-syntax-from-a-cli-command","title":"Creating a YAML syntax from a CLI command","text":"

      An easy way to create a YAML for a workload is to generate it from the runai submit command by using the --dry-run flag. For example, run:

      runai submit build1 -i ubuntu -g 1 --interactive --dry-run \\\n     -- sleep infinity \n

      The result will be the following Kubernetes object declaration:

      apiVersion: run.ai/v2alpha1\nkind: InteractiveWorkload  # (1)\nmetadata:\n  creationTimestamp: null\n  labels:\n    PreviousJob: \"true\"\n  name: job-0-2022-05-02t08-50-57\n  namespace: runai-team-a\nspec:\n  command:\n    value: sleep infinity\n  gpu:\n    value: \"1\"\n  image:\n    value: ubuntu\n  imagePullPolicy:\n    value: Always\n  name:\n    value: job-0\n\n... Additional internal and status properties...\n
      1. This is an Interactive workload.
      "},{"location":"developer/cluster-api/submit-yaml/#inference-workload-example","title":"Inference Workload Example","text":"

      Creating an inference workload is similar to the above two examples.

      apiVersion: run.ai/v2alpha1\nkind: InferenceWorkload\nmetadata:\n  name: inference1\n  namespace: runai-team-a\nspec:\n  name:\n    value: inference1\n  gpu:\n    value: \"0.5\"\n  image:\n    value: \"runai.jfrog.io/demo/example-triton-server\"\n  minScale:\n    value: 1\n  maxScale:\n    value: 2\n  metric:\n    value: concurrency # (1)\n  target:\n    value: 80  # (2)\n  ports:\n      items:\n        port1:\n          value:\n            container: 8000\n            protocol: http\n            serviceType: ServingPort\n
      1. Possible metrics are throughput, concurrency and latency.
      2. Inference requires a port to receive requests.
      "},{"location":"developer/cluster-api/submit-yaml/#suspendresume-interactivetraining-workload","title":"Suspend/Resume Interactive/Training Workload","text":"

      To suspend training:

      apiVersion: run.ai/v2alpha1\nkind: TrainingWorkload # \nmetadata:\n  name: job-1  #  \n  namespace: runai-team-a # \nspec:\n  gpu:\n    value: \"1\"\n  active:\n    value: false\n  image:\n    value: runai.jfrog.io/demo/quickstart\n  name:\n    value: job-1 # \n
      In order to suspend the workload, set active to false. To resume the workload, either set active to true or remove it entirely.

      "},{"location":"developer/cluster-api/submit-yaml/#see-also","title":"See Also","text":"
      • To understand how to connect to the inference workload, see Inference Quickstart.
      • To learn more about Inference and Run:ai see Inference overview.
      "},{"location":"developer/cluster-api/workload-overview-dev/","title":"Cluster API (Deprecated)","text":"

      The Run:ai Cluster API allows the submission of Workloads via YAML, directly to Kubernetes.

      Important

      With Run:ai 2.18 clusters, you can now submit Workloads via the Run:ai REST API. We recommend using this API if your cluster is of that version.

      "},{"location":"developer/cluster-api/workload-overview-dev/#workloads","title":"Workloads","text":"

      Run:ai schedules Workloads. Run:ai workloads contain:

      • The Kubernetes resource (Job, Deployment, etc) that is used to launch the container inside which the data science code runs.
      • A set of additional resources that is required to run the Workload. Examples: a service entry point that allows access to the Job, a persistent volume claim to access data on the network and more.

      Run:ai supports the following Workloads types:

      Workload Type Kubernetes Name Description Interactive InteractiveWorkload Submit an interactive workload Training TrainingWorkload Submit a training workload Distributed Training DistributedWorkload Submit a distributed training workload using TensorFlow, PyTorch or MPI Inference InferenceWorkload Submit an inference workload"},{"location":"developer/cluster-api/workload-overview-dev/#values","title":"Values","text":"

      A Workload will typically have a list of values, such as name, image, and resources. A full list of values is available in the runai-submit Command-line reference.

      You can also find the exact YAML syntax run:

      kubectl explain TrainingWorkload.spec\n

      (and similarly for other Workload types).

      To get information on a specific value (e.g. node type), you can also run:

      kubectl explain TrainingWorkload.spec.nodeType\n

      Result:

      KIND:     TrainingWorkload\nVERSION:  run.ai/v2alpha1\n\nRESOURCE: nodeType <Object>\n\nDESCRIPTION:\n     Specifies nodes (machines) or a group of nodes on which the workload will\n     run. To use this feature, your Administrator will need to label nodes as\n     explained in the Group Nodes guide at\n     https://docs.run.ai/admin/researcher-setup/limit-to-node-group. This flag\n     can be used in conjunction with Project-based affinity. In this case, the\n     flag is used to refine the list of allowable node groups set in the\n     Project. For more information consult the Projects guide at\n     https://docs.run.ai/admin/admin-ui-setup/project-setup.\n\nFIELDS:\n   value    <string>\n
      "},{"location":"developer/cluster-api/workload-overview-dev/#how-to-submit","title":"How to Submit","text":"

      A Workload can be submitted via various channels:

      • The Run:ai user interface.
      • The Run:ai command-line interface, via the runai submit command.
      • The Run:ai Cluster API.
      "},{"location":"developer/cluster-api/workload-overview-dev/#policies","title":"Policies","text":"

      An Administrator can set Policies for Workload submission. Policies serve two purposes:

      1. To constrain the values a researcher can specify.
      2. To provide default values.

      For example, an administrator can,

      • Set a maximum of 5 GPUs per Workload.
      • Provide a default value of 1 GPU for each container.

      Each workload type has a matching kind of workload policy. For example, an InteractiveWorkload has a matching InteractivePolicy

      A Policy of each type can be defined per-project. There is also a global policy that applies to any project that does not have a per-project policy.

      For further details on policies, see Policies.

      "},{"location":"developer/metrics/metrics-api/","title":"Metrics and telemetry","text":""},{"location":"developer/metrics/metrics-api/#telemetry","title":"Telemetry","text":"

      Telemetry is a numeric measurement recorded in real-time when emitted from the Run:ai cluster.

      "},{"location":"developer/metrics/metrics-api/#metrics","title":"Metrics","text":"

      Metrics are numeric measurements recorded over time that are emitted from the Run:ai cluster. Typical metrics involve utilization, allocation, time measurements and so on. Metrics are used in Run:ai dashboards as well as in the Run:ai administration user interface.

      The purpose of this document is to detail the structure and purpose of metrics emitted by Run:ai. This enables customers to create custom dashboards or integrate metric data into other monitoring systems.

      Run:ai provides metrics via the Run:ai Control-plane API. Previoulsy, Run:ai provided metrics information via direct access to an internal metrics store. This method is deprecated but is still documented here.

      "},{"location":"developer/metrics/metrics-api/#metric-and-telemetry-scopes","title":"Metric and telemetry Scopes","text":"

      Run:ai provides Control-plane API which supports and aggregates metrics at various levels.

      Level Description Cluster A cluster is a set of Nodes Pools & Nodes. With Cluster metrics, metrics are aggregated at the Cluster level Node Data is aggregated at the Node level. Node Pool Data is aggregated at the Node Pool level. Workload Data is aggregated at the Workload level. In some Workloads, e.g. with distributed workloads, these metrics aggregate data from all worker pods Pod The basic execution unit Project The basic organizational unit. Projects are the tool to implement resource allocation policies as well as the segregation between different initiatives. Department Departments are a grouping of projects. ## Supported Metrics Metric Cluster Node Pool Node Workload Pod Project Department API Cluster API Node Pool API Workload API Pod API ALLOCATED_GPU TRUE TRUE TRUE AVG_WORKLOAD_WAIT_TIME TRUE TRUE CPU_LIMIT_CORES TRUE CPU_MEMORY_LIMIT_BYTES TRUE CPU_MEMORY_REQUEST_BYTES TRUE CPU_MEMORY_USAGE_BYTES TRUE TRUE TRUE CPU_MEMORY_UTILIZATION TRUE TRUE TRUE CPU_REQUEST_CORES TRUE CPU_USAGE_CORES TRUE TRUE TRUE CPU_UTILIZATION TRUE TRUE TRUE GPU_ALLOCATION TRUE TRUE TRUE GPU_MEMORY_REQUEST_BYTES TRUE GPU_MEMORY_USAGE_BYTES TRUE TRUE GPU_MEMORY_USAGE_BYTES_PER_GPU TRUE TRUE GPU_MEMORY_UTILIZATION TRUE TRUE GPU_MEMORY_UTILIZATION_PER_GPU TRU GPU_QUOTA TRUE TRUE TRUE TRUE GPU_UTILIZATION TRUE TRUE TRUE TRUE GPU_UTILIZATION_PER_GPU TRUE TRUE POD_COUNT TRUE RUNNING_POD_COUNT TRUE TOTAL_GPU TRUE TRUE TOTAL_GPU_NODES TRUE TRUE GPU_UTILIZATION_DISTRIBUTION TRUE TRUE UNALLOCATED_GPU TRUE TRUE CPU_QUOTA_MILLICORES TRUE TRUE CPU_MEMORY_QUOTA_MB TRUE TRUE CPU_ALLOCATION_MILLICORES TRUE TRUE CPU_MEMORY_ALLOCATION_MB TRUE TRUE"},{"location":"developer/metrics/metrics-api/#advanced-metrics","title":"Advanced Metrics","text":"

      NVIDIA provides extended metrics at the Pod level. These are documented here. To enable these metrics please contact Run:ai customer support.

      Metric Cluster Node Pool Workload Pod GPU_FP16_ENGINE_ACTIVITY_PER_GPU TRUE GPU_FP32_ENGINE_ACTIVITY_PER_GPU TRUE GPU_FP64_ENGINE_ACTIVITY_PER_GPU TRUE GPU_GRAPHICS_ENGINE_ACTIVITY_PER_GPU TRUE GPU_MEMORY_BANDWIDTH_UTILIZATION_PER_GPU TRUE GPU_NVLINK_RECEIVED_BANDWIDTH_PER_GPU TRUE GPU_NVLINK_TRANSMITTED_BANDWIDTH_PER_GPU TRUE GPU_PCIE_RECEIVED_BANDWIDTH_PER_GPU TRUE GPU_PCIE_TRANSMITTED_BANDWIDTH_PER_GPU TRUE GPU_SM_ACTIVITY_PER_GPU TRUE GPU_SM_OCCUPANCY_PER_GPU TRUE GPU_TENSOR_ACTIVITY_PER_GPU TRUE"},{"location":"developer/metrics/metrics-api/#_1","title":"Metrics via API","text":""},{"location":"developer/metrics/metrics-api/#supported-telemetry","title":"Supported telemetry","text":"telemetry Node Workload Project Department API Node API Workload API WORKLOADS_COUNT TRUE ALLOCATED_GPUS TRUE TRUE TRUE TRUE READY_GPU_NODES TRUE READY_GPUS TRUE TOTAL_GPU_NODES TRUE TOTAL_GPUS TRUE IDLE_ALLOCATED_GPUS TRUE FREE_GPUS TRUE TOTAL_CPU_CORES TRUE USED_CPU_CORES TRUE ALLOCATED_CPU_CORES TRUE TRUE TRUE TOTAL_GPU_MEMORY_BYTES TRUE USED_GPU_MEMORY_BYTES TRUE TOTAL_CPU_MEMORY_BYTES TRUE USED_CPU_MEMORY_BYTES TRUE ALLOCATED_CPU_MEMORY_BYTES TRUE TRUE TRUE GPU_QUOTA TRUE TRUE CPU_QUOTA TRUE TRUE MEMORY_QUOTA TRUE TRUE GPU_ALLOCATION_NON_PREEMPTIBLE TRUE TRUE CPU_ALLOCATION_NON_PREEMPTIBLE TRUE TRUE MEMORY_ALLOCATION_NON_PREEMPTIBLE TRUE TRUE"},{"location":"developer/metrics/metrics/","title":"Metrics API","text":""},{"location":"developer/metrics/metrics/#what-are-metrics","title":"What are Metrics","text":"

      Metrics are numeric measurements recorded over time that are emitted from the Run:ai cluster. Typical metrics involve utilization, allocation, time measurements and so on. Metrics are used in Run:ai dashboards as well as in the Run:ai administration user interface.

      The purpose of this document is to detail the structure and purpose of metrics emitted by Run:ai to enable customers to create custom dashboards or integrate metric data into other monitoring systems.

      Run:ai uses Prometheus for collecting and querying metrics.

      Warning

      From cluster version 2.17 and onwards, Run:ai supports metrics via the Run:ai Control-plane API. Direct metrics queries (metrics that are queried directly from Prometheus) are deprecated.

      "},{"location":"developer/metrics/metrics/#published-runai-metrics","title":"Published Run:ai Metrics","text":"

      Following is the list of published Run:ai metrics, per cluster version (make sure to pick the right cluster version in the picker at the top of the page):

      Metric name Labels Measurement Description runai_active_job_cpu_requested_cores {clusterId, job_name, job_uuid} CPU Cores Workload's requested CPU cores runai_active_job_memory_requested_bytes {clusterId, job_name, job_uuid} Bytes Workload's requested CPU memory runai_cluster_cpu_utilization {clusterId} 0 to 1 CPU utilization of the entire cluster runai_cluster_memory_used_bytes {clusterId} Bytes Used CPU memory of the entire cluster runai_cluster_memory_utilization {clusterId} 0 to 1 CPU memory utilization of the entire cluster runai_allocated_gpu_count_per_gpu {gpu, clusterId, node} 0/1 Is a GPU hosting a pod runai_last_gpu_utilization_time_per_gpu {gpu, clusterId, node} Unix time Last time GPU was not idle runai_requested_gpu_memory_mb_per_workload {clusterId, job_type, job_uuid, job_name, project, workload_id} MegaBytes Requested GPU memory per workload (0 if not specified by the user) runai_requested_gpus_per_workload {clusterId, workload_type, workload_id, workload_name, project} Double Number of requested GPUs per workload runai_run_time_seconds_per_workload {clusterId, workload_id, workload_name} Seconds Total run time per workload runai_wait_time_seconds_per_workload {clusterId, workload_id, workload_name} Seconds Total wait time per workload runai_node_cpu_requested_cores {clusterId, node} Double Sum of the requested CPU cores of all workloads running in a node runai_node_cpu_utilization {clusterId, node} 0 to 1 CPU utilization per node runai_node_memory_utilization {clusterId, node} 0 to 1 CPU memory utilization per node runai_node_requested_memory_bytes {clusterId, node} Bytes Sum of the requested CPU memory of all workloads running in a node runai_node_used_memory_bytes {clusterId, node} Bytes Used CPU memory per node runai_project_guaranteed_gpus {clusterId, project} Double Guaranteed GPU quota per project runai_project_info {memory_quota, cpu_quota, gpu_guaranteed_quota, clusterId, project, department} N/A Information on CPU, CPU memory, GPU quota per project runai_queue_info {memory_quota, cpu_quota, gpu_guaranteed_quota, clusterId, nodepool, queue_name, department} N/A Information on CPU, CPU memory, GPU quota per project/department per nodepool runai_cpu_limits_per_active_workload {clusterId, job_name , job_uuid} CPU Cores Workloads CPU limit (in number of cores). See link runai_job_cpu_usage {clusterId, workload_id, workload_name, project} Double Workloads CPU usage (in number of cores) runai_memory_limits_per_active_workload {clusterId, job_name, job_uuid} Bytes Workloads CPU memory limit. See link runai_active_job_memory_requested_bytes {clusterId, job_name, job_uuid} Bytes Workloads requested CPU memory. See link runai_job_memory_used_bytes {clusterId, workload_id, workload_name, project} Bytes Workloads used CPU memory runai_mig_mode_gpu_count {clusterId, node} Double Number of GPUs on MIG nodes (Deprecated) runai_gpu_utilization_per_gpu {clusterId, gpu, node} % GPU Utilization per GPU runai_gpu_utilization_per_node {clusterId, node} % GPU Utilization per Node runai_gpu_memory_used_mebibytes_per_gpu {clusterId, gpu, node} MiB Used GPU memory per GPU runai_gpu_memory_used_mebibytes_per_node {clusterId, node} MiB Used GPU memory per Node runai_gpu_memory_total_mebibytes_per_gpu {clusterId, gpu, node} MiB Total GPU memory per GPU runai_gpu_memory_total_mebibytes_per_node {clusterId, node} MiB Total GPU memory per Node runai_gpu_count_per_node {clusterId, node, modelName, ready, schedulable} Number Number of GPUs per Node runai_allocated_gpu_count_per_workload {clusterId, workload_id, workload_name, workload_type, user} Double Number of allocated GPUs per Workload runai_allocated_gpu_count_per_project {clusterId, project} Double Number of allocated GPUs per Project runai_gpu_memory_used_mebibytes_per_pod_per_gpu {clusterId, pod_name, pod_uuid, pod_namespace, node, gpu} MiB Used GPU Memory per Pod, per Gpu on which the workload is running runai_gpu_memory_used_mebibytes_per_workload {clusterId, workload_id, workload_name, workload_type, user} MiB Used GPU Memory per Workload runai_gpu_utilization_per_pod_per_gpu {clusterId, pod_name, pod_uuid, pod_namespace, node, gpu} % GPU Utilization per Pod per GPU runai_gpu_utilization_per_workload {clusterId, workload_id, workload_name, workload_type, user} % Average GPU Utilization per Workload runai_gpu_utilization_per_project {clusterId, project} % Average GPU Utilization per Project runai_last_gpu_utilization_time_per_workload {clusterId, workload_id, workload_name, workload_type, user} Seconds (Unix Timestamp) The Last Time (Unix Timestamp) That The Workload Utilized Any Of Its Allocated GPUs runai_gpu_idle_seconds_per_workload {clusterId, workload_id, workload_name, workload_type, user} Seconds Seconds Passed Since The Workload Utilized Any Of Its Allocated GPUs runai_allocated_gpu_count_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Double Number Of Allocated GPUs per Pod runai_allocated_gpu_count_per_node {clusterId, node} Double Number Of Allocated GPUs per Node runai_allocated_millicpus_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Integer Number Of Allocated Millicpus per Pod runai_allocated_memory_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Bytes Allocated Memory per Pod

      Following is a list of labels appearing in Run:ai metrics:

      Label Description clusterId Cluster Identifier department Name of Run:ai Department cpu_quota CPU limit per project gpu GPU index gpu_guaranteed_quota Guaranteed GPU quota per project image Name of Docker image namespace_name Namespace deployment_name Deployment name job_name Job name job_type Job type: training, interactive or inference job_uuid Job identifier workload_name Workload name workload_type Workload type: training, interactive or inference workload_uuid Workload identifier pod_name Pod name. A Workload can contain many pods. pod_namespace Pod namespace memory_quota CPU memory limit per project node Node name project Name of Run:ai Project status Workload status: Running, Pending, etc. For more information on Workload statuses see document user User identifier"},{"location":"developer/metrics/metrics/#other-metrics","title":"Other Metrics","text":"

      Run:ai exports other metrics emitted by NVIDIA and Kubernetes packages, as follows:

      Metric name Description runai_gpu_utilization_per_gpu GPU utilization kube_node_status_capacity The capacity for different resources of a node kube_node_status_condition The condition of a cluster node kube_pod_container_resource_requests_cpu_cores The number of CPU cores requested by container kube_pod_container_resource_requests_memory_bytes Bytes of memory requested by a container kube_pod_info Information about pod

      For additional information, see Kubernetes kube-state-metrics and NVIDIA dcgm exporter.

      "},{"location":"developer/metrics/metrics/#changed-metrics-and-api-mapping","title":"Changed metrics and API mapping","text":"

      Starting in cluster version 2.17, some of the metrics names have been changed. In addition some Run:ai metrics are available as API endpoints. Using the API endpoints is more efficient and provides an easier way of retrieving metrics in any application. The following table lists the metrics that were changed.

      Metric name in version 2.16 2.17 Change Description 2.17 API Endpoint runai_active_job_cpu_requested_cores available also via API https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"CPU_REQUEST_CORES\" metricType runai_active_job_memory_requested_bytes available also via API https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"CPU_MEMORY_REQUEST_BYTES\" metricType runai_cluster_cpu_utilization available also via API https://app.run.ai/api/v2/clusters/{clusterUuid}/metrics ; with \"CPU_UTILIZATION\" metricType runai_cluster_memory_utilization available also via API https://app.run.ai/api/v2/clusters/{clusterUuid}/metrics ; with \"CPU_MEMORY_UTILIZATION\" metricType runai_gpu_utilization_non_fractional_jobs no longer available runai_allocated_gpu_count_per_workload labels changed runai_gpu_utilization_per_pod_per_gpu available also via API https://app.run.ai/api/v1/workloads/{workloadId}/pods/{podId}/metrics ; with \"GPU_UTILIZATION_PER_GPU\" metricType runai_gpu_utilization_per_workload available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"GPU_UTILIZATION\" metricType runai_job_image no longer available runai_job_requested_gpu_memory available also via API and renamed to: \"runai_requested_gpu_memory_mb_per_workload\" with different labels https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"GPU_MEMORY_REQUEST_BYTES\" metricType runai_job_requested_gpus renamed to: \"runai_requested_gpus_per_workload\" with different labels runai_job_total_runtime renamed to: \"runai_run_time_seconds_per_workload\" with different labels runai_job_total_wait_time renamed to: \"runai_wait_time_seconds_per_workload\" with different labels runai_gpu_memory_used_mebibytes_per_workload available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"GPU_MEMORY_USAGE_BYTES\" metricType runai_gpu_memory_used_mebibytes_per_pod_per_gpu available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/pods/{podId}/metrics ; with \"GPU_MEMORY_USAGE_BYTES_PER_GPU\" metricType runai_node_gpu_used_memory_bytes renamed and changed units: \"runai_gpu_memory_used_mebibytes_per_node\" runai_node_total_memory_bytes renamed and changed units: \"runai_gpu_memory_total_mebibytes_per_node\" runai_project_info labels changed runai_active_job_cpu_limits available also via API and renamed to: \"runai_cpu_limits_per_active_workload\" https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"CPU_LIMIT_CORES\" metricType runai_job_cpu_usage available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"CPU_USAGE_CORES\" metricType runai_active_job_memory_limits available also via API and renamed to: \"runai_memory_limits_per_active_workload\" https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"CPU_MEMORY_LIMIT_BYTES\" metricType runai_running_job_memory_requested_bytes was a duplication of \"runai_active_job_memory_requested_bytes\", see above runai_job_memory_used_bytes available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"CPU_MEMORY_USAGE_BYTES\" metricType runai_job_swap_memory_used_bytes no longer available runai_gpu_count_per_node added labels runai_last_gpu_utilization_time_per_workload labels changed runai_gpu_idle_time_per_workload renamed to: \"runai_gpu_idle_seconds_per_workload\" with different labels"},{"location":"developer/metrics/metrics/#create-custom-dashboards","title":"Create custom dashboards","text":"

      To create custom dashboards based on the above metrics, please contact Run:ai customer support.

      "},{"location":"home/components/","title":"Run:ai System Components","text":""},{"location":"home/components/#components","title":"Components","text":"

      Run:ai is made up of two components:

      • The Run:ai cluster provides scheduling services and workload management.
      • The Run:ai control plane provides resource management, Workload submission and cluster monitoring.

      Technology-wise, both are installed over a Kubernetes Cluster.

      Run:ai users:

      • Researchers submit Machine Learning workloads via the Run:ai Console, the Run:ai Command-Line Interface (CLI), or directly by sending YAML files to Kubernetes.
      • Administrators monitor and set priorities via the Run:ai User Interface

      "},{"location":"home/components/#runai-cluster","title":"Run:ai Cluster","text":"
      • Run:ai comes with its own Scheduler. The Run:ai scheduler extends the Kubernetes scheduler. It uses business rules to schedule workloads sent by Researchers.
      • Run:ai schedules Workloads. Workloads include the actual researcher code running as a Kubernetes container, together with all the system resources required to run the code, such as user storage, network endpoints to access the container etc.
      • The cluster uses an outbound-only, secure connection to synchronize with the Run:ai control plane. Information includes meta-data sync and various metrics on Workloads, Nodes etc.
      • The Run:ai cluster is installed as a Kubernetes Operator
      • Run:ai is installed in its own Kubernetes namespace named runai
      • Workloads are run in the context of Run:ai Projects. Each Project is mapped to a Kubernetes namespace with its own settings and access control.
      "},{"location":"home/components/#runai-control-plane-on-the-cloud","title":"Run:ai Control Plane on the cloud","text":"

      The Run:ai control plane is used by multiple customers (tenants) to manage resources (such as Projects & Departments), submit Workloads and monitor multiple clusters.

      A single Run:ai customer (tenant) defined in the control-plane, can manage multiple Run:ai clusters. So a single customer, can manage mutltiple GPU clusters in multiple locations/subnets from a single interface.

      "},{"location":"home/components/#self-hosted-control-plane","title":"Self-hosted Control-Plane","text":"

      The Run:ai control plane can also be locally installed. To understand the various installation options see the installation types document.

      "},{"location":"home/data-privacy-details/","title":"Data Privacy","text":"

      This article details the data privacy and compliance considerations for deploying Run:ai. It is intended to help administrators and compliance teams understand the data management practices involved with Run:ai. This ensures the permissions align with organizational policies and regulatory requirements before installation and during integration and onboarding of the various teams.

      When using the Run:ai SaaS cluster, the Control plane operates through the Run:ai cloud, requiring the transmission of certain data for control and analytics. Below is a detailed breakdown of the specific data sent to the Run:ai cloud in the SaaS offering.

      Note

      For organizations where data privacy policies do not align with this data transmission, Run:ai offers a self-hosted version. This version includes the control plane on premise and does not communicate with the cloud.

      "},{"location":"home/data-privacy-details/#data-sent-to-the-runai-cloud","title":"Data sent to the Run:ai cloud","text":"Asset Details Workload Metrics Includes workload names, CPU, GPU, and memory metrics, as well as parameters provided during the runai submit command. Workload Assets Covers environments, compute resources, and data resources associated with workloads. Resource Credentials Credentials for cluster resources, encrypted with a SHA-512 algorithm specific to each tenant. Node Metrics Node-specific data including names, IPs, and performance metrics (CPU, GPU, memory). Cluster Metrics Cluster-wide metrics such as names, CPU, GPU, and memory usage. Projects & Departments Includes names and quota information for projects and departments. Users User roles within Run:ai, email addresses, and passwords."},{"location":"home/data-privacy-details/#key-consideration","title":"Key consideration","text":"

      Run:ai ensures that no deep-learning artefacts, such as code, images, container logs, training data, models, or checkpoints, are transmitted to the cloud. These assets remain securely within your organization's firewalls, safeguarding sensitive intellectual property and data.

      "},{"location":"home/data-privacy-details/#see-also","title":"See Also","text":"

      The Run:ai privacy policy.

      "},{"location":"home/overview/","title":"Run:ai Documentation Library","text":"

      Welcome to the Run:ai documentation area. For an introduction about what is the Run:ai Platform see Run:ai platform on the run.ai website.

      The Run:ai documentation is targeting four personas:

      • Infrastructure Administrator - An IT person, responsible for the installation, setup and IT maintenance of the Run:ai product. Infrastructure Administrator documentation can be found here.

      • Platform Administrator - Responsible for the day-to-day administration of the product. Platform Administrator documentation can be found here.

      • Researcher \u2014 Using Run:ai to spin up notebooks, submit Workloads, prompt models, etc. Researcher documentation can be found here.

      • Developer \u2014 Using various APIs to automate work with Run:ai. The Developer documentation can be found here.

      "},{"location":"home/overview/#how-to-get-support","title":"How to Get Support","text":"

      To get support use the following channels:

      • On the Run:ai user interface at <company-name>.run.ai, use the 'Contact Support' link on the top right.

      • Or submit a ticket by clicking the button below:

      Submit a Ticket

      "},{"location":"home/overview/#community","title":"Community","text":"

      Run:ai provides its customers with access to the Run:ai Customer Community portal to submit tickets, track ticket progress and update support cases.

      Customer Community Portal

      Reach out to customer support for credentials.

      "},{"location":"home/overview/#runai-cloud-status-page","title":"Run:ai Cloud Status Page","text":"

      Run:ai cloud availability is monitored at status.run.ai.

      "},{"location":"home/overview/#collect-logs-to-send-to-support","title":"Collect Logs to Send to Support","text":"

      As an IT Administrator, you can collect Run:ai logs to send to support. For more information see logs collection.

      "},{"location":"home/overview/#example-code","title":"Example Code","text":"

      Code for the Docker images referred to on this site is available at https://github.com/run-ai/docs/tree/master/quickstart.

      The following images are used throughout the documentation:

      Image Description Source runai.jfrog.io/demo/quickstart Basic training image. Multi-GPU support https://github.com/run-ai/docs/tree/master/quickstart/main runai.jfrog.io/demo/quickstart-distributed Distributed training using MPI and Horovod https://github.com/run-ai/docs/tree/master/quickstart/distributed zembutsu/docker-sample-nginx Build (interactive) with Connected Ports https://github.com/zembutsu/docker-sample-nginx runai.jfrog.io/demo/quickstart-x-forwarding Use X11 forwarding from Docker image https://github.com/run-ai/docs/tree/master/quickstart/x-forwarding runai.jfrog.io/demo/pycharm-demo Image used for tool integration (PyCharm and VSCode) https://github.com/run-ai/docs/tree/master/quickstart/python%2Bssh runai.jfrog.io/demo/example-triton-client and runai.jfrog.io/demo/example-triton-server Basic Inference https://github.com/run-ai/models/tree/main/models/triton"},{"location":"home/overview/#contributing-to-the-documentation","title":"Contributing to the documentation","text":"

      This documentation is made better by individuals from our customer and partner community. If you see something worth fixing, please comment at the bottom of the page or create a pull request via GitHub. The public GitHub repository can be found on the top-right of this page.

      "},{"location":"home/saas-updates/","title":"What's New for the Run:ai SaaS Platform","text":"

      The release notes are aimed to provide transparency into the latest changes and improvements to Run:ai\u2019s SaaS platform. The updates include new features, optimizations, and fixes aimed at improving performance and user experience.

      Latest GA release notes (https://docs.run.ai/latest/home/whats-new-2-19/)

      "},{"location":"home/saas-updates/#gradual-rollout","title":"Gradual Rollout","text":"

      SaaS features are gradually rolled out to customers over the course of a week to ensure a smooth transition and minimize any potential disruption.

      "},{"location":"home/saas-updates/#november-release","title":"November Release","text":""},{"location":"home/saas-updates/#product-enhancements","title":"Product Enhancements","text":"
      • The display of the default GPU quota for the default department has been updated. Previously, the GPU quota was shown as -1. It has now been changed to display as \"-\" for better clarity.
      • New permissions have been added for the Application Administrator role, enabling full CRUD (Create, Read, Update, Delete) capabilities for managing applications.
      "},{"location":"home/saas-updates/#resolved-bugs","title":"Resolved Bugs","text":"ID Description RUN-23778 Resolved an issue where SAML mappers were displayed as null in the UI upon editing an Identity Provider (IdP). The mapper values now persist as expected, and associated attributes remain unchanged. RUN-23762 Fixed a bug that caused some customers to receive the incorrect version of the dashboard. This issue led to inconsistencies in the user interface and functionality, impacting affected users' ability to access the appropriate dashboard features. RUN-23735 Fixed an issue where the limit parameter on the Users page did not enforce the minimum value constraint. This allowed invalid values to be processed, potentially causing errors in pagination RUN-23669 Consumption report: The Inspect feature in Grafana, which allows users to export consumption data from the portal, has been re-enabled RUN-23664 An issue has been resolved where the GPU quota numbers displayed on the Department Overview page did not match the values shown on the Department Edit page. RUN-20116 An issue has been resolved where searching for certain pages in the UI only applied the search filter to the current page. Relevant tables are: Users, Applications, Workloads, Projects, departments, Node pools. RUN-23575 The dynamic refresh was not properly preserving the user\u2019s widget settings, causing them to reset to default values after each refresh cycle. RUN-23376 CLI v2: An issue was resolved where the runai logs command failed with a 401 Unauthorized error after a period of inactivity RUN-23373 An issue where AWS storage classes were not appearing when creating a new data source within a new workload has been resolved. Previously, AWS storage classes were only visible when creating a data source directly from the Data Sources tab."},{"location":"home/whats-new-2-13/","title":"Run:ai version 2.13","text":""},{"location":"home/whats-new-2-13/#version-2137","title":"Version 2.13.7","text":""},{"location":"home/whats-new-2-13/#release-date","title":"Release date","text":"

      July 2023

      "},{"location":"home/whats-new-2-13/#release-content","title":"Release content","text":"
      • Added filters to the historic quota ratio widget on the Quota management dashboard.
      "},{"location":"home/whats-new-2-13/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-11080 Fixed an issue in OpenShift environments where log in via SSO with the kubeadmin user, gets blank pages for every page. RUN-11119 Fixed an issue where values that should be the Order of priority column are in the wrong column. RUN-11120 Fixed an issue where the Projects table does not show correct metrics when Run:ai version 2.13 is paired with a Run:ai 2.8 cluster. RUN-11121 Fixed an issue where the wrong over quota memory alert is shown in the Quota management pane in project edit form. RUN-11272 Fixed an issue in OpenShift environments where the selection in the cluster drop down in the main UI does not match the cluster selected on the login page."},{"location":"home/whats-new-2-13/#version-2134","title":"Version 2.13.4","text":""},{"location":"home/whats-new-2-13/#release-date_1","title":"Release date","text":"

      July 2023

      "},{"location":"home/whats-new-2-13/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-11089 Fixed an issue when creating an environment, commands in the Runtime settings pane and are not persistent and cannot be found in other assets (for example in a new Training)."},{"location":"home/whats-new-2-13/#version-2131","title":"Version 2.13.1","text":""},{"location":"home/whats-new-2-13/#release-date_2","title":"Release date","text":"

      July 2023

      "},{"location":"home/whats-new-2-13/#release-content_1","title":"Release content","text":"
      • Made an improvement so that occurrences of labels that are not in use anymore are deleted.
      "},{"location":"home/whats-new-2-13/#fixed-issues_2","title":"Fixed issues","text":"

      N/A

      "},{"location":"home/whats-new-2-13/#version-2130","title":"Version 2.13.0","text":""},{"location":"home/whats-new-2-13/#release-content_2","title":"Release content","text":"

      This version contains features and fixes from previous versions starting with 2.9. Refer to the prior versions for specific features and fixes.

      Projects

      • Improved the Projects UI for ease of use. Projects follows UI upgrades and changes that are designed to make setting up of components and assets easier for administrators and researchers. To configure a project, see Projects.

      Dashboards

      • Added a new dashboard for Quota management, which provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard filters the display of resource quotas based on Departments, Projects, and Node pools. For more information, see Quota management dashboard.

      • Added to the Overview dashboard, the ability to filter the cluster by one or more node pools. For more information, see Node pools.

      Nodes and Node pools

      • Run:ai scheduler supports 2 scheduling strategies: Bin Packing (default) and Spread. For more information, see Scheduling strategies. You can configure the scheduling strategy in the node pool level to improve the support of clusters with mixed types of resources and workloads. For configuration information, see Creating new node pools.

      • GPU device level DCGM Metrics are collected per GPU and presented by Run:ai in the Nodes table. Each node contains a list of its embedded GPUs with their respective DCGM metrics. See DCGM Metrics for the list of metrics which are provided by NVidia DCGM and collected by Run:ai. Contact your Run:ai customer representative to enable this feature.

      • Added per node pool over-quota priority. Over-quota priority sets the relative amount of additional unused resources that an asset can get above its current quota. For more information, see Over-quota priority.
      • Added support of associating workspaces to node pool. The association between workspaces and node pools is done using Compute resources section. In order to associate a compute resource to a node pool, in the Compute resource section, press More settings. Press Add new to add more node pools to the configuration. Drag and drop the node pools to set their priority.
      • Added Node pool selection as part of the workload submission form. This allows researchers to quickly determine the list of node pools available and their priority. Priority is set by dragging and dropping them in the desired order of priority. In addition, when the node pool priority list is locked by a policy, the list isn't editable by the Researcher even if the workspace is created from a template or copied from another workspace.

      Time limit duration

      • Improved the behavior of any workload time limit (for example, Idle time limit) so that the time limit will affect existing workloads that were created before the time limit was configured. This is an optional feature which provides help in handling situations where researchers leave sessions open even when they do not need to access the resources. For more information, see Limit duration of interactive training jobs.

      • Improved workspaces time limits. Workspaces that reach a time limit will now transition to a state of stopped so that they can be reactivated later.

      • Added time limits for training jobs per project. Administrators (Department Admin, Editor) can limit the duration of Run:ai Training jobs per Project using a specified time limit value. This capability can assist administrators to limit the duration and resources consumed over time by training jobs in specific projects. Each training job that reaches this duration will be terminated.

      Workload assets

      • Extended the collaboration functionality for any workload asset such as Environment, Compute resource, and some Data source types. These assets are now shared with Departments in the organization in addition to being shared with specific projects, or the entire cluster.
      • Added a search box for card galleries in any asset based workload creation form to provide an easy way to search for assets and resources. To filter use the asset name or one of the field values of the card.

      PVC data sources

      • Added support for PVC block storage in the New data source form. In the New data source form for a new PVC data source, in the Volume mode field, select from Filesystem or Block. For more information, see Create a PVC data source.

      Credentials

      • Added Docker registry to the Credentials menu. Users can create docker credentials for use in specific projects for image pulling. To configure credentials, see Configuring credentials.

      Policies

      • Improved policy support by adding DEFAULTS in the items section in the policy. The DEFAULTS section sets the default behavior for items declared in this section. For example, this can be use to limit the submission of workloads only to existing PVCs. For more information and an example, see Policies, Complex values.
      • Added support for making a PVC data source available to all projects. In the New data source form, when creating a new PVC data source, select All from the Project pane.

      Researcher API

      • Extended researcher's API to allow stopping and starting of workloads using the API. For more information, see Submitting Workloads via HTTP/REST.

      Integrations

      • Added support for Spark and Elastic jobs. For more information, see Running Spark jobs with Run:ai.
      • Added support for Ray jobs. Ray is an open-source unified framework for scaling AI and Python applications. For more information, see Integrate Run:ai with Ray.

      • Added integration with Weights & Biases Sweep to allow data scientists to submit hyperparameter optimization workloads directly from the Run:ai UI. To configure sweep, see Sweep configuration.

      • Added support for XGBoost. XGBoost, which stands for Extreme Gradient Boosting, is a scalable, distributed gradient-boosted decision tree (GBDT) machine learning library. It provides parallel tree boosting and is the leading machine learning library for regression, classification, and ranking problems. For more information, see runai submit-dist xgboost

      Compatability

      • Added support for multiple OpenShift clusters. For configuration information, see Installing additional Clusters.
      "},{"location":"home/whats-new-2-13/#installation","title":"Installation","text":"
      • The manual process of upgrading Kubernetes CRDs is no longer needed when upgrading to the most recent version (2.13) of Run:ai.
      • From Run:ai 2.12 and above, the control-plane installation has been simplified and no longer requires the creation of a backend values file. Instead, install directly using helm as described in Install the Run:ai Control Plane.
      • From Run:ai 2.12 and above, the air-gapped, control-plane installation now generates a custom-env.yaml values file during the preparation stage. This is used when installing the control-plane.
      "},{"location":"home/whats-new-2-13/#known-issues","title":"Known issues","text":"Internal ID Description RUN-11005 Incorrect error messages when trying to run runai CLI commands in an OpenShift environment. RUN-11009 Incorrect error message when a user without permissions to tries to delete another user."},{"location":"home/whats-new-2-13/#fixed-issues_3","title":"Fixed issues","text":"Internal ID Description RUN-9039 Fixed an issue where in the new job screen, after toggling off the preemptible flag, and a job is submitted, the job still shows as preemptible. RUN-9323 Fixed an issue with a non-scaleable error message when scheduling hundreds of nodes is not successful. RUN-9324 Fixed an issue where the scheduler did not take into consideration the amount of storage so there is no explanation that pvc is not ready. RUN-9902 Fixed an issue in OpenShift environments, where there are no metrics in the dashboard because Prometheus doesn\u2019t have permissions to monitor the runai namespace after an installation or upgrade to 2.9. RUN-9920 Fixed an issue where the canEdit key in a policy is not validated properly for itemized fields when configuring an interactive policy. RUN-10052 Fixed an issue when loading a new job from a template gives an error until there are changes made on the form. RUN-10053 Fixed an issue where the Node pool column is unsearchable in the job list. RUN-10422 Fixed an issue where node details show running workloads that were actually finished (successfully/failed/etc.). RUN-10500 Fixed an issue where jobs are shown as running even though they don't exist in the cluster. RUN-10813 Fixed an issue in adding a data source where the path is case sensitive and didn't allow uppercase."},{"location":"home/whats-new-2-15/","title":"What's New 2.15 - December 3, 2023","text":""},{"location":"home/whats-new-2-15/#release-content","title":"Release Content","text":""},{"location":"home/whats-new-2-15/#researcher","title":"Researcher","text":""},{"location":"home/whats-new-2-15/#jobs-workloads-trainings-and-workspaces","title":"Jobs, Workloads, Trainings, and Workspaces","text":"
      • Added support to run distributed workloads via the training view in the UI. You can configure distributed training on the following:

        • Trainings form
        • Environments form

        You can select single or multi-node (distributed) training. When configuring distributed training, you will need to select a framework from the list. Supported frameworks now include:

        • PyTorch
        • Tensorflow
        • XGBoost
        • MPI

        For Trainings configuration, see Adding trainings. See your Run:ai representative to enable this feature. For Environments configuration, see Creating an Environment.

      • Preview the new Workloads view. Workloads is a new view for jobs that are running in the AI cluster. The Workloads view provides a more advanced UI than the previous Jobs UI. The new table format provides:

        • Improved views of the data
        • Improved filters and search
        • More information

        Use the toggle at the top of the Jobs page to switch to the Workloads view. For more information.

      • Improved support for Kubeflow Notebooks. Run:ai now supports the scheduling of Kubeflow notebooks with fractional GPUs. Kubeflow notebooks are identified automatically and appear with a dedicated icon in the Jobs UI.

      • Improved the Trainings and Workspaces forms. Now the runtime field for Command and Arguments can be edited directly in the new Workspace or Training creation form.
      • Added new functionality to the Run:ai CLI that allows submitting a workload with multiple service types at the same time in a CSV style format. Both the CLI and the UI now offer the same functionality. For more information, see runai submit.
      • Improved functionality in the runai submit command so that the port for the container is specified using the nodeport flag. For more information, see runai submit --service-type nodeport.
      "},{"location":"home/whats-new-2-15/#credentials","title":"Credentials","text":"
      • Improved Credentials creation. A Run:ai scope can now be added to credentials. For more information, see Credentials.
      "},{"location":"home/whats-new-2-15/#environments","title":"Environments","text":"
      • Added support for workload types when creating a new or editing existing environments. Select from single-node or multi-node (distributed) workloads. The environment is available only on feature forms which are relevant to the workload type selected.
      "},{"location":"home/whats-new-2-15/#volumes-and-storage","title":"Volumes and Storage","text":"
      • Added support for Ephemeral volumes in Workspaces. Ephemeral storage is temporary storage that gets wiped out and lost when the workspace is deleted. Adding Ephemeral storage to a workspace ties that storage to the lifecycle of the Workspace to which it was added. Ephemeral storage is added to the Workspace configuration form in the Volume pane. For configuration information, see Create a new workspace.
      "},{"location":"home/whats-new-2-15/#templates","title":"Templates","text":"
      • Added support for Run:ai a Scope in the template form. For configuration information, see Creating templates.
      "},{"location":"home/whats-new-2-15/#deployments","title":"Deployments","text":"
      • Improvements in the New Deployment form include:
        • Support for Tolerations. Tolerations guide the system to which node each pod can be scheduled to or evicted by matching between rules and taints defined for each Kubernetes node.
        • Support for Multi-Process Service (MPS). MPS is a service which allows the running of parallel processes on the same GPU, which are all run by the same userid. To enable MPS support, use the toggle switch on the Deployments form.

        Note

        If you do not use the same userid, the processes will run in serial and could possibly degrade performance.

      "},{"location":"home/whats-new-2-15/#auto-delete-jobs","title":"Auto Delete Jobs","text":"
      • Added new functionality to the UI and CLI that provides configuration options to automatically delete jobs after a specified amount of time upon completion. Auto-deletion provides more efficient use of resources and makes it easier for researchers to manage their jobs. For more configuration options in the UI, see Auto deletion (Step 9) in Create a new workspace. For more information on the CLI flag, see --auto-deletion-time-after-completion.
      "},{"location":"home/whats-new-2-15/#runai-administrator","title":"Run:ai Administrator","text":""},{"location":"home/whats-new-2-15/#authorization","title":"Authorization","text":"
      • Run:ai has now revised and updated the Role Based Access Control (RBAC) mechanism, expanding the scope of Kubernetes. Using the new RBAC mechanism makes it easier for administrators to manage access policies across multiple clusters and to define specific access rules over specific scopes for specific users and groups. Along with the revised RBAC mechanism, new user interface views are introduced to support the management of users, groups, and access rules. For more information, see Role based access control.
      "},{"location":"home/whats-new-2-15/#policies","title":"Policies","text":"
      • During Workspaces and Training creation, assets that do not comply with policies cannot be selected. These assets are greyed out and have a button on the cards when the item does not comply with a configured policy. The button displays information about which policies are non-compliant.
      • Added configuration options to Policies in order to prevent the submission of workloads that use data sources of type host path. This prevents data from being stored on the node, so that data is not lost when a node is deleted. For configuration information, see Prevent Data Storage on the Node.
      • Improved flexibility when creating policies which provide the ability to allocate a min and a max value for CPU and GPU memory. For configuration information, see GPU and CPU memory limits in Configuring policies.
      "},{"location":"home/whats-new-2-15/#nodes-and-node-pools","title":"Nodes and Node Pools","text":"
      • Node pools are now enabled by default. There is no need to enable the feature in the settings.
      "},{"location":"home/whats-new-2-15/#quotas-and-over-quota","title":"Quotas and Over-Quota","text":"
      • Improved control over how over-quota is managed by adding the ability to block over-subscription of the quota in Projects or Departments. For more information, see Limit Over-Quota.
      • Improved the scheduler fairness for departments using the over quota priority switch (in Settings). When the feature flag is disabled, over-quota weights are equal to the deserved quota and any excess resources are divided in the same proportion as the in-quota resources. For more information, see Over Quota Priority.
      • Added new functionality to always guarantee in-quota workloads at the expense of inter-Department fairness. Large distributed workloads from one department may preempt in-quota smaller workloads from another department. This new setting in the RunaiConfig file preserves in-quota workloads, even if the department quota or over-quota-fairness is not preserved. For more information, see Scheduler Fairness.
      "},{"location":"home/whats-new-2-15/#control-and-visibility","title":"Control and Visibility","text":""},{"location":"home/whats-new-2-15/#dashboards","title":"Dashboards","text":"
      • To ease the management of AI CPU and cluster resources, a new CPU focused dashboard was added for CPU based environments. The dashboards display specific information for CPU based nodes, node-pools, clusters, or tenants. These dashboards also include additional metrics that are specific to CPU based environments. This will help optimize visual information eliminating the views of empty GPU dashlets. For more information see CPU Dashboard.
      • Improved the Consumption report interface by moving the Cost settings to the General settings menu.
      • Added table to the Consumption dashboard that displays the consumption and cost per department. For more information, see Consumption dashboard.
      "},{"location":"home/whats-new-2-15/#nodes","title":"Nodes","text":"
      • Improved the readability of the Nodes table to include more detailed statuses and descriptions. The added information in the table makes it easier to inspect issues that may impact resource availability in the cluster. For more information, see Node and Node Pool Status.
      "},{"location":"home/whats-new-2-15/#ui-enhancements","title":"UI Enhancements","text":"
      • Added the ability to download a CSV file from any page that contains a table. Downloading a CSV provides a snapshot of the page's history over time, and helps with compliance tracking. All the columns that are selected (displayed) in the table are downloaded to the file.
      "},{"location":"home/whats-new-2-15/#installation-and-configuration","title":"Installation and Configuration","text":""},{"location":"home/whats-new-2-15/#cluster-installation-and-configuration","title":"Cluster Installation and configuration","text":"
      • New cluster wizard for adding and installing new clusters to your system.
      "},{"location":"home/whats-new-2-15/#openshift-support","title":"OpenShift Support","text":"
      • Added support for restricted policy for Pod Security Admission (PSA) on OpenShift only. For more information, see [Pod security admission](../admin/runai-setup/cluster-setup/
      • Added the ability, in OpenShift environments, to configure cluster routes created by Run:ai instead of using the OpenShift certificate. For more information, see the table entry Dedicated certificate for the researcher service route.
      "},{"location":"home/whats-new-2-16/","title":"Version 2.16","text":""},{"location":"home/whats-new-2-16/#release-content-january-25-2024","title":"Release Content - January 25, 2024","text":""},{"location":"home/whats-new-2-16/#researcher","title":"Researcher","text":"
      • Added enterprise level security for researcher tools such as Jupyter Notebooks, VSCode, or any other URL associated with the workload. Using this feature, anyone within the organization requesting access to a specific URL will be redirected to the login page to be authenticated and authorized. This results in protected URLs which cannot be reached from outside the organization. Researchers can enhance the URL privacy by using the Private toggle which means that only the researcher who created the workload can is authorized to access it. The Private toggle is available per tool that uses an external URL as a connection type and is located in the workload creation from in the UI in the environment section. This toggle sets a flag of isPrivate in the connections section of a policy for the connection type ExternalUrl. For more information, see Creating a new Workspace.
      "},{"location":"home/whats-new-2-16/#jobs-workloads-and-workspaces","title":"Jobs, Workloads, and Workspaces","text":"
      • Added the capability view and edit policies directly in the project submission form. Pressing on Policy will open a window that displays the effective policy. For more information, see Viewing Project Policies.
      • Running machine learning workloads effectively on Kubernetes can be difficult, but Run:ai makes it easy. The new Workloads experience introduces a simpler and more efficient way to manage machine learning workloads, which will appeal to data scientists and engineers alike. The Workloads experience provides a fast, reliable, and easy to use unified interface.

        • Fast-query of data from the new workloads service.
        • Reliable data retrieval and presentation in the CLI, UI, and API.
        • Easy to use single unified view with all workload types in one place.

        For more information, see Workloads Overview.

      • Changed the workload default auto deletion time after completion value from Never to 90 days. This ensures that environments will be cleaned from old data. This field is editable by default, allowing researchers the ability to change the value while submitting a workload. Using workload policies, administrators can increase, decrease, set the default value to never, or even lock access to this value so researchers can not edit it when they submit workloads.

      "},{"location":"home/whats-new-2-16/#assets","title":"Assets","text":"
      • When creating an asset such as data sources, credentials, or others, the scope is limited to the cluster selected at the top of the UI.
      "},{"location":"home/whats-new-2-16/#runai-administrator","title":"Run:ai Administrator","text":"
      • Added the capability for administrators to configure messages to users when they log into the platform. Messages are configured using the Message Editor screen. For more information, see Administrator Messages.
      "},{"location":"home/whats-new-2-16/#monitoring-and-analytics","title":"Monitoring and Analytics","text":"
      • Added to the dashboard updated GPU and CPU resource availability.

        • Added a chart displaying the number of free GPUs per node. Free GPU are GPUs that have not been allocated to a workload.
        • Added a dashlet that displays the total vs. ready resources for GPUs and CPUs. The dashlet indicates how many total nodes are in the platform, and how many are available.
      • Added additional columns to the consumption report for both Projects and Departments tables. The new columns are:

        • GPU Idle allocated hours\u2014the portion of time the GPUs spend idle from the total allocation hours.
        • CPU usage hours\u2014the actual usage time of CPU.
        • Memory usage time\u2014the actual usage time of CPU memory.

        For more information, see Consumption Dashboard.

      "},{"location":"home/whats-new-2-16/#authentication-and-authorization","title":"Authentication and Authorization","text":"
      • SSO users who have logged into the system will now be visible in the Users table. In addition, added a column to the Users table for the type of user that was created (Local or SSO). For more information, see Adding, Updating, and Deleting Users.
      "},{"location":"home/whats-new-2-16/#policies","title":"Policies","text":"
      • Added new Policy Manager. The new Policy Manager provides administrators the ability to impose restrictions and default values on system resources. The new Policy Manager provides a YAML editor for the configuration of the policies. Administrators can easily add both Workspace or Training policies. The editor makes it easy to see the configuration that has been applied and provides a quick and easy method to edit the policies. The new Policy Editor* brings other important policy features such as the ability to see non-compliant resources in workloads. For more information, see Policies.

      • Added a new policy manager. Enabling the New Policy Manager provides new tools to discover how resources are not compliant. Non-compliant resources and will appear greyed out and cannot be selected. To see how a resource is not compliant, press on the clipboard icon in the upper right hand corner of the resource. Policies can also be applied to specific scopes within the Run:ai platform. For more information, see Viewing Project Policies.

      "},{"location":"home/whats-new-2-16/#control-and-visibility","title":"Control and Visibility","text":"
      • Improved the clarity of the status column in the Clusters view. Now users have more insight about the actual status of Run:ai on the cluster. Users can now see extended details about the state of the Run:ai installation and services on the cluster, and its connectivity state. For more information, see Cluster status.
      "},{"location":"home/whats-new-2-16/#deprecation-notifications","title":"Deprecation Notifications","text":"

      Deprecation notifications allow you to plan for future changes in the Run:ai Platform. Deprecated features will be available for two versions ahead of the notification. For questions, see your Run:ai representative.

      "},{"location":"home/whats-new-2-16/#project-migration","title":"Project migration","text":"
      • Run:ai will be deprecating the migration of projects between departments. This affects:

        • API\u2014the departmentId field will be marked as deprecated in theput endpoint in the projects category.
        • User Interface\u2014there will no longer be an option to:
          • migrate projects to another department, when deleting departments.
          • change departments, when editing a project.
      "},{"location":"home/whats-new-2-16/#api-deprecations","title":"API deprecations","text":""},{"location":"home/whats-new-2-16/#removed-apis-and-api-fields-completed-deprecation","title":"Removed APIs and API fields (completed deprecation)","text":"

      The following list of API endpoints and fields that have completed their deprecation process and therefore will be changed as follows:

      Endpoint Change /v1/k8s/clusters The endpoint was removed and is replaced by /api/v1/clusters /v1/k8s/clusters/{uuid} The endpoint was removed and is replaced by /api/v1/clusters/{uuid}

      Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

      For a full explanation of the API Deprecation policy, see the Run:ai API Policy

      "},{"location":"home/whats-new-2-17/","title":"Version 2.17","text":""},{"location":"home/whats-new-2-17/#release-content-april-14-2024","title":"Release Content - April 14, 2024","text":"
      • Deprecation notifications
      • Breaking changes
      "},{"location":"home/whats-new-2-17/#researcher","title":"Researcher","text":""},{"location":"home/whats-new-2-17/#scheduler","title":"Scheduler","text":"
      • Added functionality to configure over provisioning ratios for node pools running any kind of workload. Over provisioning assumes that workloads are either under utilizing or intermittently using GPUs. This indicates that the real utilization is lower than the actual GPU allocation requested. Over provisioning allows the administrator to condense more workloads on a single GPU than what the workload required. For more information, see Optimize performance with Node Level Scheduler.

      • Added the GPU Resource Optimization feature to the UI. Now you can enable and configure GPU Portion (Fraction) limit and GPU Memory Limit from the UI. For more information, see Compute resources UI with Dynamic Fractions.

      • Added the ability to set Run:ai as the default scheduler for any project or namespace. This provides the administrator the ability to ensure that all workloads in a project or namespace are scheduled using the Run:ai scheduler. For more information, see Setting Run:ai as default scheduler.

      "},{"location":"home/whats-new-2-17/#jobs-workloads-and-workspaces","title":"Jobs, Workloads, and Workspaces","text":"
      • Added to the workload details view, the ability to filter by pod. You can now filter metrics and logs per pod or all the pods. Also, the Workloads table now has additional columns including connections and preemtability adding more at a glance information about the workload. In addition, using the Copy & edit button, you can submit a new workload via CLI based on the selected workload. For more information, see Workloads.

      • Added Inference to workload types. Inference workloads can now be created and managed from the unified Workloads table. The Deployments workload type has been deprecated, and replaced with Inference workloads which are submitted using the workload form. For more information, see Inference and for submitting an Inference workload, see Submitting workloads.

      • Added functionality that supports a single workloads submission selection. Now you can submit workloads by pressing + New workloads in the Workloads table. You can submit the following workloads from this table:

        • Workspace
        • Training
        • Inference

        This improvement phases out the previous version's Workspace and Jobs tables. The Jobs table and submission forms have been deprecated and can be reactivated. To reenable the Jobs table and forms, press Tools & settings, then General, then Workloads, and then Toggle the Jobs view and the Jobs submission buttons. For more information, see Submitting workloads.

      • Added the ability to configure a Kubernetes readiness probe. The readiness probe detects resources and workloads that are ready to receive traffic.

      "},{"location":"home/whats-new-2-17/#assets","title":"Assets","text":"
      • Added the capability to use a ConfigMap as a data source. The ability to use a ConfigMap as a data source can be configured in the Data sources UI, the CLI, and as part of a policy. For more information, see Setup a ConfigMap as a data source, Setup a ConfigMap as a volume using the CLI.

      • Added a Status column to the Credentials table, and the Data sources table. The Status column displays the state of the resource and provides troubleshooting information about that asset. For more information, see the Credentials table and the Data sources table.

      • Added functionality for asset creation that validates the asset based on version compatibility of the cluster or the control plane within a specific scope. At time of asset creation, invalid scopes will appear greyed out and will show a pop-up with the reason for the invalidation. This improvement is designed to increase the confidence that an asset is created properly and successfully.

      "},{"location":"home/whats-new-2-17/#runai-administrator","title":"Run:ai Administrator","text":""},{"location":"home/whats-new-2-17/#configuration-and-administration","title":"Configuration and Administration","text":"
      • Introducing a new Tools & Settings menu. The new Tools & Settings menu provides a streamlined UI for administrators to configure the Run:ai environment. The new UI is divided into categories that easily identify the areas where the administrator can change settings. The new categories include:

        • Analytics\u2014features related to analytics and metrics.
        • Resources\u2014features related to resource configuration and allocation.
        • Workloads\u2014features related to configuration and submission of workloads.
        • Security\u2014features related to configuration of SSO (Single Sign On).
        • Notifications\u2014used for system notifications.
        • Cluster authentication\u2014snippets related to Researcher authentication.

        Some features are now labeled either Experimental or Legacy. Experimental features are new features in the environment, that may have certain instabilities and may not perform as expected. Legacy features are features that are in the process of being deprecated, and may be removed in future versions.

      "},{"location":"home/whats-new-2-17/#clusters","title":"Clusters","text":"
      • Added new columns to the Clusters table to show Kubernetes distribution and version. This helps administrators view potential compatibility issues that may arise.

      • Improved the location of the cluster filter. The cluster filter has been relocated to filter bar and the drop down cluster filter in the header of the page has been removed. This improvement creates the following:

        • Filter assets by cluster in the following tables:

          • Data sources
          • Environments
          • Computer resources
          • Templates
          • Credentials
        • Creating a new asset, will automatically display only the scope of the selected cluster.

        • Prevention of account (top most level in the Scope) from being selected when creating assets.
        • Enforcement a cluster specific scope. This increases the confidence that an asset is created properly and successfully.

        Note

        This feature is only applicable if the all the clusters are version 2.17 and above.

      "},{"location":"home/whats-new-2-17/#monitoring-and-analytics","title":"Monitoring and Analytics","text":"
      • Improved GPU Overview dashboard. This improvement provides rich and extensive GPU allocation and performance data and now has interactive tiles that provide direct links to the Nodes, Workloads, and Departments tables. Hover over tiles with graphs to show rich data in the selected time frame filter. Tiles with graphs can be downloaded as CSV files. The new dashboard is enabled by default. Use the Go back to legacy view to return to the previous dashboard style. For more information, see Dashboard analysis.

      • Updated the knative and autoscaler metrics. Run:ai currently supports the following metrics:

        • Throughput
        • Concurrency

        For more information, see Autoscaling metrics.

      • Improved availability of metrics by using Run:ai APIs. Using the API endpoints is now the preferred method to retrieve metrics for use in any application. For more information, see Metrics.

      "},{"location":"home/whats-new-2-17/#authentication-and-authorization","title":"Authentication and Authorization","text":"
      • Added new functionality to SAML 2.0 identity provider configuration in the Security category of the General settings. The added functionality assists with troubleshooting SSO configuration and authentication issues that may arise. Now administrators now have the ability to:

        • View and edit the identity provider settings for SAML 2.0
        • Upload or download the SAML 2.0 identity provider metadata XML file.

      For more information, see SSO UI configuration.

      "},{"location":"home/whats-new-2-17/#deprecation-notifications","title":"Deprecation Notifications","text":"

      Deprecation notifications allow you to plan for future changes in the Run:ai Platform.

      "},{"location":"home/whats-new-2-17/#feature-deprecations","title":"Feature deprecations","text":"

      Deprecated features will be available for two versions ahead of the notification. For questions, see your Run:ai representative. The following features have been marked for deprecation:

      • Jobs\u2014the Jobs feature (submission form and view) has been moved to the category of Legacy. To enable them, go to Tools & Settings, General, open the Workloads pane, and then toggle the Jobs view and Job submission switch to the enabled position.
      • Deployments\u2014the Deployments feature has been removed. It has been replaced by Inference workloads. For more information, see Jobs, Workloads, and Workspaces above.
      • Workspaces view\u2014the Workspaces menu has been removed. You can now submit a Workspace workload using the + New workload form from the Workloads table.
      "},{"location":"home/whats-new-2-17/#api-support-and-endpoint-deprecations","title":"API support and endpoint deprecations","text":"

      The endpoints and parameters specified in the API reference are the ones that are officially supported by Run:ai. For more information about Run:ai's API support policy and deprecation process, see Developer overview.

      "},{"location":"home/whats-new-2-17/#deprecated-apis-and-api-fields","title":"Deprecated APIs and API fields","text":"

      The following list of API endpoints and fields that have been marked for deprecation:

      "},{"location":"home/whats-new-2-17/#jobs-and-pods-api","title":"Jobs and Pods API","text":"Deprecated Replacement /v1/k8s/clusters/{uuid}/jobs /api/v1/workloads /v1/k8s/clusters/{uuid}/jobs/count /api/v1/workloads/count /v1/k8s/clusters/{uuid}/jobs/{jobId}/pods /api/v1/workloads/{workloadId}/pods /v1/k8s/clusters/{uuid}/pods /api/v1/workloads/pods"},{"location":"home/whats-new-2-17/#clusters-api","title":"Clusters API","text":"Deprecated Replacement /v1/k8s/clusters/{clusterUuid}/metrics /api/v1/clusters/{clusterUuid}/metrics"},{"location":"home/whats-new-2-17/#authorization-and-authentication-api","title":"Authorization and Authentication API","text":"Deprecated Replacement /v1/k8s/auth/token/exchange /api/v1/token /v1/k8s/auth/oauth/tokens/refresh /api/v1/token /v1/k8s/auth/oauth/apptoken /api/v1/token /v1/k8s/users/roles /api/v1/authorization/roles /v1/k8s/users /api/v1/users /v1/k8s/users/{userId} /api/v1/users/{userId} /v1/k8s/users/{userId}/roles /api/v1/authorization/access-rules /v1/k8s/apps /api/v1/apps /v1/k8s/apps/{clientId} /api/v1/apps/{appId} /v1/k8s/groups /api/v1/authorization/access-rules /v1/k8s/groups/{groupName} /api/v1/authorization/access-rules /v1/k8s/clusters/{clusterId}/departments/{department-id}/access-control /api/v1/authorization/access-rules /api/v1/authorization/access-rules - subjectIdFilter field Use filterBy / sortBy fields /api/v1/authorization/access-rules - scopeType field Use filterBy / sortBy fields /api/v1/authorization/access-rules - roleId field Use filterBy / sortBy fields"},{"location":"home/whats-new-2-17/#projects-api","title":"Projects API","text":"Deprecated Replacement /v1/k8s/clusters/{clusterId}/projects - permissions field /api/v1/authorization/access-rules /v1/k8s/clusters/{clusterId}/projects - resources field Use nodePoolResources field /v1/k8s/clusters/{clusterId}/projects - deservedGpus field Use nodePoolResources field /v1/k8s/clusters/{clusterId}/projects - maxAllowedGpus field Use nodePoolResources field /v1/k8s/clusters/{clusterId}/projects - gpuOverQuotaWeight field Use nodePoolResources field"},{"location":"home/whats-new-2-17/#departments-api","title":"Departments API","text":"Deprecated Replacement /v1/k8s/clusters/{clusterId}/departments - resources field Use nodePoolResources field /v1/k8s/clusters/{clusterId}/departments - deservedGpus field Use nodePoolResources field /v1/k8s/clusters/{clusterId}/departments - allowOverQuota field Use nodePoolResources field /v1/k8s/clusters/{clusterId}/departments - maxAllowedGpus field Use nodePoolResources field"},{"location":"home/whats-new-2-17/#policy-api","title":"Policy API","text":"Deprecated Replacement /api/v1/policy/workspace /api/v2/policy/workspaces /api/v1/policy/training /api/v2/policy/trainings"},{"location":"home/whats-new-2-17/#logo-api","title":"Logo API","text":"Deprecated Replacement /v1/k8s/tenant/{tenantId}/logo /api/v1/logo"},{"location":"home/whats-new-2-17/#removed-apis-and-api-fields-completed-deprecation","title":"Removed APIs and API fields (completed deprecation)","text":"

      The following list of API endpoints and fields that have completed their deprecation process and therefore will be changed as follows:

      "},{"location":"home/whats-new-2-17/#assets-api","title":"Assets API","text":"Endpoint Change /api/v1/asset/compute gpuRequest field was removed and is replaced by the following fields: * gpuDevicesRequest (New and mandatory) * gpuRequestType (New and mandatory if gpuDevicesRequest=1 otherwise optional for values 0 or greater than 1) * gpuPortion was changed to gpuPortionRequest and accepts values between 0 and 1 (for example 0.75) * gpuPortionLimit (New and optional) * gpuMemory was changed to gpuMemoryRequest * gpuMemoryLimit (New and optional)"},{"location":"home/whats-new-2-17/#metrics-deprecations","title":"Metrics deprecations","text":"

      The following metrics are deprecated and replaced by API endpoints. For details about the replacement APIs, see Changed Metrics:

      Metric runai_active_job_cpu_requested_cores runai_active_job_memory_requested_bytes runai_cluster_cpu_utilization runai_cluster_memory_utilization runai_gpu_utilization_per_pod_per_gpu runai_gpu_utilization_per_workload runai_job_requested_gpu_memory runai_gpu_memory_used_mebibytes_per_workload runai_gpu_memory_used_mebibytes_per_pod_per_gpu runai_active_job_cpu_limits runai_job_cpu_usage runai_active_job_memory_limits runai_job_memory_used_bytes

      Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

      For a full explanation of the API Deprecation policy, see the Run:ai API Policy

      "},{"location":"home/whats-new-2-17/#breaking-changes","title":"Breaking changes","text":"

      Breaking changes notifications allow you to plan around potential changes that may interfere your current workflow when interfacing with the Run:ai Platform.

      "},{"location":"home/whats-new-2-17/#metrics","title":"Metrics","text":"

      Be aware that some names of metrics have been changed. For more information, see Changed Metrics.

      "},{"location":"home/whats-new-2-18/","title":"Version 2.18","text":""},{"location":"home/whats-new-2-18/#release-content-june-30-2024","title":"Release Content - June 30, 2024","text":"
      • Deprecation notifications
      • Breaking changes
      "},{"location":"home/whats-new-2-18/#researcher","title":"Researcher","text":""},{"location":"home/whats-new-2-18/#jobs-workloads-and-workspaces","title":"Jobs, Workloads, and Workspaces","text":"
      • Added to UI backoff limit functionality to Training and Workspace workloads. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload's status will change to Failed. The UI will display the default number of retries based on 6 attempts for each pod in the workload. (For example, 6 pods = 36 attempts).

      • Updated Auto-deletion time default value from never to 30 days. The Auto-deletion time count starts when any Run:ai workload reaches a a completed, or failed status will be automatically deleted (including logs). This change only affects new or cloned workloads.

      • Added new Data sources of type Secret to workload form. Data sources of type Secret are used to hide 3rd party access credentials when submitting workloads. For more information, see Submitting Workloads.

      • Added new graphs for Inference workloads. The new graphs provide more information for Inference workloads to help analyze performance of the workloads. New graphs include Latency, Throughput, and number of replicas. For more information, see Workloads View. (Requires minimum cluster version v2.18).

      • Added latency metric for autoscaling. This feature allows automatic scale-up/down the number of replicas of a Run:ai inference workload based on the threshold set by the ML Engineer. This ensures that response time is kept under the target SLA. (Requires minimum cluster version v2.18).

      • Improved autoscaling for inference models by taking out ChatBot UI from models images. By moving ChatBot UI to predefined Environments, autoscaling is more accurate by taking into account all types of requests (API, and ChatBot UI). Adding a ChatBot UI environment preset by Run:ai allows AI practitioners to easily connect them to workloads.

      • Added more precision to trigger auto-scaling to zero. Now users can configure a precise consecutive idle threshold custom setting to trigger Run:ai inference workloads to scale-to-zero. (Requires minimum cluster version v2.18).

      • Added Hugging Face catalog integration of community models. Run:ai has added Hugging Face integration directly to the inference workload form, providing the ability to select models (vLLM models) from Hugging Face. This allows organizations to quickly experiment with the latest open source community language models. For more information on how Hugging Face is integrated, see Hugging Face.

      • Improved access permissions to external tools. This improvement now allows more granular control over which personas can access external tools (external URLs) such as Jupyter Notebooks, Chatbot UI, and others. For configuration information, see Submitting workloads. (Requires minimum cluster version v2.18).

      • Added a new API for submitting Run:ai inference workloads. This API allows users to easily submit inference workloads. This new API provides a consistent user experience for workload submission which maintains data integrity across all the user interfaces in the Run:ai platform. (Requires minimum cluster version v2.18).

      "},{"location":"home/whats-new-2-18/#command-line-interface-v2","title":"Command Line Interface V2","text":"
      • Added an improved, researcher-focused Command Line Interface (CLI). The improved CLI brings usability enhancements for the Researcher which include:

        • Support multiple clusters
        • Self-upgrade
        • Interactive mode
        • Align CLI to be data consistent with UI and API
        • Improved usability and performance

        This is an early access feature available for customers to use; however, be aware that there may be functional gaps versus the older, V1 CLI. For more information about installing and using the V2 CLI, see CLI V2. (Requires minimum cluster version v2.18).

      "},{"location":"home/whats-new-2-18/#gpu-memory-swap","title":"GPU memory swap","text":"
      • Added new GPU to CPU memory swap. To ensure efficient usage of an organization\u2019s resources, Run:ai provides multiple features on multiple layers to help administrators and practitioners maximize their existing GPUs resource utilization. Run:ai\u2019s GPU memory swap feature helps administrators and AI practitioners to further increase the utilization of existing GPU HW by improving GPU sharing between AI initiatives and stakeholders. This is done by expending the GPU physical memory to the CPU memory which is typically an order of magnitude larger than that of the GPU. For more information see, GPU Memory Swap. (Requires minimum cluster version v2.18).
      "},{"location":"home/whats-new-2-18/#yaml-workload-reference-table","title":"YAML Workload Reference table","text":"
      • Added a new YAML reference document that contains the value types and workload YAML references. Each table contains the field name, its description and the supported Run:ai workload types. The YAML field details contains information on the value type and currently available example workload snippets. For more information see, YAML Reference PDF.
      "},{"location":"home/whats-new-2-18/#email-notifications-workload-status-and-timeouts","title":"Email Notifications - Workload Status and timeouts","text":"
      • Added new Email notification system. AI Practitioners can setup the types of workload notifications they want to receive. In order to receive email notifications, you must ensure that the admin has enabled and configured notifications for the tenant. For more information, see Email notifications.
      "},{"location":"home/whats-new-2-18/#assets","title":"Assets","text":"
      • Improved UI asset creation form by adding a Description field. Now asset creators can add a free text description(max 250 characters) to any asset created. The description field is intended to help explain the nature and goal of the asset, this way AI practitioners will be able to make better decisions when choosing their assets in workload creation.
      "},{"location":"home/whats-new-2-18/#runai-administrator","title":"Run:ai Administrator","text":""},{"location":"home/whats-new-2-18/#data-sources","title":"Data Sources","text":"
      • Added Data Volumes new feature. Data Volumes are snapshots of datasets stored in Kubernetes Persistent Volume Claims (PVCs). They act as a central repository for training data, and offer several key benefits.

        • Managed with dedicated permissions\u2014Data Admins, a new role within Run.ai, have exclusive control over data volume creation, data population, and sharing.
        • Shared between multiple scopes\u2014unlike other Run:ai data sources, data volumes can be shared across projects, departments, or clusters. This promotes data reuse and collaboration within your organization.
        • Coupled to workloads in the submission process\u2014similar to other Run:ai data sources, Data volumes can be easily attached to AI workloads during submission, specifying the data path within the workload environment.

        For more information, see Data Volumes. (Requires minimum cluster version v2.18).

      • Added new data source of type Secret. Run:ai now allows you to configure a Credential as a data source. A Data source of type Secret is best used in workloads so that access to 3rd party interfaces and storage used in containers, keep access credentials hidden. For more information, see Secrets as a data source.

      • Updated the logic of data source initializing state which keeps the workload in \u201cinitializing\u201d status until S3 data is fully mapped. For more information see Sidecar containers documentation.

      • Additional storage unit sizes MiB, GiB & TiB (Megabyte, Gigabyte, and Terabyte respectively) added to the UI and API when creating a new data source of type PVC.

      "},{"location":"home/whats-new-2-18/#credentials","title":"Credentials","text":"
      • Added new Generic secret to Credentials. Credentials had been used only for access to data sources (S3, Git, etc.). However, AI practitioners need to use secrets to access sensitive data (interacting with 3rd party APIs, or other services) without having to put their credentials in their source code. Generic secrets leverage multiple key value pairs which helps reduce the number of Kubernetes resources and simplifies resource management by reducing the overhead associated with maintaining multiple Secrets. Generic secrets are best used as a data source of type Secret so that they can be used in containers to keep access credentials hidden. (Requires minimum cluster version v2.18).
      "},{"location":"home/whats-new-2-18/#single-sign-on","title":"Single Sign On","text":"
      • Added support for Single Sign On using OpenShift v4 (OIDC based). When using OpenShift, you must first define OAuthClient which interacts with OpenShift's OAuth server to authenticate users and request access tokens. For more information, see Single Sign-On.

      • Added OIDC scopes to authentication requests. OIDC Scopes are used to specify what access privileges are being requested for access tokens. The scopes associated with the access tokens determine what resource are available when they are used to access OAuth 2.0 protected endpoints. Protected endpoints may perform different actions and return different information based on the scope values and other parameters used when requesting the presented access token. For more information, see UI configuration.

      "},{"location":"home/whats-new-2-18/#ownership-protection","title":"Ownership protection","text":"
      • Added new ownership protection feature. Run:ai Ownership Protection ensures that only authorized users can delete or modify workloads. This feature is designed to safeguard important jobs and configurations from accidental or unauthorized modifications by users who did not originally create the workload. For configuration information, see your Run:ai representative.
      "},{"location":"home/whats-new-2-18/#email-notifications","title":"Email notifications","text":"
      • Added new email notifications feature. Email Notifications sends alerts for critical workload life cycle changes empowering data scientists to take necessary actions and prevent delays.

        • System administrators will need to configure the email notifications. For more information, see System notifications.
      "},{"location":"home/whats-new-2-18/#policy-for-distributed-and-inference-workloads-in-the-api","title":"Policy for distributed and inference workloads in the API","text":"
      • Added a new API for creating distributed training workload policies and inference workload policies. These new policies in the API allow to set defaults, enforce rules and impose setup on distributed training and inference workloads. For distributed policies, worker and master may require different rules due to their different specifications. The new capability is currently available via API only. Documentation on submitting policies to follow shortly.
      "},{"location":"home/whats-new-2-18/#policy-for-distributed-and-inference-workloads-in-the-api_1","title":"Policy for distributed and inference workloads in the API","text":"
      • Added a new API for creating distributed training workload policies and inference workload policies. These new policies in the API allow to set defaults, enforce rules and impose setup on distributed training and inference workloads. For distributed policies, worker and master may require different rules due to their different specifications. The new capability is currently available via API only. Documentation on submitting policies to follow shortly.
      "},{"location":"home/whats-new-2-18/#deprecation-notifications","title":"Deprecation Notifications","text":"

      Existing notifications feature requires cluster configuration, is being deprecated in favor of an improved Notification System. If you have been using the existing notifications feature in the cluster, you can continue to use it for the next two versions. It is recommend that you change to the new notifications system in the Control Plane for better control and improved message granularity.

      "},{"location":"home/whats-new-2-18/#feature-deprecations","title":"Feature deprecations","text":"

      Deprecated features will be available for two versions ahead of the notification. For questions, see your Run:ai representative.

      "},{"location":"home/whats-new-2-18/#api-support-and-endpoint-deprecations","title":"API support and endpoint deprecations","text":"

      The endpoints and parameters specified in the API reference are the ones that are officially supported by Run:ai. For more information about Run:ai's API support policy and deprecation process, see note under Developer overview.

      "},{"location":"home/whats-new-2-18/#deprecated-apis-and-api-fields","title":"Deprecated APIs and API fields","text":""},{"location":"home/whats-new-2-18/#cluster-api-deprecation","title":"Cluster API Deprecation","text":"

      Run:ai REST API now supports job submission. The older, Cluster API is now deprecated.

      "},{"location":"home/whats-new-2-18/#departments-api","title":"Departments API","text":"Deprecated Replacement /v1/k8s/clusters/{clusterId}/departments /api/v1/org-unit/departments /v1/k8s/clusters/{clusterId}/departments/{department-id} /api/v1/org-unit/departments/{departmentId} /v1/k8s/clusters/{clusterId}/departments/{department-id} /api/v1/org-unit/departments/{departmentId}+PUT/PATCH /api/v1/org-unit/departments/{departmentId}/resources"},{"location":"home/whats-new-2-18/#projects-api","title":"Projects API","text":"Deprecated Replacement /v1/k8s/clusters/{clusterId}/projects /api/v1/org-unit/projects /v1/k8s/clusters/{clusterId}/projects/{id} /api/v1/org-unit/projects/{projectId} /v1/k8s/clusters/{clusterId}/projects/{id} /api/v1/org-unit/projects/{projectId} +\u00a0/api/v1/org-unit/projects/{projectId}/resources

      Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

      For a full explanation of the API Deprecation policy, see the Run:ai API Policy

      "},{"location":"home/whats-new-2-18/#breaking-changes","title":"Breaking changes","text":"

      Breaking changes notifications allow you to plan around potential changes that may interfere your current workflow when interfacing with the Run:ai Platform.

      "},{"location":"home/whats-new-2-19/","title":"What\u2019s New in Version 2.19","text":""},{"location":"home/whats-new-2-19/#release-content","title":"Release Content
      • Deprecation notifications
      ","text":""},{"location":"home/whats-new-2-19/#researchers","title":"Researchers","text":""},{"location":"home/whats-new-2-19/#improved-visibility-into-pending-workloads","title":"Improved visibility into pending workloads","text":"

      For workloads with the status of \"Pending,\" the user can click the \u201ci\u201d icon next to the status to view details of why the workload hasn\u2019t been scheduled. (Requires a minimum cluster version of v2.19)

      "},{"location":"home/whats-new-2-19/#new-workload-events","title":"New workload events","text":"

      There are now new GPU resource optimization-related messages that are viewable as workload events. These events help users understand the decisions made by the Run:ai GPU toolkit while handling Run:ai\u2019s GPU resource optimization features. Run:ai\u2019s GPU resource optimization offers unique capabilities that take GPU utilization to a new level and helps customers increase their productivity while maximizing their return on GPU investment. (Requires a minimum cluster version of v2.19)

      "},{"location":"home/whats-new-2-19/#improved-command-line-interface-autocompletion","title":"Improved command line interface autocompletion","text":"

      CLI V2 now autocompletes nouns such as project names and workload names for better data consistency with the UI, auto-upgrades, and interactive mode.

      "},{"location":"home/whats-new-2-19/#details-pane-in-the-workloads-view","title":"Details pane in the Workloads view","text":"

      A new DETAILS tab for workloads has been added and presents additional workload information, including Container command, Environment variables, and CLI command syntax (if the workload was submitted via CLI).

      "},{"location":"home/whats-new-2-19/#container-path-outside-the-data-source-asset","title":"Container path outside the data source asset","text":"

      AI practitioners can now override the predefined container path for each data source when submitting a workload via the Run:ai UI. While the container path must still be specified as part of the data source asset, researchers can now override the default container path when submitting workloads. (Requires a minimum cluster version of v2.16)

      "},{"location":"home/whats-new-2-19/#node-toleration-for-workloads","title":"Node toleration for workloads","text":"

      Researchers can now optionally set tolerations for workloads, letting them bypass node taints during workload submission via the Run:ai UI. To use this feature, make sure it is activated under General Settings. For more information, refer to the Kubernetes Taints and Tolerations Guide. (Requires a minimum cluster version of v2.19)

      "},{"location":"home/whats-new-2-19/#topology-aware-scheduling","title":"Topology-aware scheduling","text":"

      When submitting a distributed training workload through the Run:ai UI, researchers can enable topology-aware scheduling. This feature allows an optimized placement within specific placement groups, such as regions, availability zones, or other topologies. To use this, make sure it is activated under General Settings. (Requires a minimum cluster version of v2.19)

      "},{"location":"home/whats-new-2-19/#bulk-deletion-of-workloads","title":"Bulk deletion of workloads","text":"

      Users can now delete workloads in bulk via the Run:ai UI. They\u2019ll be notified if they try to delete workloads for which they don\u2019t have permissions (and those workloads will not be deleted in this process). Multi-selection can also be done using standard keyboard functions. (Requires a minimum cluster version of v2.19)

      "},{"location":"home/whats-new-2-19/#enhanced-policy-representation-in-the-runai-ui","title":"Enhanced policy representation in the Run:ai UI","text":"

      To improve AI practitioners' understanding of administrators\u2019 policy rules and defaults, the UI now includes more clarity to the enforcement and the default values representation for workload fields that are not encapsulated in the asset selection. This update aims to make policy enforcement more intuitive and transparent for practitioners. (Requires a minimum cluster version of v2.18)

      "},{"location":"home/whats-new-2-19/#configuration-of-credentials-as-environment-variables","title":"Configuration of credentials as environment variables","text":"

      Researchers can now easily define pre-configured credentials as environment variables to access private resources. This is available through the Run:ai UI during the workload submission process, specifically under the runtime settings section. (Requires a minimum cluster version pf v2.18)

      "},{"location":"home/whats-new-2-19/#expanded-scope-of-configmap-as-data-source","title":"Expanded scope of ConfigMap as data source","text":"

      When creating a data source of type ConfigMap, researchers can now not only select a project but also a cluster or department. (Requires a minimum cluster version of v2.19)

      "},{"location":"home/whats-new-2-19/#improved-workload-scheduling-algorithm","title":"Improved workload scheduling algorithm","text":"

      The Run:ai scheduler algorithm for handling large distributed workloads has been improved and is now more efficient, resulting in better handling of large distributed workloads, and better performance. (Requires a minimum cluster version of v2.19)

      "},{"location":"home/whats-new-2-19/#ml-engineer-inference","title":"ML Engineer (Inference)","text":""},{"location":"home/whats-new-2-19/#additional-data-sources-for-inference-workloads","title":"Additional data sources for inference workloads","text":"

      When submitting an inference workload via the UI and API, users can now use NFS and hostPath data sources. (Requires a minimum cluster version of v2.19)

      "},{"location":"home/whats-new-2-19/#hugging-face-integration-improvements","title":"Hugging Face integration improvements","text":"

      To reduce errors when submitting inference workloads, additional validations are done for the Hugging Face integration, ensuring that only valid models are submitted, thus enhancing overall reliability. (Requires a minimum cluster version of v2.19)

      "},{"location":"home/whats-new-2-19/#rolling-inference-updates","title":"Rolling inference updates","text":"

      ML engineers can now roll updates onto existing inference workloads. Once the revised workload (the update) is up and running, request traffic is redirected to the new version of the workload and the previous version is terminated, ensuring that services are not impacted during the update.

      See Inference overview for more information. (Requires a minimum cluster version of v2.19)

      "},{"location":"home/whats-new-2-19/#inference-endpoint-authorization","title":"Inference endpoint authorization","text":"

      When sharing inference endpoints securely using Run:ai, ML engineers can limit access to the endpoint by specifying the authorized users or groups allowed to use the service (i.e., send requests to the endpoint) after being authenticated. This restriction is especially important when handling sensitive information or when you want to manage costs by sharing the service with a controlled group of consumers. (Requires a minimum cluster version of v2.19)

      "},{"location":"home/whats-new-2-19/#runai-developer","title":"Run:ai Developer","text":""},{"location":"home/whats-new-2-19/#metrics-and-telemetry","title":"Metrics and telemetry","text":"

      Additional metrics and telemetry are available via the API. For more information, see the details below and in Metrics API:

      • Metrics (over time)
        • Cluster
          • TOTAL_GPU_NODES
          • GPU_UTILIZATION_DISTRIBUTION
          • UNALLOCATED_GPU
        • Nodepool
          • TOTAL_GPU_NODES
          • GPU_UTILIZATION_DISTRIBUTION
          • UNALLOCATED_GPU
        • Workload
          • GPU_ALLOCATION
        • Node
          • GPU_UTILIZATION_PER_GPU
          • GPU_MEMORY_UTILIZATION_PER_GPU
          • GPU_MEMORY_USAGE_BYTES_PER_GPU
          • CPU_USAGE_CORES
          • CPU_UTILIZATION
          • CPU_MEMORY_USAGE_BYTES
          • CPU_MEMORY_UTILIZATION
      • Telemetry (current time)
        • Node
          • ALLOCATED_GPUS
          • TOTAL_CPU_CORES
          • USED_CPU_CORES
          • ALLOCATED_CPU_CORES
          • TOTAL_GPU_MEMORY_BYTES
          • USED_GPU_MEMORY_BYTES
          • TOTAL_CPU_MEMORY_BYTES
          • USED_CPU_MEMORY_BYTES
          • ALLOCATED_CPU_MEMORY_BYTES
          • IDLE_ALLOCATED_GPUS
      "},{"location":"home/whats-new-2-19/#administrator","title":"Administrator","text":""},{"location":"home/whats-new-2-19/#pagination-in-user-api","title":"Pagination in user API","text":"

      Pagination has been added, removing the limitation to the number of users listed in the Run:ai UI.

      "},{"location":"home/whats-new-2-19/#audit-log","title":"Audit log","text":"

      The audit log has been updated, so system admins can view audit logs directly in the Run:ai UI and download them in CSV or JSON formats, providing flexible options for data analysis and compliance reporting. Version 2.19 reintroduces a fully functional audit log (event history), ensuring comprehensive tracking across projects, departments, access rules, and more. In the new version, all entities are logged except logins and workloads. For more information, see Audit logs.

      "},{"location":"home/whats-new-2-19/#platform-administrator","title":"Platform Administrator","text":""},{"location":"home/whats-new-2-19/#department-scheduling-rules","title":"Department scheduling rules","text":"

      Scheduling rules have been added at the department level. For more information, see scheduling rules.

      "},{"location":"home/whats-new-2-19/#department-node-pool-priority","title":"Department node pool priority","text":"

      Node pool priority has been added at the department level. For more information, see node pools

      "},{"location":"home/whats-new-2-19/#department-and-project-grids","title":"Department and project grids","text":"

      There is now improved filtering and sorting in the Projects and Departments views, including a multi-cluster view and new filters.

      "},{"location":"home/whats-new-2-19/#overview-dashboard","title":"Overview dashboard","text":"

      \u201cIdle allocated GPU devices\u201d has been added to the Overview dashboard.

      "},{"location":"home/whats-new-2-19/#workload-policy-for-distributed-training-workloads-in-the-runai-ui","title":"Workload policy for distributed training workloads in the Run:ai UI","text":"

      Distributed workload policies can now be created via the Run:ai UI. Admins can set defaults, enforce rules, and impose setup on distributed training through the UI YAML, as well as view the distributed policies (both in the policy grid and while submitting workloads). For distributed policies, workers and leaders may require different rules due to their different specifications. (Requires a minimum cluster version of v2.18)

      "},{"location":"home/whats-new-2-19/#reconciliation-of-policy-rules","title":"Reconciliation of policy rules","text":"

      A reconciliation mechanism for policy rules has been added to enhance flexibility in the policy submission process. Previously, if a specific field was governed by a policy for a certain hierarchy, other organizational units couldn\u2019t submit a policy with rules that regarded this specific field. Now, new policies for hierarchies that mention an existing policy field will no longer be blocked. The effective rules are selected based on the following logic: 1. For the compute and security sections in the workload spec of the Run:ai API, the highest hierarchy is chosen for the effective policy (tenant > cluster > department > project). 2. For any other fields in the policy, the lowest hierarchy closest to the actual workload becomes the effective for the policy (similar to policy defaults). Additionally, while viewing the effective policy, each rule displays its source of the origin policy, allowing users to clearly understand the selected hierarchy of the effective policy. | (Requires a minimum cluster version of v2.18)

      "},{"location":"home/whats-new-2-19/#infrastructure-administrator","title":"Infrastructure Administrator","text":""},{"location":"home/whats-new-2-19/#support-for-cos-over-gke","title":"Support for COS over GKE","text":"

      With Run:ai version 2.19, the Run:ai cluster on Google Kubernetes Engine (GKE) supports Container-Optimized OS (COS) when NVIDIA GPU Operator 24.6 or newer is installed. This is in addition to the already supported Ubuntu on GKE.

      "},{"location":"home/whats-new-2-19/#runai-and-karpenter","title":"Run:ai and Karpenter","text":"

      Run:ai now supports working with Karpenter. Karpenter is an open-source Kubernetes cluster auto-scaler built for cloud deployments. Karpenter optimizes the cloud cost of a customer\u2019s cluster by moving workloads between different node types, bin-packing nodes, using lower-cost nodes where possible, scaling up new nodes on demand, and shutting down unused nodes with the goal of optimizing and reducing costs. (Requires a minimum cluster version of v2.19)

      Please read the documentation for more information on Run:ai and Karpenter integration considerations.

      "},{"location":"home/whats-new-2-19/#control-and-visibility-ui-changes","title":"Control and Visibility (UI changes)","text":""},{"location":"home/whats-new-2-19/#new-runai-ui-navigation","title":"New Run:ai UI navigation","text":"

      The platform navigation has been updated to offer a more modern design, easier navigation, and address all personas interacting with the UI.

      The left-side menu now has seven categories, each with its own reorganized sub-options that appear in the pane next to the menu options.

      If you close the sub-options pane, you can hover over the categories, and the sub-options float and can be used in the same way.

      The options presented in the menu and categories continue to match each user\u2019s permissions, as in the legacy navigation.

      Below is the full list of menu and sub-options and changes:

      Analytics Displays the Run:ai dashboards allowing the different users to analyze, plan, and improve system performance AI workload execution. This category contains the following options:

      • Overview
      • Quota management
      • Analytics
      • Consumption
      • Multi-cluster overview

      Workload manager Enables AI practitioners to develop modes, train them, and deploy them into production. All supported tools and capabilities can be found here. This category contains the following options:

      • Workloads
      • Deleted workloads (now separated from current workloads. If not visible, it can be activated from Settings -> Workloads -> Deleted workloads)
      • Templates
      • Assets (these options are visible via a collapsible menu)
        • Models
        • Environments
        • Compute resources
        • Data sources
        • Credentials

      Resources Enables viewing and managing all cluster resources. In the new navigation, nodes and node pools have been split into different grids. This category contains the following options:

      • Clusters
      • Node pools (separated from the Nodes page to its own page)
      • Nodes

      Organization Maps system organizations to ensure that resource allocation and policies align with the organizational structure, business projects, and priorities. This category contains the following options:

      • Departments
      • Projects

      Access Makes it possible to provide authorization of the different system users to perform actions and alignment with their role and scope of projects within the organization. This was moved from the legacy menu where it appeared in the header of the screen under Tools and Settings. This category contains the following options:

      • Users
      • Applications
      • Roles (separated from the Access rules and roles page to its own page)
      • Access rules (separated from the Access rules and roles page to its own page)

      Policies Presents the tools to enforce controls over the AI infrastructure enabling different users to be effective while working in alignment with organizational policies. This category contains the following options:

      • Workload policies

      Admin Presents all administrator functions of the Run:ai platform. This was moved from the legacy menu where it appeared in the header of the screen under Tools and Settings. This category contains the following options:

      • General settings (previously General)
      • Event history

      For users with more than one cluster, in the legacy version the cluster selection appeared in the header of the page. In the new navigation, the cluster selection is part of the grid and changes only affect the items on that page.

      If a user prefers not to use the new UI navigation, there is an option to switch back to the legacy navigation by clicking the Back to legacy navigation option.

      Installation and configuration

      • Tenant logos can now be uploaded to the Run:ai UI via API. The logo should be in base64 format and should not be white to avoid blending into the background. The logo should be no more than 20px tall. See Upload logo for tenant API.
      • Run:ai now supports NVIDIA GPU Operator version 24.6
      • Run:ai now supports Kubernetes version 1.31
      "},{"location":"home/whats-new-2-19/#deprecation-notifications","title":"Deprecation notifications","text":""},{"location":"home/whats-new-2-19/#feature-deprecations","title":"Feature deprecations","text":""},{"location":"home/whats-new-2-19/#legacy-jobs-view","title":"Legacy Jobs view","text":"

      The legacy Jobs view will be fully deprecated in the Q1/25 release. We recommend that all users adopt the Workloads view, which offers all the capabilities of the legacy Jobs view with additional enhancements. SaaS customers will gradually be transitioned to the Workloads view during Q4/24.

      Note

      Users can still submit workloads via the legacy Jobs submission form.

      "},{"location":"home/whats-new-2-19/#dynamic-mig-deprecation","title":"Dynamic MIG deprecation","text":"

      Dynamic MIG deprecation process starts with Run:ai v2.19 (Q4/24 release)

      • The feature is still available and MIG Profile APIs still function but are marked as Deprecated. See the table below for more details.
      • In Q1/25 release, \u2018Dynamic MIG\u2019 will not be usable anymore but the APIs will still be accessible.
      • In Q2/25 all \u2018Dynamic MIG\u2019 APIs will be fully deprecated.
      "},{"location":"home/whats-new-2-19/#legacy-navigation-runai-ui","title":"Legacy navigation - Run:ai UI","text":"

      The legacy navigation will be fully deprecated in the Q1/25 release, and during Q1/25 for SaaS customers.

      "},{"location":"home/whats-new-2-19/#api-support-and-endpoint-deprecations","title":"API support and endpoint deprecations","text":"Deprecated Replacement /v1/k8s/audit /api/v1/audit/log /api/v1/asset/compute/spec/migProfile /api/v1/workloads/spec/compute/migProfile /api/v1/workloads/workspaces/spec/compute/migProfile /api/v1/workloads/Trainings/spec/compute/migProfile /api/v1/workloads/Inferences/spec/compute/migProfile /api/v1/workloads/distributed/spec/compute/migProfile /api/v1/workloads/distributed/masterSpec/compute/migProfile

      Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

      For a full explanation of the API Deprecation policy, see the Run:ai API Policy

      "},{"location":"home/whats-new-2-19/#documentation-enhancements","title":"Documentation enhancements","text":""},{"location":"home/whats-new-2-19/#workload-policy-documentation","title":"Workload policy documentation","text":"

      A comprehensive set of articles detailing the usage and the process of submitting new workload policies has been introduced. It covers the structure, syntax, best practices, and examples for configuring policy YAML files. The new documentation includes step-by-step explanations of how to create a new rule in a policy, together with information of the different value types, rule types, and policy spec sections. For more information, refer to the Policies section.

      "},{"location":"home/whats-new-2-20/","title":"What\u2019s New in Version 2.20","text":""},{"location":"home/whats-new-2-20/#release-content","title":"Release Content

      The Run:ai v2.20 What's New provides a detailed summary of the latest features, enhancements, and updates introduced in this version. They serve as a guide to help users, administrators, and researchers understand the new capabilities and how to leverage them for improved workload management, resource optimization, and more.

      Important

      For a complete list of deprecations, see Deprecation notifications. Deprecated features and capabilities will be available for two versions ahead of the notification.

      ","text":""},{"location":"home/whats-new-2-20/#researchers","title":"Researchers","text":""},{"location":"home/whats-new-2-20/#workloads-workspaces-and-training","title":"Workloads - Workspaces and Training","text":"
      • Stop/run actions for distributed workloads - You can now stop and run distributed workloads from the UI, CLI, and API. Scheduling rules for training workloads also apply to distributed workloads. This enhances control over distributed workloads, enabling greater flexibility and resource management. From cluster v2.20 onward

      • Visibility into idle GPU devices - Idle GPU devices are now displayed in the UI and API showing the number of allocated GPU devices that have been idle for more than 5 minutes. This provides better visibility into resource utilization, enabling more efficient workload management.

      • Configurable workload completion with multiple runs - You can now define the number of runs a training workload must complete to be considered finished directly in the UI, API, and CLI v2. Running training workloads multiple times improves the reliability and validity of training results. Additionally, you can configure how many runs can be scheduled in parallel, helping to significantly reduce training time and simplifying the process of managing jobs that require multiple runs. See Train models using a standard training workload for more details. From cluster v2.20 onward

      • Configurable grace period for workload preemption - You can now set a grace period in the UI, API and CLI v2 providing a buffer time for preempted workloads to reach a safe checkpoint before being forcibly preempted for standard and distributed training workloads. The grace period can be configured between 0 seconds and 5 minutes. This aims to minimize data loss and avoid unnecessary retraining, ensuring the latest checkpoints are saved. From cluster v2.20 onward

      • Pod deletion policy for terminal workloads - You can now specify which pods should be deleted when a distributed workload reaches a terminal state (completed/failed) using cleanPodPolicy in CLI v2 and API. This enhancement provides greater control over resource cleanup and helps maintain a more organized and efficient cluster environment. See cleanPodPolicy for more details.

      "},{"location":"home/whats-new-2-20/#workload-assets","title":"Workload Assets","text":"
      • Instructions for environment variables - You can now add instructions to environment variables when creating new environments via the UI and API. In addition, Run:ai's environments now include default instructions. Adding instructions provides guidance enabling anyone using the environment to set the environment variable values correctly. From cluster v2.20 onward

      • Enhanced environments and compute resource management - The action bar now contains \"Make a Copy\" and \"Edit\" while the \"Rename\" option has been removed. A new \"Last Updated\" column has also been added for easier tracking of asset modifications. From cluster v2.20 onward

      • Enhanced data sources and credentials tables - Added a new \"Kubernetes name\" column to data sources and credentials tables for visibility into Kubernetes resource associations. The credentials table now includes an \"Environments\" column displaying the environments associated with the credential. From cluster v2.20 onward

      "},{"location":"home/whats-new-2-20/#authentication-and-authorization","title":"Authentication and authorization","text":"
      • User applications for API authentication - You can now create your own applications for API integrations with Run:ai. Each application includes client credentials which can be used to obtain an authentication token to utilize for subsequent API calls. See User applications for more details. From cluster v2.20 onward
      "},{"location":"home/whats-new-2-20/#scheduler","title":"Scheduler","text":"
      • Support for multiple fractional GPUs in a single workload - Run:ai now supports submitting workloads that utilize multiple fractional GPUs within a single workload using the UI and CLI. This feature enhances GPU utilization, increases scheduling probability in shorter timeframes, and allows workloads to consume only the memory they need. It maximizes quota usage and enables more workloads to share the same GPUs effectively. See Multi-GPU fractions and Multi-GPU dynamic fractions for more details. Beta for Dynamic Fractions From cluster v2.20 onward

      • Support for GPU memory swap with multiple GPUs per workload - Run:ai now supports GPU memory swap for workloads utilizing multiple GPUs. By leveraging GPU memory swap, you can maximize GPU utilization and serve more workloads using the same hardware. The swap scheduler on each node ensures that all GPUs of a distributed model run simultaneously, maintaining synchronization across GPUs. Workload configurations combine swap settings with multi-GPU dynamic fractions, providing flexibility and efficiency for managing large-scale workloads. See Multi-GPU memory swap. Beta From cluster v2.20 onward

      "},{"location":"home/whats-new-2-20/#command-line-interface-cli-v2","title":"Command Line Interface (CLI v2)","text":"
      • Support for Windows OS - CLI v2 now supports Windows operating systems, enabling you to leverage the full capabilities of the CLI. From cluster v2.18 onward

      • Unified training command structure - Unified the distributed command into the training command to align with the Run:ai UI. The training command now includes a new sub-command to support distributed workloads, ensuring a more consistent and streamlined user experience across both the CLI v2 and UI.

      • New command for Kubernetes access - Added a new CLI v2 command, runai kubconfig set, allowing users to set the kubeconfig file with Run:ai authorization token. This enhancement enables users to gain access to the Kubernetes cluster, simplifying authentication and integration with Run:ai-managed environments.

      • Added view workload labels - You can now view the labels associated with a workload when using the CLI v2 runai workload describe command for all workload types. This enhancement provides better visibility into workload metadata.

      "},{"location":"home/whats-new-2-20/#ml-engineers","title":"ML Engineers","text":""},{"location":"home/whats-new-2-20/#workloads-inference","title":"Workloads - Inference","text":"
      • Enhanced visibility into rolling updates for inference workloads - Run:ai now provides a phase message that provides detailed insights into the current state of the update, by hovering over the workload's status. This helps users to monitor and manage updates more effectively. See Rolling inference updates for more details. From cluster v2.20 onward

      • Inference serving endpoint configuration - You can now define an inference serving endpoint directly within the environment using the Run:ai UI. From cluster v2.19 onward

      • Persistent token management for Hugging Face models - Run:ai allows users to save their Hugging Face tokens persistently as part of their credentials within the Run:ai UI. Once saved, tokens can be easily selected from a list of stored credentials, removing the need to manually enter them each time. This enhancement improves the process of deploying Hugging Face models, making it more efficient and user-friendly. See Deploy inference workloads from Hugging Face for more details. From cluster v2.13 onward

      • Deploy and manage NVIDIA NIM models in inference workloads - Run:ai now supports NVIDIA NIM models, enabling you to easily deploy and manage these models when submitting inference workloads. You can select a NIM model and leverage NVIDIA\u2019s hardware optimizations directly through the Run:ai UI. This feature also allows you to take advantage of Run:ai capabilities such as autoscaling and GPU fractioning. See Deploy inference workloads with NVIDIA NIM for more details.

      • Customizable autoscaling plans for inference workloads - Run:ai allows advanced users practicing autoscaling for inference workloads to fine-tune their autoscaling plans using the Update inference spec API. This feature enables you to achieve optimal behavior to meet fluctuating request demands. Experimental From cluster v2.20 onward

      "},{"location":"home/whats-new-2-20/#platform-administrator","title":"Platform Administrator","text":""},{"location":"home/whats-new-2-20/#analytics","title":"Analytics","text":"
      • New Reports view for analytics - The new Reports enables generating and organizing large data in a structured, CSV-formatted layout. With this feature, you can monitor resource consumption, identify trends, and make informed decisions to optimize their AI workloads with greater efficiency.
      "},{"location":"home/whats-new-2-20/#authentication-and-authorization_1","title":"Authentication and authorization","text":"
      • Client credentials for applications - Applications now use client credentials - Client ID and Client secret - to obtain an authentication token, aligned with OAuth standard. See Applications for more details. From cluster v2.20 onward
      "},{"location":"home/whats-new-2-20/#node-pools","title":"Node pools","text":"
      • Enhanced metric graphs for node pools - Enhanced metric graphs in the DETAILS tab for node pools by aligning these graphs with the dashboard and the node pools API. As part of this improvement, the following columns have been removed from the Node pools table.

        • Node GPU Allocation
        • GPU Utilization Distribution
        • GPU Utilization
        • GPU Memory Utilization
        • CPU Utilization
        • CPU Memory Utilization
      "},{"location":"home/whats-new-2-20/#organizations-projectsdepartments","title":"Organizations - Projects/Departments","text":"
      • Enhanced project deletion - Deleting a project will now attempt to delete the project's associated workloads and assets, allowing better management of your organization's assets. From cluster v2.20 onward

      • Enhanced resource prioritization for projects and departments - Run:ai has introduced advanced prioritization capabilities to manage resources between projects or between departments more effectively using the Projects and Departments APIs. From cluster v2.20 onward

        This feature allows administrators to:

        • Prioritize resource allocation and reclaim between different projects and departments.
        • Prioritize projects within the same department.
        • Set priorities per node-pool for both projects and departments.
        • Implement distinct SLAs by assigning strict priority levels to over-quota resources.
      • Updated over quota naming - Renamed over quota priority to over quota weight to reflect its actual functionality.

      "},{"location":"home/whats-new-2-20/#policy","title":"Policy","text":"
      • Added policy-based default field values - Administrators can now set default values for fields that are automatically calculated based on the values of other fields using defaultFrom. This ensures that critical fields in the workload submission form are populated automatically if not provided by the user. From cluster v2.20 onward

        This feature supports various field types:

        • Integer fields (e.g., cpuCoresRequest),
        • Number fields (e.g., gpuPortionRequest),
        • Quantity fields (e.g., gpuMemoryRequest)
      "},{"location":"home/whats-new-2-20/#data-sources","title":"Data sources","text":"
      • Improved control over data source and storage class visibility - Run:ai now provides administrators with the ability to control the visibility of data source types and storage in the UI. Data source types that are restricted by policy will no longer appear during workload submission or when creating new data source assets. Additionally, administrators can configure storage classes as internal using the Storage class configuration API. From cluster v2.20 onward
      "},{"location":"home/whats-new-2-20/#email-notifications","title":"Email notifications","text":"
      • Added email notifications API - Email notifications can now be configured via API in addition to the UI, enabling integration with external tools. See NotificationChannels API for more details.
      "},{"location":"home/whats-new-2-20/#infrastructure-administrator","title":"Infrastructure Administrator","text":""},{"location":"home/whats-new-2-20/#nvidia-data-center-gpus-grace-hopper","title":"NVIDIA Data Center GPUs - Grace-Hopper","text":"
      • Support for ARM-Based Grace-Hopper Superchip (GH200) - Run:ai now supports the ARM-based Grace-Hopper Superchip (GH200). Due to a limitation in version 2.20 with ARM64, the Run:ai control plane services must be scheduled on non-ARM based CPU nodes. This limitation will be addressed in a future release. See Self-Hosted installation over Kubernetes for more details. From cluster v2.20 onward
      "},{"location":"home/whats-new-2-20/#system-requirements","title":"System requirements","text":"
      • Run:ai now supports Kubernetes version 1.32.
      • Run:ai now supports OpenShift version 4.17.
      • Kubernetes version 1.28 is no longer supported.
      • OpenShift versions 4.12 to 4.13 are no longer supported.
      "},{"location":"home/whats-new-2-20/#advanced-cluster-configurations","title":"Advanced cluster configurations","text":"
      • Exclude nodes in mixed node clusters - Run:ai now allows you to exclude specific nodes in a mixed node cluster using the nodeSelectorTerms flag. See Advanced Cluster Configurations for more details. From cluster v2.20 onward

      • Advanced configuration options for cluster services - Introduced new cluster configuration options for setting node affinity and tolerations for Run:ai cluster services. These configuration ensure that the Run:ai cluster services are scheduled on the desired nodes. See Advanced Cluster Configurations for more details. From cluster v2.20 onward

        • global.affinity
        • global.tolerations
        • daemonSetsTolerations
      • Added Argo workflows auto-pod grouping - Introduced a new cluster configuration option, gangScheduleArgoWorkflow, to modify the default behavior for grouping ArgoWorkflow pods, allowing you to prevent pods from being grouped into a single pod-group. See Advanced Cluster Configurations for more details. Cluster v2.20 and v2.18

      • Added cloud auto-scaling for memory fractions - Run:ai now supports auto-scaling for workloads using memory fractions in cloud environments. Using gpuMemoryToFractionRatio configuration option allows a failed scheduling attempt for a memory fractions workload to create Run:ai scaling pods, triggering the auto-scaler. See Advanced Cluster Configurations for more details. From cluster v2.19 onward

      • Added stale gang eviction timeout for improved stability - Run:ai has introduced a default timeout of 60 seconds for gang eviction in gang scheduling workloads using defaultStalenessGracePeriod. This timeout allows both the workload controller and the scheduler sufficient time to remediate the workload, improving the stability of large training jobs. See Advanced Cluster Configurations for more details. From cluster v2.18 onward

      • Added custom labels for built-in alerts - Administrators can now add their own custom labels to the built-in alerts from Prometheus by setting spec.prometheus.additionalAlertLabels in their cluster. See Advanced Cluster Configurations for mode details. From cluster v2.20 onward

      • Enhanced configuration flexibility for cluster replica management - Administrators can now use the spec.global.replicaCount to manage replicas for Run:ai services. See Advanced Cluster Configurations for more details. From cluster v2.20 onward

      "},{"location":"home/whats-new-2-20/#runai-built-in-alerts","title":"Run:ai built-in alerts","text":"
      • Added two new Run:ai built-in alerts for Kubernetes nodes hosting GPU workloads. The unknown state alert notifies when the node's health and readiness cannot be determined, and the low memory alert warns when the node has insufficient memory to support current or upcoming workloads. From cluster v2.20 onward
      "},{"location":"home/whats-new-2-20/#runai-developer","title":"Run:ai Developer","text":""},{"location":"home/whats-new-2-20/#metrics-and-telemetry","title":"Metrics and Telemetry","text":"
      • Additional metrics and telemetry are available via the API. For more details, see Metrics API:

        • Metrics (over time)

          • Project
            • GPU_QUOTA
            • CPU_QUOTA_MILLICORES
            • CPU_MEMORY_QUOTA_MB
            • GPU_ALLOCATION
            • CPU_ALLOCATION_MILLICORES
            • CPU_MEMORY_ALLOCATION_MB
          • Department
            • GPU_QUOTA
            • CPU_QUOTA_MILLICORES
            • CPU_MEMORY_QUOTA_MB
            • GPU_ALLOCATION
            • CPU_ALLOCATION_MILLICORES
            • CPU_MEMORY_ALLOCATION_MB
        • Telemetry (current time)

          • Project
            • GPU_QUOTA
            • CPU_QUOTA
            • MEMORY_QUOTA
            • GPU_ALLOCATION
            • CPU_ALLOCATION
            • MEMORY_ALLOCATION
            • GPU_ALLOCATION_NON_PREEMPTIBLE
            • CPU_ALLOCATION_NON_PREEMPTIBLE
            • MEMORY_ALLOCATION_NON_PREEMPTIBLE
          • Department
            • GPU_QUOTA
            • CPU_QUOTA
            • MEMORY_QUOTA
            • GPU_ALLOCATION
            • CPU_ALLOCATION
            • MEMORY_ALLOCATION
            • GPU_ALLOCATION_NON_PREEMPTIBLE
            • CPU_ALLOCATION_NON_PREEMPTIBLE
            • MEMORY_ALLOCATION_NON_PREEMPTIBLE
      "},{"location":"home/whats-new-2-20/#deprecation-notifications","title":"Deprecation notifications","text":""},{"location":"home/whats-new-2-20/#ongoing-dynamic-mig-deprecation-process","title":"Ongoing Dynamic MIG deprecation process","text":"

      The Dynamic MIG deprecation process started in version 2.19. Run:ai supports standard MIG profiles as detailed in Configuring NVIDIA MIG profiles.

      • Before upgrading to version 2.20, workloads submitted with Dynamic MIG and their associated node configurations must be removed
      • In version 2.20, MIG was removed from the Run:ai UI under compute resources.
      • In Q2/25 all \u2018Dynamic MIG\u2019 APIs and CLI commands will be fully deprecated.
      "},{"location":"home/whats-new-2-20/#cli-v1-deprecation","title":"CLI v1 deprecation","text":"

      CLI V1 is deprecated and no new features will be developed for it. It will remain available for use for the next two releases to ensure a smooth transition for all users. We recommend switching to CLI v2, which provides feature parity, backward compatibility, and ongoing support for new enhancements. CLI v2 is designed to deliver a more robust, efficient, and user-friendly experience.

      "},{"location":"home/whats-new-2-20/#legacy-jobs-view-deprecation","title":"Legacy Jobs view deprecation","text":"

      Starting with version 2.20, the legacy Jobs view will be discontinued in favor of the more advanced Workloads view. The legacy submission form will still be accessible via the Workload manager view for a smoother transition.

      "},{"location":"home/whats-new-2-20/#appid-and-appsecret-deprecation","title":"appID and appSecret deprecation","text":"

      Deprecating appID and appSecret parameters used for requesting an API token. It will remain available for use for the next two releases. To create application tokens, use your client credentials - Client ID and Client secret.

      "},{"location":"home/changelog/hotfixes-2-13/","title":"Changelog Version 2.13","text":"

      The following is a list of the known and fixed issues for Run:ai V2.13.

      "},{"location":"home/changelog/hotfixes-2-13/#version-21348-march-14-2024","title":"Version 2.13.48 - March 14, 2024","text":"Internal ID Description RUN-16787 Fixed an issue after an upgrade to 2.13 where distributed PyTorch jobs were not able to run due to PVCs being assigned to only worker pods. RUN-16626 Fixed an issue in SSO environments, where Workspaces created using a template were assigned the template creator's UID/GID and not the Workspace creator's UID/GID. RUN-16357 Fixed an issue where pressing the Project link in Jobs screen redirects the view to the Projects of a different cluster in multi-cluster environments."},{"location":"home/changelog/hotfixes-2-13/#version-21343-february-15-2024","title":"Version 2.13.43 - February 15, 2024","text":"Internal ID Description RUN-14946 Fixed an issue where Dashboards are displaying the hidden Grafana path."},{"location":"home/changelog/hotfixes-2-13/#version-21337","title":"Version 2.13.37","text":"Internal ID Description RUN-13300 Fixed an issue where projects will appear with a status of empty while waiting for the project controller to update its status. This was caused because the cluster-sync works faster than the project controller."},{"location":"home/changelog/hotfixes-2-13/#version-21335-december-19-2023","title":"Version 2.13.35 - December 19, 2023","text":""},{"location":"home/changelog/hotfixes-2-13/#release-content","title":"Release content","text":"
      • Added the ability to set node affinity for Prometheus.
      "},{"location":"home/changelog/hotfixes-2-13/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-14472 Fixed an issue where template updates were not being applied to the workload. RUN-14434 Fixed an issue where runai_allocated_gpu_count_per_gpu was multiplied by seven. RUN-13956 Fixed an issue where editing templates failed. RUN-13825 Fixed an issue when deleting a job that is allocated a fraction of a GPU, an associated configmap is not deleted. RUN-13343 Fixed an issue in pod status calculation."},{"location":"home/changelog/hotfixes-2-13/#version-21331","title":"Version 2.13.31","text":"Internal ID Description RUN-11367 Fixed an issue where a double click on SSO Users redirects to a blank screen. RUN-10560 Fixed an issue where the RunaiDaemonSetRolloutStuck alert did not work."},{"location":"home/changelog/hotfixes-2-13/#version-21325","title":"Version 2.13.25","text":"Internal ID Description RUN-13171 Fixed an issue when a cluster is not connected the actions in the Workspace and Training pages are still enabled. After the corrections, the actions will be disabled."},{"location":"home/changelog/hotfixes-2-13/#version-21321","title":"Version 2.13.21","text":"Internal ID Description RUN-12563 Fixed an issue where users are unable to login after upgrading the control plane from 2.9.16 to 2.13.16. To correct the issue, secrets need to be upgraded manually in keycloak."},{"location":"home/changelog/hotfixes-2-13/#version-21320-september-28-2023","title":"Version 2.13.20 - September 28, 2023","text":""},{"location":"home/changelog/hotfixes-2-13/#release-content_1","title":"Release content","text":"
      • Added the prevention of selecting tenant or department scopes for credentials, and the prevention of selecting s3, PVC, and Git data sources if the cluster version does not support these.
      • Quota management is now enabled by default.
      Internal ID Description RUN-12923 Fixed an issue in upgrading due to a misconfigured Docker image for airgapped systems in 2.13.19. The helm chart contained an error, and the image is not used even though it is packaged as part of the tar. RUN-12928, RUN-12968 Fixed an issue in upgrading Prometheus due to a misconfigured image for airgapped systems in 2.13.19. The helm chart contained an error, and the image is not used even though it is packaged as part of the tar. RUN-12751 Fixed an issue when upgrading from 2.9 to 2.13 results with a missing engine-config file. RUN-12717 Fixed an issue where the user that is logged in as researcher manager can't see the clusters. RUN-12642 Fixed an issue where assets-sync could not restart due to failing to get token from control plane. RUN-12191 Fixed an issue where there was a timeout while waiting for the runai_allocated_gpu_count_per_project metric to return values. RUN-10474 Fixed an issue where the runai-conatiner-toolkit-exporter DaemonSet fails to start."},{"location":"home/changelog/hotfixes-2-13/#version-21319-september-27-2023","title":"Version 2.13.19 - September 27, 2023","text":""},{"location":"home/changelog/hotfixes-2-13/#release-content_2","title":"Release content","text":"
      • Added the ability to identify Kubeflow notebooks and display them in the Jobs table.
      • Added the ability to schedule Kubelow workloads.
      • Added functionality that displays Jobs that only belong to the user that is logged in.
      • Added and refined alerts to the state of Run:ai components, schedule latency, and warnings for out of memory on Jobs.
      • Added the ability to work with restricted PSA policy.
      "},{"location":"home/changelog/hotfixes-2-13/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-12650 Fixed an issue that used an incorrect metric in analytics GPU ALLOCATION PER NODE panel. Now the correct allocation is in percentage. RUN-12602 Fixed an issue in runaiconfig where the WorkloadServices spec has memory requests/limits and cpu requests/limits and gets overwritten with the system default. RUN-12585 Fixed an issue where the workload-controller creates a delay in running jobs. RUN-12031 Fixed an issue when upgrading from 2.9 to 2.13 where the Scheduler pod fails to upgrade due to the change of owner. RUN-11091 Fixed an issue where the Departments feature is disabled, you are not able to schedule non-preemable jobs."},{"location":"home/changelog/hotfixes-2-13/#version-21313","title":"Version 2.13.13","text":"Internal ID Description RUN-11321 Fixed an issue where metrics always showed CPU Memory Utilization and CPU Compute Utilization as 0. RUN-11307 Fixed an issue where node affinity might change mid way through a job. Node affinity in now calculated only once at job submission. RUN-11129 Fixed an issue where CRDs are not automatically upgraded when upgrading from 2.9 to 2.13."},{"location":"home/changelog/hotfixes-2-13/#version-21312-august-7-2023","title":"Version 2.13.12 - August 7, 2023","text":"Internal ID Description RUN-11476 Fixed an issue with analytics node pool filter in Allocated GPUs per Project panel."},{"location":"home/changelog/hotfixes-2-13/#version-21311","title":"Version 2.13.11","text":"Internal ID Description RUN-11408 Added to the Run:ai job-controller 2 configurable parameters QPS and Burst which are applied as environment variables in the job-controller Deployment object."},{"location":"home/changelog/hotfixes-2-13/#version-2137-july-2023","title":"Version 2.13.7 - July 2023","text":""},{"location":"home/changelog/hotfixes-2-13/#release-content_3","title":"Release content","text":"
      • Added filters to the historic quota ratio widget on the Quota management dashboard.
      "},{"location":"home/changelog/hotfixes-2-13/#fixed-issues_2","title":"Fixed issues","text":"Internal ID Description RUN-11080 Fixed an issue in OpenShift environments where log in via SSO with the kubeadmin user, gets blank pages for every page. RUN-11119 Fixed an issue where values that should be the Order of priority column are in the wrong column. RUN-11120 Fixed an issue where the Projects table does not show correct metrics when Run:ai version 2.13 is paired with a Run:ai 2.8 cluster. RUN-11121 Fixed an issue where the wrong over quota memory alert is shown in the Quota management pane in project edit form. RUN-11272 Fixed an issue in OpenShift environments where the selection in the cluster drop down in the main UI does not match the cluster selected on the login page."},{"location":"home/changelog/hotfixes-2-13/#version-2134","title":"Version 2.13.4","text":""},{"location":"home/changelog/hotfixes-2-13/#release-date","title":"Release date","text":"

      July 2023

      "},{"location":"home/changelog/hotfixes-2-13/#fixed-issues_3","title":"Fixed issues","text":"Internal ID Description RUN-11089 Fixed an issue when creating an environment, commands in the Runtime settings pane and are not persistent and cannot be found in other assets (for example in a new Training)."},{"location":"home/changelog/hotfixes-2-13/#version-2131-july-2023","title":"Version 2.13.1 - July 2023","text":""},{"location":"home/changelog/hotfixes-2-13/#release-content_4","title":"Release content","text":"
      • Made an improvement so that occurrences of labels that are not in use anymore are deleted.
      "},{"location":"home/changelog/hotfixes-2-13/#fixed-issues_4","title":"Fixed issues","text":"

      N/A

      "},{"location":"home/changelog/hotfixes-2-15/","title":"Changelog Version 2.15","text":"

      The following is a list of the known and fixed issues for Run:ai V2.15.

      "},{"location":"home/changelog/hotfixes-2-15/#version-2159-february-5-2024","title":"Version 2.15.9 - February 5, 2024","text":""},{"location":"home/changelog/hotfixes-2-15/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-15296 Fixed an issue where the resources parameter was deprecated in the Projects and Departments API."},{"location":"home/changelog/hotfixes-2-15/#version-2154-january-5-2024","title":"Version 2.15.4 - January 5, 2024","text":""},{"location":"home/changelog/hotfixes-2-15/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-15026 Fixed an issue in workloads that were built on a cluster that does not support the NFS field. RUN-14907 Fixed an issue after an upgrade where the Analytics dashboard was missing the time ranges from before the upgrade. RUN-14903 Fixed an issue where internal operations were exposed to the customer audit log. RUN-14062 Fixed an issue in the Overview dashboard where the content for the Running Workload per Type panel did not fit."},{"location":"home/changelog/hotfixes-2-15/#version-2152-february-5-2024","title":"Version 2.15.2 - February 5, 2024","text":""},{"location":"home/changelog/hotfixes-2-15/#fixed-issues_2","title":"Fixed issues","text":"Internal ID Description RUN-14434 Fixed an issue where the Allocated GPUs metric was multiplied by seven."},{"location":"home/changelog/hotfixes-2-15/#version-2151-december-17-2023","title":"Version 2.15.1 - December 17, 2023","text":""},{"location":"home/changelog/hotfixes-2-15/#release-content","title":"Release content","text":"
      • Added environment variables for customizable QPS and burst support.

      • Added the ability to support running multiple Prometheus replicas.

      "},{"location":"home/changelog/hotfixes-2-15/#fixed-issues_3","title":"Fixed issues","text":"Internal ID Description RUN-14292 Fixed an issue where BCM installations were failing due to missing create cluster permissions. RUN-14289 Fixed an issue where metrics were not working due to an incorrect parameter in the cluster-config file. RUN-14198 Fixed an issue in services where multi nodepool jobs were not scheduled due to an unassigned nodepool status. RUN-14191 Fixed an issue where a consolidation failure would cause unnecessary evictions. RUN-14154 Fixed an issue in the New cluster form, whefre the dropdown listed versions that were incompatible with the installed control plane. RUN-13956 Fixed an issue in the Jobs table where templates were not edited successfully. RUN-13891 Fixed an issue where Ray job statuses were shown as empty. RUN-13825 Fixed an issue where GPU sharing configmaps were not deleted. RUN-13628 Fixed an issue where the pre-install pod failed to run pre-install tasks due to the request being denied (Unauthorized). RUN-13550 Fixed an issue where environments were not recovering from a node restart due to a missing GPU runtime class for containerized nodes. RUN-11895 Fixed an issue where the wrong amount of GPU memory usage was shown (is now MB). RUN-11681 Fixed an issue in OpenShift environments where some metrics were not shown on dashboards when the GPU Operator from the RedHat marketplace was installed."},{"location":"home/changelog/hotfixes-2-15/#version-2150","title":"Version 2.15.0","text":""},{"location":"home/changelog/hotfixes-2-15/#fixed-issues_4","title":"Fixed issues","text":"Internal ID Description RUN-13456 Fixed an issue where the Researcher L1 role did not have permissions to create and manage credentials. RUN-13282 Fixed an issue where Workspace logs crashed unexpectedly after restarting. RUN-13121 Fixed an issue in not being able to launch jobs using the API after an upgrade overrode a change in keycloak for applications which have a custom mapping to an email. RUN-13103 Fixed an issue in the Workspaces and Trainings table where the action buttons were not greyed out for users with only the view role. RUN-12993 Fixed an issue where Prometheus was reporting metrics even though the cluster was disconnected. RUN-12978 Fixed an issue after an upgrade, where permissions fail to sync to a project due to a missing application name in the CRD. RUN-12900 Fixed an issue in the Projects table, when sorting by Allocated GPUs, the projects were displayed alphabetically and not numerically. RUN-12846 Fixed an issue after a control-plane upgrade, where GPU, CPU, and Memory Cost fields (in the Consumption Reports) were missing when not using Grafana. RUN-12824 Fixed an issue where airgapped environments tried to pull an image from gcr.io (Internet). RUN-12769 Fixed an issue where SSO users were unable to see projects in Job Form unless the group they belong to was added directly to the project. RUN-12602 Fixed an issue in the documentation where the WorkloadServices configuration in the runaiconfig file was incorrect. RUN-12528 Fixed an issue where the Workspace duration scheduling rule was suspending workspaces regardless of the configured duration. RUN-12298 Fixed an issue where projects were not shown in the Projects table due to the API not sanitizing the project name at time of creation. RUN-12157 Fixed an issue where querying pods completion time returned a negative number. RUN-10560 Fixed an issue where no Prometheus alerts were sent due to a misconfiguration of the parameter RunaiDaemonSetRolloutStuck."},{"location":"home/changelog/hotfixes-2-16/","title":"Changelog Version 2.16","text":"

      The following is a list of the known and fixed issues for Run:ai V2.16.

      "},{"location":"home/changelog/hotfixes-2-16/#version-21665","title":"Version 2.16.65","text":"Internal ID Description RUN-21448 Fixed an issue with degraded workload so the condition would reflect the actual state. RUN-20680 Fixed an issue where the workload page did not present the requested GPU."},{"location":"home/changelog/hotfixes-2-16/#version-21657","title":"Version 2.16.57","text":"Internal ID Description RUN-20388 Fixed an issue where cluster-sync caused a memory leak."},{"location":"home/changelog/hotfixes-2-16/#version-21625","title":"Version 2.16.25","text":"Internal ID Description RUN-17241 Fixed an issue where the nodes page showed nodes as not ready due to \"tookit not installed\"."},{"location":"home/changelog/hotfixes-2-16/#version-21621","title":"Version 2.16.21","text":"Internal ID Description RUN-16463 Fixed an issue after a cluster upgrade to v2.16, where some metrics of pre-existing workloads were displayed incorrectly in the Overview Dashboard."},{"location":"home/changelog/hotfixes-2-16/#version-21618","title":"Version 2.16.18","text":"Internal ID Description RUN-16486 Fixed an issue in the Workloads creation form where the GPU fields of the compute resource tiles were showing no data."},{"location":"home/changelog/hotfixes-2-16/#version-21616","title":"Version 2.16.16","text":"Internal ID Description RUN-16340 Fixed an issue in the Workloads table where filters were not saved correctly."},{"location":"home/changelog/hotfixes-2-16/#version-21615","title":"Version 2.16.15","text":""},{"location":"home/changelog/hotfixes-2-16/#release-content","title":"Release content","text":"
      • Implemented a new Workloads API to support the Workloads feature.
      "},{"location":"home/changelog/hotfixes-2-16/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-16070 Fixed an issue where missing metrics caused the Nodepools table to appear empty."},{"location":"home/changelog/hotfixes-2-16/#version-21614","title":"Version 2.16.14","text":""},{"location":"home/changelog/hotfixes-2-16/#release-content_1","title":"Release content","text":"

      *Improved overall performance by slowing down metrics updates from 10 seconds to 30 seconds.

      "},{"location":"home/changelog/hotfixes-2-16/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-16255 Fixed an issue in the Analytics dashboard where the GPU Allocation per Node and GPU Memory Allocation per Node panels were displaying incorrect data. RUN-16035 Fixed an issue in the Workloads table where completed pods continue to be counted in the requested resources column."},{"location":"home/changelog/hotfixes-2-16/#version-21612","title":"Version 2.16.12","text":""},{"location":"home/changelog/hotfixes-2-16/#fixed-issues_2","title":"Fixed issues","text":"Internal ID Description RUN-16110 Fixed an issue where creating a training workload (single or multi-node) with a new PVC or Volume, resulted in the Workloads table showing the workload in the Unknown/Pending status. RUN-16086 Fixed an issue in airgapped environments where incorrect installation commands were shown when upgrading to V2.15."},{"location":"home/changelog/hotfixes-2-16/#version-21611","title":"Version 2.16.11","text":"

      N/A

      "},{"location":"home/changelog/hotfixes-2-16/#version-2169","title":"Version 2.16.9","text":"

      N/A

      "},{"location":"home/changelog/hotfixes-2-16/#version-2168","title":"Version 2.16.8","text":""},{"location":"home/changelog/hotfixes-2-16/#release-content_2","title":"Release content","text":"

      N/A

      "},{"location":"home/changelog/hotfixes-2-16/#version-2167","title":"Version 2.16.7","text":""},{"location":"home/changelog/hotfixes-2-16/#release-content_3","title":"Release content","text":"
      • Added an API endpoint that retrieves data from a workloads's pod.
      "},{"location":"home/changelog/hotfixes-2-16/#fixed-issues_3","title":"Fixed issues","text":"

      N/A

      "},{"location":"home/changelog/hotfixes-2-16/#version-2166","title":"Version 2.16.6","text":"

      N/A

      "},{"location":"home/changelog/hotfixes-2-17/","title":"Changelog Version 2.17","text":"

      The following is a list of the known and fixed issues for Run:ai V2.17.

      "},{"location":"home/changelog/hotfixes-2-17/#version-21763","title":"Version 2.17.63","text":"Internal ID Description RUN-21448 Fixed an issue where a degraded workload was stuck and could not be released."},{"location":"home/changelog/hotfixes-2-17/#version-21746","title":"Version 2.17.46","text":"Internal ID Description RUN-20136 Updated postgres version."},{"location":"home/changelog/hotfixes-2-17/#version-21743","title":"Version 2.17.43","text":"Internal ID Description RUN-19949 Fixed an issue where runai submit arguments were not parsed correctly to the command."},{"location":"home/changelog/hotfixes-2-17/#version-21741","title":"Version 2.17.41","text":"Internal ID Description RUN-19870 Added debug logs to cluster-sync"},{"location":"home/changelog/hotfixes-2-17/#version-21726","title":"Version 2.17.26","text":"Internal ID Description RUN-19189 Fixed an issue in cluster-sync that sometimes caused unnecessary sync process to the control-plane."},{"location":"home/changelog/hotfixes-2-17/#version-21725","title":"Version 2.17.25","text":"Internal ID Description RUN-16357 Fixed an issue where the Project button in the Jobs screen redirects to the Projects page but on the wrong cluster."},{"location":"home/changelog/hotfixes-2-17/#version-21710","title":"Version 2.17.10","text":"Internal ID Description RUN-18065 Fixed an issue where the legacy job sumbission configuration was not available in the Settings page"},{"location":"home/changelog/hotfixes-2-17/#version-2170","title":"Version 2.17.0","text":"Internal ID Description RUN-20010 Fixed an issue of reduced permissions that run:ai grants users"},{"location":"home/changelog/hotfixes-2-18/","title":"Changelog Version 2.18","text":"

      The following is a list of the known and fixed issues for Run:ai V2.18.

      "},{"location":"home/changelog/hotfixes-2-18/#hotfixes","title":"Hotfixes","text":"Internal ID Hotfix # Description RUN-24521 2.18.83 Fixed a security vulnerability in golang.org.x.crypto with CVE CVE-2024-45337 with severity HIGH. RUN-24733 2.18.83 Fixed an issue where department admins were unable to load the quota management page. RUN-25094 2.18.82 Fixed an issue where OpenShift could not be upgraded due to a broken 3rd binary. RUN-24921 2.18.80 Fixed a security vulnerability in golang.org.x.net and golang.org.x.crypto. RUN-24632 2.18.80 Fixed an issue where an existing monitoring Prometheus setup deployed in an unexpected namespace was reported as missing, causing Run:ai installation to fail on the cluster. The installation mechanism now searches for the monitoring prerequisite in additional relevant namespaces. RUN-24693 2.18.80 Fixed an issue where users were unable to provide metric store authentication details using secret references. RUN-24752 2.18.79 Fixed an issue where a workload would move to a failed state when created with a custom NodePort that was already allocated. RUN-24649 2.18.79 Fixed an issue where submitting a workload with existingPvc=false and not providing a claimName resulted in auto-generating a claimName that included both upper and lower case letters. Since Kubernetes rejects uppercase letters, the workload would fail. The behavior has been updated to generate names using only lowercase letters. RUN-24595 2.18.78 Fixed an issue where the new CLI did not parse master and worker commands/args simultaneously for distributed workloads. RUN-23914 2.18.78 Fixed an issue where unexpected behavior could occur if an application was capturing a graph while memory was being swapped in as part of the GPU memory swap feature. RUN-24020 2.18.77 Fixed a security vulnerability in k8s.io.kubernetes with CVE CVE-2024-0793. RUN-24021 2.18.77 Fixed a security vulnerability in pam with CVE CVE-2024-10963. RUN-23798 2.18.75 Fixed an issue in distributed PyTorch workloads where the worker pods are deleted immediately after completion, not allowing logs to be viewed. RUN-23838 2.18.74 Fixed an issue where the command-line interface could not access resources when configured as single-sign on in a self-hosted environment. RUN-23561 2.18.74 Fixed an issue where the frontend in airgapped environment attempted to download font resources from the internet. RUN-23789 2.18.73 Fixed an issue where in some cases, it was not possible to download the latest version of the command line interface. RUN-23790 2.18.73 Fixed an issue where in some cases it was not possible to download the Windows version of the command line interface. RUN-23855 2.18.73 Fixed an issue where the pods list in the UI showed past pods. RUN-23909 2.18.73 Fixed an issue where users based on group permissions cannot see dashboards. RUN-23857 2.18.72 Dashboard to transition from Grafana v9 to v10. RUN-24010 2.18.72 Fixed an infinite loop issue in the cluster-sync service. RUN-23040 2.18.72 Fixed an edge case where the Run:ai container toolkit hangs when user is spawning hundreds of sub-processes. RUN-23802 2.18.70 Fixed an issue where new scheduling rules were not applied to existing workloads, if those new rules were set on existing projects which had no scheduling rules before. RUN-23211 2.18.70 Fixed an issue where workloads were stuck at \"Pending\" when the command-line interface flag --gpu-memory was set to zero. RUN-23778 2.18.68 Fixed an issue where in single-sign-on configuration, the mapping of UID and other properties would sometimes disappear. RUN-23762 2.18.68 Fixed an issue where the wrong version of a Grafana dashboard was displayed in the UI. RUN-21198 2.18.66 Fixed an issue where creating a training workload via yaml (kubectl apply -f) and specifying spec.namePrefix, created infinite jobs. RUN-23541 2.18.65 Fixed an issue where in some cases workload authorization did not work properly due to wrong oidc configuration. RUN-23291 2.18.64 CLI change text to be user friendly RUN-23283 2.18.64 Fixed a permissions issue with the Analytics dashboard post upgrade for SSO Users RUN-23420 2.18.63 Replaced Redis with Keydb RUN-23140 2.18.63 Fixed an issue where distributed workloads were created with the wrong types RUN-23130 2.18.63 Fixed an issue where inference-workload-controller crashed when WorkloadOwnershipProtection was enabled RUN-23334 2.18.62 Updated core Dockerfiles to ubi9 RUN-23296 2.18.62 Fixed an issue in the CLI where runai attach did not work with auto-complete RUN-23215 2.18.62 Fixed an issue where metrics requests from backend to mimir failed for certain tenants. RUN-22138 2.18.62 Fixed an issue where private URL user(s) input was an email and not a string. RUN-23282 2.18.61 CLI documentation fixes RUN-23055 2.18.60 Fixed unified Distributed and Training CLI commands RUN-23243 2.18.59 Fixed an issue where the scope tree wasn't calculating permissions correctly RUN-22463 2.18.59 Fixed an error in CLI bash command RUN-22314 2.18.59 Fixed distributed framework filtering in API commands RUN-23142 2.18.58 Fixed an issue where advanced GPU metrics per-gpu don't have gpu label RUN-23001 2.18.58 Fixed an issue of false overcommit on out-of-memory killed in the \u201cswap\u201d feature. RUN-22851 2.18.58 Fixed an issue where client may get stuck on device lock acquired during \u201cswap\u201d out-migration RUN-22758 2.18.58 Fixed an issue where inference workload showed wrong status when submission failed. RUN-22544 2.18.58 Updated Grafana version for security vulnerabilities. RUN-23055 2.18.57 Fixed the unified Distributed and Training CLI commands RUN-23014 2.18.56 Fixed an issue where node-scale-adjuster might not create a scaling pod if it is in cool-down and the pod was not updated after that. RUN-22660 2.18.56 Fixed an issue where workload charts have an unclear state RUN-22457 2.18.55 Fixed an issue where in rare edge cases the cluster-sync pod was out of memory. RUN-21825 2.18.55 Fixed all CVEs in Run:ai's Goofys-based image used for S3 integration. RUN-22871 2.18.55 Fixed an issue in runai-container-toolkit where in certain cases when a process is preempted, OOMKill metrics were not published correctly. RUN-22250 2.18.55 Fixed an issue where workloads trying to use an ingress URL which is already in use were behaving inconsistentyly instead of failing immediately. RUN-22880 2.18.55 Fixed an issue where the minAvailable field for training-operator CRDs did not consider all possible replica specs. RUN-22073 2.18.55 Fixed an issue where runai-operator failed to parse cluster URLs ending with '/'. RUN-22453 2.18.55 Fixed an issue where in rare edge cases the workload-overseer pod experienced a crash. RUN-22763 2.18.55 Fixed an issue where in rare edge cases an 'attach' command from CLI-V2 caused a crash in the cluster-api service. RUN-21948 2.18.49 Fixed an issue where in rare edge cases workload child resources could have duplicate names, causing inconsistent behavior. RUN-22623 2.18.49 Fixed an issue in Openshift where workloads were not suspended when reaching their idle GPU time limit. RUN-22600 2.18.49 Fixed an issue in AWS EKS clusters where the V1-CLI returned an empty table when listing all projects as an administrator. RUN-21878 2.18.49 Added a label to disable container toolkit from running on certain nodes run.ai/container-toolkit-enabled. RUN-22452 2.18.47 Fixed an issue where the scheduler has signature errors if TopologySpreadConstraints was partially defined. RUN-22570 2.18.47 Updated git-sync image to version v4.3.0. RUN-22054 2.18.46 Fixed an issue where users could not attach to jobs. RUN-22377 2.18.46 Removed uncached client from accessrule-controller. RUN-21697 2.18.46 Fixed an issue where client may deadlock on suspension during allocation request. RUN-20073 2.18.45 Fixed an issue where it wasn't possible to authenticate with user credentials in the CLI. RUN-21957 2.18.45 Fixed an issue where there was a missing username-loader container in inference workloads. RUN-22276 2.18.39 Fixed an issue where Knative external URL was missing from the Connections modal. RUN-22280 2.18.39 Fixed an issue when setting scale to zero - there was no pod counter in the Workload grid. RUN-19811 2.18.39 Added an option to set k8s tolerations to run:ai daemonsets (container-toolkit, runai-device-plugin, mig-parted, node-exporter, etc..) . RUN-22128 2.18.39 Added GID, UID, Supplemental groups to the V1 CLI. RUN-21800 2.18.37 Fixed an issue with old workloads residing in the cluster. RUN-21907 2.18.34 Fixed an issue where the SSO user credentials contain supplementary groups as string instead of int. RUN-21272 2.18.31 Fixed an issue with multi-cluster credinatils creation, specifically with the same name in different clusters. RUN-20680 2.18.29 Fixed an issue where workloads page do not present requested GPU. RUN-21200 2.18.29 Fixed issues with upgrades and connections from v2.13. RUN-20970 2.18.27 Fixed an issue with PUT APIs. RUN-20927 2.18.26 Fixed an issue where node affinity was not updated correctly in projects edit. RUN-20084 2.18.26 Fixed an issue where default department were deleted instead of a message being displayed. RUN-21062 2.18.26 Fixed issues with the API documentation. RUN-20434 2.18.25 Fixed an issue when creating a Project/Department with memory resources requires 'units'. RUN-20923 2.18.25 Fixed an issue with projects/departments page loading slowly. RUN-19872 2.18.23 Fixed an issue where the Toolkit crashes and fails to create and replace the publishing binaries. RUN-20861 2.18.22 Fixed an issue where a pod is stuck on pending due to a missing resource reservation pod. RUN-20842 2.18.22 Fixed an issue of illegal model name with \".\" in hugging face integration. RUN-20791 2.18.22 Fix an issue where notifications froze after startup. RUN-20865 2.18.22 Fixed an issue where default departments are not deleted when a cluster is deleted. RUN-20698 2.18.21 Fixed an issue where 2 processes requests a device at the same time received the same GPU, causing failures. RUN-20760 2.18.18 Fixed an issue where workload protection UI shows wrong status. RUN-20612 2.18.15 Fixed an issue where it was impossible with the use-table-data to hide node pool columns when there is only one default node pool. RUN-20735 2.18.15 Fixed an issue where nodePool.name is undefined RUN-20721 2.18.12 Added error handling to nodes pages. RUN-20578 2.18.10 Fixed an issue regarding policy enforcement. RUN-20188 2.18.10 Fixed issue with defining SSO in OpenShift identity provider. RUN-20673 2.18.9 Fixed an issue where a researcher uses a distributed elastic job, it is possible that in a specific flow it is scheduled on more than one node-pools. RUN-20360 2.18.7 Fixed an issue where the workload network status was misleading. RUN-22107 2.18.7 Fixed an issue where passwords containing $ were removed from the configuration. RUN-20510 2.18.5 Fixed an issue with external workloads - argocd workflow failed to be updated. RUN-20516 2.18.4 Fixed an issue when after deploying to prod, the cluster-service and authorization-service got multiple OOMKilled every ~1 hour. RUN-20485 2.18.2 Changed policy flags to Beta. RUN-20005 2.18.1 Fixed an issue where a sidecar container failure failed the workload. RUN-20169 2.18.1 Fixed an issue allowing the addition of annotations and labels to workload resources. RUN-20108 2.18.1 Fixed an issue exposing service node ports to workload status. RUN-20160 2.18.1 Fixed an issue with version display when installing a new cluster in an airgapped environment. RUN-19874 2.18.1 Fixed an issue when copying and editing a workload with group access to a tool and the group wasn't removed when selecting users option. RUN-19893 2.18.1 Fixed an issue when using a float number in the scale to zero inactivity value - custom which sometimes caused the submission to fail. RUN-20087 2.18.1 Fixed an issue where inference graphs should be displayed only for minimum cluster versions. RUN-10733 2.18.1 Fixed an issue where we needed to minify and obfuscate our code in production. RUN-19962 2.18.1 Fixed an issue to fix sentry domains regex and map them to relevant projects. RUN-20104 2.18.1 Fixed an issue where frontend Infinite loop on keycloak causes an error. RUN-19906 2.18.1 Fixed an issue where inference workload name validation fails with 2.16 cluster. RUN-19605 2.18.1 Fixed an issue where authorized users should support multiple users (workload-controller) . RUN-19903 2.18.1 Fixed an issue where inference chatbot creation fails with 2.16 cluster. RUN-20409 2.18.1 Fixed an issue where clicking on create new compute during the runai model flow did nothing. RUN-11224 2.18.1 Fixed an issue where ruani-adm collect all logs was not collecting all logs. RUN-20478 2.18.1 Improved workloads error status in overview panel. RUN-19850 2.18.1 Fixed an issue where an application administrator could not submit a job with CLI. RUN-19863 2.18.1 Fixed an issue where department admin received 403 on get tenants and cannot login to UI. RUN-19904 2.18.1 Fixed an issue when filtering by allocatedGPU in get workloads with operator returns incorrect result. RUN-19925 2.18.1 Fixed an issue when upgrade from v2.16 to v2.18 failed on worklaods migrations. RUN-19887 2.18.1 Fixed an issue in the UI when there is a scheduling rule of timeout, the form opened with the rules collapsed and written \"none\". RUN-19941 2.18.1 Fixed an issue where completed and failed jobs were shown in view pods in nodes screen. RUN-19940 2.18.1 Fixed an issue where setting gpu quota failed because the department quota was taken from wrong department. RUN-19890 2.18.1 Fixed an issue where editing a project by removing its node-affinity stuck updating. RUN-20120 2.18.1 Fixed an issue where project update fails when there is no cluster version. RUN-20113 2.18.1 Fixed an issue in the Workloads table where a researcher does not see other workloads once they clear their filters. RUN-19915 2.18.1 Fixed an issue when turning departments toggles on on cluster v2.11+ the gpu limit is -1 and there is ui error. RUN-20178 2.18.1 Fixed an issue where dashboard CPU tabs appeared in new overview. RUN-20247 2.18.1 Fixed an issue where you couldn't create a workload with namespace of a deleted project. RUN-20138 2.18.1 Fixed an issue where the system failed to create node-type on override-backend env. RUN-18994 2.18.1 Fixed an issue where some limitations for department administrator are not working as expected. RUN-19830 2.18.1 Fixed an issue where resources (GPU, CPU, Memory) units were added to k8s events that are published by run:ai scheduler making our messages more readable."},{"location":"home/changelog/hotfixes-2-18/#version-2180-fixes","title":"Version 2.18.0 Fixes","text":"Internal ID Description RUN-20734 Fixed an issue where the enable/disable toggle for the feature was presenting wrong info. RUN-19895 Fixed an issue of empty state for deleted workloads which is incorrect. RUN-19507 Fixed an issue in V1 where get APIs are missing required field in swagger leading to omit empty. RUN-20246 Fixed an issue in Departments v1 org unit where if unrecognizable params are sent, an error is returned. RUN-19947 Fixed an issue where pending multi-nodepool podgroups got stuck after cluster upgrade. RUN-20047 Fixed an issue where Workload status shows as \"deleting\" rather than \"deleted\" in side panel. RUN-20163 Fixed an issue when a DV is shared with a department and a new project is added to this dep - no pvc/pv is created. RUN-20484 Fixed an issue where Create Projects Requests Returned 500 - services is not a valid ResourceType. RUN-20354 Fixed an issue when deleting a department with projects resulted in projects remaining in environment with the status NotReady."},{"location":"home/changelog/hotfixes-2-19/","title":"Changelog Version 2.19","text":"

      The following is a list of the known and fixed issues for Run:ai V2.19.

      "},{"location":"home/changelog/hotfixes-2-19/#hotfixes","title":"Hotfixes","text":"Internal ID Hotfix # Description RUN-17284 2.19.49 Fixed an issue where workloads were suspended when set with the termination after preemption option. RUN-25290 2.19.49 Fixed a security vulnerability in golang.org/x/net v0.33.0 with CVE-2024-45338 with severity HIGH. RUN-25234 2.19.49 Fixed security vulnerabilities by updating oauth2 proxy image to the latest. RUN-25234 2.19.48 Fixed an authentication issue in CLI V1. RUN-25062 2.19.45 Fixed a security vulnerability in github.com.go-git.go-git.v5 with CVE CVE-2025-21614 with severity HIGH. RUN-25061 2.19.45 Fixed a security vulnerability in github.com.go-git.go-git.v5 with CVE CVE-2025-21613 with severity HIGH. RUN-24857 2.19.45 Fixed a security vulnerability in golang.org.x.net with CVE CVE-2024-45338 with severity HIGH. RUN-24733 2.19.45 Fixed an issue where users were unable to load the quota management dashboard. RUN-25094 2.19.44 Fixed an issue where OpenShift could not be upgraded due to a broken 3rd binary. RUN-24026 2.19.40 Fixed a security vulnerability in krb5-libs with CVE CVE-2024-3596. RUN-24649 2.19.40 Fixed an issue where submitting a workload with existingPvc=false and not providing a claimName resulted in auto-generating a claimName that included both upper and lower case letters. Since Kubernetes rejects uppercase letters, the workload would fail. The behavior has been updated to generate names using only lowercase letters. RUN-24632 2.19.40 Fixed an issue where an existing Prometheus monitoring setup deployed in an unexpected namespace was reported as missing, causing Run:ai installation to fail on the cluster. The installation mechanism now searches for the monitoring prerequisite in additional relevant namespaces. RUN-24693 2.19.40 Fixed an issue where users were unable to provide metric store authentication details using secret references. RUN-23744 2.19.40 Fixed an issue where refreshing some pages (such as the settings, policy, and access rules) removed the side navigation. RUN-24715 2.19.40 Fixed an issue in the templates form where selecting Secret as a data source got stuck in an infinite loading page. RUN-24831 2.19.40 Fixed an issue where some edge cases triggered consolidation without it actually being necessary. RUN-24873 2.19.40 Fixed an issue where users were unable to configure email notifications regarding workload statuses. RUN-24921 2.19.40 Fixed a security vulnerability in golang.org.x.net and golang.org.x.crypto. RUN-23914 2.19.38 Fixed an issue where unexpected behavior could occur if an application was capturing a graph while memory was being swapped in as part of the GPU memory swap feature. RUN-24521 2.19.36 Fixed a security vulnerability in golang.org.x.crypto with CVE CVE-2024-45337 with severity HIGH. RUN-24595 2.19.36 Fixed an issue where the new command-line interface did not parse master and worker commands/args simultaneously for distributed workloads. RUN-24565 2.19.34 Fixed an issue where the UI was hanging at times during Hugging Face model memory calculation. RUN-24021 2.19.33 Fixed a security vulnerability in pam with CVE-2024-10963. RUN-24506 2.19.33 Fixed a security vulnerability in krb5-libs with CVE-2024-3596. RUN-24259 2.19.31 Fixed an issue where the option to reset a local user password is sometimes not available. RUN-23798 2.19.30 Fixed an issue in distributed PyTorch workloads where the worker pods are deleted immediately after completion, not allowing logs to be viewed. RUN-24184 2.19.28 Fixed an issue in database migration when upgrading from 2.16 to 2.19. RUN-23752 2.19.27 Fixed an issue in the distributed training submission form when a policy on the master pod was applied. RUN-23040 2.19.27 Fixed an edge case where the Run:ai container toolkit hangs when user is spawning hundreds of sub-processes. RUN-23211 2.19.27 Fixed an issue where workloads were stuck at \"Pending\" when the command-line interface flag --gpu-memory was set to zero. RUN-23561 2.19.27 Fixed an issue where the frontend in airgapped environment attempted to download font resources from the internet. RUN-23789 2.19.27 Fixed an issue where in some cases, it was not possible to download the latest version of the command-line interface. RUN-23790 2.19.27 Fixed an issue where in some cases it was not possible to download the Windows version of the command-line interface. RUN-23802 2.19.27 Fixed an issue where new scheduling rules were not applied to existing workloads, if those new rules were set on existing projects which had no scheduling rules before. RUN-23838 2.19.27 Fixed an issue where the command-line interface could not access resources when configured as single-sign on in a self-hosted environment. RUN-23855 2.19.27 Fixed an issue where the pods list in the UI showed past pods. RUN-23857 2.19.27 Dashboard to transition from Grafana v9 to v10. RUN-24010 2.19.27 Fixed an infinite loop issue in the cluster-sync service. RUN-23669 2.19.25 Fixed an issue where export function of consumption Grafana dashboard was not showing. RUN-23778 2.19.24 Fixed an issue where mapping of UID and other properties disappears. RUN-23770 2.19.24 Fixed an issue where older overview dashboard does not filter on cluster, even though a cluster is selected. RUN-23762 2.19.24 Fixed an issue where the wrong version of a Grafana dashboard was displayed in the UI. RUN-23752 2.19.24 Fixed an issue in the distributed training submission form when a policy on the master pod was applied. RUN-23664 2.19.24 Fixed an issue where the GPU quota numbers on the department overview page did not mach the department edit page. RUN-21198 2.19.22 Fixed an issue where creating a training workload via yaml (kubectl apply -f) and specifying spec.namePrefix, created infinite jobs. RUN-23583 2.19.21 Fixed an issue where the new UI navigation bar sometimes showed multiple selections. RUN-23541 2.19.21 Fixed an issue where authorization was not working properly in SaaS due to wrong oidc URL being used. RUN-23376 2.19.21 Fixed an issue where the new command-line interface required re-login after 10 minutes. RUN-23162 2.19.21 Fixed an issue where older audit logs did not show on the new audit log UI. RUN-23385 2.19.20 Fixed an issue where calls to api/v1/notifications/config/notifications would return 502 RUN-23382 2.19.20 Fixed an issue where all nodepools were deleted on cluster upgrade RUN-23374 2.19.20 Fixed an issue where \"ghost\" nodepool in project settings prevents workload creation via UI/API RUN-23291 2.19.20 CLI - change text to be user friendly RUN-23283 2.19.20 Fixed a permissions issue with the Analytics dashboard post upgrade for SSO Users RUN-23208 2.19.20 Upload the source map to sentry only RUN-22642 2.19.20 infw-controller service tests for the reconcile RUN-23373 2.19.19 Fixed an issue where a new data source couldn't be created from the \"New Workload\" form. RUN-23368 2.19.19 Fixed an issue where the getProjects v1 API returned a list of users which was not always in the same order. RUN-23333 2.19.19 Fixed an issue where node pool with overProvisioningRatio greater than 1 cannot be created. RUN-23215 2.19.18 Fixed an issue where metrics requests from backend to mimir failed for certain tenants. RUN-23334 2.19.17 Updated some dockerfiles to the latest ubi9 image for security vulnerabilities. RUN-23318 2.19.16 Fixed an issue where some projects held faulty data which caused the getProjectById API to fail RUN-23140 2.19.16 Fixed an issue where distributed workloads were created with the wrong types RUN-22069 2.19.16 Fixed an isuue where JWT parse with claims failed to parse token without Keyfunc. RUN-23321 2.19.15 Fixed and issue where the GetProjectById wrapper API of the org-unit client in the runai-common-packages ignored errors RUN-23296 2.19.15 Fixed an issue in the CLI where runai attach did not work with auto-complete RUN-23282 2.19.15 CLI documentation fixes RUN-23245 2.19.15 Fixed an issue where ther binder service didn't update the pod status RUN-23057 2.19.15 OCP 2.19 upgrade troubleshooting RUN-22138 2.19.15 Fixed an issue where private URL user(s) input was an email and not a string. RUN-23243 2.19.14 Fixed an issue where the scope tree wasn't calculating permissions correctly RUN-23208 2.19.14 Upload the source map to sentry only RUN-23198 2.19.14 Fixed an issue where external-workload-integrator sometimes crashed for RayJob RUN-23191 2.19.13 Fixed an issue where creating workloads in the UI returned only the first 50 projects RUN-23142 2.19.12 Fixed an issue where advanced GPU metrics per-gpu did not have gpu label RUN-23139 2.19.12 Fixed an issue where inference workload showed wrong status. RUN-23027 2.19.12 Deprecated migProfiles API fields RUN-23001 2.19.12 Fixed an issue of false overcommit on out-of-memory kills in the Swap feature. RUN-22851 2.19.12 Fixed an issue where client may get stuck on device lock acquired during \u201cswap\u201d out-migration RUN-22771 2.19.12 Fixed an issue where get cluster by id with metadata verbosity returned zero values RUN-22742 2.19.12 Fixed user experience issue in inference autoscaling RUN-22725 2.19.12 Fixed an issue where the cloud operator failed to get pods in nodes UI. RUN-22720 2.19.12 Fixed an issue where the cloud operator failed to get projects in node pools UI. RUN-22700 2.19.12 Added auto refresh to the overview dashboard, Pods modal in the Workloads page, and Event history page RUN-22544 2.19.12 Updated Grafana version for security vulnerabilities. RUN-23083 2.19.11 Fixed an issue where workload actions were blocked in the UI when the cluster had any issues RUN-22771 2.19.11 Fixed an issue where the getClusterById API with metadata verbosity returned zero values"},{"location":"home/changelog/hotfixes-2-19/#version-2190-fixes","title":"Version 2.19.0 Fixes","text":"Internal ID Description RUN-21756 Fixed an issue where the NFS mount path doesn\u2019t accept \u201c{}\u201d characters RUN-21475 Fixed an issue where users failed to select the compute resource from UI if the compute resource is last in the list and has a long name"},{"location":"home/changelog/hotfixes-2-20/","title":"Changelog Version 2.19","text":"

      This section provides details on all hotfixes available for version 2.20. Hotfixes are critical updates released between our major and minor versions to address specific issues or vulnerabilities. These updates ensure the system remains secure, stable, and optimized without requiring a full version upgrade.

      "},{"location":"home/changelog/hotfixes-2-20/#hotfixes","title":"Hotfixes","text":"Version Date Internal ID Description 2.20.15 24/01/2025 RUN-24354 Fixed an issue where migrating workloads failed due to slow network connection. 2.20.14 23/01/2025 RUN-24754 Fixed an issue where the status of training and interactive workloads was not updated correctly. 2.20.14 23/01/2025 RUN-24838 Fixed an issue where an environment asset could not be created if it included an environment variable with no value specified. 2.20.11 21/01/2025 RUN-25303 Fixed an issue where submitting with the --attach flag was supported only in a workspace workload. 2.20.11 21/01/2025 RUN-25291 Fixed a security vulnerability in golang.org/x/net v0.33.0 with CVE-2024-45338 with severity HIGH. 2.20.10 20/01/2025 RUN-25234 Fixed an authentication issue in CLI V1. 2.20.9 19/01/2025 RUN-25032 Fixed an issue where inference workloads with large container sizes skipped the Initializing state. 2.20.9 19/01/2025 RUN-24752 Fixed an issue where a workload would move to a failed state when created with a custom NodePort that was already allocated. 2.20.9 19/01/2025 RUN-25031 Fixed an issue in the Templates form where existing credentials in the environment variables section were not displayed. 2.20.5 14/01/2025 RUN-25061 Fixed a security vulnerability in github.com.go-git.go-git.v5 with CVE CVE-2025-21613 with severity HIGH."},{"location":"platform-admin/overview/","title":"Overview: Platform Administrator","text":"

      The Platform Administrator is responsible for the day-to-day administration of the product.

      As part of the Platform Administrator documentation you will find:

      • Provide the right access level to users.
      • Configure Run:ai meta-data such as Projects, Departments, Node pools etc.
      • Understand Researcher Workloads and set up Workload Policies and Assets.
      • Review possible integrations with third-party products.
      • Analyze system performance and perform suggested actions.
      "},{"location":"platform-admin/aiinitiatives/overview/","title":"Adapting AI initiatives to your organization","text":"

      AI initiatives refer to advancing research, development, and implementation of AI technologies. These initiatives represent your business needs and involve collaboration between individuals, teams, and other stakeholders. AI initiatives require compute resources and a methodology to effectively and efficiently use those compute resources and split them among the different AI initiatives stakeholders. The building blocks of AI compute resources are GPUs, CPUs, and CPU memory, which are built into nodes (servers) and can be further grouped into node pools. Nodes and node pools are part of a Kubernetes Cluster.

      To manage AI initiatives in Run:ai you should:

      • Map your organization and initiatives to projects and optionally departments
      • Map compute resources (node pools and quotas) to projects and optionally departments
      • Assign users (e.g. AI practitioners, ML engineers, Admins) to projects and departments
      "},{"location":"platform-admin/aiinitiatives/overview/#mapping-your-organization","title":"Mapping your organization","text":"

      The way you map your AI initiatives and organization into Run:ai projects and departments should reflect your organization\u2019s structure and Project management practices. There are multiple options, and we provide you here with 3 examples of typical forms in which to map your organization, initiatives, and users into Run:ai, but of course, other ways that suit your requirements are also acceptable.

      "},{"location":"platform-admin/aiinitiatives/overview/#based-on-individuals","title":"Based on individuals","text":"

      A typical use case would be students (individual practitioners) within a faculty (business unit) - an individual practitioner may be involved in one or more initiatives. In this example, the resources are accounted for by the student (project) and aggregated per faculty (department). Department = business unit / Project = individual practitioner

      "},{"location":"platform-admin/aiinitiatives/overview/#based-on-business-units","title":"Based on business units","text":"

      A typical use case would be an AI service (business unit) split into AI capabilities (initiatives) - an individual practitioner may be involved in several initiatives. In this example, the resources are accounted for by Initiative (project) and aggregated per AI service (department).

      Department = business unit / Project = initiative

      "},{"location":"platform-admin/aiinitiatives/overview/#based-on-the-organizational-structure","title":"Based on the organizational structure","text":"

      A typical use case would be a business unit split into teams - an individual practitioner is involved in a single team (project) but the team may be involved in several AI initiatives. In this example, the resources are accounted for by team (project) and aggregated per business unit (department).

      Department = business unit / Project = team

      "},{"location":"platform-admin/aiinitiatives/overview/#mapping-your-resources","title":"Mapping your resources","text":"

      AI initiatives require compute resources such as GPUs and CPUs to run. Compute resources in any organization are limited, either due to the number of servers (nodes) owned by the organization is limited, the budget it can spend to lease resources in the cloud or spending for in-house servers is also limited. Every organization strives to optimize the usage of its resources by maximizing their utilization and providing all users with their needs. Therefore, the organization needs to split resources according to the organization's internal priorities and budget constraints. But even after splitting the resources, the orchestration layer should still provide fairness between the resourced consumers, and allow access to unused resources to minimize scenarios of idle resources.

      Another aspect of resource management is how to group your resources effectively, especially in large environments, or environments that are made of heterogeneous types of hardware, where some users need to use specific hardware types, or where other users should avoid occupying critical hardware of some users or initiatives.

      Run:ai assists you with all of these complex issues by allowing you to map your cluster resources to node pools, then map each Project and Department a quota allocation per node pool, and set access rights to unused resources (Over quota) per node pool.

      "},{"location":"platform-admin/aiinitiatives/overview/#grouping-your-resources","title":"Grouping your resources","text":"

      There are several reasons why you would group resources (nodes) into node pools:

      • Control the GPU type to use in heterogeneous hardware environment - in many cases, AI models can be optimized per hardware type they will use, e.g. a training workload that is optimized for H100 does not necessarily run optimally on an A100, and vice versa. Therefore segmenting into node pools, each with a different hardware type gives the AI researcher and ML engineer better control of where to run.
      • Quota control - splitting to node pools allows the admin to set specific quota per hardware type, e.g. give high priority project guaranteed access to advanced GPU hardware, while keeping lower priority project with a lower quota or even with no quota at all for that high-end GPU, but give it a \u201cbest-effort\u201d access only (i.e. if the high priority guaranteed project is not using those resources).
      • Multi-region or multi-availability-zone cloud environments - if some or all of your clusters run on the cloud (or even on-premise) but any of your clusters uses different physical locations or different topologies (e.g. racks), you probably want to segment your resources per region/zone/topology to be able to control where to run your workloads, how much quota to assign to specific environments (per project, per department), even if all those locations are all using the same hardware type. This methodology can help in optimizing the performance of your workloads because of the superior performance of local computing such as the locality of distributed workloads, local storage etc.
      • Explainability and predictability - large environments are complex to understand, this becomes even more complex when an environment is loaded. To maintain users\u2019 satisfaction and their understanding of the resources state, as well as to keep predictability of your workload chances to get scheduled, segmenting your cluster into smaller pools may significantly help.
      • Scale - Run:ai implementation of node pools has many benefits, one of the main of them is scale. Each node pool has its own scheduler instance, therefore allowing the cluster to handle more nodes and schedule workloads faster when segmented into node pools vs. one large cluster. To allow your workloads to use any resource within a cluster that is split to node pools, a second-level Scheduler is in charge of scheduling workloads between node pools according to your preferences and resource availability.
      • Prevent mutual exclusion - Some AI workloads consume CPU-only resources, to prevent those workloads from consuming the CPU resources of GPU nodes and thus block GPU workloads from using those nodes, it is recommended to group CPU-only nodes into a dedicated node pool(s) and assign a quota for CPU projects to CPU node-pools only while keeping GPU node-pools with zero quota and optionally \u201cbest-effort\u201d over-quota access for CPU-only projects.
      "},{"location":"platform-admin/aiinitiatives/overview/#grouping-examples","title":"Grouping Examples","text":"

      Set out below are illustrations of different grouping options.

      Example: grouping nodes by topology

      Example: grouping nodes by hardware type

      "},{"location":"platform-admin/aiinitiatives/overview/#assigning-your-resources","title":"Assigning your resources","text":"

      After the initial grouping of resources, it is time to associate resources to AI initiatives, this is performed by assigning quotas to projects and optionally to departments. Assigning GPU quota to a project, on a node pool basis, means that the workloads submitted by that project are entitled to use those GPUs as guaranteed resources and can use them for all workload types.

      However, what happens if the project requires more resources than its quota? This depends on the type of workloads that the user wants to submit. If the user requires more resources for non-preemptible workloads, then the quota must be increased, because non-preemptible workloads require guaranteed resources. On the other hand, if the type of workload is, for example, a model Training workload that is preemptible - in this case the project can exploit unused resources of other projects, as long as the other projects don\u2019t need them. Over-quota is set per project on a node-pool basis and per department.

      Administrators can use quota allocations to prioritize resources between users, teams, and AI initiatives. The administrator can completely prevent the use of certain node pools by a project or department by setting the node pool quota to 0 and disabling over quota for that node pool, or it can keep the quota to 0 and enable over-quota to that node pool and allow access based on resource availability only (e.g. unused GPUs). However, when a project with a non-zero quota needs to use those resources, the Scheduler reclaims those resources back and preempts the preemptible workloads of over-quota projects. As an administrator, you can also have an impact on the amount of over-quota resources a project or department uses.

      It is essential to make sure that the sum of all projects' quota does NOT surpass that of the Department, and that the sum of all departments does not surpass the number of physical resources, per node pool and for the entire cluster (we call such behavior - \u2018over-subscription\u2019). The reason over-subscription is not recommended is that it may produce unexpected scheduling decisions, especially those that might preempt \u2018non-preemptive\u2019 workloads or fail to schedule workloads within quota, either non-preemptible or preemptible, thus quota cannot be considered anymore as \u2018guaranteed\u2019. Admins can opt-in a system flag that helps to prevent over-subscription scenarios.

      Example: assigning resources to projects

      "},{"location":"platform-admin/aiinitiatives/overview/#assigning-users-to-projects-and-departments","title":"Assigning users to projects and departments","text":"

      Run:ai system is using \u2018Role Based Access Control\u2019 (RBAC) to manage users\u2019 access rights to the different objects of the system, its resources, and the set of allowed actions. To allow AI researchers, ML engineers, Project Admins, or any other stakeholder of your AI initiatives to access projects and use AI compute resources with their AI initiatives, the administrator needs to assign users to projects. After a user is assigned to a project with the proper role, e.g. \u2018L1 Researcher\u2019, the user can submit and monitor its workloads under that project. Assigning users to departments is usually done to assign \u2018Department Admin\u2019 to manage a specific department. Other roles, such as \u2018L1 Researcher\u2019, can also be assigned to departments, this allows the researcher access to all projects within that department.

      "},{"location":"platform-admin/aiinitiatives/overview/#scopes-in-the-organization","title":"Scopes in the organization","text":"

      This is an example of an organization, as represented in the Run:ai platform:

      The organizational tree is structured from top down under a single node headed by the account. The account is comprised of clusters, departments and projects.

      Note

      Different roles and permissions can be granted to specific clusters, departments and projects within an organization.

      The organizational tree is structured from top down under a single node headed by the account. The account is comprised of clusters, departments and projects.

      After mapping and building your hierarchal structured organization as shown above, you can assign or associate various Run:ai components (e.g. workloads, roles, assets, policies, and more) to different parts of the organization - these organizational parts are the Scopes. The following organizational example consists of 5 optional scopes:

      Note

      When a scope is selected, the very same unit, including all of its subordinates (both existing and any future subordinates, if added), are selected as well.

      "},{"location":"platform-admin/aiinitiatives/overview/#next-steps","title":"Next Steps","text":"

      Now that resources are grouped into node pools, organizational units or business initiatives are mapped into projects and departments, projects\u2019 quota parameters are set per node pool, and users are assigned to projects, you can finally submit workloads from a project and use compute resources to run your AI initiatives.

      "},{"location":"platform-admin/aiinitiatives/org/departments/","title":"Departments","text":"

      This article explains the procedure for managing departments

      Departments are a grouping of projects. By grouping projects into a department, you can set quota limitations to a set of projects, create policies that are applied to the department, and create assets that can be scoped to the whole department or a partial group of descendent projects

      For example, in an academic environment, a department can be the Physics Department grouping various projects (AI Initiatives) within the department, or grouping projects where each project represents a single student.

      "},{"location":"platform-admin/aiinitiatives/org/departments/#departments","title":"Departments","text":"

      The Departments table can be found under Organization in the Run:ai platform.

      Note

      Departments are disabled, by default. If you cannot see Departments in the menu, then it must be enabled by your Administrator, under General settings \u2192 Resources \u2192 Departments

      The Departments table lists all departments defined for a specific cluster and allows you to manage them. You can switch between clusters by selecting your cluster using the filter at the top.

      The Departments table consists of the following columns:

      Column Description Department The name of the department Node pool(s) with quota The node pools associated with this department. By default, all node pools within a cluster are associated with each department. Administrators can change the node pools\u2019 quota parameters for a department. Click the values under this column to view the list of node pools with their parameters (as described below) GPU quota GPU quota associated with the department Total GPUs for projects The sum of all projects\u2019 GPU quotas associated with this department Project(s) List of projects associated with this department Subject(s) The users, SSO groups, or applications with access to the project. Click the values under this column to view the list of subjects with their parameters (as described below). This column is only viewable if your role in Run:ai platform allows you those permissions. Allocated GPUs The total number of GPUs allocated by successfully scheduled workloads in projects associated with this department GPU allocation ratio The ratio of Allocated GPUs to GPU quota. This number reflects how well the department\u2019s GPU quota is utilized by its descendant projects. A number higher than 100% means the department is using over-quota GPUs. A number lower than 100% means not all projects are utilizing their quotas. A quota becomes allocated once a workload is successfully scheduled. Creation time The timestamp for when the department was created Workload(s) The list of workloads under projects associated with this department. Click the values under this column to view the list of workloads with their resource parameters (as described below) Cluster The cluster that the department is associated with"},{"location":"platform-admin/aiinitiatives/org/departments/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"platform-admin/aiinitiatives/org/departments/#node-pools-with-quota-associated-with-the-department","title":"Node pools with quota associated with the department","text":"

      Click one of the values of Node pool(s) with quota column, to view the list of node pools and their parameters

      Column Description Node pool The name of the node pool is given by the administrator during node pool creation. All clusters have a default node pool created automatically by the system and named \u2018default\u2019. GPU quota The amount of GPU quota the administrator dedicated to the department for this node pool (floating number, e.g. 2.3 means 230% of a GPU capacity) CPU (Cores) The amount of CPU (cores) quota the administrator has dedicated to the department for this node pool (floating number, e.g. 1.3 Cores = 1300 mili-cores). The \u2018unlimited\u2019 value means the CPU (Cores) quota is not bound and workloads using this node pool can use as many CPU (Cores) resources as they need (if available) CPU memory The amount of CPU memory quota the administrator has dedicated to the department for this node pool (floating number, in MB or GB). The \u2018unlimited\u2019 value means the CPU memory quota is not bounded and workloads using this node pool can use as much CPU memory resource as they need (if available). Allocated GPUs The total amount of GPUs allocated by workloads using this node pool under projects associated with this department. The number of allocated GPUs may temporarily surpass the GPU quota of the department if over-quota is used. Allocated CPU (Cores) The total amount of CPUs (cores) allocated by workloads using this node pool under all projects associated with this department. The number of allocated CPUs (cores) may temporarily surpass the CPUs (Cores) quota of the department if over-quota is used. Allocated CPU memory The actual amount of CPU memory allocated by workloads using this node pool under all projects associated with this department. The number of Allocated CPU memory may temporarily surpass the CPU memory quota if over-quota is used."},{"location":"platform-admin/aiinitiatives/org/departments/#subjects-authorized-for-the-project","title":"Subjects authorized for the project","text":"

      Click one of the values of the Subject(s) column, to view the list of subjects and their parameters. This column is only viewable if your role in the Run:ai system affords you those permissions.

      Column Description Subject A user, SSO group, or application assigned with a role in the scope of this department Type The type of subject assigned to the access rule (user, SSO group, or application). Scope The scope of this department within the organizational tree. Click the name of the scope to view the organizational tree diagram, you can only view the parts of the organizational tree for which you have permission to view. Role The role assigned to the subject, in this department\u2019s scope Authorized by The user who granted the access rule Last updated The last time the access rule was updated

      Note

      A role given in a certain scope, means the role applies to this scope and any descendant scopes in the organizational tree.

      "},{"location":"platform-admin/aiinitiatives/org/departments/#adding-a-new-department","title":"Adding a new department","text":"

      To create a new Department:

      1. Click +NEW DEPARTMENT
      2. Select a scope. By default, the field contains the scope of the current UI context cluster, viewable at the top left side of your screen. You can change the current UI context cluster by clicking the \u2018Cluster: cluster-name\u2019 field and applying another cluster as the UI context. Alternatively, you can choose another cluster within the \u2018+ New Department\u2019 form by clicking the organizational tree icon on the right side of the scope field, opening the organizational tree and selecting one of the available clusters.
      3. Enter a name for the department. Department names must start with a letter and can only contain lower case latin letters, numbers or a hyphen ('-\u2019).
      4. Under Quota Management, select a quota for the department. The Quota management section may contain different fields depending on pre-created system configuration. Possible system configurations are:
        • Existence of Node Pools
        • CPU Quota - Allow setting a quota for CPU resources.

      When no node pools are configured, you can set the following quota parameters:

      • GPU Devices The number of GPUs you want to allocate for this department (decimal number). This quota is consumed by the department\u2019s subordinated project.
      • CPUs (cores) (when CPU quota is set) The number of CPU cores you want to allocate for this department (decimal number). This quota is consumed by the department\u2019s subordinated projects
      • CPUs memory (when CPU quota is set) The amount of CPU memory you want to allocate for this department (in Megabytes or Gigabytes). This quota is consumed by the department\u2019s subordinated projects

      When node pools are enabled, it is possible to set the above quota parameters for each node-pool separately.

      • Order of priority This column is displayed only if more than one node pool exists. The default order in which the Scheduler uses node pools to schedule a workload. This means, the Scheduler first tries to allocate resources using the highest priority node pool, followed by the next in priority, until it reaches the lowest priority node pool list, then the Scheduler starts from the highest priority again. The Scheduler uses the department list of prioritized node pools, only if the order of priority of node pools is not set in project or the workload during submission (either by an admin policy or by the user). An empty value indicates that the node pool is not part of the department\u2019s default node pool priority list, but a node pool can still be chosen by the admin policy or a user during workload submission. Department nodepool priority sets defaults to the subordinate projects but does not enforce it, meaning projects are free to change their priority.
      • In addition, you can decide whether to allow a department to go over-quota. Allowing over-quota at the department level means that one department can receive more resources than its quota when not required by other departments. If the over-quota is disabled, workloads running under subordinated projects are not able to use more resources than the department\u2019s quota, but each project can still go over-quota (if enabled at the project level) up to the department\u2019s quota.

      Unlimited CPU(Cores) and CPU memory quotas are an exception - in this case, workloads of subordinated projects can consume available resources up to the physical limitation of the cluster or any of the node pools.

      Example of Quota management:

      1. Click CREATE DEPARTMENT
      "},{"location":"platform-admin/aiinitiatives/org/departments/#adding-an-access-rule-to-a-department","title":"Adding an access rule to a department","text":"

      To create a new access rule for a department:

      1. Select the department you want to add an access rule for
      2. Click ACCESS RULES
      3. Click +ACCESS RULE
      4. Select a subject
      5. Select or enter the subject identifier:
        • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
        • Group name as recognized by the IDP
        • Application name as created in Run:ai
      6. Select a role
      7. Click SAVE RULE
      8. Click CLOSE
      "},{"location":"platform-admin/aiinitiatives/org/departments/#deleting-an-access-rule-from-a-department","title":"Deleting an access rule from a department","text":"

      To delete an access rule from a department:

      1. Select the department you want to remove an access rule from
      2. Click ACCESS RULES
      3. Find the access rule you would like to delete
      4. Click on the trash icon
      5. Click CLOSE
      "},{"location":"platform-admin/aiinitiatives/org/departments/#editing-a-department","title":"Editing a department","text":"
      1. Select the Department you want to edit
      2. Click EDIT
      3. Update the Department and click SAVE
      "},{"location":"platform-admin/aiinitiatives/org/departments/#viewing-a-departments-policy","title":"Viewing a department\u2019s policy","text":"

      To view the policy of a department:

      1. Select the department for which you want to view its policies. This option is only active if the department has defined policies in place.
      2. Click VIEW POLICY and select the workload type for which you want to view the policies: a. Workspace workload type policy with its set of rules b. Training workload type policies with its set of rules
      3. In the Policy form, view the workload rules that are enforcing your department for the selected workload type as well as the defaults:
        • Parameter - The workload submission parameter that Rule and Default is applied on
        • Type (applicable for data sources only) - The data source type (Git, S3, nfs, pvc etc.)
        • Default - The default value of the Parameter
        • Rule - Set up constraints on workload policy fields
        • Source - The origin of the applied policy (cluster, department or project)

      Notes

      • The policy affecting the department consists of rules and defaults. Some of these rules and defaults may be derived from the policies of a parent cluster (source). You can see the source of each rule in the policy form.
      • A policy set for a department affects all subordinated projects and their workloads, according to the policy workload type
      "},{"location":"platform-admin/aiinitiatives/org/departments/#deleting-a-department","title":"Deleting a department","text":"
      1. Select the department you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm the deletion

      Note

      Deleting a department permanently deletes its subordinated projects, any assets created in the scope of this department, and any of its subordinated projects such as compute resources, environments, data sources, templates, and credentials. However, workloads running within the department\u2019s subordinated projects, or the policies defined for this department or its subordinated projects - remain intact and running.

      "},{"location":"platform-admin/aiinitiatives/org/departments/#reviewing-a-department","title":"Reviewing a department","text":"
      1. Select the department you want to review
      2. Click REVIEW
      3. Review and click CLOSE
      "},{"location":"platform-admin/aiinitiatives/org/departments/#using-api","title":"Using API","text":"

      Go to the Departments API reference to view the available actions

      "},{"location":"platform-admin/aiinitiatives/org/projects/","title":"Projects","text":"

      This article explains the procedure to manage Projects.

      Researchers submit AI workloads. To streamline resource allocation and prioritize work, Run:ai introduces the concept of Projects. Projects are the tool to implement resource allocation policies as well as the segregation between different initiatives. A project may represent a team, an individual, or an initiative that shares resources or has a specific resource quota. Projects may be aggregated in Run:ai departments.

      For example, you may have several people involved in a specific face-recognition initiative collaborating under one project named \u201cface-recognition-2024\u201d. Alternatively, you can have a project per person in your team, where each member receives their own quota.

      "},{"location":"platform-admin/aiinitiatives/org/projects/#projects-table","title":"Projects table","text":"

      The Projects table can be found under Organization in the Run:ai platform.

      The Projects table provides a list of all projects defined for a specific cluster, and allows you to manage them. You can switch between clusters by selecting your cluster using the filter at the top.

      The Projects table consists of the following columns:

      Column Description Project The name of the project Department The name of the parent department. Several projects may be grouped under a department. Status The Project creation status. Projects are manifested as Kubernetes namespaces. The project status represents the Namespace creation status. Node pool(s) with quota The node pools associated with the project. By default, a new project is associated with all node pools within its associated cluster. Administrators can change the node pools\u2019 quota parameters for a project. Click the values under this column to view the list of node pools with their parameters (as described below) Subject(s) The users, SSO groups, or applications with access to the project. Click the values under this column to view the list of subjects with their parameters (as described below). This column is only viewable if your role in the Run:ai platform allows you those permissions. Allocated GPUs The total number of GPUs allocated by successfully scheduled workloads under this project GPU allocation ratio The ratio of Allocated GPUs to GPU quota. This number reflects how well the project\u2019s GPU quota is utilized by its descendent workloads. A number higher than 100% indicates the project is using over-quota GPUs. GPU quota The GPU quota allocated to the project. This number represents the sum of all node pools\u2019 GPU quota allocated to this project. Allocated CPUs (Core) The total number of CPU cores allocated by workloads submitted within this project. (This column is only available if the CPU Quota setting is enabled, as described below). Allocated CPU Memory The total number of CPUs allocated by successfully scheduled workloads under this project. (This column is only available if the CPU Quota setting is enabled, as described below). CPU quota (Cores) CPU quota allocated to this project. (This column is only available if the CPU Quota setting is enabled, as described below). This number represents the sum of all node pools\u2019 CPU quota allocated to this project. The \u2018unlimited\u2019 value means the CPU (cores) quota is not bounded and workloads using this project can use as many CPU (cores) resources as they need (if available). CPU memory quota CPU memory quota allocated to this project. (This column is only available if the CPU Quota setting is enabled, as described below). This number represents the sum of all node pools\u2019 CPU memory quota allocated to this project. The \u2018unlimited\u2019 value means the CPU memory quota is not bounded and workloads using this Project can use as much CPU memory resources as they need (if available). CPU allocation ratio The ratio of Allocated CPUs (cores) to CPU quota (cores). This number reflects how much the project\u2019s \u2018CPU quota\u2019 is utilized by its descendent workloads. A number higher than 100% indicates the project is using over-quota CPU cores. CPU memory allocation ratio The ratio of Allocated CPU memory to CPU memory quota. This number reflects how well the project\u2019s \u2018CPU memory quota\u2019 is utilized by its descendent workloads. A number higher than 100% indicates the project is using over-quota CPU memory. Node affinity of training workloads The list of Run:ai node-affinities. Any training workload submitted within this project must specify one of those Run:ai node affinities, otherwise it is not submitted. Node affinity of interactive workloads The list of Run:ai node-affinities. Any interactive (workspace) workload submitted within this project must specify one of those Run:ai node affinities, otherwise it is not submitted. Idle time limit of training workloads The time in days:hours:minutes after which the project stops a training workload not using its allocated GPU resources. Idle time limit of preemptible workloads The time in days:hours:minutes after which the project stops a preemptible interactive (workspace) workload not using its allocated GPU resources. Idle time limit of non preemptible workloads The time in days:hours:minutes after which the project stops a non-preemptible interactive (workspace) workload not using its allocated GPU resources.. Interactive workloads time limit The duration in days:hours:minutes after which the project stops an interactive (workspace) workload Training workloads time limit The duration in days:hours:minutes after which the project stops a training workload Creation time The timestamp for when the project was created Workload(s) The list of workloads associated with the project. Click the values under this column to view the list of workloads with their resource parameters (as described below). Cluster The cluster that the project is associated with"},{"location":"platform-admin/aiinitiatives/org/projects/#node-pools-with-quota-associated-with-the-project","title":"Node pools with quota associated with the project","text":"

      Click one of the values of Node pool(s) with quota column, to view the list of node pools and their parameters

      Column Description Node pool The name of the node pool is given by the administrator during node pool creation. All clusters have a default node pool created automatically by the system and named \u2018default\u2019. GPU quota The amount of GPU quota the administrator dedicated to the project for this node pool (floating number, e.g. 2.3 means 230% of GPU capacity). CPU (Cores) The amount of CPUs (cores) quota the administrator has dedicated to the project for this node pool (floating number, e.g. 1.3 Cores = 1300 mili-cores). The \u2018unlimited\u2019 value means the CPU (Cores) quota is not bounded and workloads using this node pool can use as many CPU (Cores) resources as they require, (if available). CPU memory The amount of CPU memory quota the administrator has dedicated to the project for this node pool (floating number, in MB or GB). The \u2018unlimited\u2019 value means the CPU memory quota is not bounded and workloads using this node pool can use as much CPU memory resource as they need (if available). Allocated GPUs The actual amount of GPUs allocated by workloads using this node pool under this project. The number of allocated GPUs may temporarily surpass the GPU quota if over-quota is used. Allocated CPU (Cores) The actual amount of CPUs (cores) allocated by workloads using this node pool under this project. The number of allocated CPUs (cores) may temporarily surpass the CPUs (Cores) quota if over-quota is used. Allocated CPU memory The actual amount of CPU memory allocated by workloads using this node pool under this Project. The number of Allocated CPU memory may temporarily surpass the CPU memory quota if over-quota is used. Order of priority The default order in which the Scheduler uses node-pools to schedule a workload. This is used only if the order of priority of node pools is not set in the workload during submission, either by an admin policy or the user. An empty value means the node pool is not part of the project\u2019s default list, but can still be chosen by an admin policy or the user during workload submission"},{"location":"platform-admin/aiinitiatives/org/projects/#subjects-authorized-for-the-project","title":"Subjects authorized for the project","text":"

      Click one of the values in the Subject(s) column, to view the list of subjects and their parameters. This column is only viewable, if your role in the Run:ai system affords you those permissions.

      Column Description Subject A user, SSO group, or application assigned with a role in the scope of this Project Type The type of subject assigned to the access rule (user, SSO group, or application) Scope The scope of this project in the organizational tree. Click the name of the scope to view the organizational tree diagram, you can only view the parts of the organizational tree for which you have permission to view. Role The role assigned to the subject, in this project\u2019s scope Authorized by The user who granted the access rule Last updated The last time the access rule was updated"},{"location":"platform-admin/aiinitiatives/org/projects/#workloads-associated-with-the-project","title":"Workloads associated with the project","text":"

      Click one of the values of Workload(s) column, to view the list of workloads and their parameters

      Column Description Workload The name of the workload, given during its submission. Optionally, an icon describing the type of workload is also visible Type The type of the workload, e.g. Workspace, Training, Inference Status The state of the workload and time elapsed since the last status change Created by The subject that created this workload Running/ requested pods The number of running pods out of the number of requested pods for this workload. e.g. a distributed workload requesting 4 pods but may be in a state where only 2 are running and 2 are pending Creation time The date and time the workload was created GPU compute request The amount of GPU compute requested (floating number, represents either a portion of the GPU compute, or the number of whole GPUs requested) GPU memory request The amount of GPU memory requested (floating number, can either be presented as a portion of the GPU memory, an absolute memory size in MB or GB, or a MIG profile) CPU memory request The amount of CPU memory requested (floating number, presented as an absolute memory size in MB or GB) CPU compute request The amount of CPU compute requested (floating number, represents the number of requested Cores)"},{"location":"platform-admin/aiinitiatives/org/projects/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"platform-admin/aiinitiatives/org/projects/#adding-a-new-project","title":"Adding a new project","text":"

      To create a new Project:

      1. Click +NEW PROJECT
      2. Select a scope, you can only view clusters if you have permission to do so - within the scope of the roles assigned to you
      3. Enter a name for the project Project names must start with a letter and can only contain lower case Latin letters, numbers or a hyphen ('-\u2019)
      4. Namespace associated with Project Each project has an associated (Kubernetes) namespace in the cluster. All workloads under this project use this namespace. a. By default, Run:ai creates a namespace based on the Project name (in the form of runai-<name>) b. Alternatively, you can choose an existing namespace created for you by the cluster administrator
      5. In the Quota management section, you can set the quota parameters and prioritize resources

        • Order of priority This column is displayed only if more than one node pool exists. The default order in which the Scheduler uses node pools to schedule a workload. This means the Scheduler first tries to allocate resources using the highest priority node pool, then the next in priority, until it reaches the lowest priority node pool list, then the Scheduler starts from the highest again. The Scheduler uses the Project list of prioritized node pools, only if the order of priority of node pools is not set in the workload during submission, either by an admin policy or by the user. Empty value means the node pool is not part of the Project\u2019s default node pool priority list, but a node pool can still be chosen by the admin policy or a user during workload submission
        • Node pool This column is displayed only if more than one node pool exists. It represents the name of the node pool.
        • GPU devices The number of GPUs you want to allocate for this project in this node pool (decimal number).
        • CPUs (Cores) This column is displayed only if CPU quota is enabled via the General settings. Represents the number of CPU cores you want to allocate for this project in this node pool (decimal number).
        • CPU memory This column is displayed only if CPU quota is enabled via the General settings. The amount of CPU memory you want to allocate for this project in this node pool (in Megabytes or Gigabytes).

        • Over quota / Over quota weight - If over-quota weight is enabled via the General settings then over-quota weight is presented, otherwise over-quota is presented

          • Over quota When enabled, the project can use non-guaranteed overage resources above its quota in this node pool. The amount of the non-guaranteed overage resources for this project is calculated proportionally to the project quota in this node pool. When disabled, the project cannot use more resources than the guaranteed quota in this node pool.
          • Over quota weight - Represents a weight used to calculate the amount of non-guaranteed overage resources a project can get on top of its quota in this node pool. All unused resources are split between projects that require the use of overage resources:
            • Medium The default value. The Admin can change the default to any of the following values: High, Low, Lowest, or None.
            • None When set, the project cannot use more resources than the guaranteed quota in this node pool.
            • Lowest Over-quota weight \u2018lowest\u2019 has a unique behavior, because its weight is 0, it can only use over-quota (unused overage) resources if no other project needs them, and any project with a higher over-quota weight can snap the average resources at any time.

      Note

      Setting the quota to 0 (either GPU, CPU, or CPU memory) and the over-quota to \u2018disabled\u2019 or over-quota weight to \u2018none\u2019 means the project is blocked from using those resources on this node pool.

      When no node pools are configured, you can set the same parameters but it is for the whole project, instead of per node pool.

      After node pools are created, you can set the above parameters for each node-pool separately.

      1. Set Scheduling rules as required. You can have a scheduling rule for:

        • Idle GPU timeout Preempt a workload that does not use GPUs for more than a specified duration. You can apply a single rule per workload type - Preemptive Workspaces, Non-preemptive Workspaces, and Training.

        Note

        To make \u2018Idle GPU timeout\u2019 effective, it must be set to a shorter duration than that workload duration of the same workload type.

        • Workspace duration Preempt workspaces after a specified duration. This applies to both preemptive and non-preemptive Workspaces.
        • Training duration Preempt a training workload after a specified duration.
        • Node type (Affinity) Node type is used to select a group of nodes, usually with specific characteristics such as a hardware feature, storage type, fast networking interconnection, etc. The scheduler uses node type as an indication of which nodes should be used for your workloads, within this project. Node type is a label in the form of run.ai/type and a value (e.g. run.ai/type = dgx200) that the administrator uses to tag a set of nodes. Adding the node type to the project\u2019s scheduling rules enables the user to submit workloads with any node type label/value pairs in this list, according to the workload type - Workspace or Training. The Scheduler then schedules workloads using a node selector, targeting nodes tagged with the Run:ai node type label/value pair. Node pools and a node type can be used in conjunction with each other. For example, specifying a node pool and a smaller group of nodes from that node pool that includes a fast SSD memory or other unique characteristics.
      2. Click CREATE PROJECT

      "},{"location":"platform-admin/aiinitiatives/org/projects/#adding-an-access-rule-to-a-project","title":"Adding an access rule to a project","text":"

      To create a new access rule for a project:

      1. Select the project you want to add an access rule for
      2. Click ACCESS RULES
      3. Click +ACCESS RULE
      4. Select a subject
      5. Select or enter the subject identifier:
        • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
        • Group name as recognized by the IDP
        • Application name as created in Run:ai
      6. Select a role
      7. Click SAVE RULE
      8. Click CLOSE
      "},{"location":"platform-admin/aiinitiatives/org/projects/#deleting-an-access-rule-from-a-project","title":"Deleting an access rule from a project","text":"

      To delete an access rule from a project:

      1. Select the project you want to remove an access rule from
      2. Click ACCESS RULES
      3. Find the access rule you want to delete
      4. Click on the trash icon
      5. Click CLOSE
      "},{"location":"platform-admin/aiinitiatives/org/projects/#editing-a-project","title":"Editing a project","text":"

      To edit a project:

      1. Select the project you want to edit
      2. Click EDIT
      3. Update the Project and click SAVE
      "},{"location":"platform-admin/aiinitiatives/org/projects/#viewing-a-projects-policy","title":"Viewing a project\u2019s policy","text":"

      To view the policy of a project:

      1. Select the project for which you want to view its policies. This option is only active for projects with defined policies in place.
      2. Click VIEW POLICY and select the workload type for which you want to view the policies: a. Workspace workload type policy with its set of rules b. Training workload type policies with its set of rules
      3. In the Policy form, view the workload rules that are enforcing your project for the selected workload type as well as the defaults:
        • Parameter - The workload submission parameter that Rules and Defaults are applied to
        • Type (applicable for data sources only) - The data source type (Git, S3, nfs, pvc etc.)
        • Default - The default value of the Parameter
        • Rule - Set up constraints on workload policy fields
        • Source - The origin of the applied policy (cluster, department or project)

      Note

      The policy affecting the project consists of rules and defaults. Some of these rules and defaults may be derived from policies of a parent cluster and/or department (source). You can see the source of each rule in the policy form.

      "},{"location":"platform-admin/aiinitiatives/org/projects/#deleting-a-project","title":"Deleting a project","text":"

      To delete a project:

      1. Select the project you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm

      Note

      • Clusters < v2.20

        Deleting a project does not delete its associated namespace, any of the running workloads using this namespace, or the policies defined for this project. However, any assets created in the scope of this project such as compute resources, environments, data sources, templates and credentials, are permanently deleted from the system.

      • Clusters >=v2.20

        Deleting a project does not delete its associated namespace, but will attempt to delete its associated workloads and assets. Any assets created in the scope of this project such as compute resources, environments, data sources, templates and credentials, are permanently deleted from the system.

      "},{"location":"platform-admin/aiinitiatives/org/projects/#using-api","title":"Using API","text":"

      Go to the Projects API reference to view the available actions

      "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/","title":"Scheduling Rules","text":"

      This article explains the procedure of configuring and managing Scheduling rules. Scheduling rules refer to restrictions applied over workloads. These restrictions apply to either the resources (nodes) on which workloads can run or to the duration of the workload run time. Scheduling rules are set for projects or departments and apply to a specific workload type. Once scheduling rules are set, all matching workloads associated with the project or (subordinate projects in case of department) have the restrictions as defined when the workload was submitted. New scheduling rules added, are not applied over already created workloads associated with that project/department.

      "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#scheduling-rules","title":"Scheduling Rules","text":"

      There are 3 types of scheduling rules:

      "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#workload-duration-time-limit","title":"Workload duration (time limit)","text":"

      This rule limits the duration of a workload run time. Workload run time is calculated as the total time in which the workload was in status Running. You can apply a single rule per workload type - Preemptive Workspaces, Non-preemptive Workspaces, and Training.

      "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#idle-gpu-time-limit","title":"Idle GPU time limit","text":"

      This rule limits the total GPU time of a workload. Workload idle time is counted from the first time the workload is in status Running and the GPU was idle. We calculate idleness by employing the runai_gpu_idle_seconds_per_workload metric. This metric determines the total duration of zero GPU utilization within each 30-second interval. If the GPU remains idle throughout the 30-second window, 30 seconds are added to the idleness sum; otherwise, the idleness count is reset. You can apply a single rule per workload type - Preemptible Workspaces, Non-preemptible Workspaces, and Training.

      Note

      To make Idle GPU timeout effective, it must be set to a shorter duration than that workload duration of the same workload type.

      "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#node-type-affinity","title":"Node type (Affinity)","text":"

      Node type is used to select a group of nodes, typically with specific characteristics such as a hardware feature, storage type, fast networking interconnection, etc. The scheduler uses node type as an indication of which nodes should be used for your workloads, within this project.

      Node type is a label in the form of run.ai/type and a value (e.g. run.ai/type = dgx200) that the administrator uses to tag a set of nodes. Adding the node type to the project\u2019s scheduling rules mandates the user to submit workloads with a node type label/value pairs from this list, according to the workload type - Workspace or Training. The Scheduler then schedules workloads using a node selector, targeting nodes tagged with the Run:ai node type label/value pair. Node pools and a node type can be used in conjunction. For example, specifying a node pool and a smaller group of nodes from that node pool that includes a fast SSD memory or other unique characteristics.

      "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#labelling-nodes-for-node-types-grouping","title":"Labelling nodes for node types grouping","text":"

      The administrator should use a node label with the key of run.ai/type and any coupled value

      To assign a label to nodes you want to group, set the \u2018node type (affinity)\u2019 on each relevant node:

      1. Obtain the list of nodes and their current labels by coping the following to your terminal:

        kubectl get nodes --show-labels\n

      2. Annotate a specific node with a new label by coping the following to your terminal:

        kubectl label node <node-name> run.ai/type=<value>\n

      "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#adding-a-scheduling-rule-to-a-projectdepartment","title":"Adding a scheduling rule to a project/department","text":"

      To add a scheduling rule:

      1. Select the project/department for which you want to add a scheduling rule
      2. Click EDIT
      3. In the Scheduling rules section click +RULE
      4. Select the rule type
      5. Select the workload type and time limitation period
      6. For Node type, choose one or more labels for the desired nodes.
      7. Click SAVE

      Note

      You can review the defined rules in the Projects table in the relevant column.

      "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#editing-the-projectdepartment-scheduling-rule","title":"Editing the project/department scheduling rule","text":"

      To edit a scheduling rule:

      1. Select the project/department for which you want to edit its scheduling rule
      2. Click EDIT
      3. Find the scheduling rule you would like to edit
      4. Edit the rule
      5. Click SAVE

      Note

      When a editing an inherited rule on a project/department (a rule defined by the department), you can only restrict the rule limitation

      "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#deleting-the-projectdepartment-scheduling-rule","title":"Deleting the project/department scheduling rule","text":"

      To delete a scheduling rule:

      1. Select the project/department from which you want to delete a scheduling rule
      2. Click EDIT
      3. Find the scheduling rule you would like to delete
      4. Click on the x icon
      5. Click SAVE

      !!! You cannot delete rules inherited from the department from the project's set of rules

      "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#using-api","title":"Using API","text":"

      Go to the Projects API reference to view the available actions

      "},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/","title":"Configuring NVIDIA MIG Profiles","text":"

      NVIDIA\u2019s Multi-Instance GPU (MIG) enables splitting a GPU into multiple logical GPU devices, each with its own memory and compute portion of the physical GPU.

      NVIDIA provides two MIG strategies that the user can split the GPU into:

      • Single - A GPU can be divided evenly. This means all MIG profiles are the same.
      • Mixed - A GPU can be divided into different profiles.

      The Run:ai platform supports running workloads using NVIDIA MIG. Administrators can set the Kubernetes nodes to their preferred MIG strategy and configure the appropriate MIG profiles for researchers and MLOPS engineers to use.

      This guide explains how to configure MIG in each strategy to submit workloads. It also outlines the individual implications of each strategy and best practices for administrators.

      Note

      • Starting from v2.19, Dynamic MIG feature began a deprecation process and is now no longer supported. With Dynamic MIG, the Run:ai platform automatically configured MIG profiles according to on-demand user requests for different MIG profiles or memory fractions.
      • GPU fractions and memory fractions are not supported with MIG profiles.
      • Single strategy supports both Run:ai and third-party workloads. Using mixed strategy can only be done using third-party workloads. For more details on Run:ai and third-party workloads, see Introduction to workloads.
      "},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/#before-you-start","title":"Before you start","text":"

      To use MIG single and mixed strategy effectively, make sure to familiarize yourself with the following NVIDIA resources:

      • NVIDIA Multi-Instance GPU
      • MIG User Guide
      • GPU Operator with MIG
      "},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/#configuring-single-mig-strategy","title":"Configuring single MIG strategy","text":"

      When deploying MIG using single strategy, all GPUs within a node are configured with the same profile. For example, a node might have GPUs configured with 3 MIG slices of profile type 1g.20gb, or 7 MIG slices of profile 1g.10gb. With this strategy, MIG profiles are displayed as whole GPU devices by CUDA.

      The Run:ai platform discovers these MIG profiles as whole GPU devices as well, ensuring MIG devices are transparent to the end-user (practitioner). For example, a node that consists of 8 physical GPUs split into MIG slices, 3\u00d72g20gb slices each, is discovered by the Run:ai platform as a node with 24 GPU devices.

      Users can submit workloads by requesting a specific number of GPU devices (X GPU) and Run:ai will allocate X MIG slices (logical devices). The Run:ai platform deducts X GPUs from the workload\u2019s Project quota, regardless of whether this \u2018logical GPU\u2019 represents 1/3 of a physical GPU device or 1/7 of a physical GPU device.

      "},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/#configuring-mixed-mig-strategy","title":"Configuring mixed MIG strategy","text":"

      When deploying MIG using mixed strategy, each GPU in a node can be configured with a different combination of MIG profiles such as 2\u00d72g.20gb and 3\u00d71g.10gb. For details on supported combinations per GPU type, refer to Supported MIG Profiles.

      In mixed strategy, physical GPU devices continue to be displayed as physical GPU devices by CUDA, and each MIG profile is shown individually. The Run:ai platform identifies the physical GPU devices normally, however, MIG profiles are not visible in the UI or node APIs.

      When submitting third-party workloads with this strategy, the user should explicitly specify the exact requested MIG profile (for example, nvidia.com/gpu.product: A100-SXM4-40GB-MIG-3g.20gb). The Run:ai Scheduler finds a node that can provide this specific profile and binds it to the workload.

      A third-party workload submitted with a MIG profile of type Xg.Ygb (e.g. 3g.40gb or 2g.20gb) is considered as consuming X GPUs. These X GPUs will be deducted from the workload\u2019s project quota of GPUs. For example, a 3g.40gb profile deducts 3 GPUs from the associated Project\u2019s quota, while 2g.20gb deducts 2 GPUs from the associated Project\u2019s quota. This is done to maintain a logical ratio according to the characteristics of the MIG profile.

      "},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/#best-practices-for-administrators","title":"Best practices for administrators","text":""},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/#single-strategy","title":"Single strategy","text":"
      • Configure proper and uniform sizes of MIG slices (profiles) across all GPUs within a node.
      • Set the same MIG profiles on all nodes of a single node pool.
      • Create separate node pools with different MIG profile configurations allowing users to select the pool that best matches their workloads\u2019 needs.
      • Ensure Project quotas are allocated according to the MIG profile sizes.
      "},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/#mixed-strategy","title":"Mixed strategy","text":"
      • Use mixed strategy with workloads that require diverse resources. Make sure to evaluate the workload requirements and plan accordingly.
      • Configure individual MIG profiles on each node by using a limited set of MIG profile combinations to minimize complexity. Make sure to evaluate your requirements and node configurations.
      • Ensure Project quotas are allocated according to the MIG profile sizes.

      Note

      Since MIG slices are a fixed size, once configured, changing MIG profiles requires administrative intervention.

      "},{"location":"platform-admin/aiinitiatives/resources/node-pools/","title":"Node Pools","text":"

      This article explains the procedure for managing Node pools.

      Node pools assist in managing heterogeneous resources effectively. A node pool is a Run:ai construct representing a set of nodes grouped into a bucket of resources using a predefined node label (e.g. NVidia GPU type) or an administrator-defined node label (any key/value pair).

      Typically, the grouped nodes share a common feature or property, such as GPU type or other HW capability (such as Infiniband connectivity), or represent a proximity group (i.e. nodes interconnected via a local ultra-fast switch). Researchers and ML Engineers would typically use node pools to run specific workloads on specific resource types.

      Platform administrators can create, view, edit, and delete node pools. Creating a new node pool creates a new instance of the Run:ai scheduler, workloads submitted to a node pool will be scheduled using the node pool\u2019s designated scheduler instance.

      Once a new node pool is created, it is automatically assigned to all Projects and Departments with a quota of zero GPU resources, unlimited CPU resources, and over-quota enabled (Medium priority if over-quota weight is enabled). This allows any Project and Department to use any node pool when over-quota is enabled, even if the administrator has not assigned a quota for a specific node pool in a Project or Department.

      Workloads can be submitted using a prioritized list of node pools, the node pool selector picks one node pool at a time (according to the prioritized list) and the designated node pool scheduler instance handles the submission request and tries to match the requested resources within that node pool. If the scheduler cannot find resources to satisfy the submitted workload, the node pool selector will move the request to the next node pool in the prioritized list, if no node pool satisfies the request, the node pool selector will start from the first node pool again until one of the node pools satisfies the request.

      "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#node-pools-table","title":"Node pools table","text":"

      The Node pools table can be found under Resources in the Run:ai platform.

      The Node pools table lists all the node pools defined in the Run:ai platform and allows you to manage them.

      Note

      By default, the Run:ai platform includes a single node pool named \u2018default\u2019. When no other node pool is defined, all existing and new nodes are associated with the \u2018default\u2019 node pool. When deleting a node pool, if no other node pool matches any of the nodes\u2019 labels, the node will be included in the default node pool.

      The Node pools table consists of the following columns:

      Column Description Node pool The node pool name, set by the administrator during its creation (the node pool name cannot be changed after its creation). Status Node pool status. A \u2018Ready\u2019 status means the scheduler can use this node pool to schedule workloads. \u2018Empty\u2019 status means no nodes are currently included in that node pool. Label key Label value The node pool controller will use this node-label key-value pair to match nodes into this node pool. Node(s) List of nodes included in this node pool. Click the field to view details (the details are in the Nodes article). GPU devices The total number of GPU devices installed into nodes included in this node pool. For example, a node pool that includes 12 nodes each with 8 GPU devices would show a total number of 96 GPU devices. GPU memory The total amount of GPU memory included in this node pool. The total amount of GPU memory installed in nodes included in this node pool. For example, a node pool that includes 12 nodes, each with 8 GPU devices, and each device with 80 GB of memory would show a total memory amount of 7.68 TB. Allocated GPUs The total allocation of GPU devices in units of GPUs (decimal number). For example, if 3 GPUs are 50% allocated, the field prints out the value 1.50. This value represents the portion of GPU memory consumed by all running pods using this node pool. \u2018Allocated GPUs\u2019 can be larger than \u2018Projects\u2019 GPU quota\u2019 if over-quota is used by workloads, but not larger than GPU devices. GPU resource optimization ratio Shows the Node Level Scheduler mode. CPUs (Cores) The number of CPU cores installed on nodes included in this node CPU memory The total amount of CPU memory installed on nodes using this node pool Allocated CPUs (Cores) The total allocation of CPU compute in units of Cores (decimal number). This value represents the amount of CPU cores consumed by all running pods using this node pool. \u2018Allocated CPUs\u2019 can be larger than \u2018Projects\u2019 GPU quota\u2019 if over-quota is used by workloads, but not larger than CPUs (Cores). Allocated CPU memory The total allocation of CPU memory in units of TB/GB/MB (decimal number). This value represents the amount of CPU memory consumed by all running pods using this node pool. \u2018Allocated CPUs\u2019 can be larger than \u2018Projects\u2019 CPU memory quota\u2019 if over-quota is used by workloads, but not larger than CPU memory. GPU placement strategy Sets the Scheduler strategy for the assignment of pods requesting both GPU and CPU resources to nodes, which can be either Bin-pack or Spread. By default, Bin-Pack is used, but can be changed to Spread by editing the node pool. When set to Bin-pack the scheduler will try to fill nodes as much as possible before using empty or sparse nodes, when set to spread the scheduler will try to keep nodes as sparse as possible by spreading workloads across as many nodes as it succeeds. CPU placement strategy Sets the Scheduler strategy for the assignment of pods requesting only CPU resources to nodes, which can be either Bin-pack or Spread. By default, Bin-Pack is used, but can be changed to Spread by editing the node pool. When set to Bin-pack the scheduler will try to fill nodes as much as possible before using empty or sparse nodes, when set to spread the scheduler will try to keep nodes as sparse as possible by spreading workloads across as many nodes as it succeeds. Last update The date and time when the node pool was last updated Creation time The date and time when the node pool was created Workload(s) List of workloads running on nodes included in this node pool, click the field to view details (described below in this article)"},{"location":"platform-admin/aiinitiatives/resources/node-pools/#workloads-associated-with-the-node-pool","title":"Workloads associated with the node pool","text":"

      Click one of the values in the Workload(s) column, to view the list of workloads and their parameters.

      Note

      This column is only viewable if your role in the Run:ai platform gives you read access to workloads, even if you are allowed to view workloads, you can only view the workloads within your allowed scope. This means, there might be more pods running on this node than appear in the list your are viewing.

      Column Description Workload The name of the workload. If the workloads\u2019 type is one of the recognized types (for example: Pytorch, MPI, Jupyter, Ray, Spark, Kubeflow, and many more), an appropriate icon is printed. Type The Run:ai platform type of the workload - Workspace, Training, or Inference Status The state of the workload. The Workloads state is described in the \u2018Run:ai Workloads\u2019 article. Created by The User or Application created this workload Running/requested pods The number of running pods out of the number of requested pods within this workload. Creation time The workload\u2019s creation date and time Allocated GPU compute The total amount of GPU compute allocated by this workload. A workload with 3 Pods, each allocating 0.5 GPU, will show a value of 1.5 GPUs for the workload. Allocated GPU memory The total amount of GPU memory allocated by this workload. A workload with 3 Pods, each allocating 20GB, will show a value of 60 GB for the workload. Allocated CPU compute (cores) The total amount of CPU compute allocated by this workload. A workload with 3 Pods, each allocating 0.5 Core, will show a value of 1.5 Cores for the workload. Allocated CPU memory The total amount of CPU memory allocated by this workload. A workload with 3 Pods, each allocating 5 GB of CPU memory, will show a value of 15 GB of CPU memory for the workload."},{"location":"platform-admin/aiinitiatives/resources/node-pools/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      • Show/Hide details - Click to view additional information on the selected row
      "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#showhide-details","title":"Show/Hide details","text":"

      Select a row in the Node pools table and then click Show details in the upper-right corner of the action bar. The details window appears, presenting metrics graphs for the whole node pool:

      • Node GPU allocation - This graph shows an overall sum of the Allocated, Unallocated, and Total number of GPUs for this node pool, over time. From observing this graph, you can learn about the occupancy of GPUs in this node pool, over time.

      • GPU Utilization Distribution - This graph shows the distribution of GPU utilization in this node pool over time. Observing this graph, you can learn how many GPUs are utilized up to 25%, 25%-50%, 50%-75%, and 75%-100%. This information helps to understand how many available resources you have in this node pool, and how well those resources are utilized by comparing the allocation graph to the utilization graphs, over time.

      • GPU Utilization - This graph shows the average GPU utilization in this node pool over time. Comparing this graph with the GPU Utilization Distribution helps to understand the actual distribution of GPU occupancy over time.

      • GPU Memory Utilization - This graph shows the average GPU memory utilization in this node pool over time, for example an average of all nodes\u2019 GPU memory utilization over time.

      • CPU Utilization - This graph shows the average CPU utilization in this node pool over time, for example, an average of all nodes\u2019 CPU utilization over time.

      • CPU Memory Utilization - This graph shows the average CPU memory utilization in this node pool over time, for example an average of all nodes\u2019 CPU memory utilization over time.

      "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#adding-a-new-node-pool","title":"Adding a new node pool","text":"

      To create a new node pool:

      1. Click +NEW NODE POOL
      2. Enter a name for the node pool. Node pools names must start with a letter and can only contain lowercase Latin letters, numbers or a hyphen ('-\u2019)
      3. Enter the node pool label: The node pool controller will use this node-label key-value pair to match nodes into this node pool.

        • Key is the unique identifier of a node label.

          • The key must fit the following regular expression: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?/?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$
          • The administrator can put an automatically preset label such as the nvidia.com/gpu.product that labels the GPU type or any other key from a node label.
        • Value is the value of that label identifier (key). The same key may have different values, in this case, they are considered as different labels.

          • Value must fit the following regular expression: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$
        • A node pool is defined by a single key-value pair. You must not use different labels that are set on the same node by different node pools, this situation may lead to unexpected results.
      4. Set the GPU placement strategy:

        • Bin-pack - Place as many workloads as possible in each GPU and node to use fewer resources and maximize GPU and node vacancy.
        • Spread Spread workloads across as many GPUs and nodes as possible to minimize the load and maximize the available resources per workload.
        • GPU workloads are workloads that request both GPU and CPU resources
      5. Set the CPU placement strategy:

        • Bin-pack - Place as many workloads as possible in each CPU and node to use fewer resources and maximize CPU and node vacancy.
        • Spread - Spread workloads across as many CPUs and nodes as possible to minimize the load and maximize the available resources per workload.
        • CPU workloads are workloads that request purely CPU resources
      6. Click CREATE NODE POOL

      "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#labeling-nodes-for-node-pool-grouping","title":"Labeling nodes for node-pool grouping:","text":"

      The Infrastructure Administrator can use a preset node label such as the nvidia.com/gpu.product that labels the GPU type, or configure any other node label (e.g. faculty=physics).

      To assign a label to nodes you want to group into a node pool, set a node label on each node:

      1. Get the list of nodes and their current labels using the following command:

        kubectl get nodes --show-labels\n

      2. Annotate a specific node with a new label using the following command:

        kubectl label node <node-name> <key>=<value>\n

      "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#editing-a-node-pool","title":"Editing a node pool","text":"
      1. Select the node pool you want to edit
      2. Click EDIT
      3. Update the node pool and click SAVE
      "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#deleting-a-node-pool","title":"Deleting a node pool","text":"
      1. Select the node pool you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm the deletion

      Note

      The default node pool cannot be deleted. When deleting a node pool, if no other node pool matches any of the nodes\u2019 labels, the node will be included in the default node pool.

      "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#using-api","title":"Using API","text":"

      Go to the Node pools API reference to view the available actions

      "},{"location":"platform-admin/aiinitiatives/resources/nodes/","title":"Nodes","text":"

      This article explains the procedure for managing Nodes.

      Nodes are Kubernetes elements automatically discovered by the Run:ai platform. Once a node is discovered by the Run:ai platform, an associated instance is created in the Nodes table, administrators can view the Node\u2019s relevant information, and Run:ai scheduler can use the node for Scheduling.

      "},{"location":"platform-admin/aiinitiatives/resources/nodes/#nodes-table","title":"Nodes table","text":"

      The Nodes table can be found under Resources in the Run:ai platform.

      The Nodes table displays a list of predefined nodes available to users in the Run:ai platform.

      Note

      • It is not possible to create additional nodes, or edit, or delete existing nodes.
      • Only users with relevant permissions can view the table.

      The Nodes table consists of the following columns:

      Column Description Node The Kubernetes name of the node Status The state of the node. Nodes in the Ready state are eligible for scheduling. If the state is Not ready then the main reason appears in parenthesis on the right side of the state field. Hovering the state lists the reasons why a node is Not ready. Node pool The name of the associated node pool. By default, every node in the Run:ai platform is associated with the default node pool, if no other node pool is associated GPU type The GPU model, for example, H100, or V100 GPU devices The number of GPU devices installed on the node. Clicking this field pops up a dialog with details per GPU (described below in this article) Free GPU devices The current number of fully vacant GPU devices GPU memory The total amount of GPU memory installed on this node. For example, if the number is 640GB and the number of GPU devices is 8, then each GPU is installed with 80GB of memory (assuming the node is assembled of homogenous GPU devices) Allocated GPUs The total allocation of GPU devices in units of GPUs (decimal number). For example, if 3 GPUs are 50% allocated, the field prints out the value 1.50. This value represents the portion of GPU memory consumed by all running pods using this node Used GPU memory The actual amount of memory (in GB or MB) used by pods running on this node. GPU compute utilization The average compute utilization of all GPU devices in this node GPU memory utilization The average memory utilization of all GPU devices in this node CPU (Cores) The number of CPU cores installed on this node CPU memory The total amount of CPU memory installed on this node Allocated CPU (Cores) The number of CPU cores allocated by pods running on this node (decimal number, e.g. a pod allocating 350 mili-cores shows an allocation of 0.35 cores). Allocated CPU memory The total amount of CPU memory allocated by pods running on this node (in GB or MB) Used CPU memory The total amount of actually used CPU memory by pods running on this node. Pods may allocate memory but not use all of it, or go beyond their CPU memory allocation if using Limit > Request for CPU memory (burstable workload) CPU compute utilization The utilization of all CPU compute resources on this node (percentage) CPU memory utilization The utilization of all CPU memory resources on this node (percentage) Used swap CPU memory The amount of CPU memory (in GB or MB) used for GPU swap memory (* future) Pod(s) List of pods running on this node, click the field to view details (described below in this article)"},{"location":"platform-admin/aiinitiatives/resources/nodes/#gpu-devices-for-node","title":"GPU devices for node","text":"

      Click one of the values in the GPU devices column, to view the list of GPU devices and their parameters.

      Column Description Index The GPU index, read from the GPU hardware. The same index is used when accessing the GPU directly Used memory The amount of memory used by pods and drivers using the GPU (in GB or MB) Compute utilization The portion of time the GPU is being used by applications (percentage) Memory utilization The portion of the GPU memory that is being used by applications (percentage) Idle time The elapsed time since the GPU was used (i.e. the GPU is being idle for \u2018Idle time\u2019)"},{"location":"platform-admin/aiinitiatives/resources/nodes/#pods-associated-with-node","title":"Pods associated with node","text":"

      Click one of the values in the Pod(s) column, to view the list of pods and their parameters.

      Note

      This column is only viewable if your role in the Run:ai platform gives you read access to workloads, even if you are allowed to view workloads, you can only view the workloads within your allowed scope. This means, there might be more pods running on this node than appear in the list your are viewing.

      Column Description Pod The Kubernetes name of the pod. Usually name of the pod is made of the name of the parent workload if there is one, and an index for unique for that pod instance within the workload Status The state of the pod. In steady state this should be Running and the amount of time the pod is running Project The Run:ai project name the pod belongs to. Clicking this field takes you to the Projects table filtered by this project name Workload The workload name the pod belongs to. Clicking this field takes you to the Workloads table filtered by this workload name Image The full path of the image used by the main container of this pod Creation time The pod\u2019s creation date and time"},{"location":"platform-admin/aiinitiatives/resources/nodes/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      • Show/Hide details - Click to view additional information on the selected row
      "},{"location":"platform-admin/aiinitiatives/resources/nodes/#showhide-details","title":"Show/Hide details","text":"

      Click a row in the Nodes table and then click the Show details button at the upper right side of the action bar. The details screen appears, presenting the following metrics graphs:

      • GPU utilization Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs compute utilization (percentage of GPU compute) in this node.
      • GPU memory utilization Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs memory usage (percentage of the GPU memory) in this node.
      • CPU compute utilization The average of all CPUs\u2019 cores compute utilization graph, along an adjustable period allows you to see the trends of CPU compute utilization (percentage of CPU compute) in this node.
      • CPU memory utilization The utilization of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory utilization (percentage of CPU memory) in this node.
      • CPU memory usage The usage of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory usage (in GB or MB of CPU memory) in this node.

      • For GPUs charts - Click the GPU legend on the right-hand side of the chart, to activate or deactivate any of the GPU lines.

      • You can click the date picker to change the presented period
      • You can use your mouse to mark a sub-period in the graph for zooming in, and use the \u2018Reset zoom\u2019 button to go back to the preset period
      • Changes in the period affect all graphs on this screen.
      "},{"location":"platform-admin/aiinitiatives/resources/nodes/#using-api","title":"Using API","text":"

      Go to the Nodes API reference to view the available actions

      "},{"location":"platform-admin/authentication/accessrules/","title":"Access Rules","text":"

      This article explains the procedure to manage Access rules.

      Access rules provide users, groups, or applications privileges to system entities.

      An access rule is the assignment of a role to a subject in a scope: <Subject> is a <Role> in a <Scope>.

      For example, user user@domain.com is a department admin in department A.

      "},{"location":"platform-admin/authentication/accessrules/#access-rules-table","title":"Access rules table","text":"

      The Access rules table can be found under Access in the Run:ai platform.

      The Access rules table provides a list of all the access rules defined in the platform and allows you to manage them.

      Note

      Flexible management

      It is also possible to manage access rules directly for a specific user, application, project, or department.

      The Access rules table consists of the following columns:

      Column Description Type The type of subject assigned to the access rule (user, SSO group, or application). Subject The user, SSO group, or application assigned with the role Role The role assigned to the subject Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates Authorized by The user who granted the access rule Creation time The timestamp for when the rule was created Last updated The last time the access rule was updated"},{"location":"platform-admin/authentication/accessrules/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"platform-admin/authentication/accessrules/#adding-new-access-rules","title":"Adding new access rules","text":"

      To add a new access rule:

      1. Click +NEW ACCESS RULE
      2. Select a subject User, SSO Group, or Application
      3. Select or enter the subject identifier:
        • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
        • Group name as recognized by the IDP
        • Application name as created in Run:ai
      4. Select a role
      5. Select a scope
      6. Click SAVE RULE

      Note

      An access rule consists of a single subject with a single role in a single scope. To assign multiple roles or multiple scopes to the same subject, multiple access rules must be added.

      "},{"location":"platform-admin/authentication/accessrules/#editing-an-access-rule","title":"Editing an access rule","text":"

      Access rules cannot be edited. To change an access rule, you must delete the rule, and then create a new rule to replace it.

      "},{"location":"platform-admin/authentication/accessrules/#deleting-an-access-rule","title":"Deleting an access rule","text":"
      1. Select the access rule you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm the deletion
      "},{"location":"platform-admin/authentication/accessrules/#using-api","title":"Using API","text":"

      Go to the Access rules API reference to view the available actions

      "},{"location":"platform-admin/authentication/applications/","title":"Applications","text":"

      This article explains the procedure to manage your organization's applications.

      Applications are used for API integrations with Run:ai. An application contains a client ID and a client secret. With the client credentials, you can obtain a token as detailed in API authentication and use it within subsequent API calls.

      Applications are assigned with Access Rules to manage permissions. For example, application ci-pipeline-prod is assigned with a Researcher role in Cluster: A.

      "},{"location":"platform-admin/authentication/applications/#applications-table","title":"Applications table","text":"

      The Applications table can be found under Access in the Run:ai platform.

      The Applications table provides a list of all the applications defined in the platform, and allows you to manage them.

      The Applications table consists of the following columns:

      Column Description Application The name of the application Client ID The client ID of the application Access rule(s) The access rules assigned to the application Last login The timestamp for the last time the user signed in Created by The user who created the application Creation time The timestamp for when the application was created Last updated The last time the application was updated"},{"location":"platform-admin/authentication/applications/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"platform-admin/authentication/applications/#creating-an-application","title":"Creating an application","text":"

      To create an application:

      1. Click +NEW APPLICATION
      2. Enter the application\u2019s name
      3. Click CREATE
      4. Copy the Client ID and Client secret and store them securely
      5. Click DONE

      Note

      The client secret is visible only at the time of creation. It cannot be recovered but can be regenerated.

      "},{"location":"platform-admin/authentication/applications/#adding-an-access-rule-to-an-application","title":"Adding an access rule to an application","text":"

      To create an access rule:

      1. Select the application you want to add an access rule for
      2. Click ACCESS RULES
      3. Click +ACCESS RULE
      4. Select a role
      5. Select a scope
      6. Click SAVE RULE
      7. Click CLOSE
      "},{"location":"platform-admin/authentication/applications/#deleting-an-access-rule-from-an-application","title":"Deleting an access rule from an application","text":"

      To delete an access rule:

      1. Select the application you want to remove an access rule from
      2. Click ACCESS RULES
      3. Find the access rule assigned to the user you would like to delete
      4. Click on the trash icon
      5. Click CLOSE
      "},{"location":"platform-admin/authentication/applications/#regenerating-client-secret","title":"Regenerating client secret","text":"

      To regenerate a client secret:

      1. Locate the application you want to regenerate its client secret
      2. Click REGENERATE CLIENT SECRET
      3. Click REGENERATE
      4. Copy the New client secret and store it securely
      5. Click DONE

      Warning

      Regenerating a client secret revokes the previous one.

      "},{"location":"platform-admin/authentication/applications/#deleting-an-application","title":"Deleting an application","text":"
      1. Select the application you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm
      "},{"location":"platform-admin/authentication/applications/#using-api","title":"Using API","text":"

      Go to the Applications, Access rules API reference to view the available actions

      "},{"location":"platform-admin/authentication/roles/","title":"Roles","text":"

      This article explains the available roles in the Run:ai platform.

      A role is a set of permissions that can be assigned to a subject in a scope.

      A permission is a set of actions (View, Edit, Create and Delete) over a Run:ai entity (e.g. projects, workloads, users).

      "},{"location":"platform-admin/authentication/roles/#roles-table","title":"Roles table","text":"

      The Roles table can be found under Access in the Run:ai platform.

      The Roles table displays a list of predefined roles available to users in the Run:ai platform. It is not possible to create additional rules or edit or delete existing rules.

      The Roles table consists of the following columns:

      Column Description Role The name of the role Created by The name of the role creator Creation time The timestamp when the role was created"},{"location":"platform-admin/authentication/roles/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"platform-admin/authentication/roles/#reviewing-a-role","title":"Reviewing a role","text":"
      1. To review a role click the role name on the table
      2. In the role form review the following:
        • Role name The name of the role
        • Entity A system-managed object that can be viewed, edited, created or deleted by a user based on their assigned role and scope
        • Actions The actions that the role assignee is authorized to perform for each entity
          • View If checked, an assigned user with this role can view instances of this type of entity within their defined scope
          • Edit If checked, an assigned user with this role can change the settings of an instance of this type of entity within their defined scope
          • Create If checked, an assigned user with this role can create new instances of this type of entity within their defined scope
          • Delete If checked, an assigned user with this role can delete instances of this type of entity within their defined scope
      "},{"location":"platform-admin/authentication/roles/#roles-in-runai","title":"Roles in Run:ai","text":"

      Run:ai supports the following roles and their permissions: Under each role is a detailed list of the actions that the role assignee is authorized to perform for each entity.

      Compute resource administrator

      Data source administrator

      Data volume administrator

      Department administrator

      Department viewer

      Editor

      Environment administrator

      L1 researcher

      L2 researcher

      ML engineer

      Research manager

      System administrator

      Template administrator

      Viewer

      Notes

      Keep the following in mind when upgrading from versions 2.13 or earlier:

      • Admin becomes System Admin with full access to all managed objects and scopes
      • Research Manager is not automatically assigned to all projects, but to projects set by the relevant Admin when assigning this role to a user, group or app
      • To preserve backwards compatibility, users with the role of Research Manager are assigned to all current projects, but not to new projects
      • To allow the Department Admin to assign a Researcher role to a user, group or app, the Department Admin must have VECD permissions for jobs and workspaces. This creates a broader span of managed objects
      • To preserve backwards compatibility, users with the role of Editor, are assigned to the same scope they had before the upgrade. However, with new user assignments, the Admin can limit the scope to only part of the organizational scope.
      "},{"location":"platform-admin/authentication/roles/#permitted-workloads","title":"Permitted workloads","text":"

      When assigning a role with either one, all or any combination of the View, Edit, Create and Delete permissions for workloads, the subject has permissions to manage not only Run:ai native workloads (Workspace, Training, Inference), but also a list of 3rd party workloads:

      • k8s: StatefulSet
      • k8s: ReplicaSet
      • k8s: Pod
      • k8s: Deployment
      • batch: Job
      • batch: CronJob
      • machinelearning.seldon.io: SeldonDeployment
      • kubevirt.io: VirtualMachineInstance
      • kubeflow.org: TFJob
      • kubeflow.org: PyTorchJob
      • kubeflow.org: XGBoostJob
      • kubeflow.org: MPIJob
      • kubeflow.org: MPIJob
      • kubeflow.org: Notebook
      • kubeflow.org: ScheduledWorkflow
      • amlarc.azureml.com: AmlJob
      • serving.knative.dev: Service
      • workspace.devfile.io: DevWorkspace
      • ray.io: RayCluster
      • ray.io: RayJob
      • ray.io: RayService
      • ray.io: RayCluster
      • ray.io: RayJob
      • ray.io: RayService
      • tekton.dev: TaskRun
      • tekton.dev: PipelineRun
      • argoproj.io: Workflow
      "},{"location":"platform-admin/authentication/roles/#using-api","title":"Using API","text":"

      Go to the Roles API reference to view the available actions.

      "},{"location":"platform-admin/authentication/users/","title":"Users","text":"

      This article explains the procedure to manage users and their permissions.

      Users can be managed locally, or via the Identity provider, while assigned with Access Rules to manage its permissions.

      For example, user user@domain.com is a department admin in department A.

      "},{"location":"platform-admin/authentication/users/#users-table","title":"Users table","text":"

      The Users table can be found under Access in the Run:ai platform.

      The users table provides a list of all the users in the platform. You can manage local users and manage user permissions (access rules) for both local and SSO users.

      Note

      Single Sign-On users

      SSO users are managed by the identity provider and appear once they have signed in to Run:ai

      The Users table consists of the following columns:

      Column Description User The unique identity of the user (email address) Type The type of the user - SSO / local Last login The timestamp for the last time the user signed in Access rule(s) The access rules assigned to the user Created By The user who created the user Creation time The timestamp for when the user was created Last updated The last time the user was updated"},{"location":"platform-admin/authentication/users/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"platform-admin/authentication/users/#creating-a-local-user","title":"Creating a local user","text":"

      To create a local user:

      1. Click +NEW LOCAL USER
      2. Enter the user\u2019s Email address
      3. Click CREATE
      4. Review and copy the user\u2019s credentials:
        • User Email
        • Temporary password to be used on first sign-in
      5. Click DONE

      Note

      The temporary password is visible only at the time of user\u2019s creation, and must be changed after the first sign-in

      "},{"location":"platform-admin/authentication/users/#adding-an-access-rule-to-a-user","title":"Adding an access rule to a user","text":"

      To create an access rule:

      1. Select the user you want to add an access rule for
      2. Click ACCESS RULES
      3. Click +ACCESS RULE
      4. Select a role
      5. Select a scope
      6. Click SAVE RULE
      7. Click CLOSE
      "},{"location":"platform-admin/authentication/users/#deleting-users-access-rule","title":"Deleting user\u2019s access rule","text":"

      To delete an access rule:

      1. Select the user you want to remove an access rule from
      2. Click ACCESS RULES
      3. Find the access rule assigned to the user you would like to delete
      4. Click on the trash icon
      5. Click CLOSE
      "},{"location":"platform-admin/authentication/users/#resetting-a-user-password","title":"Resetting a user password","text":"

      To reset a user\u2019s password:

      1. Select the user you want to reset it\u2019s password
      2. Click RESET PASSWORD
      3. Click RESET
      4. Review and copy the user\u2019s credentials:
        • User Email
        • Temporary password to be used on next sign-in
      5. Click DONE
      "},{"location":"platform-admin/authentication/users/#deleting-a-user","title":"Deleting a user","text":"
      1. Select the user you want to delete
      2. Click DELETE
      3. In the dialog, click DELETE to confirm the deletion

      Note

      To ensure administrative operations are always available, at least one local user with System Administrator role should exist.

      "},{"location":"platform-admin/authentication/users/#using-api","title":"Using API","text":"

      Go to the Users, Access rules API reference to view the available actions

      "},{"location":"platform-admin/integrations/integration-overview/","title":"Integrations with Run:ai","text":"

      The table below summarizes the integration capabilities with various third-party products.

      "},{"location":"platform-admin/integrations/integration-overview/#integration-support","title":"Integration support","text":"

      Support for integrations varies. Where mentioned below, the integration is supported out of the box with Run:ai. With other integrations, our customer success team has previous experience with integrating with the third party software and many times the community portal will contain additional reference documentation provided on an as-is basis.

      The Run:ai community portal is password protected and access is provided to customers and partners.

      "},{"location":"platform-admin/integrations/integration-overview/#integrations","title":"Integrations","text":"Tool Category Run:ai support details Additional Information Triton Orchestration Supported Usage via docker base image. Quickstart inference example Spark Orchestration Community Support It is possible to schedule Spark workflows with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-Run-Spark-jobs-with-Run-AI Kubeflow Pipelines Orchestration Community Support It is possible to schedule kubeflow pipelines with the Run:ai scheduler. For details please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portalhttps://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Kubeflow Apache Airflow Orchestration Community Support It is possible to schedule Airflow workflows with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Apache-Airflow Argo workflows Orchestration Community Support It is possible to schedule Argo workflows with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Argo-Workflows SeldonX Orchestration Community Support It is possible to schedule Seldon Core workloads with the Run:ai scheduler. For details, please contact Run:ai customer success. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Seldon-Core Jupyter Notebook Development Supported Run:ai provides integrated support with Jupyter Notebooks. Quickstart example: https://docs.run.ai/latest/Researcher/Walkthroughs/quickstart-jupyter/ Jupyter Hub Development Community Support It is possible to submit Run:ai workloads via JupyterHub. For more information please contact Run:ai customer support PyCharm Development Supported Containers created by Run:ai can be accessed via PyCharm. PyCharm example VScode Development Supported - Containers created by Run:ai can be accessed via Visual Studio Code. example - You can automatically launch Visual Studio code web from the Run:ai console. example. Kubeflow notebooks Development Community Support It is possible to launch a kubeflow notebook with the Run:ai scheduler. For details please contact Run:ai customer support Sample code can be found in the Run:ai customer success community portal:https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Kubeflow Ray training, inference, data processing. Community Support It is possible to schedule Ray jobs with the Run:ai scheduler. Sample code can be found in the Run:ai customer success community portal https://runai.my.site.com/community/s/article/How-to-Integrate-Run-ai-with-Ray TensorBoard Experiment tracking Supported Run:ai comes with a preset Tensorboard Environment asset. TensorBoard example. Additional sample Weights & Biases Experiment tracking Community Support It is possible to schedule W&B workloads with the Run:ai scheduler. For details, please contact Run:ai customer success. ClearML Experiment tracking Community Support It is possible to schedule ClearML workloads with the Run:ai scheduler. For details, please contact Run:ai customer success. MLFlow Model Serving Community Support It is possible to use ML Flow together with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-MLflow Additional MLFlow sample Hugging Face Repositories Supported Run:ai provides an out of the box integration with Hugging Face Docker Registry Repositories Supported Run:ai allows using a docker registry as a Credentials asset. S3 Storage Supported Run:ai communicates with S3 by defining a data source asset. Github Storage Supported Run:ai communicates with GitHub by defining it as a data source asset Tensorflow Training Supported Run:ai provides out of the box support for submitting TensorFlow workloads via API or by submitting workloads via user interface. Pytorch Training Supported Run:ai provides out of the box support for submitting PyTorch workloads via API or by submitting workloads via user interface. Kubeflow MPI Training Supported Run:ai provides out of the box support for submitting MPI workloads via API or by submitting workloads via user interface XGBoost Training Supported Run:ai provides out of the box support for submitting XGBoost workloads via API or by submitting workloads via user interface Karpenter Cost Optimization Supported Run:ai provides out of the box support for Karpenter to save cloud costs. Integration notes with Karpenter can be found here"},{"location":"platform-admin/integrations/integration-overview/#kubernetes-workloads-integration","title":"Kubernetes Workloads Integration","text":"

      Kubernetes has several built-in resources that encapsulate running Pods. These are called Kubernetes Workloads and should not be confused with Run:ai Workloads.

      Examples of such resources are a Deployment that manages a stateless application, or a Job that runs tasks to completion.

      Run:ai natively runs Run:ai Workloads. A Run:ai workload encapsulates all the resources needed to run, creates them, and deletes them together. However, Run:ai, being an open platform allows the scheduling of any Kubernetes Workflow.

      For more information see Kubernetes Workloads Integration.

      "},{"location":"platform-admin/integrations/karpenter/","title":"Working with Karpenter","text":"

      Karpenter is an open-source, Kubernetes cluster autoscaler built for cloud deployments. Karpenter optimizes the cloud cost of a customer\u2019s cluster by moving workloads between different node types, consolidating workloads into fewer nodes, using lower-cost nodes where possible, scaling up new nodes when needed, and shutting down unused nodes.

      Karpenter\u2019s main goal is cost optimization. Unlike Karpenter, Run:ai\u2019s scheduler optimizes for fairness and resource utilization. Therefore, there are a few potential friction points when using both on the same cluster.

      "},{"location":"platform-admin/integrations/karpenter/#friction-points-using-karpenter-with-runai","title":"Friction points using Karpenter with Run:ai","text":"
      1. Karpenter looks for \u201cunschedulable\u201d pending workloads and may try to scale up new nodes to make those workloads schedulable. However, in some scenarios, these workloads may exceed their quota parameters, and the Run:ai scheduler will put them into a pending state.
      2. Karpenter is not aware of the Run:ai fractions mechanism and may try to interfere incorrectly.
      3. Karpenter preempts any type of workload (i.e., high-priority, non-preemptible workloads will potentially be interrupted and moved to save cost).
      4. Karpenter has no pod-group (i.e., workload) notion or gang scheduling awareness, meaning that Karpenter is unaware that a set of \u201carbitrary\u201d pods is a single workload. This may cause Karpenter to schedule those pods into different node pools (in the case of multi-node-pool workloads) or scale up or down a mix of wrong nodes.
      "},{"location":"platform-admin/integrations/karpenter/#mitigating-the-friction-points","title":"Mitigating the friction points","text":"

      Run:ai scheduler mitigates the friction points using the following techniques (each numbered bullet below corresponds to the related friction point listed above):

      1. Karpenter uses a \u201cnominated node\u201d to recommend a node for the scheduler. The Run:ai scheduler treats this as a \u201cpreferred\u201d recommendation, meaning it will try to use this node, but it\u2019s not required and it may choose another node.
      2. Fractions - Karpenter won\u2019t consolidate nodes with one or more pods that cannot be moved. The Run:ai reservation pod is marked as \u2018do not evict\u2019 to allow the Run:ai scheduler to control the scheduling of fractions.
      3. Non-preemptible workloads - Run:ai marks non-preemptible workloads as \u2018do not evict\u2019 and Karpenter respects this annotation.
      4. Run:ai node pools (single-node-pool workloads) - Karpenter respects the \u2018node affinity\u2019 that Run:ai sets on a pod, so Karpenter uses the node affinity for its recommended node. For the gang-scheduling/pod-group (workload) notion, Run:ai scheduler considers Karpenter directives as preferred recommendations rather than mandatory instructions and overrides Karpenter instructions where appropriate.
      "},{"location":"platform-admin/integrations/karpenter/#deployment-considerations","title":"Deployment Considerations","text":"
      • Using multi-node-pool workloads
        • Workloads may include a list of optional nodepools. Karpenter is not aware that only a single node pool should be selected out of that list for the workload. It may therefore recommend putting pods of the same workload into different node pools and may scaleup nodes from different node pools to serve a \u201cmulti-node-pool\u201d workload instead of nodes on the selected single node pool.
        • If this becomes an issue (i.e., if Karpenter scales up the wrong node types), users can set an inter-pod affinity using the node pool label or another common label as a \u2018topology\u2019 identifier. This will force Karpenter to choose nodes from a single-node pool per workload, selecting from any of the node pools listed as allowed by the workload.
        • An alternative approach is to use a single-node pool for each workload instead of multi-node pools.
      • Consolidation
        • To make Karpenter more effective when using its consolidation function, users should consider separating preemptible and non-preemptible workloads, either by using node pools, node affinities, taint/tollerations, or inter-pod anti-affinity.
        • If users don\u2019t separate preemptible and non-preemptible workloads (i.e., make them run on different nodes), Karpenter\u2019s ability to consolidate (binpack) and shut down nodes will be reduced, but it is still effective.
      • Conflicts between binpacking and spread policies
        • If Run:ai is used with a scheduling spread policy, it will clash with Karpenter\u2019s default binpacks/consolidation policy, and the outcome may be a deployment that is not optimized for any of these policies.
        • Usually spread is used for Inference, which is non-preemptible and therefore not controlled by Karpenter (Run:ai scheduler will mark those workloads as \u2018do not evict\u2019 for Karpenter), so this should not present a real deployment issue for customers.
      "},{"location":"platform-admin/performance/dashboard-analysis/","title":"Introduction","text":"

      The Run:ai Administration User Interface provides a set of dashboards that help you monitor Clusters, Cluster Nodes, Projects, and Workloads. This document provides the key metrics to monitor, how to assess them as well as suggested actions.

      Dashboards are used by system administrators to analyze and diagnose issues that relate to:

      • Physical Resources.
      • Organization resource allocation and utilization.
      • Usage characteristics.

      System administrators need to know important information about the physical resources that are currently being used. Important information such as:

      • Resource health.
      • Available resources and their distribution.
      • Is there a lack of resources.
      • Are resources being utilized correctly.

      With this information, system administrators can hone in on:

      • How resources are allocated across the organization.
      • How the different organizational units utilized quotas and resources within those quotas.
      • The actual performance of the organizational units.

      These dashboards give system administrators the ability to drill down to see details of the different types of workloads that each of the organizational units is running. These usage and performance metrics ensure that system administrators can then take actions to correct issues that affect performance.

      There are 5 dashboards:

      • GPU/CPU Overview dashboard\u2014Provides information about what is happening right now in the cluster.
      • Quota Management dashboard\u2014Provides information about quota utilization.
      • Analytics dashboard\u2014Provides long term analysis of cluster behavior.
      • Multi-Cluster Overview dashboard\u2014Provides a more holistic, multi-cluster view of what is happening right now. The dashboard is intended for organizations that have more than one connected cluster.
      • Consumption dashboard\u2014Provides information about resource consumption.
      "},{"location":"platform-admin/performance/dashboard-analysis/#gpucpu-overview-dashboard-new-and-legacy","title":"GPU/CPU Overview Dashboard (New and legacy)","text":"

      The Overview dashboard provides information about what is happening right now in the cluster. Administrators can view high-level information on the state of the cluster. The dashboard has two tabs that change the display to provide a focused view for GPU Dashboards (default view) and CPU Dashboards.

      "},{"location":"platform-admin/performance/dashboard-analysis/#gpu-dashboard","title":"GPU Dashboard","text":"

      The GPU dashboard displays specific information for GPU based nodes, node-pools, clusters, or tenants. These dashboards also include additional metrics that specific to GPU based environments. The dashboard contains tiles that show information about specific resource allocation and performance metrics. The tiles are interactive allowing you to link directly to the assets or drill down to specific scopes. Use the time frame selector to choose a time frame for all the tiles in the dashboard.

      The dashboard has the following tiles:

      • Ready nodes\u2014displays GPU nodes that are in the ready state.
      • Ready GPU devices\u2014displays the number of GPUs in nodes that are in the ready state.
      • Allocated GPU compute\u2014displays the total number of GPUs allocated from all the nodes.
      • Idle allocated GPU devices\u2014displays the number of allocated GPU devices that have been idle for more than 5 minutes.
      • Running workloads\u2014displays the number of running workloads.
      • Pending workloads\u2014displays the number of workloads in the pending status.
      • Allocation ration by node pool\u2014displays the percentage of GPUs allocated per node pool. Hover over the bar for detailed information. Use the scope selected at the bottom of the graph to drill down for more details.
      • Free resources by node pool\u2014the graph displays the amount of free resources per node pool. Press a entry in the graph for more details. Hover over the resource bubbles for specific details for the workers in the node. Use the ellipsis to download the graph as a CSV file.
      • Resource allocation by workload type\u2014displays the resource allocation by workload type. Hover over the bar for detailed information. Use the scope selected at the bottom of the graph to drill down for more details. Use the ellipsis to download the graph as a CSV file.
      • Workload by status\u2014displays the number of workloads for each status in the workloads table. Hover over the bar for detailed information. Use the scope selected at the bottom of the graph to drill down for more details. Use the ellipsis to download the graph as a CSV file.
      • Resources utilization\u2014displays the resource utilization over time. The right pane of the graph shows the average utilization of the selected time frame of the dashboard. Hover over the graph to see details of a specific time in the graph. Use the ellipsis to download the graph as a CSV file.
      • Resource allocation\u2014displays the resource allocation over time. The right pane of the graph shows the average allocation of the selected time frame of the dashboard. Hover over the graph to see details of a specific time in the graph. Use the ellipsis to download the graph as a CSV file.
      "},{"location":"platform-admin/performance/dashboard-analysis/#cpu-dashboard","title":"CPU Dashboard","text":"

      The CPU dashboards display specific information for CPU based nodes, node-pools, clusters, or tenants. These dashboards also include additional metrics that specific to CPU based environments.

      To enable CPU Dashboards:

      1. Click General settings
      2. Open the Analytics pane and toggle the Show CPU dashboard switch to enable the feature.

      Toggle the switch to disable to disable CPU Dashboards option.

      The dashboard contains the following tiles:

      • Total CPU Nodes\u2014displays the total amount of CPU nodes.
      • Ready CPU nodes\u2014displays the total amount of CPU nodes in the ready state.
      • Total CPUs\u2014displays the total amount of CPUs.
      • Ready CPUs\u2014displays the total amount of CPUs in the ready state.
      • Allocated CPUs\u2014displays the amount of allocated CPUs.
      • Running workloads\u2014displays the amount of workloads in the running state.
      • Pending workloads\u2014displays the amount of workloads in the pending state.
      • Allocated CPUs per project\u2014displays the amount of CPUs allocated per project.
      • Active projects\u2014displays the active projects with the CPU allocation and amount of running and pending workloads.
      • Utilization per resource type\u2014displays the CPU compute and CPU memory utilization over time.
      • CPU compute utilization\u2014displays the current CPU compute utilization.
      • CPU memory utilization\u2014displays the current CPU memory utilization.
      • Pending workloads\u2014displays the requested resources and wait time for workloads in the pending status.
      • Workloads with error\u2014displays the amount of workloads that are currently not running due to an error.
      • Workload Count per CPU Compute Utilization\u2014
      • 5 longest running workloads\u2014displays up to 5 of workloads that have the longest running time.

      Analysis and Suggested actions:

      Review Analysis & Actions Interactive Workloads are too frequently idle Consider setting time limits for interactive Workloads through the Projects tab.\u00a0 Consider also reducing GPU/CPU quotas for specific Projects to encourage users to run more training Workloads as opposed to interactive Workloads (note that interactive Workloads can not use more than the GPU/CPU quota assigned to their Project). Training Workloads are too frequently idle Identify and notify the right users and work with them to improve the utilization of their training scripts"},{"location":"platform-admin/performance/dashboard-analysis/#workloads-with-an-error","title":"Workloads with an Error","text":"

      Search for Workloads with an error status. These Workloads may be holding GPUs/CPUs without actually using them.

      Analysis and Suggested actions:

      Search for workloads with an Error status on the Workloads view and discuss with the Job owner. Consider deleting these Workloads to free up the resources for other users.

      "},{"location":"platform-admin/performance/dashboard-analysis/#workloads-with-a-long-duration","title":"Workloads with a Long Duration","text":"

      View list of 5 longest Workloads.

      Analysis and Suggested actions:

      Review Analysis & Actions Training Workloads run for too long Ask users to view their Workloads and analyze whether useful work is being done. If needed, stop their Workloads. Interactive Workloads run for too long Consider setting time limits for interactive Workloads via the Project editor."},{"location":"platform-admin/performance/dashboard-analysis/#job-queue","title":"Job Queue","text":"

      Identify queueing bottlenecks.

      Analysis and Suggested actions:

      Review Analysis & Actions Cluster is fully loaded Go over the table of active Projects and check that fairness between Projects was enforced, by reviewing the number of allocated GPUs/CPUs for each Project, ensuring each Project was allocated with its fair-share portion of the cluster. Cluster is not fully loaded Go to the Workloads view to review the resources requested for that Job (CPU, CPU memory, GPU, GPU memory). Go to the Nodes view to verify that there is no Node with enough free resources that can host that Job.

      Also, check the command that the user used to submit the job. The Researcher may have requested a specific Node for that Job.

      "},{"location":"platform-admin/performance/dashboard-analysis/#analytics-dashboard","title":"Analytics Dashboard","text":"

      The Analytics dashboard provides means for viewing historical data on cluster information such as:

      • Utilization across the cluster
      • GPU usage by different Projects, including allocation and utilization, broken down into interactive and training Workloads
      • Breakdown of running Workloads into interactive, training, and GPU versus CPU-only Workloads, including information on queueing (number of pending Workloads and requested GPUs),
      • Status of Nodes in terms of availability and allocated and utilized resources.

      The dashboard has a dropdown filter for node pools and Departments. From the dropdown, select one or more node pools. The default setting is all.

      The information presented in Analytics can be used in different ways for identifying problems and fixing them. Below are a few examples.

      "},{"location":"platform-admin/performance/dashboard-analysis/#node-downtime","title":"Node Downtime","text":"

      View the overall available resources per Node and identify cases where a Node is down and there was a reduction in the number of available resources.

      How to: view the following panel.

      Analysis and Suggested actions:

      Filter according to time range to understand for how long the Node is down.

      "},{"location":"platform-admin/performance/dashboard-analysis/#gpu-allocation","title":"GPU Allocation","text":"

      Track GPU allocation across time.

      How to: view the following panels.

      The panel on the right-hand side shows the cluster-wide GPU allocation and utilization versus time, whereas the panels on the left-hand side show the cluster-wide GPU allocation and utilization averaged across the filtered time range.

      Analysis and Suggested actions:

      If the allocation is too low for a long period, work with users to run more workloads and to better utilize the Cluster.

      "},{"location":"platform-admin/performance/dashboard-analysis/#track-gpu-utilization","title":"Track GPU utilization","text":"

      Track whether Researchers efficiently use the GPU resources they have allocated for themselves.

      How to: view the following panel:

      Analysis and Suggested actions:

      If utilization is too low for a long period, you will want to identify the source of the problem:

      • Go to \u201cAverage GPU Allocation & Utilization\u201d
      • Look for Projects with large GPU allocations for interactive Workloads or Projects that poorly utilize their training Workloads. Users tend to poorly utilize their GPUs in interactive sessions because of the dev & debug nature of their work which typically is an iterative process with long idle GPU time. On many occasions users also don\u2019t shut down their interactive Workloads, holding their GPUs idle and preventing others from using them.
      Review Analysis & Actions Low GPU utilization is due to interactive Workloads being used too frequently Consider setting time limits for interactive Workloads through the Projects tab or reducing GPU quotas to encourage users to run more training Workloads as opposed to interactive Workloads (note that interactive Workloads can not use more than the GPU quota assigned to their Project). Low GPU utilization is due to users poorly utilizing their GPUs in training sessions Identify Projects with bad GPU utilization in training Workloads, notify the users and work with them to improve their code and the way they utilize their GPUs."},{"location":"platform-admin/performance/dashboard-analysis/#training-vs-interactive-researcher-maturity","title":"Training vs. Interactive -- Researcher maturity","text":"

      Track the number of running Workloads and the breakdown into interactive, training, and CPU-only Workloads.

      How to: view the following panel:

      Analysis and Suggested actions:

      We would want to encourage users to run more training Workloads than interactive Workloads, as it is the key to achieving high GPU utilization across the Cluster:

      • Training Workloads run to completion and free up their resources automatically when training ends
      • Training Workloads can be preempted, queued, and resumed automatically by the Run:ai system according to predefined policies which increases fairness and Cluster utilization.
      "},{"location":"platform-admin/performance/dashboard-analysis/#pending-queue-size","title":"Pending Queue Size","text":"

      Track how long is the queue for pending Workloads

      How to: view the following panels:

      Analysis and Suggested actions:

      Consider buying more GPUs:

      • When there are too many Workloads are waiting in queue for too long.
      • With a large number of requested GPUs.
      • While the Cluster is fully loaded and well utilized.
      "},{"location":"platform-admin/performance/dashboard-analysis/#cpu-memory-utilization","title":"CPU & Memory Utilization","text":"

      Track CPU and memory Node utilization and identify times where the load on specific Nodes is high.

      How to: view the following panel:

      Analysis and Suggested actions:

      If the load on specific Nodes is too high, it may cause problems with the proper operation of the Cluster and the way workloads are running.

      Consider adding more CPUs, or adding additional CPU-only nodes for Workloads that do only CPU processing.

      "},{"location":"platform-admin/performance/dashboard-analysis/#multi-cluster-overview-dashboard","title":"Multi-Cluster overview dashboard","text":"

      Provides a holistic, aggregated view across Clusters, including information about Cluster and Node utilization, available resources, and allocated resources. With this dashboard, you can identify Clusters that are down or underutilized and go to the Overview of that Cluster to explore further.

      "},{"location":"platform-admin/performance/dashboard-analysis/#consumption-dashboard","title":"Consumption dashboard","text":"

      This dashboard enables users and admins to view consumption usage using run:AI services. The dashboard provides views based on configurable filters and timelines. The dashboard also provides costing analysis for GPU, CPU, and memory costs for the system.

      The dashboard has 4 tiles for:

      • Cumulative GPU allocation per Project or Department
      • Cumulative CPU allocation per Project or Department
      • Cumulative memory allocation per Project or Department
      • Consumption types

      Use the drop down menus at the top of the dashboard to apply filters for:

      • Project or department
      • Per project (single, multiple, or all)
      • Per department (single, multiple or all)
      • Per cluster (single, multiple, all)

      To enable the Consumption Dashboard:

      1. Press the General settings icon, then press General.
      2. Open the Analytics pane and toggle the Consumption switch to enable the feature.
      3. Enter the cost of:
      4. GPU compute / Hour
      5. CPU compute / Hour
      6. CPU memory / Hour

      Use the time picker dropdown to select relative time range options and set custom absolute time ranges. You can change the Timezone and fiscal year settings from the time range controls by clicking the Change time settings button.

      Note

      Dashboard data updates once an hour.

      You can change the refresh interval using the refresh interval drop down.

      The dashboard has a 2 consumption tables that display the total consumption of resources. Hover over an entry in the table to filter it in or out of the table.

      The Total consumption table includes consumption details based on the filters selected. Fields include:

      • Project
      • Department
      • GPU hours
      • CPU hours
      • Memory hours
      • GPU Idle allocated hours\u2014the portion of time the GPUs spend idle from the total allocation hours.
      • CPU usage hours\u2014the actual usage time of CPU.
      • Memory usage time\u2014the actual usage time of CPU memory.
      • GPU cost (only when configured)
      • CPU cost (only when configured)
      • CPU memory (only when configured)

      The Total department consumption table includes consumption details for each department, or details for departments selected in the filters. Fields include:

      • Department
      • GPU hours
      • CPU hours
      • Memory hours
      • GPU Idle allocated hours\u2014the portion of time the GPUs spend idle from the total allocation hours.
      • CPU usage hours\u2014the actual usage time of CPU.
      • Memory usage time\u2014the actual usage time of CPU memory.
      • GPU cost (only when configured)
      • CPU cost (only when configured)
      • CPU memory (only when configured)

      The dashboard has a graph of the GPU allocation over time.

      !

      The dashboard has a graph of the Project over-quota GPU consumption.

      !

      "},{"location":"platform-admin/performance/dashboard-analysis/#quota-management-dashboard","title":"Quota management dashboard","text":"

      The Quota management dashboard provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard is divided into sections with essential metrics and data visualizations to identify resource usage patterns, potential bottlenecks, and areas for optimization. The sections of the dashboard include:

      • Add Filter
      • Quota / Total
      • Allocated / Quota
      • Pending workloads
      • Quota by node pool
      • Allocation by node pool
      • Pending workloads by node pool
      • Departments with lowest allocation by node pool
      • Projects with lowest allocation ratio by node pool
      • Over time allocation / quota
      "},{"location":"platform-admin/performance/dashboard-analysis/#add-filter","title":"Add Filter","text":"

      Use the Add Filter dropdown to select filters for the dashboard. The filters will change the data shown on the dashboard. Available filters are:

      • Departments
      • Projects
      • Nodes

      Select a filter from the dropdown, then select a item from the list, and press apply.

      Note

      You can create a filter with multiple categories, but you can use each category and item only once.

      "},{"location":"platform-admin/performance/dashboard-analysis/#quota-total","title":"Quota / Total","text":"

      This section shows the number of GPUs that are in the quota based on the filter selection. The quota of GPUs is the number of GPUs that are reserved for use.

      "},{"location":"platform-admin/performance/dashboard-analysis/#allocated-quota","title":"Allocated / Quota","text":"

      This section shows the number of GPUs that are allocated based on the filter selection. Allocated GPUs are the number of GPUs that are being used.

      "},{"location":"platform-admin/performance/dashboard-analysis/#pending-workloads","title":"Pending workloads","text":"

      This section shows the number workloads that are pending based on the filter selection. Pending workloads are workloads that have not started.

      "},{"location":"platform-admin/performance/dashboard-analysis/#quota-by-node-pool","title":"Quota by node pool","text":"

      This section shows the quota of GPUs by node pool based on the filter. The quota is the number of GPUs that are reserved for use. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.

      "},{"location":"platform-admin/performance/dashboard-analysis/#allocation-by-node-pool","title":"Allocation by node pool","text":"

      This section shows the allocation of GPUs by node pool based on the filter. The allocation is the number of GPUs that are being used. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.

      "},{"location":"platform-admin/performance/dashboard-analysis/#pending-workloads-by-node-pool","title":"Pending workloads by node pool","text":"

      This section shows the number of pending workloads by node pool. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.

      "},{"location":"platform-admin/performance/dashboard-analysis/#departments-with-lowest-allocation-by-node-pool","title":"Departments with lowest allocation by node pool","text":"

      This section shows the departments with the lowest allocation of GPUs by percentage relative to the total number of GPUs.

      "},{"location":"platform-admin/performance/dashboard-analysis/#projects-with-lowest-allocation-ratio-by-node-pool","title":"Projects with lowest allocation ratio by node pool","text":"

      This section shows the projects with the lowest allocation of GPUS by percentage relative to the total number of GPUs.

      "},{"location":"platform-admin/performance/dashboard-analysis/#over-time-allocation-quota","title":"Over time allocation / quota","text":"

      This section shows the allocation of GPUs from the quota over a period of time.

      "},{"location":"platform-admin/performance/reports/","title":"Reports","text":"

      This article explains the procedure of managing reports in Run:ai.

      Reports allow users to access and organize large amounts of data in a clear, CSV-formatted layout. They enable users to monitor resource consumption, analyze trends, and make data-driven decisions to optimize their AI workloads effectively.

      Note

      Reports is enabled by default for SaaS tenants. In order to enable the feature for tenants, additional configuration must be added. See Enabling reports for self-hosted accounts.

      "},{"location":"platform-admin/performance/reports/#report-types","title":"Report types","text":"

      Currently, only \u201cConsumption Reports\u201d are available, which provides insights into the consumption of resources such as GPU, CPU, and CPU memory across organizational units.

      "},{"location":"platform-admin/performance/reports/#reports-table","title":"Reports table","text":"

      The Reports table can be found under Analytics in the Run:ai platform.

      The Reports table provides a list of all the reports defined in the platform and allows you to manage them.

      Users are able to access the reports they have generated themselves. Users with project viewing permissions throughout the tenant can access all reports within the tenant.

      The Reports table comprises the following columns:

      Column Description Report The name of the report Description The description of the report Status The different lifecycle phases and representation of the report condition Type The type of the report \u2013 e.g., consumption Created by The user who created the report Creation time The timestamp of when the report was created Collection period The period in which the data was collected"},{"location":"platform-admin/performance/reports/#reports-status","title":"Reports status","text":"

      The following table describes the reports' condition and whether they were created successfully:

      Status Description Ready Report is ready and can be downloaded as CSV Pending Report is in the queue and waiting to be processed Failed The report couldn\u2019t be created Processing... The report is being created"},{"location":"platform-admin/performance/reports/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      "},{"location":"platform-admin/performance/reports/#creating-a-new-report","title":"Creating a\u00a0new report","text":"

      Before you start, make sure you have a project.

      To create a new report:

      1. Click +NEW REPORT
      2. Enter a name for the report (if the name already exists, you will need to choose a different one)
      3. Optional: Provide a description of the report
      4. Set the report\u2019s data collection period
        • Start date - The date at which the report data commenced
        • End date - The date at which the report data concluded
      5. Set the report segmentation and filters
        • Filters - Filter by project or department name
        • Segment by - Data is collected and aggregated based on the segment
      6. Click CREATE REPORT
      "},{"location":"platform-admin/performance/reports/#deleting-a-report","title":"Deleting a report","text":"
      1. Select the report you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm
      "},{"location":"platform-admin/performance/reports/#downloading-a-report","title":"Downloading a report","text":"

      Note

      To download, the report must be in status \u201cReady\u201d.

      1. Select the report you want to download
      2. Click DOWNLOAD CSV
      "},{"location":"platform-admin/performance/reports/#using-api","title":"Using API","text":"

      To view the available actions, go to the Reports API reference.

      "},{"location":"platform-admin/performance/reports/#enabling-reports-for-self-hosted-accounts","title":"Enabling Reports for self-hosted accounts","text":"

      Reports must be saved in a storage solution compatible with S3. To activate this feature for self-hosted accounts, the storage needs to be linked to the account. The configuration should be incorporated into two ConfigMap objects within the Control Plane.

      1. Edit the runai-backend-org-unit-service ConfigMap:

        kubectl edit cm runai-backend-org-unit-service -n runai-backend\n

      2. Add the following lines to the file:

        S3_ENDPOINT: <S3_END_POINT_URL>\nS3_ACCESS_KEY_ID: <S3_ACCESS_KEY_ID>\nS3_ACCESS_KEY: <S3_ACCESS_KEY>\nS3_USE_SSL: \"true\"\nS3_BUCKET: <BUCKET_NAME>\n

      3. Edit the runai-backend-metrics-service ConfigMap:

        kubectl edit cm runai-backend-metrics-service -n runai-backend\n

      4. Add the following lines to the file:

        S3_ENDPOINT: <S3_END_POINT_URL>\nS3_ACCESS_KEY_ID: <S3_ACCESS_KEY_ID>\nS3_ACCESS_KEY: <S3_ACCESS_KEY>\nS3_USE_SSL: \"true\"\n

      5. In addition on the same file, under config.yaml section, add the following right after log_level: \\\"Info\\\"\\n:

        reports:\\n s3_config:\\n bucket: \\\"<BUCKET_NAME>\\\"\\n\n

      6. Restart the deployments:

        kubectl rollout restart deployment runai-backend-metrics-service runai-backend-org-unit-service -n runai-backend\n

      7. Refresh the page to see Reports under Analytics in the Run:ai platform.

      "},{"location":"platform-admin/workloads/assets/compute/","title":"Compute Resources","text":"

      This article explains what compute resources are and how to create and use them.

      Compute resources are one type of workload asset. A compute resource is a template that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

      A compute resource asset is a preconfigured building block that encapsulates all the specifications of compute requirements for the workload including:

      • GPU devices and GPU memory
      • CPU memory and CPU compute
      "},{"location":"platform-admin/workloads/assets/compute/#compute-resource-table","title":"Compute resource table","text":"

      The Compute resource table can be found under Workload manager in the Run:ai UI.

      The Compute resource table provides a list of all the compute resources defined in the platform and allows you to manage them.

      The Compute resource table consists of the following columns:

      Column Description Compute resource The name of the compute resource Description A description of the essence of the compute resource GPU devices request per pod The number of requested physical devices per pod of the workload that uses this compute resource GPU memory request per device The amount of GPU memory per requested device that is granted to each pod of the workload that uses this compute resource CPU memory request The minimum amount of CPU memory per pod of the workload that uses this compute resource CPU memory limit The maximum amount of CPU memory per pod of the workload that uses this compute resource CPU compute request The minimum number of CPU cores per pod of the workload that uses this compute resource CPU compute limit The maximum number of CPU cores per pod of the workload that uses this compute resource Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram Workload(s) The list of workloads associated with the compute resource Template(s) The list of workload templates that use this compute resource Created by The name of the user who created the compute resource Creation time The timestamp of when the compute resource was created Last updated The timestamp of when the compute resource was last updated Cluster The cluster that the compute resource is associated with"},{"location":"platform-admin/workloads/assets/compute/#workloads-associated-with-the-compute-resource","title":"Workloads associated with the compute resource","text":"

      Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

      Column Description Workload The workload that uses the compute resource Type Workspace/Training/Inference Status Represents the workload lifecycle. See the full list of workload status."},{"location":"platform-admin/workloads/assets/compute/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      "},{"location":"platform-admin/workloads/assets/compute/#adding-new-compute-resource","title":"Adding new compute resource","text":"

      To add a new compute resource:

      1. Go to the Compute resource table
      2. Click +NEW COMPUTE RESOURCE
      3. Select under which cluster to create the compute resource
      4. Select a scope
      5. Enter a name for the compute resource. The name must be unique.
      6. Optional: Provide a description of the essence of the compute resource
      7. Set the resource types needed within a single node (the Run:ai scheduler tries to match a single node that complies with the compute resource for each of the workload\u2019s pods)

        • GPU

          • GPU devices per pod The number of devices (physical GPUs) per pod (for example, if you requested 3 devices per pod and the running workload using this compute resource consists of 3 pods, there are 9 physical GPU devices used in total)

          Note

          • When setting it to zero, the workload using this computer resource neither requests or uses GPU resources while running
          • You can set any number of GPU devices and specify the memory requirement to any portion size (1..100), or memory size value using GB or MB units per device
          • GPU memory per device
            • Select the memory request format
              • % (of device) - Fraction of a GPU device\u2019s memory
              • MB (memory size) - An explicit GPU memory unit
              • GB (memory size) - An explicit GPU memory unit
            • Set the memory Request - The minimum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives this amount of GPU memory for each device(s) the pod utilizes
            • Optional: Set the memory Limit - The maximum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives at most this amount of GPU memory for each device(s) the pod utilizes. To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.

          Note

          • GPU memory limit is disabled by default. If you cannot see the Limit toggle in the compute resource form, then it must be enabled by your Administrator, under General settings \u2192 Resources \u2192 GPU resource optimization
          • When a Limit is set and is bigger than the Request, the scheduler allows each pod to reach the maximum amount of GPU memory in an opportunistic manner (only upon availability).
          • If the GPU Memory Limit is bigger that the Request the pod is prone to be killed by the Run:ai toolkit (out of memory signal). The greater the difference between the GPU memory used and the request, the higher the risk of being killed
          • If GPU resource optimization is turned off, the minimum and maximum are in fact equal
        • CPU

          • CPU compute per pod
            • Select the units for the CPU compute (Cores / Millicores)
            • Set the CPU compute Request - the minimum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU compute for each pod.
            • Optional: Set the CPU compute Limit - The maximum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU compute. To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request. By default, the limit is set to \u201cUnlimited\u201d - which means that the pod may consume all the node's free CPU compute resources.
          • CPU memory per pod
            • Select the units for the CPU memory (MB / GB)
            • Set the CPU memory Request - The minimum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU memory for each pod.
            • Optional: Set the CPU memory Limit - The maximum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU memory. To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request. By default, the limit is set to \u201cUnlimited\u201d - Meaning that the pod may consume all the node's free CPU memory resources.

          Note

          If the CPU Memory Limit is bigger that the Request the pod is prone to be killed by the operating system (out of memory signal). The greater the difference between the CPU memory used and the request, the higher the risk of being killed.

      8. Optional: More settings

        • Increase shared memory size When enabled, the shared memory size available to the pod is increased from the default 64MB to the node's total available memory or the CPU memory limit, if set above.
        • Set extended resource(s) Click +EXTENDED RESOURCES to add resource/quantity pairs. For more information on how to set extended resources, see the Extended resources and Quantity guides
      9. Click CREATE COMPUTE RESOURCE

        Note

        It is also possible to add compute resources directly when creating a specific Workspace, training or inference workload.

      "},{"location":"platform-admin/workloads/assets/compute/#editing-a-compute-resource","title":"Editing a compute resource","text":"

      To edit a compute resource:

      1. Select the compute resource you want to edit
      2. Click Edit
      3. Click SAVE COMPUTE RESOURCE

      Note

      The already bound workload that is using this asset will not be affected.

      "},{"location":"platform-admin/workloads/assets/compute/#copying-a-compute-resource","title":"Copying a compute resource","text":"

      To make a copy of an existing compute resource:

      1. Select the compute resource you want to copy
      2. Click MAKE A COPY
      3. Enter a name for the environment. The name must be unique.
      4. Update the environment
      5. Click CREATE COMPUTE RESOURCE
      "},{"location":"platform-admin/workloads/assets/compute/#deleting-a-compute-resource","title":"Deleting a compute resource","text":"
      1. Select the compute resource you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm

      Note

      The already bound workload that is using this asset will not be affected.

      "},{"location":"platform-admin/workloads/assets/compute/#using-api","title":"Using API","text":"

      Go to the Compute resources API reference to view the available actions

      "},{"location":"platform-admin/workloads/assets/credentials/","title":"Credentials","text":"

      This article explains what credentials are and how to create and use them.

      Credentials are a workload asset that simplify the complexities of Kubernetes secrets. They consist of and mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

      Credentials are crucial for the security of AI workloads and the resources they require, as they restrict access to authorized users, verify identities, and ensure secure interactions. By enforcing the protection of sensitive data, credentials help organizations comply with industry regulations, fostering a secure environment overall.

      Essentially, credentials enable AI practitioners to access relevant protected resources, such as private data sources and Docker images, thereby streamlining the workload submission process.

      "},{"location":"platform-admin/workloads/assets/credentials/#credentials-table","title":"Credentials table","text":"

      The Credentials table can be found under Workload manager in the Run:ai User interface.

      The Credentials table provides a list of all the credentials defined in the platform and allows you to manage them.

      The Credentials table comprises the following columns:

      Column Description Credentials The name of the credentials Description A description of the credentials Type The type of credentials, e.g., Docker registry Status The different lifecycle phases and representation of the credentials\u2019 condition Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram Kubernetes name The unique name of the credentials Kubernetes name as it appears in the cluster Environment(s) The environment(s) that are associated with the credentials Data source(s) The private data source(s) that are accessed using the credentials Created by The user who created the credentials Creation time The timestamp of when the credentials were created Cluster The cluster with which the credentials are associated"},{"location":"platform-admin/workloads/assets/credentials/#credentials-status","title":"Credentials status","text":"

      The following table describes the credentials\u2019 condition and whether they were created successfully for the selected scope.

      Status Description No issues found No issues were found while creating the credentials (this status may change while propagating the credentials to the selected scope) Issues found Issues found while propagating the credentials Issues found Failed to access the cluster Creating\u2026 Credentials are being created Deleting\u2026 Credentials are being deleted No status When the credentials\u2019 scope is an account, or the current version of the cluster is not up to date, the status cannot be displayed"},{"location":"platform-admin/workloads/assets/credentials/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then click \u2018Download as CSV\u2019. Export to CSV is limited to 20,000 rows.
      • Refresh - Click REFRESH to update the table with the latest data
      "},{"location":"platform-admin/workloads/assets/credentials/#adding-new-credentials","title":"Adding new credentials","text":"

      Creating credentials is limited to specific roles.

      To add a new credential:

      1. Go to the Credentials table:
      2. Click +NEW CREDENTIALS
      3. Select the credential type from the list Follow the step-by-step guide for each credential type:
      "},{"location":"platform-admin/workloads/assets/credentials/#docker-registry","title":"Docker registry","text":"

      These credentials allow users to authenticate and pull images from a Docker registry, enabling access to containerized applications and services.

      After creating the credentials, it is used automatically when pulling images.

      1. Select a scope.
      2. Enter a name for the credential. The name must be unique.
      3. Optional: Provide a description of the credentials
      4. Set how the credential is created
        • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
          • Select a secret from the list (The list is empty if no secrets were created in advance)
        • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
          • Enter the username, password, and Docker registry URL
      5. Click CREATE CREDENTIALS

      After the credentials are created, check their status to monitor their proper creation across the selected scope.

      "},{"location":"platform-admin/workloads/assets/credentials/#access-key","title":"Access key","text":"

      These credentials are unique identifiers used to authenticate and authorize access to cloud services or APIs, ensuring secure communication between applications. They typically consist of two parts:

      • An access key ID
      • A secret access key

      The purpose of this credential type is to allow access to restricted data.

      1. Select a scope.
      2. Enter a name for the credential. The name must be unique.
      3. Optional: Provide a description of the credential
      4. Set how the credential is created
        • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
          • Select a secret from the list (The list is empty if no secrets were created in advance)
        • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
          • Enter the Access key and Access secret
      5. Click CREATE CREDENTIALS

      After the credentials are created, check their status to monitor their proper creation across the selected scope.

      "},{"location":"platform-admin/workloads/assets/credentials/#username-password","title":"Username & password","text":"

      These credentials require a username and corresponding password to access various resources, ensuring that only authorized users can log in.

      The purpose of this credential type is to allow access to restricted data.

      1. Select a scope
      2. Enter a name for the credential. The name must be unique.
      3. Optional: Provide a description of the credentials
      4. Set how the credential is created
        • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
          • Select a secret from the list (The list is empty if no secrets were created in advance)
        • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
          • Enter the username and password
      5. Click CREATE CREDENTIALS

      After the credentials are created, check their status to monitor their proper creation across the selected scope.

      "},{"location":"platform-admin/workloads/assets/credentials/#generic-secret","title":"Generic secret","text":"

      These credentials are a flexible option that consists of multiple keys & values and can store various sensitive information, such as API keys or configuration data, to be used securely within applications.

      The purpose of this credential type is to allow access to restricted data.

      1. Select a scope
      2. Enter a name for the credential. The name must be unique.
      3. Optional: Provide a description of the credentials
      4. Set how the credential is created
        • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
          • Select a secret from the list (The list is empty if no secrets were created in advance)
        • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
          • Click +KEY & VALUE - to add key/value pairs to store in the new secret
      5. Click CREATE CREDENTIALS
      "},{"location":"platform-admin/workloads/assets/credentials/#editing-credentials","title":"Editing credentials","text":"

      To rename a credential:

      1. Select the credential from the table
      2. Click Rename to edit its name and description
      "},{"location":"platform-admin/workloads/assets/credentials/#deleting-credentials","title":"Deleting credentials","text":"

      To delete a credential:

      1. Select the credential you want to delete
      2. Click DELETE
      3. In the dialog, click DELETE to confirm

      Note

      Credentials cannot be deleted if they are being used by a workload and template.

      "},{"location":"platform-admin/workloads/assets/credentials/#using-credentials","title":"Using credentials","text":"

      You can use credentials (secrets) in various ways within the system

      "},{"location":"platform-admin/workloads/assets/credentials/#access-private-data-sources","title":"Access private data sources","text":"

      To access private data sources, attach credentials to data sources of the following types: Git, S3 Bucket

      "},{"location":"platform-admin/workloads/assets/credentials/#use-directly-within-the-container","title":"Use directly within the container","text":"

      To use the secret directly from within the container, you can choose between the following options

      1. Get the secret mounted to the file system by using the Generic secret data source
      2. Get the secret as an environment variable injected into the container. There are two equivalent ways to inject the environment variable.

        a. By adding it to the Environment asset. b. By adding it ad-hoc as part of the workload.

      "},{"location":"platform-admin/workloads/assets/credentials/#creating-secrets-in-advance","title":"Creating secrets in advance","text":"

      Add secrets in advance to be used when creating credentials via the Run:ai UI.

      Follow the steps below for each required scope:

      Cluster scopeDepartment scopeProject scope
      1. Create the secret in the Run:ai namespace (runai)
      2. To authorize Run:ai to use the secret, label it: run.ai/cluster-wide: \"true\"
      3. Label the secret with the correct credential type:
        1. Docker registry - run.ai/resource: \"docker-registry\"
        2. Access key - run.ai/resource: \"access-key\"
        3. Username and password - run.ai/resource: \"password\"
        4. Generic secret - run.ai/resource: \"generic\" \u05bf
      1. Create the secret in the Run:ai namespace (runai)
      2. To authorize Run:ai to use the secret, label it: run.ai/department: \"<department id>\"
      3. Label the secret with the correct credential type:
        1. Docker registry - run.ai/resource: \"docker-registry\"
        2. Access key - run.ai/resource: \"access-key\"
        3. Username and password - run.ai/resource: \"password\"
        4. Generic secret - run.ai/resource: \"generic\"
      1. Create the secret in the project\u2019s namespace
      2. Label the secret with the correct credential type:
        1. Docker registry - run.ai/resource: \"docker-registry\"
        2. Access key - run.ai/resource: \"access-key\"
        3. Username and password - run.ai/resource: \"password\"
        4. Generic secret - run.ai/resource: \"generic\"

      The secret is now displayed for that scope in the list of existing secrets.

      "},{"location":"platform-admin/workloads/assets/credentials/#using-api","title":"Using API","text":"

      To view the available actions, go to the Credentials API reference

      "},{"location":"platform-admin/workloads/assets/data-volumes/","title":"Data Volumes","text":"

      Data volumes offer a powerful solution for storing, managing, and sharing AI training data within the Run:ai platform. They promote collaboration, simplify data access control, and streamline the AI development lifecycle.

      Data volumes are snapshots of datasets stored in Kubernetes Persistent Volume Claims (PVCs). They act as a central repository for training data.

      "},{"location":"platform-admin/workloads/assets/data-volumes/#why-use-a-data-volume","title":"Why use a data volume?","text":"
      1. Sharing with multiple scopes Unlike other Run:ai data sources, data volumes can be shared across projects, departments, or clusters, encouraging data reuse and collaboration within the organization.
      2. Storage saving A single copy of the data can be used across multiple scopes
      "},{"location":"platform-admin/workloads/assets/data-volumes/#typical-use-cases","title":"Typical use cases","text":"
      1. Sharing large data sets In large organizations, the data is often stored in a remote location, which can be a barrier for large model training. Even if the data is transferred into the cluster, sharing it easily with multiple users is still challenging. Data volumes can help share the data seamlessly, with maximum security and control.
      2. Sharing data with colleagues When sharing training results, generated data sets, or other artifacts with team members is needed, data volumes can help make the data available easily.
      "},{"location":"platform-admin/workloads/assets/data-volumes/#prerequisites","title":"Prerequisites","text":"

      To create a data volume, there must be a project with a PVC in its namespace.

      Working with data volumes is currently available using the API. To view the available actions, go to the Data volumes API reference.

      "},{"location":"platform-admin/workloads/assets/data-volumes/#adding-a-new-data-volume","title":"Adding a new data volume","text":"

      Data volume creation is limited to specific roles

      "},{"location":"platform-admin/workloads/assets/data-volumes/#adding-scopes-for-a-data-volume","title":"Adding scopes for a data volume","text":"

      Data volume sharing (adding scopes) is limited to specific roles

      Once created, the data volume is available to its originating project (see the prerequisites above).

      Data volumes can be shared with additional scopes in the organization.

      "},{"location":"platform-admin/workloads/assets/data-volumes/#who-can-use-a-data-volume","title":"Who can use a data volume?","text":"

      Data volumes are used when submitting workloads. Any user, application or SSO group with a role that has permissions to create workloads can also use data volumes.

      Researchers can list available data volumes within their permitted scopes for easy selection.

      "},{"location":"platform-admin/workloads/assets/datasources/","title":"Data Sources","text":"

      This article explains what data sources are and how to create and use them.

      Data sources are a type of workload asset and represent a location where data is actually stored. They may represent a remote data location, such as NFS, Git, or S3, or a Kubernetes local resource, such as PVC, ConfigMap, HostPath, or Secret.

      This configuration simplifies the mapping of the data into the workload\u2019s file system and handles the mounting process during workload creation for reading and writing. These data sources are reusable and can be easily integrated and used by AI practitioners while submitting workloads across various scopes.

      "},{"location":"platform-admin/workloads/assets/datasources/#data-sources-table","title":"Data sources table","text":"

      The data sources table can be found under Workload manager in the Run:ai platform.

      The data sources table provides a list of all the data sources defined in the platform and allows you to manage them.

      The data sources table comprises the following columns:

      Column Description Data source The name of the data source Description A description of the data source Type The type of data source connected \u2013 e.g., S3 bucket, PVC, or others Status The different lifecycle phases and representation of the data source condition Scope The scope of the data source within the organizational tree. Click the scope name to view the organizational tree diagram Kubernetes name The unique name of the data sources Kubernetes name as it appears in the cluster Workload(s) The list of existing workloads that use the data source Template(s) The list of workload templates that use the data source Created by The user who created the data source Creation time The timestamp for when the data source was created Cluster The cluster that the data source is associated with"},{"location":"platform-admin/workloads/assets/datasources/#data-sources-status","title":"Data sources status","text":"

      The following table describes the data sources' condition and whether they were created successfully for the selected scope.

      Status Description No issues found No issues were found while creating the data source Issues found Issues were found while propagating the data source credentials Issues found The data source couldn\u2019t be created at the cluster Creating\u2026 The data source is being created No status / \u201c-\u201d When the data source\u2019s scope is an account, the current version of the cluster is not up to date, or the asset is not a cluster-syncing entity, the status can\u2019t be displayed"},{"location":"platform-admin/workloads/assets/datasources/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then click \u2018Download as CSV\u2019
      • Refresh - Click REFRESH to update the table with the latest data
      "},{"location":"platform-admin/workloads/assets/datasources/#adding-a-new-data-source","title":"Adding a new data source","text":"

      To create a new data source:

      1. Click +NEW DATA SOURCE
      2. Select the data source type from the list. Follow the step-by-step guide for each data source type:
      "},{"location":"platform-admin/workloads/assets/datasources/#nfs","title":"NFS","text":"

      A Network File System (NFS) is a Kubernetes concept used for sharing storage in the cluster among different pods. Like a PVC, the NFS volume\u2019s content remains preserved, even outside the lifecycle of a single pod. However, unlike PVCs, which abstract storage management, NFS provides a method for network-based file sharing. The NFS volume can be pre-populated with data and can be mounted by multiple pod writers simultaneously. At Run:ai, an NFS-type data source is an abstraction that is mapped directly to a Kubernetes NFS volume. This integration allows multiple workloads under various scopes to mount and present the NFS data source.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Set the data origin
        • Enter the NFS server (host name or host IP)
        • Enter the NFS path
      6. Set the data target location
        • Container path
      7. Optional: Restrictions
        • Prevent data modification - When enabled, the data will be mounted with read-only permissions
      8. Click CREATE DATA SOURCE
      "},{"location":"platform-admin/workloads/assets/datasources/#pvc","title":"PVC","text":"

      A Persistent Volume Claim (PVC) is a Kubernetes concept used for managing storage in the cluster, which can be provisioned by an administrator or dynamically by Kubernetes using a StorageClass. PVCs allow users to request specific sizes and access modes (read/write once, read-only many). Run:ai ensures that data remains consistent and accessible across various scopes and workloads, beyond the lifecycle of individual pods, which is efficient while working with large datasets typically associated with AI projects.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Select PVC:
        • Existing PVC This option is relevant when the purpose is to create a PVC-type data source based on an existing PVC in the cluster
          • Select a PVC from the list - (The list is empty if no existing PVCs were created in advance)
        • New PVC - creates a new PVC in the cluster. New PVCs are not added to the Existing PVCs list. When creating a PVC-type data source and selecting the \u2018New PVC\u2019 option, the PVC is immediately created in the cluster (even if no workload has requested this PVC).
      6. Select the storage class
        • None - Proceed without defining a storage class
        • Custom storage class - This option applies when selecting a storage class based on existing storage classes. To add new storage classes to the storage class list, and for additional information, check Kubernetes storage classes
      7. Select the access mode(s) (multiple modes can be selected)
        • Read-write by one node - The volume can be mounted as read-write by a single node.
        • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
        • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
      8. Set the claim size and its units
      9. Select the volume mode
        • File system (default) - allows the volume to be mounted as a filesystem, enabling the usage of directories and files.
        • Block - exposes the volume as a block storage, which can be formatted or used by applications directly without a filesystem.
      10. Set the data target location
        • container path
      11. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permission.
      12. Click CREATE DATA SOURCE

      After the data source is created, check its status to monitor its proper creation across the selected scope.

      "},{"location":"platform-admin/workloads/assets/datasources/#s3-bucket","title":"S3 Bucket","text":"

      The S3 bucket data source enables the mapping of a remote S3 bucket into the workload\u2019s file system. Similar to a PVC, this mapping remains accessible across different workload executions, extending beyond the lifecycle of individual pods. However, unlike PVCs, data stored in an S3 bucket resides remotely, which may lead to decreased performance during the execution of heavy machine learning workloads. As part of the Run:ai connection to the S3 bucket, you can create credentials in order to access and map private buckets.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Set the data origin
        • Set the S3 service URL
        • Select the credentials
          • None - for public buckets
          • Credential names - This option is relevant for private buckets based on existing credentials that were created for the scope. To add new credentials to the credentials list, and for additional information, check the Credentials article.
        • Enter the bucket name
      6. Set the data target location
        • container path
      7. Click CREATE DATA SOURCE

      After a private data source is created, check its status to monitor its proper creation across the selected scope.

      "},{"location":"platform-admin/workloads/assets/datasources/#git","title":"Git","text":"

      A Git-type data source is a Run:ai integration, that enables code to be copied from a Git branch into a dedicated folder in the container. It is mainly used to provide the workload with the latest code repository. As part of the integration with Git, in order to access private repositories, you can add predefined credentials to the data source mapping.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Set the data origin
        • Set the Repository URL
        • Set the Revision (branch, tag, or hash)- If left empty, it will use the 'HEAD' (latest)
        • Select the credentials
          • None - for public repositories
          • Credential names - This option applies to private repositories based on existing credentials that were created for the scope. To add new credentials to the credentials list, and for additional information, check the Credentials article.
      6. Set the data target location
        • container path
      7. Click CREATE DATA SOURCE

      After a private data source is created, check its status to monitor its proper creation across the selected scope.

      "},{"location":"platform-admin/workloads/assets/datasources/#host-path","title":"Host path","text":"

      A Host path volume is a Kubernetes concept that enables mounting a host path file or a directory on the workload\u2019s file system. Like a PVC, the host path volume\u2019s data persists across workloads under various scopes. It also enables data serving from the hosting node.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Set the data origin
        • host path
      6. Set the data target location
        • container path
      7. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permissions.
      8. Click CREATE DATA SOURCE
      "},{"location":"platform-admin/workloads/assets/datasources/#configmap","title":"ConfigMap","text":"

      A ConfigMap data source is a Run:ai abstraction for the Kubernetes ConfigMap concept. The ConfigMap is used mainly for storage that can be mounted on the workload container for non-confidential data. It is usually represented in key-value pairs (e.g., environment variables, command-line arguments etc.). It allows you to decouple environment-specific system configurations from your container images, so that your applications are easily portable. ConfigMaps must be created on the cluster prior to being used within the Run:ai system.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Set the data origin
        • Select the ConfigMap name (The list is empty if no existing ConfigMaps were created in advance).
      6. Set the data target location
        • container path
      7. Click CREATE DATA SOURCE
      "},{"location":"platform-admin/workloads/assets/datasources/#secret","title":"Secret","text":"

      A secret-type data source enables the mapping of a credential into the workload\u2019s file system. Credentials are a workload asset that simplify the complexities of Kubernetes Secrets. The credentials mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

      1. Select the cluster under which to create this data source
      2. Select a scope
      3. Enter a name for the data source. The name must be unique.
      4. Optional: Provide a description of the data source
      5. Set the data origin
        • Select the credentials To add new credentials, and for additional information, check the Credentials article.
      6. Set the data target location
        • container path
      7. Click CREATE DATA SOURCE

      After the data source is created, check its status to monitor its proper creation across the selected scope.

      Note

      It is also possible to add data sources directly when creating a specific workspace, training or inference workload

      "},{"location":"platform-admin/workloads/assets/datasources/#editing-a-data-source","title":"Editing a data source","text":"

      To edit a data source:

      1. Select the data source from the table
      2. Click Rename to provide it with a new name
      3. Click Copy & Edit to make any changes to the data source
      "},{"location":"platform-admin/workloads/assets/datasources/#deleting-a-data-source","title":"Deleting a data source","text":"

      To delete a data source:

      1. Select the data source you want to delete
      2. Click DELETE
      3. Confirm you want to delete the data source

      Note

      It is not possible to delete an environment being used by an existing workload or template.

      "},{"location":"platform-admin/workloads/assets/datasources/#using-api","title":"Using API","text":"

      To view the available actions, go to the Data sources API reference.

      "},{"location":"platform-admin/workloads/assets/environments/","title":"Environments","text":"

      This article explains what environments are and how to create and use them.

      Environments are one type of workload asset. An environment consists of a configuration that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

      An environment asset is a preconfigured building block that encapsulates aspects for the workload such as:

      • Container image and container configuration
      • Tools and connections
      • The type of workload it serves
      "},{"location":"platform-admin/workloads/assets/environments/#environments-table","title":"Environments table","text":"

      The Environments table can be found under Workload manager in the Run:ai platform.

      The Environment table provides a list of all the environment defined in the platform and allows you to manage them.

      The Environments table consists of the following columns:

      Column Description Environment The name of the environment Description A description of the environment Scope The scope of this environment within the organizational tree. Click the name of the scope to view the organizational tree diagram Image The application or service to be run by the workload Workload Architecture This can be either standard for running workloads on a single node or distributed for running distributed workloads on multiple nodes Tool(s) The tools and connection types the environment exposes Workload(s) The list of existing workloads that use the environment Workload types The workload types that can use the environment (Workspace/ Training / Inference) Template(s) The list of workload templates that use this environment Created by The user who created the environment. By default Run:ai UI comes with preinstalled environments created by Run:ai created by Run:ai Creation time The timestamp of when the environment was created Last updated The timestamp of when the environment was last updated Cluster The cluster with which the environment is associated"},{"location":"platform-admin/workloads/assets/environments/#tools-associated-with-the-environment","title":"Tools associated with the environment","text":"

      Click one of the values in the tools column to view the list of tools and their connection type.

      Column Description Tool name The name of the tool or application AI practitioner can set up within the environment. Connection type The method by which you can access and interact with the running workload. It's essentially the \"doorway\" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc)"},{"location":"platform-admin/workloads/assets/environments/#workloads-associated-with-the-environment","title":"Workloads associated with the environment","text":"

      Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

      Column Description Workload The workload that uses the environment Type The workload type (Workspace/Training/Inference) Status Represents the workload lifecycle. See the full list of workload status"},{"location":"platform-admin/workloads/assets/environments/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      "},{"location":"platform-admin/workloads/assets/environments/#environments-created-by-runai","title":"Environments created by Run:ai","text":"

      When installing Run:ai, you automatically get the environments created by Run:ai to ease up the onboarding process and support different use cases out of the box. These environments are created at the scope of the account.

      Environment Image Jupiter-lab jupyter/scipy-notebook jupyter-tensorboard gcr.io/run-ai-demo/jupyter-tensorboard tensorboard tensorflow/tensorflow:latest llm-server runai.jfrog.io/core-llm/runai-vllm:v0.6.4-0.10.0 chatbot-ui runai.jfrog.io/core-llm/llm-app gpt2 runai.jfrog.io/core-llm/quickstart-inference:gpt2-cpu"},{"location":"platform-admin/workloads/assets/environments/#adding-a-new-environment","title":"Adding a new environment","text":"

      Environment creation is limited to specific roles

      To add a new environment:

      1. Go to the Environments table
      2. Click +NEW ENVIRONMENT
      3. Select under which cluster to create the environment
      4. Select a scope
      5. Enter a name for the environment. The name must be unique.
      6. Optional: Provide a description of the essence of the environment
      7. Enter the Image URL If a token or secret is required to pull the image, it is possible to create it via credentials of type docker registry. These credentials are automatically used once the image is pulled (which happens when the workload is submitted)
      8. Set the image pull policy - the condition for when to pull the image from the registry
      9. Set the workload architecture:
        • Standard Only standard workloads can use the environment. A standard workload consists of a single process.
        • Distributed Only distributed workloads can use the environment. A distributed workload consists of multiple processes working together. These processes can run on different nodes.
        • Select a framework from the list.
      10. Set the workload type:
        • Workspace
        • Training
        • Inference
        • When inference is selected, define the endpoint of the model by providing both the protocol and the container\u2019s serving port
      11. Optional: Set the connection for your tool(s). The tools must be configured in the image. When submitting a workload using the environment, it is possible to connect to these tools
        • Select the tool from the list (the available tools varies from IDE, experiment tracking, and more, including a custom tool for your choice)
        • Select the connection type
          • External URL
            • Auto generate A unique URL is automatically created for each workload using the environment
            • Custom URL The URL is set manually
          • Node port
            • Auto generate A unique port is automatically exposed for each workload using the environment
            • Custom URL Set the port manually
          • Set the container port
      12. Optional: Set a command and arguments for the container running the pod
        • When no command is added, the default command of the image is used (the image entrypoint)
        • The command can be modified while submitting a workload using the environment
        • The argument(s) can be modified while submitting a workload using the environment
      13. Optional: Set the environment variable(s)
        • Click +ENVIRONMENT VARIABLE
        • Enter a name
        • Select the source for the environment variable
        • Custom
          • Enter a value
          • Leave empty
          • Add instructions for the expected value if any
        • Credentials - Select existing credentials as the environment variable
          • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
          • Select a secret key
        • The environment variables can be modified and new variables can be added while submitting a workload using the environment
      14. Optional: Set the container\u2019s working directory to define where the container\u2019s process starts running. When left empty, the default directory is used.
      15. Optional: Set where the UID, GID and supplementary groups are taken from, this can be:
        • From the image
        • From the IdP token (only available in an SSO installations)
        • Custom (manually set) - decide whether the submitter can modify these value upon submission.
        • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
          • Enter UID
          • Enter GID
          • Add Supplementary groups (multiple groups can be added, separated by commas)
          • Disable Allow the values above to be modified within the workload if you want the above values to be used as the default
      16. Optional: Select Linux capabilities - Grant certain privileges to a container without granting all the privileges of the root user.
      17. Click CREATE ENVIRONMENT

      Note

      It is also possible to add environments directly when creating a specific workspace, training or inference workload.

      "},{"location":"platform-admin/workloads/assets/environments/#editing-an-environment","title":"Editing an environment","text":"

      To edit an environment:

      1. Select the environment you want to edit
      2. Click Edit
      3. Click SAVE ENVIRONMENT

      Note

      • The already bound workload that is using this asset will not be affected.
      • llm-server and chatbot-ui environments cannot be edited.
      "},{"location":"platform-admin/workloads/assets/environments/#copying-an-environment","title":"Copying an environment","text":"

      To make a copy of an existing environment:

      1. Select the environment you want to copy
      2. Click MAKE A COPY
      3. Enter a name for the environment. The name must be unique.
      4. Update the environment
      5. Click CREATE ENVIRONMENT
      "},{"location":"platform-admin/workloads/assets/environments/#deleting-an-environment","title":"Deleting an environment","text":"

      To delete an environment:

      1. Select the environment you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm

      Note

      The already bound workload that is using this asset will not be affected.

      "},{"location":"platform-admin/workloads/assets/environments/#using-api","title":"Using API","text":"

      Go to the Environment API reference to view the available actions

      "},{"location":"platform-admin/workloads/assets/overview/","title":"Overview","text":"

      Workload assets enable organizations to:

      • Create and reuse preconfigured setup for code, data, storage and resources to be used by AI practitioners to simplify the process of submitting workloads
      • Share the preconfigured setup with a wide audience of AI practitioners with similar needs

      Note

      • The creation of assets is possible only via API and the Run:ai UI
      • The submission of workloads using assets, is possible only via the Run:ai UI
      "},{"location":"platform-admin/workloads/assets/overview/#workload-asset-types","title":"Workload asset types","text":"

      There are four workload asset types used by the workload:

      • Environments The container image, tools and connections for the workload
      • Data sources The type of data, its origin and the target storage location such as PVCs or cloud storage buckets where datasets are stored
      • Compute resources The compute specification, including GPU and CPU compute and memory
      • Credentials The secrets to be used to access sensitive data, services, and applications such as docker registry or S3 buckets
      "},{"location":"platform-admin/workloads/assets/overview/#asset-scope","title":"Asset scope","text":"

      When a workload asset is created, a scope is required. The scope defines who in the organization can view and/or use the asset.

      Note

      When an asset is created via API, the scope can be the entire account, this is currently an experimental feature.

      "},{"location":"platform-admin/workloads/assets/overview/#who-can-create-an-asset","title":"Who can create an asset?","text":"

      Any subject (user, application, or SSO group) with a role that has permissions to Create an asset, can do so within their scope.

      "},{"location":"platform-admin/workloads/assets/overview/#who-can-use-an-asset","title":"Who can use an asset?","text":"

      Assets are used when submitting workloads. Any subject (user, application or SSO group) with a role that has permissions to Create workloads, can also use assets.

      "},{"location":"platform-admin/workloads/assets/overview/#who-can-view-an-asset","title":"Who can view an asset?","text":"

      Any subject (user, application, or SSO group) with a role that has permission to View an asset, can do so within their scope.

      "},{"location":"platform-admin/workloads/assets/templates/","title":"Workspace Templates","text":"

      This article explains the procedure to manage templates.

      A template is a pre-set configuration that is used to quickly configure and submit workloads using existing assets. A template consists of all the assets a workload needs, allowing researchers to submit a workload in a single click, or make subtle adjustments to differentiate them from each other.

      "},{"location":"platform-admin/workloads/assets/templates/#workspace-templates-table","title":"Workspace templates table","text":"

      The Templates table can be found under Workload manager in the Run:ai User interface.

      The Templates table provides a list of all the templates defined in the platform, and allows you to manage them.

      Flexible Management

      It is also possible to manage templates directly for a specific user, application, project, or department.

      The Templates table consists of the following columns:

      Column Description Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates Environment The name of the environment related to the workspace template Compute resource The name of the compute resource connected to the workspace template Data source(s) The name of the data source(s) connected to the workspace template Created by The subject that created the template Creation time The timestamp for when the template was created Cluster The cluster name containing the template"},{"location":"platform-admin/workloads/assets/templates/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then click Download as CSV. Export to CSV is limited to 20,000 rows.
      • Refresh (optional) - Click REFRESH to update the table with the latest data
      • Show/Hide details (optional) - Click to view additional information on the selected row
      "},{"location":"platform-admin/workloads/assets/templates/#adding-a-new-workspace-template","title":"Adding a new workspace template","text":"

      To add a new template:

      1. Click +NEW TEMPLATE
      2. Set the scope for the template
      3. Enter a name for the template
      4. Select the environment for your workload
      5. Select the node resources needed to run your workload - or - Click +NEW COMPUTE RESOURCE

      6. Set the volume needed for your workload

      7. Create a new data source
      8. Set auto-deletion, annotations and labels, as required
      9. Click CREATE TEMPLATE
      "},{"location":"platform-admin/workloads/assets/templates/#editing-a-template","title":"Editing a template","text":"

      To edit a template:

      1. Select the template from the table
      2. Click Rename to provide it with a new name
      3. Click Copy & Edit to make any changes to the template
      "},{"location":"platform-admin/workloads/assets/templates/#deleting-a-template","title":"Deleting a template","text":"

      To delete a template:

      1. Select the template you want to delete
      2. Click DELETE
      3. Confirm you want to delete the template
      "},{"location":"platform-admin/workloads/assets/templates/#using-api","title":"Using API**","text":"

      Go to the Workload template API reference to view the available actions

      "},{"location":"platform-admin/workloads/overviews/introduction-to-workloads/","title":"Introduction to Workloads","text":"

      Run:ai enhances visibility and simplifies management, by monitoring, presenting and orchestrating all AI workloads in the clusters it is installed on. Workloads are the fundamental building blocks for consuming resources, enabling AI practitioners such as researchers, data scientists and engineers to efficiently support the entire life cycle of an AI initiative.

      "},{"location":"platform-admin/workloads/overviews/introduction-to-workloads/#workloads-across-the-ai-lifecycle","title":"Workloads across the AI lifecycle","text":"

      A typical AI initiative progresses through several key stages, each with distinct workloads and objectives. With Run:ai, research and engineering teams can host and manage all these workloads to achieve the following:

      • Data preparation: Aggregating, cleaning, normalizing, and labeling data to prepare for training.
      • Training: Conducting resource-intensive model development and iterative performance optimization.
      • Fine-tuning: Adapting pre-trained models to domain-specific data sets while balancing efficiency and performance.
      • Inference: Deploying models for real-time or batch predictions with a focus on low latency and high throughput.
      • Monitoring and optimization: Ensuring ongoing performance by addressing data drift, usage patterns, and retraining as needed.
      "},{"location":"platform-admin/workloads/overviews/introduction-to-workloads/#what-is-a-workload","title":"What is a workload?","text":"

      A workload runs in the cluster, is associated with a namespace, and operates to fulfill its targets, whether that is running to completion for a batch job, allocating resources for experimentation in an integrated development environment (IDE)/notebook, or serving inference requests in production.

      The workload, defined by the AI practitioner, consists of:

      • Container images: This includes the application, its dependencies, and the runtime environment.
      • Compute resources: CPU, GPU, and RAM to execute efficiently and address the workload\u2019s needs.
      • Data sets: The data needed for processing, such as training data sets or input from external databases.
      • Credentials: The access to certain data sources or external services, ensuring proper authentication and authorization.
      "},{"location":"platform-admin/workloads/overviews/introduction-to-workloads/#workload-scheduling-and-orchestration","title":"Workload scheduling and orchestration","text":"

      Run:ai\u2019s core mission is to optimize AI resource usage at scale. This is achieved through efficient scheduling and orchestrating of all cluster workloads using the Run:ai Scheduler. The Scheduler allows the prioritization of workloads across different departments and projects within the organization at large scales, based on the resource distribution set by the system administrator.

      "},{"location":"platform-admin/workloads/overviews/introduction-to-workloads/#runai-and-third-party-workloads","title":"Run:ai and third-party workloads","text":"
      • Run:ai workloads: These workloads are submitted via the Run:ai platform. They are represented by Kubernetes Custom Resource Definitions (CRDs) and APIs. When using Run:ai workloads, a complete Workload and Scheduling Policy solution is offered for administrators to ensure optimizations, governance and security standards are applied.
      • Third-party workloads: These workloads are submitted via third-party applications that use the Run:ai Scheduler. The Run:ai platform manages and monitors these workloads. They enable seamless integrations with external tools, allowing teams and individuals flexibility.
      "},{"location":"platform-admin/workloads/overviews/introduction-to-workloads/#levels-of-support","title":"Levels of support","text":"

      Different types of workloads have different levels of support. Understanding what capabilities are needed before selecting the workload type to work with is important. The table below details the level of support for each workload type in Run:ai. Run:ai workloads are fully supported with all of Run:ai advanced features and capabilities. While third-party workloads are partially supported. The list of capabilities can change between different Run:ai versions.

      Functionality Workload Type Run:ai workloads Third-party workloads Training - Standard Workspace Inference Training - distributed Fairness v v v v v Priority and preemption v v v v v Over quota v v v v v Node pools v v v v v Bin packing / Spread v v v v v Multi-GPU fractions v v v v v Multi-GPU dynamic fractions v v v v v Node level scheduler v v v v v Multi-GPU memory swap v v v v v Elastic scaling NA NA v v v Gang scheduling v v v v v Monitoring v v v v v RBAC v v v v Workload awareness v v v v Workload submission v v v v Workload actions (stop/run) v v v v Workload Policies v v v v Scheduling rules v v v v

      Note

      Workload awareness

      Specific workload-aware visibility, so that different pods are identified and treated as a single workload (for example GPU utilization, workload view, dashboards).

      "},{"location":"platform-admin/workloads/overviews/managing-workloads/","title":"Workloads","text":"

      This article explains the procedure for managing workloads.

      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#workloads-table","title":"Workloads table","text":"

      The Workloads table can be found under Workload manager in the Run:ai platform.

      The workloads table provides a list of all the workloads scheduled on the run:ai Scheduler, and allows you to manage them.

      The Workloads table consists of the following columns:

      Column Description Workload The name of the workload Type The workload type Preemptible Is the workload preemptible Status The different phases in a workload life cycle. Project The project in which the workload runs. Department The department that the workload is associated with. this column is visible only if the department toggle is enabled by your administrator. Created by The user who created the workload Running/requested pods The number of running pods out of the requested Creation time The timestamp for when the workload was created Completion time The timestamp the workload reached a terminal state (failed/completed) Connection(s) The method by which you can access and interact with the running workload. It's essentially the \"doorway\" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc). Click one of the values in the column to view the list of connections and their parameters Data source(s) Data resources used by the workload Environment The environment used by the workload Workload architecture Standard or distributed. A standard workload consists of a single process. A distributed workload consists of multiple processes working together. These processes can run on different nodes. GPU compute request Amount of GPU devices requested GPU compute allocation Amount of GPU devices allocated GPU memory request Amount of GPU memory Requested GPU memory allocation Amount of GPU memory allocated Idle GPU devices The number of allocated GPU devices that have been idle for more than 5 minutes CPU compute request Amount of CPU cores requested CPU compute allocation Amount of CPU cores allocated CPU memory request Amount of CPU memory requested CPU memory allocation Amount of CPU memory allocated Cluster The cluster that the workload is associated with"},{"location":"platform-admin/workloads/overviews/managing-workloads/#workload-status","title":"Workload status","text":"

      The following table describes the different phases in a workload life cycle. The UI provides additional details for some of the below workload statuses which can be viewed by clicking the icon next to the status.

      Status Description Entry Condition Exit Condition Creating Workload setup is initiated in the cluster. Resources and pods are now provisioning. A workload is submitted. A multi-pod group is created. Pending Workload is queued and awaiting resource allocation. A pod group exists. All pods are scheduled. Initializing Workload is retrieving images, starting containers, and preparing pods. All pods are scheduled. All pods are initialized or a failure to initialize is detected. Running Workload is currently in progress with all pods operational. All pods initialized (all containers in pods are ready). Workload completion or failure. Degraded Pods may not align with specifications, network services might be incomplete, or persistent volumes may be detached. Check your logs for specific details. Pending - All pods are running but have issues. Running - All pods are running with no issues. Running - All resources are OK. Completed - Workload finished with fewer resources. Failed - Workload failure or user-defined rules. Deleting Workload and its associated resources are being decommissioned from the cluster. Deleting the workload. Resources are fully deleted. Stopped Workload is on hold and resources are intact but inactive. Stopping the workload without deleting resources. Transitioning back to the initializing phase or proceeding to deleting the workload. Failed Image retrieval failed or containers experienced a crash. Check your logs for specific details. An error occurs preventing the successful completion of the workload. Terminal state. Completed Workload has successfully finished its execution. The workload has finished processing without errors. Terminal state."},{"location":"platform-admin/workloads/overviews/managing-workloads/#pods-associated-with-workload","title":"Pods Associated with Workload","text":"

      Click one of the values in the Running/requested pods column, to view the list of pods and their parameters.

      Column Description Pod Pod name Status Pod lifecycle stages Node The node on which the pod resides Node pool The node pool in which the pod resides (applicable if node pools are enabled) Image The pod\u2019s main image GPU compute allocation Amount of GPU devices allocated for the pod GPU memory allocation Amount of GPU memory allocated for the pod"},{"location":"platform-admin/workloads/overviews/managing-workloads/#connections-associated-with-workload","title":"Connections Associated with Workload","text":"

      A connection refers to the method by which you can access and interact with the running workloads. It is essentially the \"doorway\" through which you can reach and use the applications (tools) these workloads provide.

      Click one of the values in the Connection(s) column, to view the list of connections and their parameters. Connections are network interfaces that communicate with the application running in the workload. Connections are either the URL the application exposes or the IP and the port of the node that the workload is running on.

      Column Description Name The name of the application running on the workload Connection type The network connection type selected for the workload Access Who is authorized to use this connection (everyone, specific groups/users) Address The connection URL Copy button Copy URL to clipboard Connect button Enabled only for supported tools"},{"location":"platform-admin/workloads/overviews/managing-workloads/#data-sources-associated-with-workload","title":"Data Sources Associated with Workload","text":"

      Click one of the values in the Data source(s) column, to view the list of data sources and their parameters.

      Column Description Data source The name of the data source mounted to the workload Type The data source type"},{"location":"platform-admin/workloads/overviews/managing-workloads/#customizing-the-table-view","title":"Customizing the table view","text":"
      • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
      • Search - Click SEARCH and type the value to search by
      • Sort - Click each column header to sort by
      • Column selection - Click COLUMNS and select the columns to display in the table
      • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
      • Refresh - Click REFRESH to update the table with the latest data
      • Show/Hide details - Click to view additional information on the selected row
      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#showhide-details","title":"Show/Hide details","text":"

      Click a row in the Workloads table and then click the SHOW DETAILS button at the upper-right side of the action bar. The details pane appears, presenting the following tabs:

      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#event-history","title":"Event History","text":"

      Displays the workload status over time. It displays events describing the workload lifecycle and alerts on notable events. Use the filter to search through the history for specific events.

      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#metrics","title":"Metrics","text":"
      • GPU utilization Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs compute utilization (percentage of GPU compute) in this node.
      • GPU memory utilization Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs memory usage (percentage of the GPU memory) in this node.
      • CPU compute utilization The average of all CPUs\u2019 cores compute utilization graph, along an adjustable period allows you to see the trends of CPU compute utilization (percentage of CPU compute) in this node.
      • CPU memory utilization The utilization of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory utilization (percentage of CPU memory) in this node.
      • CPU memory usage The usage of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory usage (in GB or MB of CPU memory) in this node.

      • For GPUs charts - Click the GPU legend on the right-hand side of the chart, to activate or deactivate any of the GPU lines.

      • You can click the date picker to change the presented period
      • You can use your mouse to mark a sub-period in the graph for zooming in, and use Reset zoom to go back to the preset period
      • Changes in the period affect all graphs on this screen.
      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#logs","title":"Logs","text":"

      Workload events are ordered in chronological order. The logs contain events from the workload\u2019s lifecycle to help monitor and debug issues.

      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#adding-new-workload","title":"Adding new workload","text":"

      Before starting, make sure you have created a project or have one created for you to work with workloads.

      To create a new workload:

      1. Click +NEW WORKLOAD
      2. Select a workload type - Follow the links below to view the step-by-step guide for each workload type:
        • Workspace. Used for data preparation and model-building tasks.
        • Training. Used for standard training tasks of all sorts
        • Distributed Training. Used for distributed tasks of all sorts
        • Inference. Used for inference and serving tasks
        • Job (legacy). This type is displayed only if enabled by your Administrator, under General settings \u2192 Workloads \u2192 Workload policies
      3. Click CREATE WORKLOAD
      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#stopping-a-workload","title":"Stopping a workload","text":"

      Stopping a workload kills the workload pods and releases the workload resources.

      1. Select the workload you want to stop
      2. Click STOP
      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#running-a-workload","title":"Running a workload","text":"

      Running a workload spins up new pods and resumes the workload work after it was stopped.

      1. Select the workload you want to run again
      2. Click RUN
      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#connecting-to-a-workload","title":"Connecting to a workload","text":"

      To connect to an application running in the workload (for example, Jupyter Notebook)

      1. Select the workload you want to connect
      2. Click CONNECT
      3. Select the tool from the drop-down list
      4. The selected tool is opened in a new tab on your browser
      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#deleting-a-workload","title":"Deleting a workload","text":"
      1. Select the workload you want to delete
      2. Click DELETE
      3. On the dialog, click DELETE to confirm the deletion

      Note

      Once a workload is deleted you can view it in the Deleted tab in the workloads view. This tab is displayed only if enabled by your Administrator, under General settings \u2192 Workloads \u2192 Deleted workloads

      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#copy-edit-a-workload","title":"Copy & Edit a workload","text":"
      1. Select the workload you want to copy and edit
      2. Click COPY & EDIT
      3. Update the workload and click CREATE WORKLOAD
      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#using-api","title":"Using API","text":"

      Go to the Workloads API reference to view the available actions

      "},{"location":"platform-admin/workloads/overviews/managing-workloads/#troubleshooting","title":"Troubleshooting","text":"

      To understand the condition of the workload, review the workload status in the Workload table. For more information, see check the workload\u2019s event history.

      Listed below are a number of known issues when working with workloads and how to fix them:

      Issue Mediation Cluster connectivity issues (there are issues with your connection to the cluster error message) Verify that you are on a network that has been granted access to the cluster. Reach out to your cluster admin for instructions on verifying this. If you are an admin, see the troubleshooting section in the cluster documentation Workload in \u201cInitializing\u201d status for some time Check that you have access to the Container image registry. Check the statuses of the pods in the pods\u2019 modal. Check the event history for more details Workload has been pending for some time Check that you have the required quota. Check the project\u2019s available quota in the project dialog. Check that all services needed to run are bound to the workload. Check the event history for more details. PVCs created using the K8s API or kubectl are not visible or mountable in Run:ai. This is by design. - Create a new data source of type PVC in the Run:ai UI - In the Data mount section, select Existing PVC - Select the PVC you created via the K8S API You are now able to select and mount this PVC in your Run:ai submitted workloads. Workload is not visible in the UI. Check that the workload hasn\u2019t been deleted. See the \u201cDeleted\u201d tab in the workloads view"},{"location":"platform-admin/workloads/overviews/workload-types/","title":"Run:ai Workload Types","text":"

      In the world of machine learning (ML), the journey from raw data to actionable insights is a complex process that spans multiple stages. Each stage of the AI lifecycle requires different tools, resources, and frameworks to ensure optimal performance. Run:ai simplifies this process by offering specialized workload types tailored to each phase, facilitating a smooth transition across various stages of the ML workflows.

      The ML lifecycle usually begins with the experimental work on data and exploration of different modeling techniques to identify the best approach for accurate predictions. At this stage, resource consumption is usually moderate as experimentation is done on a smaller scale. As confidence grows in the model's potential and its accuracy, the demand for compute resources increases. This is especially true during the training phase, where vast amounts of data need to be processed, particularly with complex models such as large language models (LLMs), with their huge parameter sizes, that often require distributed training across multiple GPUs to handle the intensive computational load.

      Finally, once the model is ready, it moves to the inference stage, where it is deployed to make predictions on new, unseen data. Run:ai's workload types are designed to correspond with the natural stages of this lifecycle. They are structured to align with the specific resource and framework requirements of each phase, ensuring that AI researchers and data scientists can focus on advancing their models without worrying about infrastructure management.

      Run:ai offers three workload types that correspond to a specific phase of the researcher\u2019s work:

      • Workspaces \u2013 For experimentation with data and models.
      • Training \u2013 For resource-intensive tasks such as model training and data preparation.
      • Inference \u2013 For deploying and serving the trained model.
      "},{"location":"platform-admin/workloads/overviews/workload-types/#workspaces-the-experimentation-phase","title":"Workspaces: the experimentation phase","text":"

      The Workspace is where data scientists conduct initial research, experiment with different data sets, and test various algorithms. This is the most flexible stage in the ML lifecycle, where models and data are explored, tuned, and refined. The value of workspaces lies in the flexibility they offer, allowing the researcher to iterate quickly without being constrained by rigid infrastructure.

      • Framework flexibility

        Workspaces support a variety of machine learning frameworks, as researchers need to experiment with different tools and methods.

      • Resource requirements

        Workspaces are often lighter on resources compared to the training phase, but they still require significant computational power for data processing, analysis, and model iteration.

        Hence, the default for the Run:ai workspaces considerations is to allow scheduling those workloads without the ability to preempt them once the resources were allocated. However, this non-preemptable state doesn\u2019t allow to utilize more resources outside of the project\u2019s deserved quota.

      See Running workspaces to learn more about how to submit a workspace via the Run:ai platform. For quick starts, see Running Jupyter Notebook using workspaces.

      "},{"location":"platform-admin/workloads/overviews/workload-types/#training-scaling-resources-for-model-development","title":"Training: scaling resources for model development","text":"

      As models mature and the need for more robust data processing and model training increases, Run:ai facilitates this shift through the Training workload. This phase is resource-intensive, often requiring distributed computing and high-performance clusters to process vast data sets and train models.

      • Training architecture

        For training workloads Run:ai allows you to specify the architecture - standard or distributed. The distributed architecture is relevant for larger data sets and more complex models that require utilizing multiple nodes. For the distributed architecture, Run:ai allows you to specify different configurations for the master and workers and select which framework to use - PyTorch, XGBoost, MPI, and TensorFlow. In addition, as part of the distributed configuration, Run:ai enable the researchers to schedule their distributed workloads on nodes within the same region, zone, placement group, or any other topology.

      • Resource requirements

        Training tasks demand high memory, compute power, and storage. Run:ai ensures that the allocated resources match the scale of the task and allows those workloads to utilize more compute resources than the project\u2019s deserved quota. Make sure that if you wish your training workload not to be preempted, specify the number of GPU\u2019s that are in your quota.

      See Standard training and Distributed training to learn more about how to submit a training workload via the Run:ai UI. For quick starts, see Run your first standard training and Run your first distributed training.

      "},{"location":"platform-admin/workloads/overviews/workload-types/#inference-deploying-and-serving-models","title":"Inference: deploying and serving models","text":"

      Once a model is trained and validated, it moves to the Inference stage, where it is deployed to make predictions (usually in a production environment). This phase is all about efficiency and responsiveness, as the model needs to serve real-time or batch predictions to end-users or other systems.

      • Inference-specific use cases

        Naturally, inference workloads are required to change and adapt to the ever-changing demands to meet SLA. For example, additional replicas may be deployed, manually or automatically, to increase compute resources as part of a horizontal scaling approach or a new version of the deployment may need to be rolled out without affecting the running services.

      • Resource requirements

        Inference models differ in size and purpose, leading to varying computational requirements. For example, small OCR models can run efficiently on CPUs, whereas LLMs typically require significant GPU memory for deployment and serving. Inference workloads are considered production-critical and are given the highest priority to ensure compliance with SLAs. Additionally, Run:ai ensures that inference workloads cannot be preempted, maintaining consistent performance and reliability.

      See Deploy a custom inference workload to learn more about how to submit an inference workload via the Run:ai UI.

      "},{"location":"platform-admin/workloads/policies/old-policies/","title":"Policies (YAML-based)","text":"

      Warning

      The below describes the old V1 Policies. While these still work, they have been replaced with Control-plane-based v2 policies which are accessible via API and user interface. For a description of the new policies, see API-based Policies.

      "},{"location":"platform-admin/workloads/policies/old-policies/#what-are-policies","title":"What are Policies?","text":"

      Policies allow administrators to impose restrictions and set default values for Researcher Workloads. For example:

      1. Restrict researchers from requesting more than 2 GPUs, or less than 1GB of memory for an interactive workload.
      2. Set the default memory of each training job to 1GB, or mount a default volume to be used by any submitted Workload.

      Policies are stored as Kubernetes custom resources.

      Policies are specific to Workload type as such there are several kinds of Policies:

      Workload Type Kubernetes Workload Name Kubernetes Policy Name Interactive InteractiveWorkload InteractivePolicy Training TrainingWorkload TrainingPolicy Distributed Training DistributedWorkload DistributedPolicy Inference InferenceWorkload InferencePolicy

      A Policy can be created per Run:ai Project (Kubernetes namespace). Additionally, a Policy resource can be created in the runai namespace. This special Policy will take effect when there is no project-specific Policy for the relevant workload kind.

      When researchers create a new interactive workload or workspace, they see list of available node pools and their priority. Priority is set by dragging and dropping the node pools in the desired order of priority. When the node pool priority list is locked by an administrator policy, the node pool list isn't editable by the Researcher even if the workspace is created from a template or copied from another workspace.

      Note

      Policies on this page cannot be added to platform 2.16 or higher that have the New Policy Manager enabled.

      "},{"location":"platform-admin/workloads/policies/old-policies/#creating-a-policy","title":"Creating a Policy","text":""},{"location":"platform-admin/workloads/policies/old-policies/#creating-your-first-policy","title":"Creating your First Policy","text":"

      To create a sample InteractivePolicy, prepare a file (e.g. policy.yaml) containing the following YAML:

      gpupolicy.yaml
      apiVersion: run.ai/v2alpha1\nkind: InteractivePolicy\nmetadata:\n  name: interactive-policy1\n  namespace: runai-team-a # (1)\nspec:\n  gpu:\n    rules:\n      required: true\n      min: \"1\"  # (2)\n      max: \"4\"  \n    value: \"1\"\n
      1. Set the Project namespace here.
      2. GPU values are quoted as they can contain non-integer values.

      The policy places a default and limit on the available values for GPU allocation. To apply this policy, run:

      kubectl apply -f gpupolicy.yaml \n

      Now, try the following command:

      runai submit --gpu 5 --interactive -p team-a\n

      The following message will appear:

      gpu: must be no greater than 4\n

      A similar message will appear in the New Job form of the Run:ai user interface, when attempting to enter the number of GPUs, which is out of range for a training job.

      "},{"location":"platform-admin/workloads/policies/old-policies/#gpu-and-cpu-memory-limits","title":"GPU and CPU memory limits","text":"

      The following policy places a default and limit on the available values for CPU and GPU memory allocation.

      gpumemorypolicy.yaml
      apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\n  name: training-policy\n  namespace: runai\nspec:\n  gpuMemory:\n    rules:\n      min: 100M\n      max: 2G\nmemory:\n    rules:\n      min: 100M\n      max: 2G\n
      "},{"location":"platform-admin/workloads/policies/old-policies/#read-only-values","title":"Read-only values","text":"

      When you do not want the user to be able to change a value, you can force the corresponding user interface control to become read-only by using the canEdit key. For example,

      runasuserpolicy.yaml
      apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\n  name: train-policy1\n  namespace: runai-team-a # (1) \n\nspec:\n  runAsUser:\n    rules:\n      required: true  # (2)\n      canEdit: false  # (3)\n    value: true # (4)\n
      1. Set the Project namespace here.
      2. The field is required.
      3. The field will be shown as read-only in the user interface.
      4. The field value is true.
      "},{"location":"platform-admin/workloads/policies/old-policies/#complex-values","title":"Complex Values","text":"

      The example above illustrated rules for parameters of \"primitive\" types, such as GPU allocation, CPU memory, working directory, etc. These parameters contain a single value.

      Other workload parameters, such as ports or volumes, are \"complex\", in the sense that they may contain multiple values: a workload may contain multiple ports and multiple volumes.

      The following is an example of a policy containing the value ports, which is complex: The ports flag typically contains two values: The external port that is mapped to an internal container port. One can have multiple port tuples defined for a single Workload:

      apiVersion: run.ai/v2alpha1\nkind: InteractivePolicy\nmetadata:\n  name: interactive-policy\n  namespace: runai\nspec:\n  ports:\n    rules:\n      canAdd: true\n    itemRules:\n      container:\n        min: 30000\n        max: 32767\n      external:\n        max: 32767\n    items:\n      admin-port-a:\n        rules:\n          canRemove: false\n          canEdit: false\n        value:\n          container: 30100\n          external: 8080\n      admin-port-b:\n        value:\n          container: 30101\n          external: 8081\n

      A policy for a complex field is composed of three parts:

      • Rules: Rules apply to the ports parameter as a whole. In this example, the administrator specifies canAdd rule with true value, indicating that a researcher submitting an interactive job can add additional ports to the ports listed by the policy (true is the default for canAdd, so it actually could have been omitted from the policy above). When canAdd is set to false, the researcher will not be able to add any additional port except those already specified by the policy.
      • itemRules: itemRules impose restrictions on the data members of each item, in this case - container and external. In the above example, the administrator has limited the value of container to 30000-32767, and the value of external to a maximum of 32767.
      • Items: Specifies a list of default ports. Each port is an item in the ports list and given a label (e.g. admin-port-b). The administrator can also specify whether a researcher can change/delete ports from the submitted workload. In the above example, admin-port-a is hardwired and cannot be changed or deleted, while admin-port-b can be changed or deleted by the researcher when submitting the Workload. It is possible to specify a label using the reserved name of DEFAULTS. This item provides the defaults for all other items.

      The following is an example of a complex policy for PVCs which contains DEFAULTS.

      apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\n  name: tp # use your name.\n  namespace: runai-team-a # use your namespace\nspec:\n  pvcs:\n    itemRules:\n      existingPvc:\n        canEdit: false\n      claimName:\n        required: true\n    items:\n      DEFAULTS:\n        value:\n          existingPvc: true\n          path: nil\n
      "},{"location":"platform-admin/workloads/policies/old-policies/#syntax","title":"Syntax","text":"

      The complete syntax of the policy YAML can be obtained using the explain command of kubectl. For example:

      kubectl explain trainingpolicy.spec\n
      Should provide the list of all possible fields in the spec of training policies:

      KIND:     TrainingPolicy\nVERSION:  run.ai/v2alpha1\n\nRESOURCE: spec <Object>\n\nDESCRIPTION:\nThe specifications of this TrainingPolicy\n\nFIELDS:\nannotations <Object>\nSpecifies annotations to be set in the container running the created\nworkload.\n\narguments   <Object>\nIf set, the arguments are sent along with the command which overrides the\nimage's entry point of the created workload.\n\ncommand <Object>\nIf set, overrides the image's entry point with the supplied command.\n...\n

      You can further drill down to get the syntax for ports by running:

      kubectl explain trainingpolicy.spec.ports\n
      KIND:     TrainingPolicy\nVERSION:  run.ai/v2alpha1\n\nRESOURCE: ports <Object>\n\nDESCRIPTION:\n     Specify the set of ports exposed from the container running the created\n     workload. Used together with --service-type.\n\nFIELDS:\n   itemRules    <Object>\n\n   items    <map[string]Object>\n\n   rules    <Object>\n     these rules apply to a value of type map (=non primitive) as a whole\n     additionally there are rules which apply for specific items of the map\n

      Drill down into the ports.rules object by running:

      kubectl explain trainingpolicy.spec.ports.rules\n
      KIND:     TrainingPolicy\nVERSION:  run.ai/\n\nRESOURCE: rules <Object>\n\nDESCRIPTION:\n     these rules apply to a value of type map (=non primitive) as a whole\n     additionally there are rules which apply for specific items of the map\n\nFIELDS:\n   canAdd   <boolean>\n     is it allowed for a workload to add items to this map\n\n   required <boolean>\n     if the map as a whole is required\n

      Note that each kind of policy has a slightly different set of parameters. For example, an InteractivePolicy has a jupyter parameter that is not available under TrainingPolicy.

      "},{"location":"platform-admin/workloads/policies/old-policies/#using-secrets-for-environment-variables","title":"Using Secrets for Environment Variables","text":"

      It is possible to add values from Kubernetes secrets as the value of environment variables included in the policy. The secret will be extracted from the secret object when the Job is created. For example:

        environment:\n    items:\n      MYPASSWORD:\n        value: \"SECRET:my-secret,password\"\n

      When submitting a workload that is affected by this policy, the created container will have an environment variable called MYPASSWORD whose value is the key password residing in Kubernetes secret my-secret which has been pre-created in the namespace where the workload runs.

      "},{"location":"platform-admin/workloads/policies/old-policies/#prevent-data-storage-on-the-node","title":"Prevent Data Storage on the Node","text":"

      You can configure policies to prevent the submission of workloads that use data sources that consist of a host path. This setting prevents data from being stored on the node so that in the event when a node is deleted, all data stored on that node is lost.

      Example for rejecting workloads requesting host path:

      spec:\n  volumes:\n    itemRules:\n      nfsServer:\n        required: true\n
      "},{"location":"platform-admin/workloads/policies/old-policies/#terminate-runai-training-jobs-after-preemption-policy","title":"Terminate Run:ai training Jobs after preemption policy","text":"

      Administrators can set a \u2018termination after preemption\u2019 policy to Run:ai training jobs. After applying this policy, a training job will be terminated once it has been preempted from any reason. For example, a training job that is using over-quota resources (e.g. GPUs) and the owner of those GPUs wants to reclaim them back, the Training job is preempted and typically goes back to the pending queue. However, if the termination policy is applied, the job is terminated instead of reinstated as pending. The Termination after Preemption Policy can be set as a cluster-wide policy (applicable to all namespaces/projects) or per project/namespace.

      To use this feature the administrator should configure either a cluster wide or namespace policy.

      For cluster wide (all namespaces/projects) use this YAML based policy:

      apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\n  name: training-policy\n  namespace: runai\nspec:\n  terminateAfterPreemption:\n    value: true\n

      For per namespace (project) use this YAML based policy:

      apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\n  name: training-policy\n  namespace: runai-<PROJECT_NAME>\nspec:\n  terminateAfterPreemption:\n    value: false\n
      "},{"location":"platform-admin/workloads/policies/old-policies/#modifyingdeleting-policies","title":"Modifying/Deleting Policies","text":"

      Use the standard kubectl get/apply/delete commands to modify and delete policies.

      For example, to view the global interactive policy:

      kubectl get interactivepolicies -n runai\n

      Should return the following:

      NAME                 AGE\ninteractive-policy   2d3h\n

      To delete this policy:

      kubectl delete InteractivePolicy interactive-policy -n runai\n

      To access project-specific policies, replace the -n runai parameter with the namespace of the relevant project.

      "},{"location":"platform-admin/workloads/policies/old-policies/#see-also","title":"See Also","text":"
      • For creating workloads based on policies, see the Run:ai submitting workloads
      "},{"location":"platform-admin/workloads/policies/overview/","title":"Overview","text":"

      At Run:ai, Administrator can access a suite of tools designed to facilitate efficient account management. This article focuses on two key features: workload policies and workload scheduling rules. These features empower admins to establish default values and implement restrictions allowing enhanced control, assuring compatibility with organizational policies and optimizing resources usage and utilization.

      "},{"location":"platform-admin/workloads/policies/overview/#workload-policies","title":"Workload policies","text":"

      A workload policy is an end-to-end solution for AI managers and administrators to control and simplify how workloads are submitted. This solution allows them to set best practices, enforce limitations, and standardize processes for the submission of workloads for AI projects within their organization. It acts as a key guideline for data scientists, researchers, ML & MLOps engineers by standardizing submission practices and simplifying the workload submission process.

      "},{"location":"platform-admin/workloads/policies/overview/#older-and-newer-policy-technologies","title":"Older and Newer Policy technologies","text":"

      Run:ai provides two policy technologies.

      YAML-Based policies are the older policies. These policies:

      • Require access to Kubernetes to view or change.
      • Contact Run:ai support to convert the old policies to the new V2 policies format.

      API-based policies which are the newer policies. These are:

      • Show in the Run:ai user interface.
      • Can be viewed and modified via the user interface and the Control-plane API.
      • Enable new rules addressing differences between project, department and cluster policies.
      • Only available with Run:ai clusters of version 2.18 and up.
      "},{"location":"platform-admin/workloads/policies/overview/#why-use-a-workload-policy","title":"Why use a workload policy?","text":"

      Implementing workload policies is essential when managing complex AI projects within an enterprise for several reasons:

      1. Resource control and management Defining or limiting the use of costly resources across the enterprise via a centralized management system to ensure efficient allocation and prevent overuse.
      2. Setting best practices Provide managers with the ability to establish guidelines and standards to follow, reducing errors amongst AI practitioners within the organization.
      3. Security and compliance Define and enforce permitted and restricted actions to uphold organizational security and meet compliance requirements.
      4. Simplified setup Conveniently allow setting defaults and streamline the workload submission process for AI practitioners.
      5. Scalability and diversity
        1. Multi-purpose clusters with various workload types that may have different requirements and characteristics for resource usage.
        2. The organization has multiple hierarchies, each with distinct goals, objectives and degrees of flexibility.
        3. Manage multiple users and projects with distinct requirements and methods, ensuring appropriate utilization of resources.
      "},{"location":"platform-admin/workloads/policies/overview/#understanding-the-mechanism","title":"Understanding the mechanism","text":"

      The following sections provide details of how the workload policy mechanism works.

      "},{"location":"platform-admin/workloads/policies/overview/#cross-interface-enforcement","title":"Cross-interface enforcement","text":"

      The policy enforces the workloads regardless of whether they were submitted via UI, CLI, Rest APIs, or Kubernetes YAMLs.

      "},{"location":"platform-admin/workloads/policies/overview/#policy-types","title":"Policy types","text":"

      Run:ai\u2019s policies enforce Run:ai workloads. The policy type is per Run:ai workload type. This allows administrators to set different policies for each workload type.

      Policy type Workload type Kubernetes name Workspace Workspace Interactive workload Training Standard Training Standard Training workload Distributed Distributed Distributed workload Inference* Inference Inference workload

      * The submission of this policy type is supported currently via API only

      "},{"location":"platform-admin/workloads/policies/overview/#policy-structure-rules-defaults-and-imposed-assets","title":"Policy structure - rules, defaults, and imposed assets","text":"

      A policy consists of rules for limiting and controlling the values of fields of the workload. In addition to rules, some defaults allow the implementation of default values to different workload fields. These default values are not rules, as they simply suggest values that can be overridden during the workload submission.

      Furthermore, policies allow the enforcement of workload assets. For example, as an admin, you can impose a data source of type PVC to be used by any workload submitted.

      For more information see rules, defaults and imposed assets.

      "},{"location":"platform-admin/workloads/policies/overview/#scope-of-effectiveness","title":"Scope of effectiveness","text":"

      Numerous teams working on various projects require the use of different tools, requirements, and safeguards. One policy may not suit all teams and their requirements. Hence, administrators can select the scope to cover the effectiveness of the policy. When a scope is selected, all of its subordinate units are also affected. As a result, all workloads submitted within the selected scope are controlled by the policy.

      For example, if a policy is set for Department A, all workloads submitted by any of the projects within this department are controlled.

      A scope for a policy can be:

          The entire account *  \n        L Specific cluster  \n            L Specific department  \n                L Specific project\n

      * The policy submission to the entire account scope is supported via API only

      The different scoping of policies also allows the breakdown of the responsibility between different administrators. This allows delegation of ownership between different levels within the organization. The policies, containing rules and defaults, propagate* down the organizational tree, forming an \u201ceffective\u201d policy that enforces any workload submitted by users within the project.

      "},{"location":"platform-admin/workloads/policies/overview/#policy-rules-reconciliation","title":"Policy rules reconciliation","text":"

      For situations where a rule or a default for a specific field is already governed by a policy, newly submitted policies for additional organizational units mentioning this existing field are not blocked from submission. For those instances, the effective rules and defaults are selected based on the following logic:

      • For policy defaults - The lowest organizational hierarchy \u201cclosest\u201d to the actual workload becomes the effective policy defaults (project defaults > department defaults > cluster defaults > tenant defaults).
      • For policy rules -
      • If the rule belongs to the compute and security sections in the workload spec of the Run:ai API, the highest hierarchy is chosen for the effective policy for the field (tenant rules > cluster rules > department rules > project rules).
      • If the rule does not belong to the compute or security sections, the lowest hierarchy \u201cclosest\u201d to the actual workload becomes the effective policy for the field (similar to defaults).

      While viewing the effective policy, for each rule and default the source of the policy origin is visible, allowing users to clearly understand the selected hierarchy of the effective policy.

      "},{"location":"platform-admin/workloads/policies/overview/#runai-policies-vs-kyverno-policies","title":"Run:ai Policies vs. Kyverno Policies","text":"

      Kyverno runs as a dynamic admission controller in a Kubernetes cluster. Kyverno receives validating and mutating admission webhook HTTP callbacks from the Kubernetes API server and applies matching policies to return results that enforce admission policies or reject requests. Kyverno policies can match resources using the resource kind, name, label selectors, and much more. For more information, see How Kyverno Works.

      "},{"location":"platform-admin/workloads/policies/policy-examples/","title":"Policies Examples","text":"

      This article provides examples of:

      1. Creating a new rule within a policy
      2. Best practices for adding sections to a policy
      3. A full example of a policy.
      "},{"location":"platform-admin/workloads/policies/policy-examples/#creating-a-new-rule-within-a-policy","title":"Creating a new rule within a policy","text":"

      This example shows how to add a new limitation to the GPU usage for workloads of type workspace:

      1. Check the workload API fields documentation and select the field(s) that are most relevant for GPU usage.

        {\n\"spec\": {\n    \"compute\": {\n    \"gpuDevicesRequest\": 1,\n    \"gpuRequestType\": \"portion\",\n    \"gpuPortionRequest\": 0.5,\n    \"gpuPortionLimit\": 0.5,\n    \"gpuMemoryRequest\": \"10M\",\n    \"gpuMemoryLimit\": \"10M\",\n    \"migProfile\": \"1g.5gb\",\n    \"cpuCoreRequest\": 0.5,\n    \"cpuCoreLimit\": 2,\n    \"cpuMemoryRequest\": \"20M\",\n    \"cpuMemoryLimit\": \"30M\",\n    \"largeShmRequest\": false,\n    \"extendedResources\": [\n        {\n        \"resource\": \"hardware-vendor.example/foo\",\n        \"quantity\": 2,\n        \"exclude\": false\n        }\n    ]\n    },\n}\n}\n
      2. Search the field in the Policy YAML fields - reference table. For example, gpuDevicesRequest appears under the Compute fields sub-table and appears as follow:

      Fields Description Value type Supported Run:ai workload type gpuDeviceRequest Specifies the number of GPUs to allocate for the created workload. Only if gpuDeviceRequest = 1, the gpuRequestType can be defined. integer Workspace & Training
      1. Use the value type of the gpuDevicesRequest field indicated in the table - \u201cinteger\u201d and navigate to the Value types table to view the possible rules that can be applied to this value type -

        for integer, the options are:

        • canEdit
        • required
        • min
        • max
        • step
      2. Proceed to the Rule Type table, select the required rule for the limitation of the field - for example \u201cmax\u201d and use the examples syntax to indicate the maximum GPU device requested.

      compute:\n    gpuDevicesRequest:\n        max: 2\n
      "},{"location":"platform-admin/workloads/policies/policy-examples/#policy-yaml-best-practices","title":"Policy YAML best practices","text":"Create a policy that has multiple defaults and rules

      Best practice description: Presentation of the syntax while adding a set of defaults and rules

      defaults:\n  createHomeDir: true\n  environmentVariables:\n    instances:\n    - name: MY_ENV\n      value: my_value\nsecurity:\n  allowPrivilegeEscalation: false\n\nrules:\n  storage:\n    s3:\n      attributes:\n        url:\n          options:\n            - value: https://www.google.com\n            displayed: https://www.google.com\n            - value: https://www.yahoo.com\n            displayed: https://www.yahoo.com\n
      Allow only single selection out of many

      Best practice description: Blocking the option to create all types of data sources except the one that is allowed is the solution.

      rules:\n  storage:\n    dataVolume:\n      instances:\n        canAdd: false\n    hostPath:\n      instances:\n        canAdd: false\n    pvc:\n      instances:\n        canAdd: false\n    git:\n      attributes:\n        repository:\n          required: true\n        branch:\n          required: true\n        path:\n          required: true\n    nfs:\n      instances:\n        canAdd: false\n    s3:\n      instances:\n        canAdd: false\n
      Create a robust set of guidelines

      Best practice description: Set rules for specific compute resource usage, addressing most relevant spec fields

      rules:\n  compute:\n    cpuCoreRequest:\n      required: true\n      min: 0\n      max: 8\n    cpuCoreLimit:\n      min: 0\n      max: 8\n    cpuMemoryRequest:\n      required: true\n      min: '0'\n      max: 16G\n    cpuMemoryLimit:\n      min: '0'\n      max: 8G\n    migProfile:\n      canEdit: false\n    gpuPortionRequest:\n      min: 0\n      max: 1\n    gpuMemoryRequest:\n      canEdit: false\n    extendedResources:\n      instances:\n        canAdd: false\n
      Environment creation (specific section)
      rules:\n  imagePullPolicy:\n    required: true\n    options:\n    - value: Always\n      displayed: Always\n    - value: Never\n      displayed: Never\n  createHomeDir:\n    canEdit: false\n
      Setting security measures (specific section)
      rules:\n  security:\n    runAsUid:\n      min: 1\n      max: 32700\n    allowPrivilegeEscalation:\n      canEdit: false\n
      Policy for distributed training workloads (specific section)

      Best practice description: Set rules and defaults for a distributed training workload with different settings for master and worker

      defaults:\n  worker:\n    command: my-command-worker-1\n    environmentVariables:\n      instances:\n        - name: LOG_DIR\n          value: policy-worker-to-be-ignored\n        - name: ADDED_VAR\n          value: policy-worker-added\n   security:\n    runAsUid: 500\n  storage:\n     s3:\n     attributes:\n       bucket: bucket1-worker\n master:\n   command: my-command-master-2\n   environmentVariables:\n     instances:\n       - name: LOG_DIR\n         value: policy-master-to-be-ignored\n       - name: ADDED_VAR\n         value: policy-master-added\n    security:\n      runAsUid: 800\n    storage:\n     s3:\n       attributes:\n         bucket: bucket1-master\n rules:\n   worker:\n     command:\n       options:\n         - value: my-command-worker-1\n           displayed: command1\n         - value: my-command-worker-2\n           displayed: command2\n     storage:\n       nfs:\n         instances:\n           canAdd: false\n       s3:\n         attributes:\n           bucket:\n             options:\n               - value: bucket1-worker\n               - value: bucket2-worker\n   master:\n     command:\n       options:\n         - value: my-command-master-1\n           displayed: command1\n         - value: my-command-master-2\n           displayed: command2\n     storage:\n       nfs:\n         instances:\n           canAdd: false\n       s3:\n         attributes:\n           bucket:\n             options:\n               - value: bucket1-master\n               - value: bucket2-master\n
      Impose an asset (specific section)
       defaults: null\n rules: null\n imposedAssets:\n   - f12c965b-44e9-4ff6-8b43-01d8f9e630cc\n
      "},{"location":"platform-admin/workloads/policies/policy-examples/#example-of-a-full-policy","title":"Example of a full policy","text":"
      defaults:\n  createHomeDir: true\n  imagePullPolicy: IfNotPresent\n  nodePools:\n    - node-pool-a\n    - node-pool-b\n  environmentVariables:\n    instances:\n      - name: WANDB_API_KEY\n        value: REPLACE_ME!\n      - name: WANDB_BASE_URL\n        value: https://wandb.mydomain.com\n  compute:\n    cpuCoreRequest: 0.1\n    cpuCoreLimit: 20\n    cpuMemoryRequest: 10G\n    cpuMemoryLimit: 40G\n    largeShmRequest: true\n  security:\n    allowPrivilegeEscalation: false\n  storage:\n    git:\n      attributes:\n        repository: https://git-repo.my-domain.com\n        branch: master\n    hostPath:\n      instances:\n        - name: vol-data-1\n          path: /data-1\n          mountPath: /mount/data-1\n        - name: vol-data-2\n          path: /data-2\n          mountPath: /mount/data-2\nrules:\n  createHomeDir:\n    canEdit: false\n  imagePullPolicy:\n    canEdit: false\n  environmentVariables:\n    instances:\n      locked:\n        - WANDB_BASE_URL\n  compute:\n    cpuCoreRequest:\n      max: 32\n    cpuCoreLimit:\n      max: 32\n    cpuMemoryRequest:\n      min: 1G\n      max: 20G\n    cpuMemoryLimit:\n      min: 1G\n      max: 40G\n    largeShmRequest:\n      canEdit: false\n    extendedResources:\n      instances:\n        canAdd: false\n  security:\n    allowPrivilegeEscalation:\n      canEdit: false\n    runAsUid:\n      min: 1\n  storage:\n    hostPath:\n      instances:\n        locked:\n          - vol-data-1\n          - vol-data-2\nimposedAssets:\n  - 4ba37689-f528-4eb6-9377-5e322780cc27\n
      "},{"location":"platform-admin/workloads/policies/policy-reference/","title":"Policies Reference","text":"

      A workload policy is an end-to-end solution for AI managers and administrators to control and simplify how workloads are submitted, setting best practices, enforcing limitations, and standardizing processes for AI projects within their organization.

      This article explains the policy YAML fields and the possible rules and defaults that can be set for each field.

      "},{"location":"platform-admin/workloads/policies/policy-reference/#policy-yaml-fields-reference-table","title":"Policy YAML fields - reference table","text":"

      The policy fields are structured in a similar format to the workload API fields. The following tables represent a structured guide designed to help you understand and configure policies in a YAML format. It provides the fields, descriptions, defaults and rules for each workload type.

      Click the link to view the value type of each field.

      Fields Description Value type Supported Run:ai workload type args When set, contains the arguments sent along with the command. These override the entry point of the image in the created workload string Workspace Training command A command to serve as the entry point of the container running the workspace string Workspace Training createHomeDir Instructs the system to create a temporary home directory for the user within the container. Data stored in this directory is not saved when the container exists. When the runAsUser flag is set to true, this flag defaults to true as well boolean Workspace Training environmentVariables Set of environmentVariables to populate the container running the workspace array Workspace Training image Specifies the image to use when creating the container running the workload string Workspace Training imagePullPolicy Specifies the pull policy of the image when starting t a container running the created workload. Options are: always, ifNotPresent, or never string Workspace Training workingDir Container\u2019s working directory. If not specified, the container runtime default is used, which might be configured in the container image string Workspace Training nodeType Nodes (machines) or a group of nodes on which the workload runs string Workspace Training nodePools A prioritized list of node pools for the scheduler to run the workspace on. The scheduler always tries to use the first node pool before moving to the next one when the first is not available. array Workspace Training annotations Set of annotations to populate into the container running the workspace itemized Workspace Training labels Set of labels to populate into the container running the workspace itemized Workspace Training terminateAfterPreemtpion Indicates whether the job should be terminated, by the system, after it has been preempted boolean Workspace Training autoDeletionTimeAfterCompletionSeconds Specifies the duration after which a finished workload (Completed or Failed) is automatically deleted. If this field is set to zero, the workload becomes eligible to be deleted immediately after it finishes. integer Workspace Training backoffLimit Specifies the number of retries before marking a workload as failed integer Workspace Training cleanPodPolicy

      Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed). The policy can be one of the following values:

      • Running - Only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default).
      • All - All (including completed) pods will be deleted immediately when the job finishes.
      • None - No pods will be deleted when the job completes. It will keep running pods that consume GPU, CPU and memory over time. It is recommended to set to None only for debugging and obtaining logs from running pods.
      string Distributed completions Used with Hyperparameter Optimization. Specifies the number of successful pods the job should reach to be completed. The Job is marked as successful once the specified amount of pods has succeeded. integer Workspace Training parallelism Used with Hyperparameters Optimization. Specifies the maximum desired number of pods the workload should run at any given time. itemized Workspace Training exposeUrls Specifies a set of exported URL (e.g. ingress) from the container running the created workload. itemized Workspace Training largeShmRequest Specifies a large /dev/shm device to mount into a container running the created workload. SHM is a shared file system mounted on RAM. boolean Workspace Training PodAffinitySchedulingRule Indicates if we want to use the Pod affinity rule as: the \u201chard\u201d (required) or the \u201csoft\u201d (preferred) option. This field can be specified only if PodAffinity is set to true. string Workspace Training podAffinityTopology Specifies the Pod Affinity Topology to be used for scheduling the job. This field can be specified only if PodAffinity is set to true. string Workspace Training ports Specifies a set of ports exposed from the container running the created workload. More information in Ports fields below. itemized Workspace Training probes Specifies the ReadinessProbe to use to determine if the container is ready to accept traffic. More information in Probes fields below - Workspace Training tolerations Toleration rules which apply to the pods running the workload. Toleration rules guide (but do not require) the system to which node each pod can be scheduled to or evicted from, based on matching between those rules and the set of taints defined for each Kubernetes node. itemized Workspace Training priorityClass Priority class of the workload. The values for workspace are build (default) or interactive-preemptible. For training only, use train. Enum: \"build\", \"train\", \"interactive-preemptible\" string Workspace storage Contains all the fields related to storage configurations. More information in Storage fields below. - Workspace Training security Contains all the fields related to security configurations. More information in Security fields below. - Workspace Training compute Contains all the fields related to compute configurations. More information in Compute fields below. - Workspace Training"},{"location":"platform-admin/workloads/policies/policy-reference/#ports-fields","title":"Ports fields","text":"Fields Description Value type Supported Run:ai workload type container The port that the container running the workload exposes. string Workspace Training serviceType Specifies the default service exposure method for ports. the default shall be sued for ports which do not specify service type. Options are: LoadBalancer, NodePort or ClusterIP. For more information see the External Access to Containers guide. string Workspace Training external The external port which allows a connection to the container port. If not specified, the port is auto-generated by the system. integer Workspace Training toolType The tool type that runs on this port. string Workspace Training toolName A name describing the tool that runs on this port. string Workspace Training"},{"location":"platform-admin/workloads/policies/policy-reference/#probes-fields","title":"Probes fields","text":"Fields Description Value type Supported Run:ai workload type readiness Specifies the Readiness Probe to use to determine if the container is ready to accept traffic. - Workspace Training Readiness field details Spec fields readiness Description Specifies the Readiness Probe to use to determine if the container is ready to accept traffic Supported Run:ai workload types Workspace Training Value type itemized Spec Readiness fields Description Value type initialDelaySeconds Number of seconds after the container has started before liveness or readiness probes are initiated. integer periodSeconds How often (in seconds) to perform the probe. integer timeoutSeconds Number of seconds after which the probe times out integer successThreshold Minimum consecutive successes for the probe to be considered successful after having failed. integer failureThreshod When a probe fails, the number of times to try before giving up. integer

      Example workload snippet:

      defaults:\n  probes:\n    readiness:\n        initialDelaySeconds: 2\n
      "},{"location":"platform-admin/workloads/policies/policy-reference/#security-fields","title":"Security fields","text":"Fields Description Value type Supported Run:ai workload type uidGidSource Indicates the way to determine the user and group ids of the container. The options are: fromTheImage - user and group IDs are determined by the docker image that the container runs. This is the default option. custom - user and group IDs can be specified in the environment asset and/or the workspace creation request. idpToken - user and group IDs are determined according to the identity provider (idp) access token. This option is intended for internal use of the environment UI form. For more information, see Non-root containers string Workspace Training capabilities The capabilities field allows adding a set of unix capabilities to the container running the workload. Capabilities are Linux distinct privileges traditionally associated with superuser which can be independently enabled and disabled Array Workspace Training seccompProfileType Indicates which kind of seccomp profile is applied to the container. The options are: RuntimeDefault - the container runtime default profile should be used Unconfined - no profile should be applied string Workspace Training runAsNonRoot Indicates that the container must run as a non-root user. boolean Workspace Training readOnlyRootFilesystem If true, mounts the container's root filesystem as read-only. boolean Workspace Training runAsUid Specifies the Unix user id with which the container running the created workload should run. integer Workspace Training runasGid Specifies the Unix Group ID with which the container should run. integer Workspace Training supplementalGroups Comma separated list of groups that the user running the container belongs to, in addition to the group indicated by runAsGid. string Workspace Training allowPrivilegeEscalation Allows the container running the workload and all launched processes to gain additional privileges after the workload starts boolean Workspace Training hostIpc Whether to enable hostIpc. Defaults to false. boolean Workspace Training hostNetwork Whether to enable host network. boolean Workspace Training"},{"location":"platform-admin/workloads/policies/policy-reference/#compute-fields","title":"Compute fields","text":"Fields Description Value type Supported Run:ai workload type cpuCoreRequest CPU units to allocate for the created workload (0.5, 1, .etc). The workload receives at least this amount of CPU. Note that the workload is not scheduled unless the system can guarantee this amount of CPUs to the workload. number Workspace Training cpuCoreLimit Limitations on the number of CPUs consumed by the workload (0.5, 1, .etc). The system guarantees that this workload is not able to consume more than this amount of CPUs. number Workspace Training cpuMemoryRequest The amount of CPU memory to allocate for this workload (1G, 20M, .etc). The workload receives at least this amount of memory. Note that the workload is not scheduled unless the system can guarantee this amount of memory to the workload quantity Workspace Training cpuMemoryLimit Limitations on the CPU memory to allocate for this workload (1G, 20M, .etc). The system guarantees that this workload is not be able to consume more than this amount of memory. The workload receives an error when trying to allocate more memory than this limit. quantity Workspace Training largeShmRequest A large /dev/shm device to mount into a container running the created workload (shm is a shared file system mounted on RAM). boolean Workspace Training gpuRequestType Sets the unit type for GPU resources requests to either portion, memory or mig profile. Only if gpuDeviceRequest = 1, the request type can be stated as portion, memory or migProfile. string Workspace Training migProfile Specifies the memory profile to be used for workload running on NVIDIA Multi-Instance GPU (MIG) technology. string Workspace Training (Deprecated) gpuPortionRequest Specifies the fraction of GPU to be allocated to the workload, between 0 and 1. For backward compatibility, it also supports the number of gpuDevices larger than 1, currently provided using the gpuDevices field. number Workspace Training gpuDeviceRequest Specifies the number of GPUs to allocate for the created workload. Only if gpuDeviceRequest = 1, the gpuRequestType can be defined. integer Workspace Training gpuPortionLimit When a fraction of a GPU is requested, the GPU limit specifies the portion limit to allocate to the workload. The range of the value is from 0 to 1. number Workspace Training gpuMemoryRequest Specifies GPU memory to allocate for the created workload. The workload receives this amount of memory. Note that the workload is not scheduled unless the system can guarantee this amount of GPU memory to the workload. quantity Workspace Training gpuMemoryLimit Specifies a limit on the GPU memory to allocate for this workload. Should be no less than the gpuMemory. quantity Workspace Training extendedResources Specifies values for extended resources. Extended resources are third-party devices (such as high-performance NICs, FPGAs, or InfiniBand adapters) that you want to allocate to your Job. itemized Workspace Training"},{"location":"platform-admin/workloads/policies/policy-reference/#storage-fields","title":"Storage fields","text":"Fields Description Value type Supported Run:ai workload type dataVolume Set of data volumes to use in the workload. Each data volume is mapped to a file-system mount point within the container running the workload. itemized Workspace Training hostPath Maps a folder to a file-system mount point within the container running the workload. itemized Workspace Training git Details of the git repository and items mapped to it. itemized Workspace Training pvc Specifies persistent volume claims to mount into a container running the created workload. itemized Workspace Training nfs Specifies NFS volume to mount into the container running the workload. itemized Workspace Training s3 Specifies S3 buckets to mount into the container running the workload. itemized Workspace Training configMapVolumes Specifies ConfigMaps to mount as volumes into a container running the created workload. itemized Workspace Training secretVolume Set of secret volumes to use in the workload. A secret volume maps a secret resource in the cluster to a file-system mount point within the container running the workload. itemized Workspace Training Storage field details Spec fields hostPath Description Maps a folder to a file system mount oint within the container running the workload Supported Run:ai workload types Workspace Training Value type itemized Git fields Description Value type name Unique name to identify the instance. primarily used for policy locked rules. string path Local path within the controller to which the host volume is mapped. string readOnly Force the volume to be mounted with read-only permissions. Defaults to false. boolean mountPath The path that the host volume is mounted to when in use. string mountPropagation Enum: \"None\" \"HostToContainer\" Share this volume mount with other containers. If set to HostToContainer, this volume mount receives all subsequent mounts that are mounted to this volume or any of its subdirectories. In case of multiple hostPath entries, this field should have the same value for all of them string

      Example workload snippet:

      defaults:\n  storage:\n    hostPath:\n      instances:\n        - path: h3-path-1\n          mountPath: h3-mount-1\n        - path: h3-path-2\n          mountPath: h3-mount-2\n      attributes:\n        - readOnly: true\n
      Spec fields git Description Details of the git repository and items mapped to it. Supported Run:ai workload types Workspace Training Value type itemized Git fields Description Value type repository URL to a remote git repository. The content of this repository is mapped to the container running the workload string revision Specific revision to synchronize the repository from string path Local path within the workspace to which the S3 bucket is mapped. string secretName Optional name of Kubernetes secret that holds your git username and password. string username If secretName is provided, this field should contain the key, within the provided Kubernetes secret, which holds the value of your git username. Otherwise, this field should specify your git username in plain text (example: myuser). string

      Example workload snippet:

      defaults:\n  storage:\n    git:\n      attributes:\n        Repository: https://runai.public.github.com\n      instances\n        - branch: \"master\"\n          path: /container/my-repository\n          passwordSecret: my-password-secret\n
      Spec fields pvc Description Specifies persistent volume claims to mount into a container running the created workload Supported Run:ai workload types Workspace Training Value type itemized Spec PVC fields Description Value type claimName (manadatory) A given name for the PVC. Allowed referencing it across workspaces. string ephemeral Use true to set PVC to ephemeral. If set to true, the PVC is deleted when the workspace is stopped. boolean path Local path within the workspace to which the PVC bucket is mapped. string readonly Permits read only from the PVC, prevents additions or modifications to its content. boolean ReadwriteOnce Requesting claim that can be mounted in read/write mode to exactly 1 host. If none of the modes are specified, the default is readWriteOnce. boolean size Requested size for the PVC. Mandatory when existing PVC is false. string storageClass Storage class name to associate with the PVC. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. Further details at Kubernetes storage classes. string readOnlyMany Requesting claim that can be mounted in read-only mode to many hosts. boolean readWriteMany Requesting claim that can be mounted in read/write mode to many hosts. boolean

      Example workload snippet:

      defaults:\n  storage:\n    pvc:\n      instances:\n        - claimName: pvc-staging-researcher1-home\n          existingPvc: true\n          path: /myhome\n          readOnly: false\n          claimInfo:\n            accessModes:\n              readWriteMany: true\n
      Spec fields nfs Description Specifies NFS volume to mount into the container running the workload Supported Run:ai workload types Workspace Training Value type itemized Spec PVC fields Description Value type mountpath The path that the NFS volume is mounted to when in use. string path Path that is exported by the NFS server. string readOnly Whether to force the NFS export to be mounted with read-only permissions. boolean nfsServer The hostname or IP address of the NFS server. string

      Example workload snippet:

      defaults:\nstorage:\n  nfs:\n    instances:\n      - path: nfs-path\n        readOnly: true\n        server: nfs-server\n        mountPath: nfs-mount\nrules:\n  storage:\n    nfs:\n      instances:\n        canAdd: false\n
      Spec fields s3 Description Specifies S3 buckets to mount into the container running the workload Supported Run:ai workload types Workspace Training Value type itemized Spec PVC fields Description Value type Bucket The name of the bucket string path Local path within the workspace to which the S3 bucket is mapped string url The URL of the S3 service provider. The default is the URL of the Amazon AWS Se service string

      Example workload snippet:

      defaults:\n  storage:\n    s3:\n      instances:\n        - bucket: bucket-opt-1\n          path: /s3/path\n          accessKeySecret: s3-access-key\n          secretKeyOfAccessKeyId: s3-secret-id\n          secretKeyOfSecretKey: s3-secret-key\n      attributes:\n        url: https://amazonaws.s3.com\n
      "},{"location":"platform-admin/workloads/policies/policy-reference/#value-types","title":"Value types","text":"

      Each field has a specific value type. The following value types are supported.

      Value type Description Supported rule type Defaults Boolean A binary value that can be either True or False
      • canEdit
      • required true/false String A sequence of characters used to represent text. It can include letters, numbers, symbols, and spaces
        • canEdit
        • required
        • options abc Itemized An ordered collection of items (objects), which can be of different types (all items in the list are of the same type). For further information see the chapter below the table.
          • canAdd
          • locked See below Integer An Integer is a whole number without a fractional component.
            • canEdit
            • required
            • min
            • max
            • step
            • defaultFrom 100 Number Capable of having non-integer values
              • canEdit
              • required
              • min
              • defaultFrom 10.3 Quantity Holds a string composed of a number and a unit representing a quantity
                • canEdit
                • required
                • min
                • max
                • defaultFrom 5M Array Set of values that are treated as one, as opposed to Itemized in which each item can be referenced separately.
                  • canEdit
                  • required node-a node-b node-c"},{"location":"platform-admin/workloads/policies/policy-reference/#itemized","title":"Itemized","text":"

                    Workload fields of type itemized have multiple instances, however in comparison to objects, each can be referenced by a key field. The key field is defined for each field.

                    Consider the following workload spec:

                    spec:\n  image: ubuntu\n  compute:\n    extendedResources:\n      - resource: added/cpu\n        quantity: 10\n      - resource: added/memory\n        quantity: 20M\n

                    In this example, extendedResources have two instances, each has two attributes: resource (the key attribute) and quantity.

                    In policy, the defaults and rules for itemized fields have two sub sections:

                    • Instances: default items to be added to the policy or rules which apply to an instance as a whole.
                    • Attributes: defaults for attributes within an item or rules which apply to attributes within each item.

                    Consider the following example:

                    defaults:\n  compute:\n    extendedResources:\n      instances: \n        - resource: default/cpu\n          quantity: 5\n        - resource: default/memory\n          quantity: 4M\n      attributes:\n        quantity: 3\nrules:\n  compute:\n    extendedResources:\n      instances:\n        locked: \n          - default/cpu\n      attributes:\n        quantity: \n          required: true\n

                    Assume the following workload submission is requested:

                    spec:\n  image: ubuntu\n  compute:\n    extendedResources:\n      - resource: default/memory\n        exclude: true\n      - resource: added/cpu\n      - resource: added/memory\n        quantity: 5M\n

                    The effective policy for the above mentioned workload has the following extendedResources instances:

                    Resource Source of the instance Quantity Source of the attribute quantity default/cpu Policy defaults 5 The default of this instance in the policy defaults section added/cpu Submission request 3 The default of the quantity attribute from the attributes section added/memory Submission request 5M Submission request

                    Note

                    The default/memory is not populated to the workload, this is because it has been excluded from the workload using \u201cexclude: true\u201d.

                    A workload submission request cannot exclude the default/cpu resource, as this key is included in the locked rules under the instances section. {#a-workload-submission-request-cannot-exclude-the-default/cpu-resource,-as-this-key-is-included-in-the-locked-rules-under-the-instances-section.}

                    "},{"location":"platform-admin/workloads/policies/policy-reference/#rule-types","title":"Rule types","text":"Rule types Description Supported value types Rule type example canAdd Whether the submission request can add items to an itemized field other than those listed in the policy defaults for this field. itemized storage: hostPath: instances: canAdd: false locked Set of items that the workload is unable to modify or exclude. In this example, a workload policy default is given to HOME and USER, that the submission request cannot modify or exclude from the workload. itemized storage: hostPath: Instances: locked: - HOME - USER canEdit Whether the submission request can modify the policy default for this field. In this example, it is assumed that the policy has default for imagePullPolicy. As canEdit is set to false, submission requests are not able to alter this default.
                    • string
                    • boolean
                    • integer
                    • number
                    • quantity
                    • array imagePullPolicy: canEdit: false required When set to true, the workload must have a value for this field. The value can be obtained from policy defaults. If no value specified in the policy defaults, a value must be specified for this field in the submission request.
                      • string
                      • boolean
                      • integer
                      • number
                      • quantity
                      • array image: required: true min The minimal value for the field.
                        • integer
                        • number
                        • quantity compute: gpuDevicesRequest: min: 3 max The maximal value for the field.
                          • integer
                          • number
                          • quantity compute: gpuMemoryRequest: max: 2G step The allowed gap between values for this field. In this example the allowed values are: 1, 3, 5, 7
                            • integer
                            • number compute: cpuCoreRequest: min: 1 max: 7 Step: 2 options Set of allowed values for this field. string image: options: - value: image-1 - value: image-2 defaultFrom Set a default value for a field that will be calculated based on the value of another field.
                              • integer
                              • number
                              • quantity computeCoreRequest: defaultFrom: field:compute.cpuCoreLimit factor:0.5"},{"location":"platform-admin/workloads/policies/policy-reference/#policy-spec-sections","title":"Policy Spec Sections","text":"

                                For each field of a specific policy, you can specify both rules and defaults. A policy spec consists of the following sections:

                                • Rules
                                • Defaults
                                • Imposed Assets
                                "},{"location":"platform-admin/workloads/policies/policy-reference/#rules","title":"Rules","text":"

                                Rules set up constraints on workload policy fields. For example, consider the following policy:

                                rules:\n  compute:\n    gpuDevicesRequest: \n      max: 8\n  security:\n    runAsUid: \n      min: 500\n

                                Such a policy restricts the maximum value for gpuDeviceRequests to 8, and the minimal value for runAsUid, provided in the security section to 500.

                                "},{"location":"platform-admin/workloads/policies/policy-reference/#defaults","title":"Defaults","text":"

                                The defaults section is used for providing defaults for various workload fields. For example, consider the following policy:

                                defaults:\n  imagePullPolicy: Always\n  security:\n    runAsNonRoot: true\n    runAsUid: 500\n

                                Assume a submission request with the following values:

                                • Image: ubuntu
                                • runAsUid: 501

                                The effective workload that runs has the following set of values:

                                Field Value Source Image Ubuntu Submission request ImagePullPolicy Always Policy defaults security.runAsNonRoot true Policy defaults security.runAsUid 501 Submission request

                                Note

                                It is possible to specify a rule for each field, which states if a submission request is allowed to change the policy default for that given field, for example:

                                defaults:\n  imagePullPolicy: Always\n  security:\n    runAsNonRoot: true\n    runAsUid: 500\nrules:\n  security:\n    runAsUid:\n      canEdit: false\n

                                If this policy is applied, the submission request above fails, as it attempts to change the value of secuirty.runAsUid from 500 (the policy default) to 501 (the value provided in the submission request), which is forbidden due to canEdit rule set to false for this field.

                                "},{"location":"platform-admin/workloads/policies/policy-reference/#imposed-assets","title":"Imposed Assets","text":"

                                Default instances of a storage field can be provided using a datasource containing the details of this storage instance. To add such instances in the policy, specify those asset IDs in the imposedAssets section of the policy.

                                defaults: null\nrules: null\nimposedAssets:\n  - f12c965b-44e9-4ff6-8b43-01d8f9e630cc\n

                                Assets with references to credentials assets (for example: private S3, containing reference to an AccessKey asset) cannot be used as imposedAssets.

                                "},{"location":"platform-admin/workloads/policies/workspaces-policy/","title":"Policies","text":"

                                This article explains the procedure to manage workload policies.

                                "},{"location":"platform-admin/workloads/policies/workspaces-policy/#workload-policies-table","title":"Workload policies table","text":"

                                The Workload policies table can be found under Policies in the Run:ai platform.

                                Note

                                Workload policies are disabled by default. If you cannot see Workload policies in the menu, then it must be enabled by your administrator, under General settings \u2192 Workloads \u2192 Policies

                                The Workload policies table provides a list of all the policies defined in the platform, and allows you to manage them.

                                The Workload policies table consists of the following columns:

                                Column Description Policy The policy name which is a combination of the policy scope and the policy type Type The policy type is per Run:ai workload type. This allows administrators to set different policies for each workload type. Status Representation of the policy lifecycle (one of the following - \u201cCreating\u2026\u201d, \u201cUpdating\u2026\u201d, \u201cDeleting\u2026\u201d, Ready or Failed) Scope The scope the policy affects. Click the name of the scope to view the organizational tree diagram. You can only view the parts of the organizational tree for which you have permission to view. Created by The user who created the policy Creation time The timestamp for when the policy was created Last updated The last time the policy was updated"},{"location":"platform-admin/workloads/policies/workspaces-policy/#customizing-the-table-view","title":"Customizing the table view","text":"
                                • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                • Search - Click SEARCH and type the value to search by
                                • Sort - Click each column header to sort by
                                • Column selection - Click COLUMNS and select the columns to display in the table
                                • Refresh - Click REFRESH to update the table with the latest data
                                "},{"location":"platform-admin/workloads/policies/workspaces-policy/#adding-a-policy","title":"Adding a policy","text":"

                                To create a new policy:

                                1. Click +NEW POLICY
                                2. Select a scope
                                3. Select the workload type
                                4. Click +POLICY YAML
                                5. In the YAML editor type or paste a YAML policy with defaults and rules. You can utilize the following references and examples:
                                6. Policy YAML reference
                                7. Policy YAML examples
                                8. Click SAVE POLICY
                                "},{"location":"platform-admin/workloads/policies/workspaces-policy/#editing-a-policy","title":"Editing a policy","text":"
                                1. Select the policy you want to edit
                                2. Click EDIT
                                3. Update the policy and click APPLY
                                4. Click SAVE POLICY
                                "},{"location":"platform-admin/workloads/policies/workspaces-policy/#troubleshooting","title":"Troubleshooting","text":"

                                Listed below are issues that might occur when creating or editing a policy via the YAML Editor:

                                Issue Message Mitigation Cluster connectivity issues There's no communication from cluster \u201ccluster_name\u201c. Actions may be affected, and the data may be stale. Verify that you are on a network that has been allowed access to the cluster. Reach out to your cluster administrator for instructions on verifying the issue. Policy can\u2019t be applied due to a rule that is occupied by a different policy Field \u201cfield_name\u201d already has rules in cluster: \u201ccluster_id\u201d Remove the rule from the new policy or adjust the old policy for the specific rule. Policy is not visible in the UI - Check that the policy hasn\u2019t been deleted. Policy syntax is no valid Add a valid policy YAML;json: unknown field \"field_name\" For correct syntax check the Policy YAML reference or the Policy YAML examples. Policy can\u2019t be saved for some reason The policy couldn't be saved due to a network or other unknown issue. Download your draft and try pasting and saving it again later. Possible cluster connectivity issues. Try updating the policy once again at a different time. Policies were submitted before version 2.18, you upgraded to version 2.18 or above and wish to submit new policies If you have policies and want to create a new one, first contact Run:ai support to prevent potential conflicts Contact Run:ai support. R&D can migrate your old policies to the new version."},{"location":"platform-admin/workloads/policies/workspaces-policy/#viewing-a-policy","title":"Viewing a policy","text":"

                                To view a policy:

                                1. Select the policy for which you want to view its policies.
                                2. Click VIEW POLICY
                                3. In the Policy form per workload section, view the workload rules and defaults:
                                  • Parameter The workload submission parameter that Rules and Defaults are applied to
                                  • Type (applicable for data sources only) The data source type (Git, S3, nfs, pvc etc.)
                                  • Default The default value of the Parameter
                                  • Rule Set up constraint on workload policy field
                                  • Source The origin of the applied policy (cluster, department or project)

                                Note

                                Some of the rules and defaults may be derived from policies of a parent cluster and/or department. You can see the source of each rule in the policy form. For more information, check the Scope of effectiveness documentation

                                "},{"location":"platform-admin/workloads/policies/workspaces-policy/#deleting-a-policy","title":"Deleting a policy","text":"
                                1. Select the policy you want to delete
                                2. Click DELETE
                                3. On the dialog, click DELETE to confirm the deletion
                                "},{"location":"platform-admin/workloads/policies/workspaces-policy/#using-api","title":"Using API","text":"

                                Go to the Policies API reference to view the available actions.

                                "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"Researcher/overview-researcher/","title":"Overview: Researcher Documentation","text":"

                                Researchers, or AI practitioners, use Run:ai to submit Workloads.

                                As part of the Researcher documentation you will find:

                                • Quickstart Guides which provide step-by-step guides to Run:ai technology.
                                • Command line interface reference documentation.
                                • Best Practices for Deep Learning with Run:ai.
                                • Information about the Run:ai Scheduler.
                                • Using Run:ai with various developer tools.
                                "},{"location":"Researcher/use-cases/","title":"Use Cases","text":"

                                This is a collection of various client-requested use cases. Each use case is accompanied by a short live-demo video, along with all the files used.

                                Note

                                For the most up-to-date information, check out the official Run:ai use-cases GitHub page.

                                • MLflow with Run:ai: experiment management is important for Data Scientists. This is a demo of how to set up and use MLflow with Run:ai.
                                • Introduction to Docker: Run:ai runs using Docker images. This is a brief introduction to Docker, image creation, and how to use them in the context of Run:ai. Please also check out the Persistent Environments use case if you wish to keep the creation of Docker images to a minimum.
                                • Tensorboard with Jupyter (ResNet demo): Many Data Scientists like to use Tensorboard to keep an eye on the their current training experiments. They also like to have it side-by-side with Jupyter. In this demo, we will show how to integrate Tensorboard and Jupyter Lab within the context of Run:ai.
                                • Persistent Environments (with Conda/Mamba & Jupyter): Some Data Scientists find creating Docker images for every single one of their environments a bit of a hindrance. They would often prefer the ability to create and alter environments on the fly and to have those environments remain, even after an image has finished running in a job. This demo shows users how they can create and persist Conda/Mamba environments using an NFS.
                                • Weights & Biases with Run:ai: W&B (Weights & Biases) is one of the best tools for experiment tracking and management. W&B is an official Run:ai partner. In this tutorial, we will demo how to use W&B alongside Run:ai
                                "},{"location":"Researcher/Walkthroughs/quickstart-inference/","title":"Quickstart: Launch an Inference Workload","text":""},{"location":"Researcher/Walkthroughs/quickstart-inference/#introduction","title":"Introduction","text":"

                                Machine learning (ML) inference refers to the process of using a trained machine learning model to make predictions or generate outputs based on new, unseen data. After a model has been trained on a dataset, inference involves applying this model to new examples to produce results such as classifications, predictions, or other types of insights.

                                The quickstart below shows an inference server running the model and an inference client.

                                There are various ways to submit a Workload:

                                • Run:ai command-line interface (CLI)
                                • Run:ai user interface
                                • Run:ai API

                                At this time, Inference services cannot be created via the CLI. The CLI can be used for creating a client to query the inference service.

                                "},{"location":"Researcher/Walkthroughs/quickstart-inference/#prerequisites","title":"Prerequisites","text":"

                                To complete this Quickstart, the Infrastructure Administrator will need to install some optional inference prerequisites as described here.

                                To complete this Quickstart, the Platform Administrator will need to provide you with:

                                • ML Engineer access to Project in Run:ai named \"team-a\"
                                • The project should be assigned a quota of at least 1 GPU.
                                • The URL of the Run:ai Console. E.g. https://acme.run.ai.

                                As described, the inference client can be created via CLI. To perform this, you will need to have the Run:ai CLI installed on your machine. There are two available CLI variants:

                                • The older V1 CLI. See installation here
                                • A newer V2 CLI, supported with clusters of version 2.18 and up. See installation here
                                "},{"location":"Researcher/Walkthroughs/quickstart-inference/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/quickstart-inference/#login","title":"Login","text":"CLI V1CLI V2User InterfaceAPI

                                Run runai login and enter your credentials.

                                Run runai login and enter your credentials.

                                Browse to the provided Run:ai user interface and log in with your credentials.

                                To use the API, you will need to obtain a token. Please follow the api authentication article.

                                "},{"location":"Researcher/Walkthroughs/quickstart-inference/#create-an-inference-server-environment","title":"Create an Inference Server Environment","text":"

                                To complete this Quickstart via the UI, you will need to create a new Inference Server Environment asset.

                                This is a one-time step for all Inference workloads using the same image.

                                Under Environments Select NEW ENVIRONMENT. Then select:

                                • A default (cluster) scope.
                                • Use the environment name inference-server.
                                • The image runai.jfrog.io/demo/example-triton-server.
                                • Under type of workload select inference.
                                • Under endpoint set the container port as 8000 which is the port that the triton server is using.
                                "},{"location":"Researcher/Walkthroughs/quickstart-inference/#run-an-inference-workload","title":"Run an Inference Workload","text":"CLI V1CLI V2User InterfaceAPI

                                Not available right now.

                                Not available right now.

                                • In the Run:ai UI select Workloads
                                • Select New Workload and then Inference
                                • You should already have Cluster and Project selected. Enter inference-server-1 as the name and press CONTINUE.
                                • Under Environment, select inference-server.
                                • Under Compute Resource, select half-gpu.
                                • Under `Replica autoscaling, select a minimum of 1 and a maximum of 2.
                                • Under conditions for a new replica select Concurrency and set the value as 3.
                                • Set the scale to zero option to 5 minutes
                                • Select CREATE INFERENCE.

                                Note

                                For more information on submitting Workloads and creating Assets via the user interface, see Workload documentation.

                                curl -L 'https://<COMPANY-URL>/api/v1/workloads/inferences' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"inference-server-1\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {\n        \"image\": \"runai.jfrog.io/demo/example-triton-server\",\n        \"servingPort\": {\n            \"protocol\": \"http\",\n            \"container\": 8000\n        },\n        \"autoscaling\": {\n            \"minReplicas\": 1,\n            \"maxReplicas\": 2,\n            \"metric\": \"concurrency\",\n            \"metricThreshold\": 3,\n            \"scaleToZeroRetentionSeconds\": 300\n        },\n        \"compute\": {\n            \"cpuCoreRequest\": 0.1,\n            \"gpuRequestType\": \"portion\",\n            \"cpuMemoryRequest\": \"100M\",\n            \"gpuDevicesRequest\": 1,\n            \"gpuPortionRequest\": 0.5\n        }\n    }\n}'\n
                                1. <COMPANY-URL> is the link to the Run:ai user interface. For example acme.run.ai
                                2. <TOKEN> is an API access token. see above on how to obtain a valid token.
                                3. <PROJECT-ID> is the the ID of the team-a Project. You can get the Project ID via the Get Projects API
                                4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.

                                Note

                                • The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.
                                • For more information on the Inference Submit API see API Documentation

                                This would start a triton inference server with a maximum of 2 instances, each instance consumes half a GPU.

                                "},{"location":"Researcher/Walkthroughs/quickstart-inference/#query-the-inference-server","title":"Query the Inference Server","text":"

                                You can use the Run:ai Triton demo client to send requests to the server

                                "},{"location":"Researcher/Walkthroughs/quickstart-inference/#find-the-inference-server-endpoint","title":"Find the Inference Server Endpoint","text":"
                                • Under Workloads, select Columns on the top right. Add the column Connections.
                                • See the connections of the inference-server-1 workload:
                                • Copy the inference endpoint URL.
                                CLI V1CLI V2User Interface

                                Open a terminal and run:

                                runai config project team-a   \nrunai submit inference-client-1  -i runai.jfrog.io/demo/example-triton-client \\\n-- perf_analyzer -m inception_graphdef  -p 3600000 -u  <INFERENCE-ENDPOINT>    \n

                                Open a terminal and run:

                                runai project set team-a\nrunai training submit inference-client-1  -i runai.jfrog.io/demo/example-triton-client \\\n-- perf_analyzer -m inception_graphdef  -p 3600000 -u  <INFERENCE-ENDPOINT>    \n
                                • In the Run:ai UI select Workloads
                                • Select New Workload and then Training
                                • You should already have Cluster, Project and a start from scratch Template selected. Enter inference-client-1 as the name and press CONTINUE.
                                • Select NEW ENVIRONMENT. Enter inference-client as the name and runai.jfrog.io/demo/example-triton-client as the image. Select CREATE ENVIRONMENT.
                                • When the previous screen comes up, select cpu-only under the Compute resource.
                                • Under runtime settings enter the command as perf_analyzer and arguments -m inception_graphdef -p 3600000 -u <INFERENCE-ENDPOINT> (replace inference endpoint with the above URL).
                                • Select CREATE TRAINING.

                                In the user interface, under inference-server-1, go to the Metrics tab and watch as the various GPU and inference metrics graphs rise.

                                "},{"location":"Researcher/Walkthroughs/quickstart-inference/#stop-workload","title":"Stop Workload","text":"

                                Run the following:

                                CLI V1CLI V2User Interface

                                Not available right now

                                Not available right now

                                Select the two workloads and press DELETE.

                                "},{"location":"Researcher/Walkthroughs/quickstart-overview/","title":"Run:ai Quickstart Guides","text":"

                                Below is a set of Quickstart documents. The purpose of these documents is to get you acquainted with an aspect of Run:ai in the simplest possible form.

                                Note

                                The Quickstart documents are based solely on the command-line interface. The same functionality can be achieved by using the Workloads User interface which allows for Workload submission and log viewing.

                                Follow the Quickstart documents below to learn more:

                                • Training Quickstart documents:
                                  • Standard training sessions
                                  • Distributed Training
                                • Build Quickstart documents:
                                  • Basic Interactive build sessions
                                  • Interfactive build session with connected ports
                                  • Jupyter Notebook
                                  • Visual Studio Web
                                • Inference
                                • GPU Allocation documents:
                                  • Using GPU Fractions
                                • Scheduling documents:
                                  • Over-Quota, Basic Fairness & Bin Packing
                                  • Fairness

                                Most quickstarts rely on an image called runai.jfrog.io/demo/quickstart. The image is based on TensorFlow Release 20-08. This TensorFlow image has minimal requirements for CUDA and NVIDIA Compute Capability.

                                If your GPUs do not meet these requirements, use runai.jfrog.io/demo/quickstart:legacy instead.

                                "},{"location":"Researcher/Walkthroughs/quickstart-vscode/","title":"Quickstart: Launch Workspace with a Visual Studio Code for Web","text":""},{"location":"Researcher/Walkthroughs/quickstart-vscode/#introduction","title":"Introduction","text":"

                                The purpose of this article is to provide a quick ramp-up to running a Workspace running Visual Studio Code (Web edition). Workspaces are containers that live forever until deleted by the user.

                                There are various ways to submit a Workspace:

                                • Run:ai command-line interface (CLI)
                                • Run:ai user interface
                                • Run:ai API
                                "},{"location":"Researcher/Walkthroughs/quickstart-vscode/#prerequisites","title":"Prerequisites","text":"

                                To complete this Quickstart, the Infrastructure Administrator will need to configure a wildcard certificate to Run:ai as described here.

                                To complete this Quickstart, the Platform Administrator will need to provide you with:

                                • Researcher access to Project in Run:ai named \"team-a\"
                                • The project should be assigned a quota of at least 1 GPU.
                                • A URL of the Run:ai Console. E.g. https://acme.run.ai.

                                To complete this Quickstart via the CLI, you will need to have the Run:ai CLI installed on your machine. There are two available CLI variants:

                                • The older V1 CLI. See installation here
                                • A newer V2 CLI, supported with clusters of version 2.18 and up. See installation here
                                "},{"location":"Researcher/Walkthroughs/quickstart-vscode/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/quickstart-vscode/#login","title":"Login","text":"CLI V1CLI V2User InterfaceAPI

                                Run runai login and enter your credentials.

                                Run runai login and enter your credentials.

                                Browse to the provided Run:ai user interface and log in with your credentials.

                                To use the API, you will need to obtain a token. Please follow the api authentication article.

                                "},{"location":"Researcher/Walkthroughs/quickstart-vscode/#create-a-visual-studio-environment","title":"Create a Visual Studio Environment","text":"

                                To complete this Quickstart via the UI, you will need to create a new Visual Studio Environment asset.

                                This is a one-time step for all VSCode Workloads.

                                Under Environments Select NEW ENVIRONMENT. Then select:

                                • A scope (where you want your environment to live).
                                • Use the environment name vscode.
                                • The image quay.io/opendatahub-contrib/workbench-images:vscode-datascience-c9s-py311_2023c_latest.
                                • Under Tools, add Visual Studio Code and change the port to 8787.
                                "},{"location":"Researcher/Walkthroughs/quickstart-vscode/#run-workload","title":"Run Workload","text":"CLI V1CLI V2User InterfaceAPI

                                Open a terminal and run:

                                runai config project team-a   \nrunai submit vs1 --jupyter -g 1\n

                                Note

                                For more information on the workload submit command, see cli documentation.

                                Open a terminal and run:

                                runai project set team-a\nrunai workspace submit vs1  --image quay.io/opendatahub-contrib/workbench-images:vscode-datascience-c9s-py311_2023c_latest \\\n    --gpu-devices-request 1  --external-url container=8787  \n

                                Note

                                For more information on the workspace submit command, see cli documentation.

                                • In the Run:ai UI select Workloads
                                • Select New Workload and then Workspace
                                • You should already have Cluster, Project and a start from scratch Template selected. Enter vs1 as the name and press CONTINUE.
                                • Under Environment, select select the previously created vscode environment.
                                • Under Compute Resource, select one-gpu.
                                • Select CREATE WORKSPACE.

                                Note

                                For more information on submitting Workloads and creating Assets via the user interface, see Workload documentation.

                                curl -L 'https://<COMPANY-URL>/api/v1/workloads/workspaces' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"vs1\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {\n        \"image\": \"quay.io/opendatahub-contrib/workbench-images:vscode-datascience-c9s-py311_2023c_latest\",\n        \"compute\": {\n            \"gpuDevicesRequest\": 1\n        },\n        \"exposedUrls\" : [\n            { \n                \"container\" : 8787,\n                \"toolType\": \"visual-studio-code\", \\ # (5)\n                \"toolName\": \"Visual Studio\" \\ # (6)\n            }\n        ]\n    }\n}'\n
                                1. <COMPANY-URL> is the link to the Run:ai user interface. For example acme.run.ai
                                2. <TOKEN> is an API access token. see above on how to obtain a valid token.
                                3. <PROJECT-ID> is the the ID of the team-a Project. You can get the Project ID via the Get Projects API
                                4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.
                                5. toolType will show the Visual Studio icon when connecting to the Visual Studio tool via the user interface.
                                6. toolName text will show when connecting to the Visual Studio tool via the user interface.

                                Note

                                • The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.
                                • For more information on the Training Submit API see API Documentation

                                This would start a Workspace with a pre-configured Visual Studio Code image with an allocation of a single GPU.

                                "},{"location":"Researcher/Walkthroughs/quickstart-vscode/#accessing-visual-studio-web","title":"Accessing Visual Studio Web","text":"

                                Via the Run:ai user interface, go to Workloads, select the vs1 Workspace and press Connect.

                                "},{"location":"Researcher/Walkthroughs/quickstart-vscode/#stop-workload","title":"Stop Workload","text":"

                                Run the following:

                                CLI V1CLI V2User Interface
                                runai delete job vs1\n
                                runai workspace delete vs1\n

                                Select the Workspace and press DELETE.

                                "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/","title":"Quickstart: Launch Interactive Build Workloads with Connected Ports","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#introduction","title":"Introduction","text":"

                                This Quickstart is an extension of the Quickstart document: Start and Use Interactive Build Workloads

                                When starting a container with the Run:ai Command-Line Interface (CLI), it is sometimes needed to expose internal ports to the user. Examples are: accessing a Jupyter notebook, using the container from a development environment such as PyCharm.

                                "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#exposing-a-container-port","title":"Exposing a Container Port","text":"

                                There are three ways to expose ports in Kubernetes: Port Forwarding, NodePort, and LoadBalancer. The first two will always work. The other requires a special setup by your administrator. The four methods are explained here.

                                The document below provides an example based on Port Forwarding.

                                "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#port-forwarding-step-by-step-walkthrough","title":"Port Forwarding, Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#setup","title":"Setup","text":"
                                • Login to the Projects area of the Run:ai user interface.
                                • Add a Project named team-a.
                                "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#run-workload","title":"Run Workload","text":"
                                • At the command-line run:
                                runai config project team-a\nrunai submit nginx-test -i zembutsu/docker-sample-nginx --interactive\nrunai port-forward nginx-test --port 8080:80\n
                                • The Job is based on a sample NGINX webserver docker image zembutsu/docker-sample-nginx. Once accessed via a browser, the page shows the container name.
                                • Note the interactive flag which means the Job will not have a start or end. It is the Researcher's responsibility to close the Job.
                                • In this example, we have chosen the simplest scheme to expose ports which is port forwarding. We temporarily expose port 8080 to localhost as long as the runai port-forward command is not stopped
                                • It is possible to forward traffic from multiple IP addresses by using the \"--address\" parameter. Check the CLI reference for further details.

                                The result will be:

                                The job 'nginx-test-0' has been submitted successfully\nYou can run `runai describe job nginx-test-0 -p team-a` to check the job status\n\nForwarding from 127.0.0.1:8080 -> 80\nForwarding from [::1]:8080 -> 80\n
                                "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#access-the-webserver","title":"Access the Webserver","text":"

                                Open the following in the browser at http://localhost:8080.

                                You should see a web page with the name of the container.

                                "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#stop-workload","title":"Stop Workload","text":"

                                Press Ctrl-C in the shell to stop port forwarding. Then delete the Job by running runai delete job nginx-test

                                "},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#see-also","title":"See Also","text":"
                                • Develop on Run:ai using Visual Studio Code
                                • Develop on Run:ai using PyCharm
                                • Use a Jupyter notbook with Run:ai.
                                "},{"location":"Researcher/Walkthroughs/walkthrough-build/","title":"Quickstart: Launch Interactive Build Workloads","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build/#introduction","title":"Introduction","text":"

                                The purpose of this article is to provide a quick ramp-up to running an interactive Workspace to allow building data science programs. data scientists typically use various tools such as Jupyter Notebook, PyCharm, or Visual Studio code. However, in this quickstart, we will start by launching a bare-bones Workspace without such tools.

                                With this Quickstart you will learn how to:

                                • Start a workspace.
                                • Open an ssh session to the workspace.
                                • Stop the workspace.
                                "},{"location":"Researcher/Walkthroughs/walkthrough-build/#prerequisites","title":"Prerequisites","text":"

                                To complete this Quickstart, the Platform Administrator will need to provide you with:

                                • Researcher access to Project in Run:ai named \"team-a\"
                                • The project should be assigned a quota of at least 1 GPU.
                                • A URL of the Run:ai Console. E.g. https://acme.run.ai.

                                To complete this Quickstart via the CLI, you will need to have the Run:ai CLI installed on your machine. There are two available CLI variants:

                                • The older V1 CLI. See installation here
                                • A newer V2 CLI, supported with clusters of version 2.18 and up. See installation here
                                "},{"location":"Researcher/Walkthroughs/walkthrough-build/#step-by-step-quickstart","title":"Step by Step Quickstart","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build/#login","title":"Login","text":"CLI V1CLI V2User InterfaceAPI

                                Run runai login and enter your credentials.

                                Run runai login and enter your credentials.

                                Browse to the provided Run:ai user interface and log in with your credentials.

                                To use the API, you will need to obtain a token. Please follow the api authentication article.

                                "},{"location":"Researcher/Walkthroughs/walkthrough-build/#create-a-workspace","title":"Create a Workspace","text":"CLI V1CLI V2User InterfaceAPI

                                Open a terminal and run:

                                runai config project team-a   \nrunai submit build1 -i ubuntu -g 1 --interactive -- sleep infinity\n

                                Note

                                For more information on the workload submit command, see cli documentation.

                                Open a terminal and run:

                                runai project set team-a\nrunai workspace submit build1 -i ubuntu -g 1 --command -- sleep infinity\n

                                Note

                                For more information on the workspace submit command, see cli documentation.

                                • In the Run:ai UI select Workloads
                                • Select New Workload and then Workspace
                                • You should already have Cluster, Project and a start from scratch Template selected. Enter build1 as the name and press CONTINUE.
                                • Select NEW ENVIRONMENT. Enter ubuntu as the name and ubuntu as the image. Then select CREATE ENVIRONMENT.
                                • When the previous screen comes up, select one-gpu under the Compute resource.
                                • Select CREATE WORKSPACE.

                                Note

                                For more information on submitting Workloads and creating Assets via the user interface, see Workload documentation.

                                curl -L 'https://<COMPANY-URL>/api/v1/workloads/workspaces' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"build1\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {\n        \"command\" : \"sleep\",\n        \"args\" : \"infinity\"\n        \"image\": \"ubuntu\",\n        \"compute\": {\n        \"gpuDevicesRequest\": 1\n        }\n    }\n}'\n
                                1. <COMPANY-URL> is the link to the Run:ai user interface. For example acme.run.ai
                                2. <TOKEN> is an API access token. see above on how to obtain a valid token.
                                3. <PROJECT-ID> is the the ID of the team-a Project. You can get the Project ID via the Get Projects API
                                4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.

                                Note

                                • The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.
                                • For more information on the Workspace Submit API see API Documentation
                                • This would start a workload of type Workspace for team-a with an allocation of a single GPU.
                                • We named the Workload build1.
                                • Note that, unlike a Training workload, a Workspace workload will not end automatically. It is the Researcher's responsibility to stop the Workload.
                                • The command provided is sleep infinity. You must provide a command or the container will start and then exit immediately. Alternatively, when using the command line, replace these flags with --attach to attach immediately to a session.
                                "},{"location":"Researcher/Walkthroughs/walkthrough-build/#list-workloads","title":"List Workloads","text":"

                                Follow up on the Workload's progress by running:

                                CLI V1CLI V2User Interface

                                runai list jobs\n
                                The result:

                                runai workspace list\n

                                The result:

                                Workload     Type        Status      Project     Preemptible      Running/Requested Pods     GPU Allocation\n\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\nvs1          Workspace   Running     team-a      No               1/1                        1.00\n
                                • Open the Run:ai user interface.
                                • Under \"Workloads\" you can view the new Workspace:

                                Select the Workloads and press Show Details to see the Workload details

                                Typical statuses you may see:

                                • ContainerCreating - The docker container is being downloaded from the cloud repository
                                • Pending - the job is waiting to be scheduled
                                • Running - the job is running

                                A full list of Job statuses can be found here

                                "},{"location":"Researcher/Walkthroughs/walkthrough-build/#describe-workload","title":"Describe Workload","text":"

                                To get additional status on your Workload run:

                                CLI V1CLI V2User Interface
                                runai describe job build1\n
                                runai workspace describe build1\n

                                Workload parameters can be viewed by adding more columns to the Workload list and by reviewing the Event History tab for the specific Workload.

                                "},{"location":"Researcher/Walkthroughs/walkthrough-build/#get-a-shell-to-the-container","title":"Get a Shell to the container","text":"CLI V1CLI V2

                                Run:

                                runai bash build1\n

                                runai workspace bash build1\n

                                This should provide a direct shell into the computer

                                "},{"location":"Researcher/Walkthroughs/walkthrough-build/#stop-workload","title":"Stop Workload","text":"

                                Run the following:

                                CLI V1CLI V2User Interface
                                runai delete job build1\n
                                runai workspace delete build1\n

                                Select the Workspace and press DELETE.

                                This would stop the workspace. You can verify this by running the list command again.

                                "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/","title":"Quickstart: Launch Workloads with GPU Fractions","text":""},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#introduction","title":"Introduction","text":"

                                Run:ai provides a Fractional GPU sharing system for containerized workloads on Kubernetes. The system supports workloads running CUDA programs and is especially suited for lightweight AI tasks such as inference and model building. The fractional GPU system transparently gives data science and AI engineering teams the ability to run multiple workloads simultaneously on a single GPU, enabling companies to run more workloads such as computer vision, voice recognition and natural language processing on the same hardware, lowering costs.

                                Run:ai\u2019s fractional GPU system effectively creates logical GPUs, with their own memory and computing space that containers can use and access as if they were self-contained processors. This enables several workloads to run in containers side-by-side on the same GPU without interfering with each other. The solution is transparent, simple, and portable; it requires no changes to the containers themselves.

                                A typical use-case could see a couple of Workloads running on the same GPU, meaning you could multiply the work with the same hardware.

                                The purpose of this article is to provide a quick ramp-up to running a training Workload with fractions of a GPU.

                                There are various ways to submit a Workload:

                                • Run:ai command-line interface (CLI)
                                • Run:ai user interface
                                • Run:ai API
                                "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#prerequisites","title":"Prerequisites","text":"

                                To complete this Quickstart, the Platform Administrator will need to provide you with:

                                • Researcher access to Run:ai
                                • To a Project named \"team-a\"
                                • With at least 1 GPU assigned to the project.
                                • A link to the Run:ai Console. E.g. https://acme.run.ai.
                                • To complete this Quickstart via the CLI, you will need to have the Run:ai CLI installed on your machine. There are two available CLI variants:
                                  • The older V1 CLI. See installation here
                                  • A newer V2 CLI, supported with clusters of version 2.18 and up. See installation here
                                "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#login","title":"Login","text":"CLI V1CLI V2User InterfaceAPI

                                Run runai login and enter your credentials.

                                Run runai login and enter your credentials.

                                Browse to the provided Run:ai user interface and log in with your credentials.

                                To use the API, you will need to obtain a token. Please follow the api authentication article.

                                "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#run-workload","title":"Run Workload","text":"

                                Open a terminal and run:

                                CLI V1CLI V2User InterfaceAPI
                                runai config project team-a   \nrunai submit frac05 -i runai.jfrog.io/demo/quickstart -g 0.5\nrunai submit frac05-2 -i runai.jfrog.io/demo/quickstart -g 0.5 \n
                                runai project set team-a\nrunai training submit frac05 -i runai.jfrog.io/demo/quickstart --gpu-portion-request 0.5\nrunai training submit frac05-2 -i runai.jfrog.io/demo/quickstart --gpu-portion-request 0.5\n
                                • In the Run:ai UI select Workloads
                                • Select New Workload and then Training
                                • You should already have Cluster, Project and a start from scratch Template selected. Enter frac05 as the name and press CONTINUE.
                                • Select NEW ENVIRONMENT. Enter quickstart as the name and runai.jfrog.io/demo/quickstart as the image. Then select CREATE ENVIRONMENT.
                                • When the previous screen comes up, select half-gpu under the Compute resource.
                                • Select CREATE TRAINING.
                                • Follow the process again to submit a second workload called frac05-2.

                                Note

                                For more information on submitting Workloads and creating Assets via the user interface, see Workload documentation.

                                curl -L 'https://<COMPANY-URL>/api/v1/workloads/trainings' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"frac05\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {\n        \"image\": \"runai.jfrog.io/demo/quickstart\",\n        \"compute\": {\n        \"gpuRequestType\": \"portion\",\n        \"gpuPortionRequest\" : 0.5\n        }\n    }\n}'\n
                                1. <COMPANY-URL> is the link to the Run:ai user interface. For example acme.run.ai
                                2. <TOKEN> is an API access token. see above on how to obtain a valid token.
                                3. <PROJECT-ID> is the the ID of the team-a Project. You can get the Project ID via the Get Projects API
                                4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.

                                Note

                                • The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.
                                • For more information on the Training Submit API see API Documentation
                                • The Workloads are based on a sample docker image runai.jfrog.io/demo/quickstart the image contains a startup script that runs a deep learning TensorFlow-based workload.
                                • We named the Workloads frac05 and frac05-2 respectively.
                                • The Workloads are assigned to team-a with an allocation of half a GPU.
                                "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#list-workloads","title":"List Workloads","text":"

                                Follow up on the Workload's progress by running:

                                CLI V1CLI V2User Interface

                                runai list jobs\n
                                The result:

                                Showing jobs for project team-a\nNAME      STATUS   AGE  NODE                  IMAGE                          TYPE   PROJECT  USER   GPUs Allocated (Requested)  PODs Running (Pending)  SERVICE URL(S)\nfrac05    Running  9s   runai-cluster-worker  runai.jfrog.io/demo/quickstart  Train  team-a   yaron  0.50 (0.50)                 1 (0)\nfrac05-2  Running  8s   runai-cluster-worker  runai.jfrog.io/demo/quickstart  Train  team-a   yaron  0.50 (0.50)                 1 (0)\n
                                runai training list\n

                                The result:

                                Workload               Type        Status      Project     Preemptible      Running/Requested Pods     GPU Allocation\n\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\nfrac05      Training    Running  team-a      Yes              0/1                        0.00\nfrac05-2    Training    Running  team-a      Yes              0/1                        0.00    \n
                                • Open the Run:ai user interface.
                                • Under Workloads you can view the two new Training Workloads
                                "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#view-partial-gpu-memory","title":"View Partial GPU memory","text":"

                                To verify that the Workload sees only parts of the GPU memory run:

                                CLI V1CLI V2
                                runai exec frac05 nvidia-smi\n
                                runai training exec frac05 nvidia-smi\n

                                The result:

                                Notes:

                                • The total memory is circled in red. It should be 50% of the GPUs memory size. In the picture above we see 8GB which is half of the 16GB of Tesla V100 GPUs.
                                • The script running on the container is limited by 8GB. In this case, TensorFlow, which tends to allocate almost all of the GPU memory has allocated 7.7GB RAM (and not close to 16 GB). Overallocation beyond 8GB will lead to an out-of-memory exception
                                "},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#use-exact-gpu-memory","title":"Use Exact GPU Memory","text":"

                                Instead of requesting a fraction of the GPU, you can ask for specific GPU memory requirements. For example:

                                CLI V1CLI V2User Interface
                                runai submit  -i runai.jfrog.io/demo/quickstart --gpu-memory 5G\n
                                runai training submit -i runai.jfrog.io/demo/quickstart --gpu-memory-request 5G\n

                                As part of the Workload submission, Create a new Compute Resource, with 1 GPU Device and 5GB of GPU memory per device. See picture below:

                                Which will provide 5GB of GPU memory.

                                "},{"location":"Researcher/Walkthroughs/walkthrough-overquota/","title":"Quickstart: Over-Quota and Bin Packing","text":""},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#goals","title":"Goals","text":"

                                The goal of this Quickstart is to explain the concepts of over-quota and bin-packing (consolidation) and how they help in maximizing cluster utilization:

                                • Show the simplicity of resource provisioning, and how resources are abstracted from users.
                                • Show how the system eliminates compute bottlenecks by allowing teams/users to go over their resource quota if there are free GPUs in the cluster.
                                "},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#setup-and-configuration","title":"Setup and configuration:","text":"

                                To complete this Quickstart, the Platform Administrator will need to provide you with:

                                • Your cluster should have 4 GPUs on 2 machines with 2 GPUs each.
                                • Researcher access to two Projects named \"team-a\" and \"team-b\"
                                • Each project should be assigned an exact quota of 2 GPUs.
                                • A URL of the Run:ai Console. E.g. https://acme.run.ai.
                                • Run:ai CLI installed on your machine. There are two available CLI variants:

                                  • The older V1 CLI. See installation here
                                  • A newer V2 CLI, supported with clusters of version 2.18 and up. See installation here
                                "},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#login","title":"Login","text":"

                                Run runai login and enter your credentials.

                                "},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#part-i-over-quota","title":"Part I: Over-quota","text":"

                                Open a terminal and run the following command:

                                CLI V1CLI V2
                                runai submit a2 -i runai.jfrog.io/demo/quickstart -g 2 -p team-a\nrunai submit a1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai submit b1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\n
                                runai training submit a2 -i runai.jfrog.io/demo/quickstart -g 2 -p team-a\nrunai training submit a1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai training submit b1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\n

                                System status after run:

                                Discussion

                                • team-a has 3 GPUs allocated. Which is over its quota by 1 GPU.
                                • The system allows this over-quota as long as there are available resources
                                • The system is at full capacity with all GPUs utilized.
                                "},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#part-2-basic-fairness-via-preemption","title":"Part 2: Basic Fairness via Preemption","text":"

                                Run the following command:

                                CLI V1CLI V2
                                runai submit b2 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\n
                                runai training submit b2 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\n

                                System status after run:

                                Discussion

                                • team-a can no longer remain in over-quota. Thus, one Job, must be preempted: moved out to allow team-b to grow.
                                • Run:ai scheduler chooses to preempt Job a1.
                                • It is important that unattended Jobs will save checkpoints. This will ensure that whenever Job a1 resume, it will do so from where it left off.
                                "},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#part-3-bin-packing","title":"Part 3: Bin Packing","text":"

                                Run the following command:

                                CLI V1CLI V2

                                runai delete job a2 -p team-a

                                runai training delete a2\n

                                a1 is now going to start running again.

                                Run:

                                CLI V1CLI V2
                                runai list jobs -A\n
                                runai training list -A\n

                                You have two Jobs that are running on the first node and one Job that is running alone the second node.

                                Choose one of the two Jobs from the full node and delete it:

                                CLI V1CLI V2
                                runai delete job <job-name> -p <project>\n
                                runai training delete <job-name> -p <project>\n

                                The status now is:

                                Now, run a 2 GPU Job:

                                CLI V1CLI V2
                                runai submit a2 -i runai.jfrog.io/demo/quickstart -g 2 -p team-a\n
                                runai training submit a2 -i runai.jfrog.io/demo/quickstart -g 2 -p team-a\n

                                _ The status now is:

                                Discussion

                                Note that Job a1 has been preempted and then restarted on the second node, to clear space for the new a2 Job. This is bin-packing or consolidation

                                "},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/","title":"Quickstart: Queue Fairness","text":""},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#goal","title":"Goal","text":"

                                The goal of this Quickstart is to explain fairness. The over-quota Quickstart shows basic fairness where allocated GPUs per Project are adhered to such that if a Project is in over-quota, its Job will be preempted once another Project requires its resources.

                                This Quickstart is about queue fairness. It shows that Jobs will be scheduled fairly regardless of the time they have been submitted. As such, if a person in Project A has submitted 50 Jobs and soon after that, a person in Project B has submitted 25 Jobs, the Jobs in the queue will be processed fairly.

                                "},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#setup-and-configuration","title":"Setup and configuration:","text":"

                                To complete this Quickstart, the Platform Administrator will need to provide you with:

                                • Your cluster should have 4 GPUs on 2 machines with 2 GPUs each.
                                • Researcher access to two Projects named \"team-a\" and \"team-b\"
                                • Each project should be assigned an exact quota of 1 GPU.
                                • A URL of the Run:ai Console. E.g. https://acme.run.ai.
                                • Run:ai CLI installed on your machine. There are two available CLI variants:

                                  • The older V1 CLI. See installation here
                                  • A newer V2 CLI, supported with clusters of version 2.18 and up. See installation here
                                "},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#login","title":"Login","text":"

                                Run runai login and enter your credentials.

                                "},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#part-i-immediate-displacement-of-over-quota","title":"Part I: Immediate Displacement of Over-Quota","text":"

                                Run the following commands:

                                CLI V1CLI V2
                                runai submit a1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai submit a2 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai submit a3 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai submit a4 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\n
                                runai training submit a1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai training submit a2 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai training submit a3 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\nrunai training submit a4 -i runai.jfrog.io/demo/quickstart -g 1 -p team-a\n

                                System status after run:

                                Discussion

                                team-a, even though it has a single GPU as quota, is now using all 4 GPUs.

                                Run the following commands:

                                CLI V1CLI V2
                                runai submit b1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\nrunai submit b2 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\nrunai submit b3 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\nrunai submit b4 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\n
                                runai training submit b1 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\nrunai training submit b2 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\nrunai training submit b3 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\nrunai training submit b4 -i runai.jfrog.io/demo/quickstart -g 1 -p team-b\n

                                System status after run:

                                Discussion

                                • Two team-b Jobs have immediately displaced team-a.
                                • team-a and team-b each have a quota of 1 GPU, thus the remaining over-quota (2 GPUs) is distributed equally between the Projects.
                                "},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#part-2-queue-fairness","title":"Part 2: Queue Fairness","text":"

                                Now lets start deleting Jobs. Alternatively, you can wait for Jobs to complete.

                                CLI V1CLI V2
                                runai delete job b2 -p team-b\n
                                runai training delete b2 -p team-b\n

                                Discussion

                                As the quotas are equal (1 for each Project, the remaining pending Jobs will get scheduled one by one alternating between Projects, regardless of the time in which they were submitted.

                                "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/","title":"Best Practice: From Bare Metal to Docker Images","text":""},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#introduction","title":"Introduction","text":"

                                Some Researchers do data science on bare metal. The term bare-metal relates to connecting to a server and working directly on its operating system and disks.

                                This is the fastest way to start working, but it introduces problems when the data science organization scales:

                                • More Researchers mean that the machine resources need to be efficiently shared
                                • Researchers need to collaborate and share data, code, and results

                                To overcome that, people working on bare-metal typically write scripts to gather data, code as well as code dependencies. This soon becomes an overwhelming task.

                                "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#why-use-docker-images","title":"Why Use Docker Images?","text":"

                                Docker images and containerization in general provide a level of abstraction which, by large, frees developers and Researchers from the mundane tasks of setting up an environment. The image is an operating system by itself and thus the 'environment' is by large, a part of the image.

                                When a docker image is instantiated, it creates a container. A container is the running manifestation of a docker image.

                                "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#moving-a-data-science-environment-to-docker","title":"Moving a Data Science Environment to Docker","text":"

                                A data science environment typically includes:

                              • Training data
                              • Machine Learning (ML) code and inputs
                              • Libraries: Code dependencies that must be installed before the ML code can be run
                              • "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#training-data","title":"Training data","text":"

                                Training data is usually significantly large (from several Gigabytes to Petabytes) and is read-only in nature. Thus, training data is typically left outside of the docker image. Instead, the data is mounted onto the image when it is instantiated. Mounting a volume allows the code within the container to access the data as though it was within a directory on the local file system.

                                The best practice is to store the training data on a shared file system. This allows the data to be accessed uniformly on whichever machine the Researcher is currently using, allowing the Researcher to easily migrate between machines.

                                Organizations without a shared file system typically write scripts to copy data from machine to machine.

                                "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#machine-learning-code-and-inputs","title":"Machine Learning Code and Inputs","text":"

                                As a rule, code needs to be saved and versioned in a code repository.

                                There are two alternative practices:

                                • The code resides in the image and is being periodically pulled from the repository. This practice requires building a new container image each time a change is introduced to the code.
                                • When a shared file system exists, the code can reside outside the image on a shared disk and mounted via a volume onto the container.

                                Both practices are valid.

                                Inputs to machine learning models and artifacts of training sessions, like model checkpoints, are also better stored in and loaded from a shared file system.

                                "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#code-dependencies","title":"Code Dependencies","text":"

                                Any code has code dependencies. These libraries must be installed for the code to run. As the code is changing, so do the dependencies.

                                ML Code is typically python and python dependencies are typically declared together in a single requirements.txt file which is saved together with the code.

                                The best practice is to have your docker startup script (see below) run this file using pip install -r requirements.txt. This allows the flexibility of adding and removing code dependencies dynamically.

                                "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#ml-lifecycle-build-and-train","title":"ML Lifecycle: Build and Train","text":"

                                Deep learning workloads can be divided into two generic types:

                              • Interactive \"build\" sessions. With these types of workloads, the data scientist opens an interactive session, via bash, Jupyter Notebook, remote PyCharm, or similar and accesses GPU resources directly. Build workloads are typically meant for debugging and development sessions.
                              • Unattended \"training\" sessions. Training is characterized by a machine learning run that has a start and a finish. With these types of workloads, the data scientist prepares a self-running workload and sends it for execution. During the execution, the data scientist can examine the results. A Training session can take from a few minutes to a couple of days. It can be interrupted in the middle and later restored (though the data scientist should save checkpoints for that purpose). Training workloads typically utilize large percentages of the GPU and at the end of the run automatically frees the resources.
                              • Getting your docker ready is also a matter of which type of workload you are currently running.

                                "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#build-workloads","title":"Build Workloads","text":"

                                With \"build\" you are actually coding and debugging small experiments. You are interactive. In that mode, you can typically take a well known standard image (e.g. https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) and use it directly.

                                Start a docker container by running:

                                docker run -it .... \"the well known image\" -v /where/my/code/resides bash 

                                You get a shell prompt to a container with a mounted volume of where your code is. You can then install your prerequisites and run your code via ssh.

                                You can also access the container remotely from tools such as PyCharm, Jupyter Notebook, and more. In this case, the docker image needs to be customized to install the \"server software\" (e.g. a Jupyter Notebook service).

                                "},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#training-workloads","title":"Training Workloads","text":"

                                For training workloads, you can use a well-known image (e.g. the TensorFlow image from the link above) but more often than not, you want to create your own docker image. The best practice is to use the well-known image (e.g. TensorFlow from above) as a base image and add your own customizations on top of it. To achieve that, you create a Dockerfile. A Dockerfile is a declarative way to build a docker image and is built in layers. e.g.:

                                1. Base image is nvidia-tensorflow
                                2. Install popular software
                                3. (Optional) Run a script

                                The script can be part of the image or can be provided as part of the command line to run the docker. It will typically include additional dependencies to install as well as a reference to the ML code to be run.

                                The best practice for running training workloads is to test the container image in a \"build\" session and then send it for execution as a training Job. For further information on how to set up and parameterize a training workload via docker or Run:ai see Converting your Workload to use Unattended Training Execution.

                                "},{"location":"Researcher/best-practices/convert-to-unattended/","title":"Best Practice: Convert your Workload to Run Unattended","text":""},{"location":"Researcher/best-practices/convert-to-unattended/#motivation","title":"Motivation","text":"

                                Run:ai allows non-interactive training workloads to extend beyond guaranteed quotas and into over-quota as long as computing resources are available. To achieve this kind of flexibility, the system needs to be able to safely stop a workload and restart it again later. This requires Researchers to switch workloads from running interactively, to running unattended, thus allowing Run:ai to pause/resume the run.

                                Unattended workloads are a good fit for long-duration runs, or sets of smaller hyperparameter optimization runs.

                                "},{"location":"Researcher/best-practices/convert-to-unattended/#best-practices","title":"Best Practices","text":""},{"location":"Researcher/best-practices/convert-to-unattended/#docker-image","title":"Docker Image","text":"

                                A docker container is based on a docker image. Some Researchers use generic images such as ones provided by Nvidia, for example: NVIDIA NGC TensorFlow. Others, use generic images as the base image to a more customized image using Dockerfiles.

                                Realizing that Researchers are not always proficient with building docker files, as a best practice, you will want to:

                                • Use the same docker image both for interactive and unattended jobs. In this way, you can keep the difference between both methods of invocation to a minimum. This can be a stock image from Nvidia or a custom image.
                                • Leave some degree of flexibility, which allows the Researcher to add/remove python dependencies without re-creating images.
                                "},{"location":"Researcher/best-practices/convert-to-unattended/#code-location","title":"Code Location","text":"

                                You will want to minimize the cycle of code change-and-run. There are a couple of best practices which you can choose from:

                                1. Code resides on the network file storage. This way you can change the code and immediately run the Job. The Job picks up the new files from the network.
                                2. Use the runai submit flag --git-sync. The flag allows the Researcher to provide details of a Git repository. The repository will be automatically cloned into a specified directory when the container starts.
                                3. The code can be embedded within the image. In this case, you will want to create an automatic CI/CD process, which packages the code into a modified image.

                                The document below assumes option #1.

                                "},{"location":"Researcher/best-practices/convert-to-unattended/#create-a-startup-script","title":"Create a Startup Script","text":"

                                Gather the commands you ran inside the interactive Job into a single script. The script will be provided with the command-line at the start of the unattended execution (see the section running the job below). This script should be kept next to your code, on a shared network drive (e.g. /nfs/john).

                                An example of a common startup script start.sh:

                                pip install -r requirements.txt\n...\npython training.py\n

                                The first line of this script is there to make sure that all required python libraries are installed before the training script executes, it also allows the Researcher to add/remove libraries without needing changes to the image itself.

                                "},{"location":"Researcher/best-practices/convert-to-unattended/#support-variance-between-different-runs","title":"Support Variance Between Different Runs","text":"

                                Your training script must be flexible enough to support variance in execution without changing the code. For example, you will want to change the number of epochs to run, apply a different set of hyperparameters, etc. There are two ways to handle this in your script. You can use one or both methods:

                                1. Your script can read arguments passed to the script:

                                  python training.py --number-of-epochs=30

                                In which case, change your start.sh script to:

                                pip install -r requirements.txt\n...\npython training.py $@
                                1. Your script can read from environment variables during script execution. In case you use environment variables, the variables will be passed to the training script automatically. No special action is required in this case.
                                "},{"location":"Researcher/best-practices/convert-to-unattended/#checkpoints","title":"Checkpoints","text":"

                                Run:ai can pause unattended executions, giving your GPU resources to another workload. When the time comes, Run:ai will give you back the resources and restore your workload. Thus, it is a good practice to save your weights at various checkpoints and start a workload from the latest checkpoint (typically between epochs).

                                TensorFlow, PyTorch, and others have mechanisms to help save checkpoints (e.g. https://www.tensorflow.org/guide/checkpoint for TensorFlow and https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html for PyTorch).

                                It is important to save the checkpoints to network storage and not the machine itself. When your workload resumes, it can, in all probability, be allocated to a different node (machine) than the original node

                                For more information on best practices for saving checkpoints, see Saving Deep Learning Checkpoints.

                                "},{"location":"Researcher/best-practices/convert-to-unattended/#running-the-job","title":"Running the Job","text":"

                                Using runai submit, drop the flag --interactive. For submitting a Job using the script created above, please use -- [COMMAND] flag to specify a command, use the -- syntax to pass arguments, and pass environment variables using the flag --environment.

                                Example with Environment variables:

                                runai submit train1 -i tensorflow/tensorflow:1.14.0-gpu-py3  \n    -v /nfs/john:/mydir -g 1  --working-dir /mydir/  \n    -e 'EPOCHS=30'  -e 'LEARNING_RATE=0.02'  \n    -- ./startup.sh  \n

                                Example with Command-line arguments:

                                runai submit train1 -i tensorflow/tensorflow:1.14.0-gpu-py3  \n    -v /nfs/john:/mydir -g 1  --working-dir /mydir/  \n    -- ./startup.sh batch-size=64 number-of-epochs=3\n

                                Please refer to Command-Line Interface, runai submit for a list of all arguments accepted by the Run:ai CLI.

                                "},{"location":"Researcher/best-practices/convert-to-unattended/#use-cli-policies","title":"Use CLI Policies","text":"

                                Different run configurations may vary significantly and can be tedious to be written each time on the command-line. To make life easier, our CLI offers a way to set administrator policies for these configurations and use pre-configured configuration when submitting a Workload. Please refer to Configure Policies.

                                "},{"location":"Researcher/best-practices/convert-to-unattended/#attached-files","title":"Attached Files","text":"

                                The 3 relevant files mentioned in this document can be downloaded from Github

                                "},{"location":"Researcher/best-practices/convert-to-unattended/#see-also","title":"See Also","text":"

                                See the unattended training Quickstart: Launch Unattended Training Workloads

                                "},{"location":"Researcher/best-practices/env-variables/","title":"Environment Variables inside a Run:ai Workload","text":""},{"location":"Researcher/best-practices/env-variables/#identifying-a-job","title":"Identifying a Job","text":"

                                There may be use cases where your container may need to uniquely identify the Job it is currently running in. A typical use case is for saving Job artifacts under a unique name. Run:ai provides pre-defined environment variables you can use. These variables are guaranteed to be unique even if the Job is preempted or evicted and then runs again.

                                Run:ai provides the following environment variables:

                                • JOB_NAME - the name of the Job.
                                • JOB_UUID - a unique identifier for the Job.

                                Note that the Job can be deleted and then recreated with the same name. A Job UUID will be different even if the Job names are the same.

                                "},{"location":"Researcher/best-practices/env-variables/#gpu-allocation","title":"GPU Allocation","text":"

                                Run:ai provides an environment variable, visible inside the container, to help identify the number of GPUs allocated for the container. Use RUNAI_NUM_OF_GPUS

                                "},{"location":"Researcher/best-practices/env-variables/#node-name","title":"Node Name","text":"

                                There may be use cases where your container may need to identify the node it is currently running on. Run:ai provides an environment variable, visible inside the container, to help identify the name of the node on which the pod was scheduled. Use NODE_NAME

                                "},{"location":"Researcher/best-practices/env-variables/#usage-example-in-python","title":"Usage Example in Python","text":"
                                import os\n\njobName = os.environ['JOB_NAME']\njobUUID = os.environ['JOB_UUID']\n
                                "},{"location":"Researcher/best-practices/researcher-notifications/","title":"Researcher Email Notifications","text":""},{"location":"Researcher/best-practices/researcher-notifications/#importance-of-email-notifications-for-data-scientists","title":"Importance of Email Notifications for Data Scientists","text":"

                                Managing numerous data science workloads requires monitoring various stages, including submission, scheduling, initialization, execution, and completion. Additionally, handling suspensions and failures is crucial for ensuring timely workload completion. Email Notifications address this need by sending alerts for critical workload life cycle changes. This empowers data scientists to take necessary actions and prevent delays.

                                Once the system administrator configures the email notifications, users will receive notifications about their jobs that transition from one status to another. In addition, the user will get warning notifications before workload termination due to project-defined timeouts. Details included in the email are:

                                • Workload type
                                • Project and cluster information
                                • Event timestamp

                                To configure the types of email notifications you can receive:

                                1. The user must log in to their account.
                                2. Press the user icon, then select settings.
                                3. In the Email notifications, and in the Send me an email about my workloads when section, select the relevant workload statuses.
                                4. When complete, press Save.
                                "},{"location":"Researcher/best-practices/save-dl-checkpoints/","title":"Best Practice: Save Deep-Learning Checkpoints","text":""},{"location":"Researcher/best-practices/save-dl-checkpoints/#introduction","title":"Introduction","text":"

                                Run:ai can pause unattended executions, giving your GPU resources to another workload. When the time comes, Run:ai will give you back the resources and restore your workload. Thus, it is a good practice to save the state of your run at various checkpoints and start a workload from the latest checkpoint (typically between epochs).

                                "},{"location":"Researcher/best-practices/save-dl-checkpoints/#how-to-save-checkpoints","title":"How to Save Checkpoints","text":"

                                TensorFlow, PyTorch, and others have mechanisms to help save checkpoints (e.g. https://www.tensorflow.org/guide/checkpoint for TensorFlow and https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html for PyTorch).

                                This document uses Keras as an example. The code itself can be found here

                                "},{"location":"Researcher/best-practices/save-dl-checkpoints/#where-to-save-checkpoints","title":"Where to Save Checkpoints","text":"

                                It is important to save the checkpoints to network storage and not the machine itself. When your workload resumes, it can, in all probability, be allocated to a different node (machine) than the original node. Example:

                                runai submit train-with-checkpoints -i tensorflow/tensorflow:1.14.0-gpu-py3 \\\n  -v /mnt/nfs_share/john:/mydir -g 1  --working-dir /mydir --command -- ./startup.sh\n

                                The command saves the checkpoints in an NFS checkpoints folder /mnt/nfs_share/john

                                "},{"location":"Researcher/best-practices/save-dl-checkpoints/#when-to-save-checkpoints","title":"When to Save Checkpoints","text":""},{"location":"Researcher/best-practices/save-dl-checkpoints/#save-periodically","title":"Save Periodically","text":"

                                It is a best practice to save checkpoints at intervals. For example, every epoch as the Keras code below shows:

                                checkpoints_file = \"weights.best.hdf5\"\ncheckpoint = ModelCheckpoint(checkpoints_file, monitor='val_acc', verbose=1, \n    save_best_only=True, mode='max')\n
                                "},{"location":"Researcher/best-practices/save-dl-checkpoints/#save-on-exit-signal","title":"Save on Exit Signal","text":"

                                If periodic checkpoints are not enough, you can use a signal-hook provided by Run:ai (via Kubernetes). The hook is python code that is called before your Job is suspended and allows you to save your checkpoints as well as other state data you may wish to store.

                                import signal\nimport time\n\ndef graceful_exit_handler(signum, frame):\n    # save your checkpoints to shared storage\n\n    # exit with status \"1\" is important for the Job to return later.  \n    exit(1)\n\nsignal.signal(signal.SIGTERM, graceful_exit_handler)\n

                                By default, you will have 30 seconds to save your checkpoints.

                                Important

                                For the signal to be captured, it must be propagated from the startup script to the python child process. See code here

                                "},{"location":"Researcher/best-practices/save-dl-checkpoints/#resuming-using-saved-checkpoints","title":"Resuming using Saved Checkpoints","text":"

                                A Run:ai unattended workload that is resumed, will run the same startup script as on the first run. It is the responsibility of the script developer to add code that:

                                • Checks if saved checkpoints exist (see above)
                                • If saved checkpoints exist, load them and start the run using these checkpoints
                                import os\n\ncheckpoints_file = \"weights.best.hdf5\"\nif os.path.isfile(checkpoints_file):\n    print(\"loading checkpoint file: \" + checkpoints_file)\n    model.load_weights(checkpoints_file)\n
                                "},{"location":"Researcher/best-practices/secrets-as-env-var-in-cli/","title":"Propogating secrets as environment variables to workloads via the CLI","text":"

                                The following is a specific knowledge articles for Run:ai command-line interface users who wish to propagate a Kubernetes secret an an environment variable.

                                "},{"location":"Researcher/best-practices/secrets-as-env-var-in-cli/#kubernetes-secrets","title":"Kubernetes Secrets","text":"

                                Sometimes you want to use sensitive information within your code. For example passwords, OAuth tokens, or ssh keys. The best practice for saving such information in Kubernetes is via Kubernetes Secrets. Kubernetes Secrets let you store and manage sensitive information. Access to secrets is limited via configuration.

                                A Kubernetes secret may hold multiple key - value pairs.

                                "},{"location":"Researcher/best-practices/secrets-as-env-var-in-cli/#using-secrets-in-runai-workloads","title":"Using Secrets in Run:ai Workloads","text":"

                                Our goal is to provide Run:ai Workloads with secrets as input in a secure way. Using the Run:ai command line, you will be able to pass a reference to a secret that already exists in Kubernetes.

                                "},{"location":"Researcher/best-practices/secrets-as-env-var-in-cli/#creating-a-secret","title":"Creating a secret","text":"

                                For details on how to create a Kubernetes secret see: https://kubernetes.io/docs/concepts/configuration/secret/. Here is an example:

                                apiVersion: v1\nkind: Secret\nmetadata:\n  name: my-secret\n  namespace: runai-<project-name>\ndata:\n  username: am9obgo=\n  password: bXktcGFzc3dvcmQK\n

                                Then run:

                                kubectl apply -f <file-name>\n

                                Notes

                                • Secrets are base64 encoded
                                • Secrets are stored in the scope of a namespace and will not be accessible from other namespaces. Hence the reference to the Run:ai Project name above. Run:ai provides the ability to propagate secrets throughout all Run:ai Projects. See below.
                                "},{"location":"Researcher/best-practices/secrets-as-env-var-in-cli/#attaching-a-secret-to-a-workload-on-submit-via-cli","title":"Attaching a secret to a Workload on Submit via CLI","text":"

                                When you submit a new Workload, you will want to connect the secret to the new Workload. To do that, run:

                                runai submit -e <ENV-VARIABLE>=SECRET:<secret-name>,<secret-key> ....\n

                                For example:

                                runai submit -i ubuntu -e MYUSERNAME=SECRET:my-secret,username\n
                                "},{"location":"Researcher/cli-reference/Introduction/","title":"Introduction","text":"

                                The Run:ai Command-line Interface (CLI) is one of the ways for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, etc.

                                To install and configure the Run:ai CLI see Researcher Setup - Start Here

                                "},{"location":"Researcher/cli-reference/runai-attach/","title":"runai attach","text":""},{"location":"Researcher/cli-reference/runai-attach/#description","title":"Description","text":"

                                Attach to a running Job.

                                The command attaches to the standard input, output, and error streams of a running Job. If the Job has multiple pods the job will attach to the first pod unless otherwise set.

                                "},{"location":"Researcher/cli-reference/runai-attach/#synopsis","title":"Synopsis","text":"
                                runai attach <job-name>\n    [--no-stdin ]\n    [--no-tty]   \n    [--pod string]\n    .\n    [--loglevel value] \n    [--help | -h]\n
                                "},{"location":"Researcher/cli-reference/runai-attach/#options","title":"Options","text":"

                                <job-name> - The name of the Job to run the command with. Mandatory.

                                "},{"location":"Researcher/cli-reference/runai-attach/#-no-stdin","title":"--no-stdin","text":"

                                Do not attach STDIN.

                                "},{"location":"Researcher/cli-reference/runai-attach/#-no-tty","title":"--no-tty","text":"

                                Do not allocate a pseudo-TTY

                                "},{"location":"Researcher/cli-reference/runai-attach/#-pod-string","title":"--pod string","text":"

                                Attach to a specific pod within the Job. To find the list of pods run runai describe job <job-name> and then use the pod name with the --pod flag.

                                "},{"location":"Researcher/cli-reference/runai-attach/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-attach/#-loglevel-string","title":"--loglevel (string)","text":"

                                Set the logging level. One of: debug | info | warn | error (default \"info\").

                                "},{"location":"Researcher/cli-reference/runai-attach/#-help-h","title":"--help | -h","text":"

                                Show help text.

                                "},{"location":"Researcher/cli-reference/runai-attach/#output","title":"Output","text":"

                                None

                                "},{"location":"Researcher/cli-reference/runai-bash/","title":"runai bash","text":""},{"location":"Researcher/cli-reference/runai-bash/#description","title":"Description","text":"

                                Get a bash session inside a running Job

                                This command is a shortcut to runai exec (runai exec -it job-name bash). See runai exec for full documentation of the exec command.

                                "},{"location":"Researcher/cli-reference/runai-bash/#synopsis","title":"Synopsis","text":"
                                runai bash <job-name> \n    [--pod string]\n\n    [--loglevel value] \n    [--project string | -p string] \n    [--help | -h]\n
                                "},{"location":"Researcher/cli-reference/runai-bash/#options","title":"Options","text":"

                                <job-name> - The name of the Job to run the command with. Mandatory.

                                "},{"location":"Researcher/cli-reference/runai-bash/#-pod-string","title":"--pod string","text":"

                                Specify a pod of a running Job. To get a list of the pods of a specific Job, run runai describe job <job-name> command

                                "},{"location":"Researcher/cli-reference/runai-bash/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-bash/#-loglevel-string","title":"--loglevel (string)","text":"

                                Set the logging level. One of: debug | info | warn | error (default \"info\")

                                "},{"location":"Researcher/cli-reference/runai-bash/#-project-p-string","title":"--project | -p (string)","text":"

                                Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

                                "},{"location":"Researcher/cli-reference/runai-bash/#-help-h","title":"--help | -h","text":"

                                Show help text

                                "},{"location":"Researcher/cli-reference/runai-bash/#output","title":"Output","text":"

                                The command will access the container that should be currently running in the current cluster and attempt to create a command-line shell based on bash.

                                The command will return an error if the container does not exist or has not been in a running state yet.

                                "},{"location":"Researcher/cli-reference/runai-bash/#see-also","title":"See also","text":"

                                Build Workloads. See Quickstart document: Launch Interactive Build Workloads.

                                "},{"location":"Researcher/cli-reference/runai-config/","title":"runai config","text":""},{"location":"Researcher/cli-reference/runai-config/#description","title":"Description","text":"

                                Set a default Project or Cluster

                                "},{"location":"Researcher/cli-reference/runai-config/#synopsis","title":"Synopsis","text":"
                                runai  config project <project-name>\n    [--loglevel value] \n    [--help | -h]\n\nrunai  config cluster <cluster-name>\n    [--loglevel value] \n    [--help | -h]\n
                                "},{"location":"Researcher/cli-reference/runai-config/#options","title":"Options","text":"

                                <project-name> - The name of the Project you want to set as default. Mandatory.

                                <cluster-name> - The name of the cluster you want to set as the current cluster. Mandatory.

                                "},{"location":"Researcher/cli-reference/runai-config/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-config/#-loglevel-string","title":"--loglevel (string)","text":"

                                Set the logging level. One of: debug | info | warn | error (default \"info\").

                                "},{"location":"Researcher/cli-reference/runai-config/#-help-h","title":"--help | -h","text":"

                                Show help text.

                                "},{"location":"Researcher/cli-reference/runai-config/#output","title":"Output","text":"

                                None

                                "},{"location":"Researcher/cli-reference/runai-delete/","title":"runai delete","text":""},{"location":"Researcher/cli-reference/runai-delete/#description","title":"Description","text":"

                                Delete a Workload and its associated Pods.

                                Note that once you delete a Workload, its entire data will be gone:

                                • You will no longer be able to enter it via bash.
                                • You will no longer be able to access logs.
                                • Any data saved on the container and not stored in a shared location will be lost.
                                "},{"location":"Researcher/cli-reference/runai-delete/#synopsis","title":"Synopsis","text":"
                                runai delete job <job-name> \n    [--all | -A]\n\n    [--loglevel value] \n    [--project string | -p string] \n    [--help | -h]\n
                                "},{"location":"Researcher/cli-reference/runai-delete/#options","title":"Options","text":"

                                <job-name> - The name of the Workload to run the command with. Mandatory.

                                "},{"location":"Researcher/cli-reference/runai-delete/#-all-a","title":"--all | -A","text":"

                                Delete all Workloads.

                                "},{"location":"Researcher/cli-reference/runai-delete/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-delete/#-loglevel-string","title":"--loglevel (string)","text":"

                                Set the logging level. One of: debug | info | warn | error (default \"info\").

                                "},{"location":"Researcher/cli-reference/runai-delete/#-project-p-string","title":"--project | -p (string)","text":"

                                Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

                                "},{"location":"Researcher/cli-reference/runai-delete/#-help-h","title":"--help | -h","text":"

                                Show help text.

                                "},{"location":"Researcher/cli-reference/runai-delete/#output","title":"Output","text":"
                                • The Workload will be deleted and not available via the command runai list jobs.

                                • The Workloads will show as deleted from the Run:ai user interface Job list.

                                "},{"location":"Researcher/cli-reference/runai-delete/#see-also","title":"See Also","text":"
                                • Build Workloads. See Quickstart document: Launch Interactive Build Workloads.

                                • Training Workloads. See Quickstart document: Launch Unattended Training Workloads.

                                "},{"location":"Researcher/cli-reference/runai-describe/","title":"runai describe","text":""},{"location":"Researcher/cli-reference/runai-describe/#description","title":"Description","text":"

                                Display details of a Workload or Node.

                                "},{"location":"Researcher/cli-reference/runai-describe/#synopsis","title":"Synopsis","text":"
                                runai describe job <job-name> \n    [--output value | -o value]  \n\n    [--loglevel value] \n    [--project string | -p string] \n    [--help | -h]\n    [--output string | -o string]  \n\n\nrunai describe node [node-name] \n\n    [--loglevel value] \n    [--help | -h]\n
                                "},{"location":"Researcher/cli-reference/runai-describe/#options","title":"Options","text":"
                                • <job-name> - The name of the Workload to run the command with. Mandatory.
                                • <node-name> - The name of the Node to run the command with. If a Node name is not specified, a description of all Nodes is shown.

                                -o | --output

                                Output format. One of: json|yaml|wide. Default is 'wide'

                                "},{"location":"Researcher/cli-reference/runai-describe/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-describe/#-loglevel-string","title":"--loglevel (string)","text":"

                                Set the logging level. One of: debug | info | warn | error (default \"info\").

                                "},{"location":"Researcher/cli-reference/runai-describe/#-project-p-string","title":"--project | -p (string)","text":"

                                Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project, use: runai config project <project-name>.

                                "},{"location":"Researcher/cli-reference/runai-describe/#-help-h","title":"--help | -h","text":"

                                Show help text

                                "},{"location":"Researcher/cli-reference/runai-describe/#output","title":"Output","text":"
                                • The runai describe job command will show Workload properties and status as well as lifecycle events and the list of related resources and pods.
                                • The runai describe node command will show Node properties.
                                "},{"location":"Researcher/cli-reference/runai-exec/","title":"runai exec","text":""},{"location":"Researcher/cli-reference/runai-exec/#description","title":"Description","text":"

                                Execute a command inside a running Job

                                Note: to execute a bash command, you can also use the shorthand runai bash

                                "},{"location":"Researcher/cli-reference/runai-exec/#synopsis","title":"Synopsis","text":"
                                runai exec <job-name> <command> \n    [--stdin | -i] \n    [--tty | -t]\n\n    [--loglevel value] \n    [--project string | -p string] \n    [--help | -h]\n
                                "},{"location":"Researcher/cli-reference/runai-exec/#options","title":"Options","text":"

                                <job-name> - The name of the Job to run the command with. Mandatory.

                                <command> the command itself (e.g. bash).

                                "},{"location":"Researcher/cli-reference/runai-exec/#-stdin-i","title":"--stdin | -i","text":"

                                Keep STDIN open even if not attached.

                                "},{"location":"Researcher/cli-reference/runai-exec/#-tty-t","title":"--tty | -t","text":"

                                Allocate a pseudo-TTY.

                                "},{"location":"Researcher/cli-reference/runai-exec/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-exec/#-loglevel-string","title":"--loglevel (string)","text":"

                                Set the logging level. One of: debug | info | warn | error (default \"info\").

                                "},{"location":"Researcher/cli-reference/runai-exec/#-project-p-string","title":"--project | -p (string)","text":"

                                Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

                                "},{"location":"Researcher/cli-reference/runai-exec/#-help-h","title":"--help | -h","text":"

                                Show help text.

                                "},{"location":"Researcher/cli-reference/runai-exec/#output","title":"Output","text":"

                                The command will run in the context of the container.

                                "},{"location":"Researcher/cli-reference/runai-exec/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-list/","title":"runai list","text":""},{"location":"Researcher/cli-reference/runai-list/#description","title":"Description","text":"

                                Show lists of Workloads, Projects, Clusters or Nodes.

                                "},{"location":"Researcher/cli-reference/runai-list/#synopsis","title":"Synopsis","text":"
                                runai list jobs \n    [--all-projects | -A]  \n\n    [--loglevel value] \n    [--project string | -p string] \n    [--help | -h]\n\nrunai list projects \n    [--loglevel value] \n    [--help | -h]\n\nrunai list clusters  \n    [--loglevel value] \n    [--help | -h]\n\nrunai list nodes [node-name]\n    [--loglevel value] \n    [--help | -h]\n
                                "},{"location":"Researcher/cli-reference/runai-list/#options","title":"Options","text":"

                                node-name - Name of a specific node to list (optional).

                                "},{"location":"Researcher/cli-reference/runai-list/#-all-projects-a","title":"--all-projects | -A","text":"

                                Show Workloads from all Projects.

                                "},{"location":"Researcher/cli-reference/runai-list/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-list/#-loglevel-string","title":"--loglevel (string)","text":"

                                Set the logging level. One of: debug | info | warn | error (default \"info\").

                                "},{"location":"Researcher/cli-reference/runai-list/#-project-p-string","title":"--project | -p (string)","text":"

                                Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

                                "},{"location":"Researcher/cli-reference/runai-list/#-help-h","title":"--help | -h","text":"

                                Show help text.

                                "},{"location":"Researcher/cli-reference/runai-list/#output","title":"Output","text":"
                                • A list of Workloads, Nodes, Projects, or Clusters.
                                • To filter 'runai list nodes' for a specific Node, add the Node name.
                                "},{"location":"Researcher/cli-reference/runai-list/#see-also","title":"See Also","text":"

                                To show details for a specific Workload or Node see runai describe.

                                "},{"location":"Researcher/cli-reference/runai-login/","title":"runai login","text":""},{"location":"Researcher/cli-reference/runai-login/#description","title":"Description","text":"

                                Login to Run:ai

                                When Researcher Authentication is enabled, you will need to login to Run:ai using your username and password before accessing resources

                                "},{"location":"Researcher/cli-reference/runai-login/#synopsis","title":"Synopsis","text":"
                                runai login \n    [--loglevel value]\n    [--help | -h]\n
                                "},{"location":"Researcher/cli-reference/runai-login/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-login/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-login/#-loglevel-string","title":"--loglevel (string)","text":"

                                Set the logging level. One of: debug | info | warn | error (default \"info\").

                                "},{"location":"Researcher/cli-reference/runai-login/#-help-h","title":"--help | -h","text":"

                                Show help text.

                                "},{"location":"Researcher/cli-reference/runai-login/#output","title":"Output","text":"

                                You will be prompted for a user name and password

                                "},{"location":"Researcher/cli-reference/runai-login/#see-also","title":"See Also","text":"
                                • runai logout.
                                "},{"location":"Researcher/cli-reference/runai-logout/","title":"runai logout","text":""},{"location":"Researcher/cli-reference/runai-logout/#description","title":"Description","text":"

                                Log out from Run:ai

                                "},{"location":"Researcher/cli-reference/runai-logout/#synopsis","title":"Synopsis","text":"
                                runai logout \n    [--loglevel value]\n    [--help | -h]\n
                                "},{"location":"Researcher/cli-reference/runai-logout/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-logout/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-logout/#-loglevel-string","title":"--loglevel (string)","text":"

                                Set the logging level. One of: debug | info | warn | error (default \"info\").

                                "},{"location":"Researcher/cli-reference/runai-logout/#-help-h","title":"--help | -h","text":"

                                Show help text.

                                "},{"location":"Researcher/cli-reference/runai-logout/#output","title":"Output","text":"

                                You will be logged out from Run:ai

                                "},{"location":"Researcher/cli-reference/runai-logout/#see-also","title":"See Also","text":"
                                • runai login.
                                "},{"location":"Researcher/cli-reference/runai-logs/","title":"runai logs","text":""},{"location":"Researcher/cli-reference/runai-logs/#description","title":"Description","text":"

                                Show the logs of a Job.

                                "},{"location":"Researcher/cli-reference/runai-logs/#synopsis","title":"Synopsis","text":"
                                runai logs <job-name> \n    [--follow | -f] \n    [--pod string | -p string] \n    [--since duration] \n    [--since-time date-time] \n    [--tail int | -t int] \n    [--timestamps]  \n\n    [--loglevel value] \n    [--project string | -p string] \n    [--help | -h]\n
                                "},{"location":"Researcher/cli-reference/runai-logs/#options","title":"Options","text":"

                                <job-name> - The name of the Job to run the command with. Mandatory.

                                "},{"location":"Researcher/cli-reference/runai-logs/#-follow-f","title":"--follow | -f","text":"

                                Stream the logs.

                                "},{"location":"Researcher/cli-reference/runai-logs/#-pod-p","title":"--pod | -p","text":"

                                Specify a specific pod name. When a Job fails, it may start a couple of times in an attempt to succeed. The flag allows you to see the logs of a specific instance (called 'pod'). Get the name of the pod by running runai describe job <job-name>.

                                "},{"location":"Researcher/cli-reference/runai-logs/#-instance-string-i-string","title":"--instance (string) | -i (string)","text":"

                                Show logs for a specific instance in cases where a Job contains multiple pods.

                                "},{"location":"Researcher/cli-reference/runai-logs/#-since-duration","title":"--since (duration)","text":"

                                Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs. The flags since and since-time cannot be used together.

                                "},{"location":"Researcher/cli-reference/runai-logs/#-since-time-date-time","title":"--since-time (date-time)","text":"

                                Return logs after specified date. Date format should be RFC3339, example: 2020-01-26T15:00:00Z.

                                "},{"location":"Researcher/cli-reference/runai-logs/#-tail-int-t-int","title":"--tail (int) | -t (int)","text":"

                                # of lines of recent log file to display.

                                "},{"location":"Researcher/cli-reference/runai-logs/#-timestamps","title":"--timestamps","text":"

                                Include timestamps on each line in the log output.

                                "},{"location":"Researcher/cli-reference/runai-logs/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-logs/#-loglevel-string","title":"--loglevel (string)","text":"

                                Set the logging level. One of: debug | info | warn | error (default \"info\").

                                "},{"location":"Researcher/cli-reference/runai-logs/#-project-p-string","title":"--project | -p (string)","text":"

                                Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use runai config project <project-name>.

                                "},{"location":"Researcher/cli-reference/runai-logs/#-help-h","title":"--help | -h","text":"

                                Show help text.

                                "},{"location":"Researcher/cli-reference/runai-logs/#output","title":"Output","text":"

                                The command will show the logs of the first process in the container. For training Jobs, this would be the command run at startup. For interactive Jobs, the command may not show anything.

                                "},{"location":"Researcher/cli-reference/runai-logs/#see-also","title":"See Also","text":"
                                • Training Workloads. See Quickstart document: Launch Unattended Training Workloads.
                                "},{"location":"Researcher/cli-reference/runai-port-forwarding/","title":"runai port-forward","text":""},{"location":"Researcher/cli-reference/runai-port-forwarding/#description","title":"Description","text":"

                                Forward one or more local ports to the selected job or a pod within the job. The forwarding session ends when the selected job terminates or the terminal is interrupted.

                                "},{"location":"Researcher/cli-reference/runai-port-forwarding/#examples","title":"Examples","text":"
                                1. Port forward connections from localhost:8080 (localhost is the default) to on port 8090.

                                  runai port-forward <job-name> --port 8080:8090

                                2. Port forward connections from 192.168.1.23:8080 to on port 8080.

                                  runai port-forward <job-name> --port 8080 --address 192.168.1.23

                                3. Port forward multiple connections from localhost:8080 to on port 8090 and localhost:6443 to on port 443.

                                  runai port-forward <job-name> --port 8080:8090 --port 6443:443

                                4. Port forward into a specific pod in a multi-pod job.

                                  runai port-forward <job-name> --port 8080:8090 --pod <pod-name>

                                5. "},{"location":"Researcher/cli-reference/runai-port-forwarding/#global-flags","title":"Global flags","text":"

                                  --loglevel <string>\u2014Set the logging level. Choose: (default \"info\").

                                  -p | --project <string>\u2014Specify the project name. To change the default project use runai config project <project name>.

                                  "},{"location":"Researcher/cli-reference/runai-port-forwarding/#flags","title":"Flags","text":"

                                  --address <string> | [local-interface-ip\\host] |localhost | 0.0.0.0 [privileged]\u2014The listening address of your local machine. (default \"localhost\").

                                  -h | --help\u2014Help for the command.

                                  --port\u2014forward ports based on one of the following arguments:

                                  • <stringArray>\u2014a list of port forwarding combinations.

                                  • [local-port]:[remote-port]\u2014different local and remote ports.

                                  • [local-port=remote-port]\u2014the same port is used for both local and remote.

                                  --pod\u2014Specify a pod of a running job. To get a list of the pods of a specific job, run the command runai describe <job-name>.

                                  --pod-running-timeout\u2014The length of time (like 5s, 2m, or 3h, higher than zero) to wait until the pod is running. Default is 10 minutes.

                                  Filter based flags

                                  --mpi\u2014search only for mpi jobs.

                                  --interactive\u2014search only for interactive jobs.

                                  --pytorch\u2014search only for pytorch jobs.

                                  --tf\u2014search only for tensorflow jobs.

                                  --train\u2014search only for training jobs.

                                  "},{"location":"Researcher/cli-reference/runai-resume/","title":"runai resume","text":""},{"location":"Researcher/cli-reference/runai-resume/#description","title":"Description","text":"

                                  Resume a suspended Job

                                  Resuming a previously suspended Job will return it to the queue for scheduling. The Job may or may not start immediately, depending on available resources.

                                  Suspend and resume do not work with mpi Jobs.

                                  "},{"location":"Researcher/cli-reference/runai-resume/#synopsis","title":"Synopsis","text":"
                                  runai resume <job-name>\n    [--all | -A]\n\n    [--loglevel value]\n    [--project string | -p string]\n    [--help | -h]\n
                                  "},{"location":"Researcher/cli-reference/runai-resume/#options","title":"Options","text":"

                                  <job-name> - The name of the Job to run the command with. Mandatory.

                                  "},{"location":"Researcher/cli-reference/runai-resume/#-all-a","title":"--all | -A","text":"

                                  Resume all suspended Jobs in the current Project.

                                  "},{"location":"Researcher/cli-reference/runai-resume/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-resume/#-loglevel-string","title":"--loglevel (string)","text":"

                                  Set the logging level. One of: debug | info | warn | error (default \"info\").

                                  "},{"location":"Researcher/cli-reference/runai-resume/#-project-p-string","title":"--project | -p (string)","text":"

                                  Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

                                  "},{"location":"Researcher/cli-reference/runai-resume/#-help-h","title":"--help | -h","text":"

                                  Show help text.

                                  "},{"location":"Researcher/cli-reference/runai-resume/#output","title":"Output","text":"
                                  • The Job will be resumed. When running runai list jobs the Job status will no longer by Suspended.
                                  "},{"location":"Researcher/cli-reference/runai-resume/#see-also","title":"See Also","text":"
                                  • Suspending Jobs: Suspend.
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/","title":"runai submit-dist tf","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#description","title":"Description","text":"

                                  Submit a distributed TensorFlow training run:ai job to run.

                                  Note

                                  To use distributed training you need to have installed the TensorFlow operator as specified in Distributed training.

                                  Syntax notes:

                                  • Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#examples","title":"Examples","text":"
                                  runai submit-dist tf --name distributed-job --workers=2 -g 1 \\\n    -i <image_name\n>\n
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-clean-pod-policy-string","title":"--clean-pod-policy < string >","text":"

                                  The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:

                                  • Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)
                                  • All\u2014all (including completed) pods will be deleted immediately when the job finishes.
                                  • None\u2014no pods will be deleted when the job completes.
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-workers-int","title":"--workers < int>","text":"

                                  Number of replicas for Inference jobs

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-job-name-prefix-string","title":"--job-name-prefix <string>","text":"

                                  The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional --job-name-prefix flag creates Job names with the provided prefix.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-name-string","title":"--name <string>","text":"

                                  The name of the Job.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-template-string","title":"--template <string>","text":"

                                  Load default values from a workload.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-add-capability-stringarray","title":"--add-capability <stringArray>","text":"

                                  Add linux capabilities to the container.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-a-annotation-stringarray","title":"-a | --annotation <stringArray>","text":"

                                  Set annotations variables in the container.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-attach","title":"--attach","text":"

                                  Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.

                                  The --attach flag also sets --tty and --stdin to true.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-command","title":"--command","text":"

                                  Overrides the image's entry point with the command supplied after '--'. When not using the --command flag, the entry point will not be overrided and the string after -- will be appended as arguments to the entry point command.

                                  Example:

                                  --command -- run.sh 1 54 will start the docker and run run.sh 1 54

                                  -- script.py 10000 will augment script.py 10000 to the entry point command (e.g. python)

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-create-home-dir","title":"--create-home-dir","text":"

                                  Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-e-stringarray-environment","title":"-e <stringArray> | --environment`

                                  Define environment variables to be set in the container. To set multiple values add the flag multiple times (-e BATCH_SIZE=50 -e LEARNING_RATE=0.2).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-image-string-i-string","title":"--image <string> | -i <string>

                                  Image to use when creating the container for this Job

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-image-pull-policy-string","title":"--image-pull-policy <string>

                                  Pulling policy of the image when starting a container. Options are:

                                  • Always (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.
                                  • IfNotPresent: the image is pulled only if it is not already present locally.
                                  • Never: the image is assumed to exist locally. No attempt is made to pull the image.

                                  For more information see Kubernetes documentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-l-label-stringarray","title":"-l | --label <stringArray>

                                  Set labels variables in the container.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-master-args-string-string","title":"--master-args string <string>

                                  Arguments to pass to the master pod container command. If used together with --command, overrides the image's entrypoint of the master pod container with the given command.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-master-environment-stringarray","title":"--master-environment <stringArray>

                                  Set environment variables in the master pod container. To prevent from a worker environment variable from being set in the master, use the format: name=-.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-master-extended-resource-stringarray","title":"--master-extended-resource <stringArray>

                                  Request access to an extended resource in the master pod. Use the format: resource_name=quantity.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-master-gpu-float","title":"--master-gpu <float>

                                  GPU units to allocate for the master pod.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-master-no-pvcs","title":"--master-no-pvcs

                                  Do not mount any persistent volumes in the master pod.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-no-master","title":"--no-master

                                  Do not create a separate pod for the master.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <string>

                                  If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-required-pod-topology-key-string","title":"--required-pod-topology-key <string>

                                  Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-stdin","title":"--stdin

                                  Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-t-tty","title":"-t | --tty

                                  Allocate a pseudo-TTY.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-working-dir-string","title":"--working-dir <string>

                                  Starts the container with the specified directory as the current directory.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-cpu-double","title":"--cpu <double>

                                  CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-cpu-limit-double","title":"--cpu-limit <double>

                                  Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-extended-resource","title":"--extended-resource `

                                  Request access to extended resource, syntax <resource-name> = < resource_quantity >

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-g-gpu-float","title":"-g | --gpu <float>

                                  GPU units to allocate for the Job (0.5, 1).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-gpu-memory","title":"--gpu-memory

                                  GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-memory-string","title":"--memory <string>

                                  CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-memory-limit","title":"--memory-limit `

                                  CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-mig-profile-string-deprecated","title":"--mig-profile <string> (Deprecated)

                                  MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-backoff-limit-int","title":"--backoff-limit <int>

                                  The number of times the Job will be retried before failing. The default is 6.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-ttl-after-finish-duration","title":"--ttl-after-finish < duration >

                                  The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-git-sync-stringarray","title":"--git-sync <stringArray>

                                  Clone a git repository into the container running the Job. The parameter should follow the syntax: source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-large-shm","title":"--large-shm

                                  Mount a large /dev/shm device.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-mount-propagation","title":"--mount-propagation

                                  Enable HostToContainer mount propagation for all container volumes

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-nfs-server-string","title":"--nfs-server <string>

                                  Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro]","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc Pvc_Name:Container_Mount_Path:[ro]

                                  Mount a persistent volume claim into a container.

                                  Note

                                  This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters --pvc-exists and --pvc-new.

                                  The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.

                                  Storage_Class_Name is a storage class name that can be obtained by running kubectl get storageclasses.storage.k8s.io. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class.

                                  Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes

                                  Container_Mount_Path. A path internal to the container where the storage will be mounted

                                  Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container

                                  Examples:

                                  --pvc :3Gi:/tmp/john:ro - Allocate 3GB from the default Storage class. Mount it to /tmp/john as read-only

                                  --pvc my-storage:3Gi:/tmp/john:ro - Allocate 3GB from the my-storage storage class. Mount it to /tmp/john as read-only

                                  --pvc :3Gi:/tmp/john - Allocate 3GB from the default storage class. Mount it to /tmp/john as read-write

                                  --pvc my-pvc:/tmp/john - Use a Persistent Volume Claim named my-pvc. Mount it to /tmp/john as read-write

                                  --pvc my-pvc-2:/tmp/john:ro - Use a Persistent Volume Claim named my-pvc-2. Mount it to /tmp/john as read-only

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-exists-string","title":"--pvc-exists <string>

                                  Mount a persistent volume. You must include a claimname and path.

                                  • claim name\u2014The name of the persistent colume claim. Can be obtained by running

                                  kubectl get storageclasses.storage.k8s.io

                                  • path\u2014the path internal to the container where the storage will be mounted

                                  Use the format:

                                  claimname=<CLAIM_NAME>,path=<PATH>

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-new-string","title":"--pvc-new <string>

                                  Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.

                                  • claim name\u2014The name of the persistent colume claim.
                                  • storage class\u2014A storage class name that can be obtained by running

                                  kubectl get storageclasses.storage.k8s.io.

                                  storageclass may be omitted if there is a single storage class in the system, or you are using the default storage class.

                                  • size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.
                                  • accessmode\u2014The description ofthedesired volume capabilities for the PVC.
                                  • ro\u2014Mount the PVC with read-only access.
                                  • ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.

                                  Use the format:

                                  storageclass= <storageclass>,size= <size>, path= <path>, ro, accessmode-rwm

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-s3-string","title":"--s3 <string>

                                  Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:

                                  bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH

                                  All the fields, except url=URL, are mandatory. Default for url is

                                  url=https://s3.amazon.com

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'

                                  Volumes to mount into the container.

                                  Examples:

                                  -v /raid/public/john/data:/root/data:ro

                                  Mount /root/data to local path /raid/public/john/data for read-only access.

                                  -v /public/data:/root/data::nfs.example.com

                                  Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-configmap-volume-namepath","title":"--configmap-volume name=,path= ...'

                                  Mount a ConfigMap object for use as a data volume.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-address-string","title":"--address <string>

                                  Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-host-ipc","title":"--host-ipc

                                  Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.

                                  For further information see docker run reference documentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-host-network","title":"--host-network

                                  Use the host's network stack inside the container. For further information see docker run referencedocumentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-port-stringarray","title":"--port <stringArray>

                                  Expose ports from the Job container.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-s-service-type-string","title":"-s | --service-type <string>

                                  External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-allow-privilege-escalation","title":"--allow-privilege-escalation

                                  Allow the job to gain additional privileges after start.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-run-as-user","title":"--run-as-user

                                  Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-node-pools-string","title":"--node-pools <string>

                                  Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-node-type-string","title":"--node-type <string>

                                  Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-toleration-string","title":"--toleration <string>

                                  Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.

                                  The format of the string:

                                  operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n
                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-loglevel-string","title":"--loglevel (string)

                                  Set the logging level. One of: debug | info | warn | error (default \"info\")

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-project-p-string","title":"--project | -p (string)

                                  Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-help-h","title":"--help | -h

                                  Show help text.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#output","title":"Output","text":"

                                  The command will attempt to submit an mpi Job. You can follow up on the Job by running runai list jobs or runai describe job <job-name>.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#see-also","title":"See Also","text":"
                                  • See Quickstart document Running Distributed Training.
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/","title":"runai submit-dist mpi","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#description","title":"Description","text":"

                                  Submit a Distributed Training (MPI) Run:ai Job to run.

                                  Note

                                  To use distributed training you need to have installed the Kubeflow MPI Operator as specified in Distributed training.

                                  Syntax notes:

                                  • Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#examples","title":"Examples","text":"

                                  You can start an unattended mpi training Job of name dist1, based on Project team-a using a quickstart-distributed image:

                                  runai submit-dist mpi --name dist1 --workers=2 -g 1 \\\n    -i runai.jfrog.io/demo/quickstart-distributed:v0.3.0 -e RUNAI_SLEEP_SECS=60\n

                                  (see: distributed training Quickstart).

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-clean-pod-policy-string","title":"--clean-pod-policy < string >","text":"

                                  The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:

                                  • Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)
                                  • All\u2014all (including completed) pods will be deleted immediately when the job finishes.
                                  • None\u2014no pods will be deleted when the job completes.
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-workers-int","title":"--workers < int >","text":"

                                  Number of replicas for Inference jobs.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-slots-per-worker-int","title":"--slots-per-worker < int >","text":"

                                  Number of slots to allocate for each worker.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-job-name-prefix-string","title":"--job-name-prefix <string>","text":"

                                  The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional --job-name-prefix flag creates Job names with the provided prefix.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-name-string","title":"--name <string>","text":"

                                  The name of the Job.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-template-string","title":"--template <string>","text":"

                                  Load default values from a workload.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-add-capability-stringarray","title":"--add-capability <stringArray>","text":"

                                  Add linux capabilities to the container.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-a-annotation-stringarray","title":"-a | --annotation <stringArray>","text":"

                                  Set annotations variables in the container.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-attach","title":"--attach","text":"

                                  Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.

                                  The --attach flag also sets --tty and --stdin to true.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-command","title":"--command","text":"

                                  Overrides the image's entry point with the command supplied after '--'. When not using the --command flag, the entry point will not be overrided and the string after -- will be appended as arguments to the entry point command.

                                  Example:

                                  --command -- run.sh 1 54 will start the docker and run run.sh 1 54

                                  -- script.py 10000 will augment script.py 10000 to the entry point command (e.g. python)

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-create-home-dir","title":"--create-home-dir","text":"

                                  Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-e-stringarray-environment","title":"-e <stringArray> | --environment`

                                  Define environment variables to be set in the container. To set multiple values add the flag multiple times (-e BATCH_SIZE=50 -e LEARNING_RATE=0.2).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-image-string-i-string","title":"--image <string> | -i <string>

                                  Image to use when creating the container for this Job

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-image-pull-policy-string","title":"--image-pull-policy <string>

                                  Pulling policy of the image when starting a container. Options are:

                                  • Always (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.
                                  • IfNotPresent: the image is pulled only if it is not already present locally.
                                  • Never: the image is assumed to exist locally. No attempt is made to pull the image.

                                  For more information see Kubernetes documentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-l-label-stringarray","title":"-l | --label <stringArray>

                                  Set labels variables in the container.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-master-args-string-string","title":"--master-args string <string>

                                  Arguments to pass to the master pod container command. If used together with --command, overrides the image's entrypoint of the master pod container with the given command.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-master-environment-stringarray","title":"--master-environment <stringArray>

                                  Set environment variables in the master pod container. To prevent from a worker environment variable from being set in the master, use the format: name=-.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-master-extended-resource-stringarray","title":"--master-extended-resource <stringArray>

                                  Request access to an extended resource in the master pod. Use the format: resource_name=quantity.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-master-gpu-float","title":"--master-gpu <float>

                                  GPU units to allocate for the master pod.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-master-no-pvcs","title":"--master-no-pvcs

                                  Do not mount any persistent volumes in the master pod.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <string>

                                  If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-required-pod-topology-key-string","title":"--required-pod-topology-key <string>

                                  Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-stdin","title":"--stdin

                                  Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-t-tty","title":"-t | --tty

                                  Allocate a pseudo-TTY.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-working-dir-string","title":"--working-dir <string>

                                  Starts the container with the specified directory as the current directory.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-cpu-double","title":"--cpu <double>

                                  CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-cpu-limit-double","title":"--cpu-limit <double>

                                  Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-extended-resource","title":"--extended-resource `

                                  Request access to extended resource, syntax <resource-name> = < resource_quantity >

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-g-gpu-float","title":"-g | --gpu <float>

                                  GPU units to allocate for the Job (0.5, 1).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-gpu-memory","title":"--gpu-memory

                                  GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-memory-string","title":"--memory <string>

                                  CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-memory-limit","title":"--memory-limit `

                                  CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-mig-profile-string-deprecated","title":"--mig-profile <string> (Deprecated)

                                  MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-backoff-limit-int","title":"--backoff-limit <int>

                                  The number of times the Job will be retried before failing. The default is 6.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-ttl-after-finish-duration","title":"--ttl-after-finish < duration >

                                  The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-git-sync-stringarray","title":"--git-sync <stringArray>

                                  Clone a git repository into the container running the Job. The parameter should follow the syntax: source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-large-shm","title":"--large-shm

                                  Mount a large /dev/shm device.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-mount-propagation","title":"--mount-propagation

                                  Enable HostToContainer mount propagation for all container volumes

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-nfs-server-string","title":"--nfs-server <string>

                                  Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro]","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc Pvc_Name:Container_Mount_Path:[ro]

                                  Mount a persistent volume claim into a container.

                                  Note

                                  This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters --pvc-exists and --pvc-new.

                                  The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.

                                  Storage_Class_Name is a storage class name that can be obtained by running kubectl get storageclasses.storage.k8s.io. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class.

                                  Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes

                                  Container_Mount_Path. A path internal to the container where the storage will be mounted

                                  Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container

                                  Examples:

                                  --pvc :3Gi:/tmp/john:ro - Allocate 3GB from the default Storage class. Mount it to /tmp/john as read-only

                                  --pvc my-storage:3Gi:/tmp/john:ro - Allocate 3GB from the my-storage storage class. Mount it to /tmp/john as read-only

                                  --pvc :3Gi:/tmp/john - Allocate 3GB from the default storage class. Mount it to /tmp/john as read-write

                                  --pvc my-pvc:/tmp/john - Use a Persistent Volume Claim named my-pvc. Mount it to /tmp/john as read-write

                                  --pvc my-pvc-2:/tmp/john:ro - Use a Persistent Volume Claim named my-pvc-2. Mount it to /tmp/john as read-only

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-exists-string","title":"--pvc-exists <string>

                                  Mount a persistent volume. You must include a claimname and path.

                                  • claim name\u2014The name of the persistent colume claim. Can be obtained by running

                                  kubectl get storageclasses.storage.k8s.io

                                  • path\u2014the path internal to the container where the storage will be mounted

                                  Use the format:

                                  claimname=<CLAIM_NAME>,path=<PATH>

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-new-string","title":"--pvc-new <string>

                                  Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.

                                  • claim name\u2014The name of the persistent colume claim.
                                  • storage class\u2014A storage class name that can be obtained by running

                                  kubectl get storageclasses.storage.k8s.io.

                                  storageclass may be omitted if there is a single storage class in the system, or you are using the default storage class.

                                  • size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.
                                  • accessmode\u2014The description ofthedesired volume capabilities for the PVC.
                                  • ro\u2014Mount the PVC with read-only access.
                                  • ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.

                                  Use the format:

                                  storageclass= <storageclass>,size= <size>, path= <path>, ro, accessmode-rwm

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-s3-string","title":"--s3 <string>

                                  Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:

                                  bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH

                                  All the fields, except url=URL, are mandatory. Default for url is

                                  url=https://s3.amazon.com

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'

                                  Volumes to mount into the container.

                                  Examples:

                                  -v /raid/public/john/data:/root/data:ro

                                  Mount /root/data to local path /raid/public/john/data for read-only access.

                                  -v /public/data:/root/data::nfs.example.com

                                  Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-configmap-volume-namepath","title":"--configmap-volume name=,path= ...'

                                  Mount a ConfigMap object for use as a data volume.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-address-string","title":"--address <string>

                                  Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-host-ipc","title":"--host-ipc

                                  Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.

                                  For further information see docker run reference documentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-host-network","title":"--host-network

                                  Use the host's network stack inside the container. For further information see docker run referencedocumentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-port-stringarray","title":"--port <stringArray>

                                  Expose ports from the Job container.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-s-service-type-string","title":"-s | --service-type <string>

                                  External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-allow-privilege-escalation","title":"--allow-privilege-escalation

                                  Allow the job to gain additional privileges after start.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-run-as-user","title":"--run-as-user

                                  Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-node-pools-string","title":"--node-pools <string>

                                  Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-node-type-string","title":"--node-type <string>

                                  Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-toleration-string","title":"--toleration <string>

                                  Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.

                                  The format of the string:

                                  operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n
                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-loglevel-string","title":"--loglevel (string)

                                  Set the logging level. One of: debug | info | warn | error (default \"info\")

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-project-p-string","title":"--project | -p (string)

                                  Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-help-h","title":"--help | -h

                                  Show help text.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#output","title":"Output","text":"

                                  The command will attempt to submit an mpi Job. You can follow up on the Job by running runai list jobs or runai describe job <job-name>.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#see-also","title":"See Also","text":"
                                  • See Quickstart document Running Distributed Training.
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/","title":"runai submit-dist pytorch","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#description","title":"Description","text":"

                                  Submit a distributed PyTorch training run:ai job to run.

                                  Note

                                  To use distributed training you need to have installed the Pytorch operator as specified in Distributed training.

                                  Syntax notes:

                                  • Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#examples","title":"Examples","text":"
                                  runai submit-dist pytorch --name distributed-job --workers=2 -g 1 \\\n    -i <image_name>\n
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-clean-pod-policy-string","title":"--clean-pod-policy < string >","text":"

                                  The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:

                                  • Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)
                                  • All\u2014all (including completed) pods will be deleted immediately when the job finishes.
                                  • None\u2014no pods will be deleted when the job completes.
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-max-replicas-int","title":"--max-replicas < int >","text":"

                                  Maximum number of replicas for elastic PyTorch job.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-min-replicas-int","title":"--min-replicas < int >","text":"

                                  Minimum number of replicas for elastic PyTorch job.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-workers-int","title":"--workers < int>","text":"

                                  Number of replicas for Inference jobs

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-job-name-prefix-string","title":"--job-name-prefix <string>","text":"

                                  The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional --job-name-prefix flag creates Job names with the provided prefix.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-name-string","title":"--name <string>","text":"

                                  The name of the Job.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-template-string","title":"--template <string>","text":"

                                  Load default values from a workload.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-add-capability-stringarray","title":"--add-capability <stringArray>","text":"

                                  Add linux capabilities to the container.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-a-annotation-stringarray","title":"-a | --annotation <stringArray>","text":"

                                  Set annotations variables in the container.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-attach","title":"--attach","text":"

                                  Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.

                                  The --attach flag also sets --tty and --stdin to true.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-command","title":"--command","text":"

                                  Overrides the image's entry point with the command supplied after '--'. When not using the --command flag, the entry point will not be overrided and the string after -- will be appended as arguments to the entry point command.

                                  Example:

                                  --command -- run.sh 1 54 will start the docker and run run.sh 1 54

                                  -- script.py 10000 will augment script.py 10000 to the entry point command (e.g. python)

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-create-home-dir","title":"--create-home-dir","text":"

                                  Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-e-stringarray-environment","title":"-e <stringArray> | --environment`

                                  Define environment variables to be set in the container. To set multiple values add the flag multiple times (-e BATCH_SIZE=50 -e LEARNING_RATE=0.2).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-image-string-i-string","title":"--image <string> | -i <string>

                                  Image to use when creating the container for this Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-image-pull-policy-string","title":"--image-pull-policy <string>

                                  Pulling policy of the image when starting a container. Options are:

                                  • Always (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.
                                  • IfNotPresent: the image is pulled only if it is not already present locally.
                                  • Never: the image is assumed to exist locally. No attempt is made to pull the image.

                                  For more information see Kubernetes documentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-l-label-stringarray","title":"-l | --label <stringArray>

                                  Set labels variables in the container.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-master-args-string-string","title":"--master-args string <string>

                                  Arguments to pass to the master pod container command. If used together with --command, overrides the image's entrypoint of the master pod container with the given command.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-master-environment-stringarray","title":"--master-environment <stringArray>

                                  Set environment variables in the master pod container. To prevent from a worker environment variable from being set in the master, use the format: name=-.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-master-extended-resource-stringarray","title":"--master-extended-resource <stringArray>

                                  Request access to an extended resource in the master pod. Use the format: resource_name=quantity.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-master-gpu-float","title":"--master-gpu <float>

                                  GPU units to allocate for the master pod.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-master-no-pvcs","title":"--master-no-pvcs

                                  Do not mount any persistent volumes in the master pod.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-no-master","title":"--no-master

                                  Do not create a separate pod for the master.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <string>

                                  If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-required-pod-topology-key-string","title":"--required-pod-topology-key <string>

                                  Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-stdin","title":"--stdin

                                  Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-t-tty","title":"-t | --tty

                                  Allocate a pseudo-TTY.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-working-dir-string","title":"--working-dir <string>

                                  Starts the container with the specified directory as the current directory.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-cpu-double","title":"--cpu <double>

                                  CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-cpu-limit-double","title":"--cpu-limit <double>

                                  Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-extended-resource","title":"--extended-resource `

                                  Request access to extended resource, syntax <resource-name> = < resource_quantity >

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-g-gpu-float","title":"-g | --gpu <float>

                                  GPU units to allocate for the Job (0.5, 1).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-gpu-memory","title":"--gpu-memory

                                  GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-memory-string","title":"--memory <string>

                                  CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-memory-limit","title":"--memory-limit `

                                  CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-mig-profile-string-deprecated","title":"--mig-profile <string> (Deprecated)

                                  MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-backoff-limit-int","title":"--backoff-limit <int>

                                  The number of times the Job will be retried before failing. The default is 6.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-ttl-after-finish-duration","title":"--ttl-after-finish < duration >

                                  The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-git-sync-stringarray","title":"--git-sync <stringArray>

                                  Clone a git repository into the container running the Job. The parameter should follow the syntax: source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-large-shm","title":"--large-shm

                                  Mount a large /dev/shm device.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-mount-propagation","title":"--mount-propagation

                                  Enable HostToContainer mount propagation for all container volumes

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-nfs-server-string","title":"--nfs-server <string>

                                  Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro]","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc Pvc_Name:Container_Mount_Path:[ro]

                                  Mount a persistent volume claim into a container.

                                  Note

                                  This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters --pvc-exists and --pvc-new.

                                  The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.

                                  Storage_Class_Name is a storage class name that can be obtained by running kubectl get storageclasses.storage.k8s.io. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class.

                                  Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes

                                  Container_Mount_Path. A path internal to the container where the storage will be mounted

                                  Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container

                                  Examples:

                                  --pvc :3Gi:/tmp/john:ro - Allocate 3GB from the default Storage class. Mount it to /tmp/john as read-only

                                  --pvc my-storage:3Gi:/tmp/john:ro - Allocate 3GB from the my-storage storage class. Mount it to /tmp/john as read-only

                                  --pvc :3Gi:/tmp/john - Allocate 3GB from the default storage class. Mount it to /tmp/john as read-write

                                  --pvc my-pvc:/tmp/john - Use a Persistent Volume Claim named my-pvc. Mount it to /tmp/john as read-write

                                  --pvc my-pvc-2:/tmp/john:ro - Use a Persistent Volume Claim named my-pvc-2. Mount it to /tmp/john as read-only

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-exists-string","title":"--pvc-exists <string>

                                  Mount a persistent volume. You must include a claimname and path.

                                  • claim name\u2014The name of the persistent colume claim. Can be obtained by running

                                  kubectl get storageclasses.storage.k8s.io

                                  • path\u2014the path internal to the container where the storage will be mounted

                                  Use the format:

                                  claimname=<CLAIM_NAME>,path=<PATH>

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-new-string","title":"--pvc-new <string>

                                  Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.

                                  • claim name\u2014The name of the persistent colume claim.
                                  • storage class\u2014A storage class name that can be obtained by running

                                  kubectl get storageclasses.storage.k8s.io.

                                  storageclass may be omitted if there is a single storage class in the system, or you are using the default storage class.

                                  • size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.
                                  • accessmode\u2014The description ofthedesired volume capabilities for the PVC.
                                  • ro\u2014Mount the PVC with read-only access.
                                  • ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.

                                  Use the format:

                                  storageclass= <storageclass>,size= <size>, path= <path>, ro, accessmode-rwm

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-s3-string","title":"--s3 <string>

                                  Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:

                                  bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH

                                  All the fields, except url=URL, are mandatory. Default for url is

                                  url=https://s3.amazon.com

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'

                                  Volumes to mount into the container.

                                  Examples:

                                  -v /raid/public/john/data:/root/data:ro

                                  Mount /root/data to local path /raid/public/john/data for read-only access.

                                  -v /public/data:/root/data::nfs.example.com

                                  Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-configmap-volume-namepath","title":"--configmap-volume name=,path= ...'

                                  Mount a ConfigMap object for use as a data volume.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-address-string","title":"--address <string>

                                  Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-host-ipc","title":"--host-ipc

                                  Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.

                                  For further information see docker run reference documentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-host-network","title":"--host-network

                                  Use the host's network stack inside the container. For further information see docker run referencedocumentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-port-stringarray","title":"--port <stringArray>

                                  Expose ports from the Job container.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-s-service-type-string","title":"-s | --service-type <string>

                                  External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-allow-privilege-escalation","title":"--allow-privilege-escalation

                                  Allow the job to gain additional privileges after start.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-run-as-user","title":"--run-as-user

                                  Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-node-pools-string","title":"--node-pools <string>

                                  Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-node-type-string","title":"--node-type <string>

                                  Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-toleration-string","title":"--toleration <string>

                                  Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.

                                  The format of the string:

                                  operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n
                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-loglevel-string","title":"--loglevel (string)

                                  Set the logging level. One of: debug | info | warn | error (default \"info\")

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-project-p-string","title":"--project | -p (string)

                                  Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-help-h","title":"--help | -h

                                  Show help text.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#output","title":"Output","text":"

                                  The command will attempt to submit a distributed pytorch workload. You can follow up on the workload by running runai list jobs or runai describe job <job-name>.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/","title":"runai submit-dist xgboost","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#description","title":"Description","text":"

                                  Submit a distributed XGBoost training run:ai job to run.

                                  Syntax notes:

                                  • Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#examples","title":"Examples","text":"
                                  runai submit-dist xgboost --name distributed-job --workers=2 -g 1 \\\n    -i <image_name\n>\n
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-clean-pod-policy-string","title":"--clean-pod-policy < string >","text":"

                                  The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:

                                  • Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)
                                  • All\u2014all (including completed) pods will be deleted immediately when the job finishes.
                                  • None\u2014no pods will be deleted when the job completes.
                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-workers-int","title":"--workers < int>","text":"

                                  Number of replicas for Inference jobs

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-job-name-prefix-string","title":"--job-name-prefix <string>","text":"

                                  The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional --job-name-prefix flag creates Job names with the provided prefix.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-name-string","title":"--name <string>","text":"

                                  The name of the Job.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-template-string","title":"--template <string>","text":"

                                  Load default values from a workload.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-add-capability-stringarray","title":"--add-capability <stringArray>","text":"

                                  Add linux capabilities to the container.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-a-annotation-stringarray","title":"-a | --annotation <stringArray>","text":"

                                  Set annotations variables in the container.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-attach","title":"--attach","text":"

                                  Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.

                                  The --attach flag also sets --tty and --stdin to true.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-command","title":"--command","text":"

                                  Overrides the image's entry point with the command supplied after '--'. When not using the --command flag, the entry point will not be overrided and the string after -- will be appended as arguments to the entry point command.

                                  Example:

                                  --command -- run.sh 1 54 will start the docker and run run.sh 1 54

                                  -- script.py 10000 will augment script.py 10000 to the entry point command (e.g. python)

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-create-home-dir","title":"--create-home-dir","text":"

                                  Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-e-stringarray-environment","title":"-e <stringArray> | --environment`

                                  Define environment variables to be set in the container. To set multiple values add the flag multiple times (-e BATCH_SIZE=50 -e LEARNING_RATE=0.2).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-image-string-i-string","title":"--image <string> | -i <string>

                                  Image to use when creating the container for this Job

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-image-pull-policy-string","title":"--image-pull-policy <string>

                                  Pulling policy of the image when starting a container. Options are:

                                  • Always (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.
                                  • IfNotPresent: the image is pulled only if it is not already present locally.
                                  • Never: the image is assumed to exist locally. No attempt is made to pull the image.

                                  For more information see Kubernetes documentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-l-label-stringarray","title":"-l | --label <stringArray>

                                  Set labels variables in the container.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-master-args-string-string","title":"--master-args string <string>

                                  Arguments to pass to the master pod container command. If used together with --command, overrides the image's entrypoint of the master pod container with the given command.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-master-environment-stringarray","title":"--master-environment <stringArray>

                                  Set environment variables in the master pod container. To prevent from a worker environment variable from being set in the master, use the format: name=-.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-master-extended-resource-stringarray","title":"--master-extended-resource <stringArray>

                                  Request access to an extended resource in the master pod. Use the format: resource_name=quantity.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-master-gpu-float","title":"--master-gpu <float>

                                  GPU units to allocate for the master pod.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-master-no-pvcs","title":"--master-no-pvcs

                                  Do not mount any persistent volumes in the master pod.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <string>

                                  If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-required-pod-topology-key-string","title":"--required-pod-topology-key <string>

                                  Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-stdin","title":"--stdin

                                  Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-t-tty","title":"-t | --tty

                                  Allocate a pseudo-TTY.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-working-dir-string","title":"--working-dir <string>

                                  Starts the container with the specified directory as the current directory.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-cpu-double","title":"--cpu <double>

                                  CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-cpu-limit-double","title":"--cpu-limit <double>

                                  Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-extended-resource","title":"--extended-resource `

                                  Request access to extended resource, syntax <resource-name> = < resource_quantity >

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-g-gpu-float","title":"-g | --gpu <float>

                                  GPU units to allocate for the Job (0.5, 1).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-gpu-memory","title":"--gpu-memory

                                  GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-memory-string","title":"--memory <string>

                                  CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-memory-limit","title":"--memory-limit `

                                  CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-mig-profile-string-deprecated","title":"--mig-profile <string> (Deprecated)

                                  MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-backoff-limit-int","title":"--backoff-limit <int>

                                  The number of times the Job will be retried before failing. The default is 6.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-ttl-after-finish-duration","title":"--ttl-after-finish < duration >

                                  The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-git-sync-stringarray","title":"--git-sync <stringArray>

                                  Clone a git repository into the container running the Job. The parameter should follow the syntax: source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-large-shm","title":"--large-shm

                                  Mount a large /dev/shm device.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-mount-propagation","title":"--mount-propagation

                                  Enable HostToContainer mount propagation for all container volumes

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-nfs-server-string","title":"--nfs-server <string>

                                  Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro]","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc Pvc_Name:Container_Mount_Path:[ro]

                                  Mount a persistent volume claim into a container.

                                  Note

                                  This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters --pvc-exists and --pvc-new.

                                  The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.

                                  Storage_Class_Name is a storage class name that can be obtained by running kubectl get storageclasses.storage.k8s.io. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class.

                                  Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes

                                  Container_Mount_Path. A path internal to the container where the storage will be mounted

                                  Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container

                                  Examples:

                                  --pvc :3Gi:/tmp/john:ro - Allocate 3GB from the default Storage class. Mount it to /tmp/john as read-only

                                  --pvc my-storage:3Gi:/tmp/john:ro - Allocate 3GB from the my-storage storage class. Mount it to /tmp/john as read-only

                                  --pvc :3Gi:/tmp/john - Allocate 3GB from the default storage class. Mount it to /tmp/john as read-write

                                  --pvc my-pvc:/tmp/john - Use a Persistent Volume Claim named my-pvc. Mount it to /tmp/john as read-write

                                  --pvc my-pvc-2:/tmp/john:ro - Use a Persistent Volume Claim named my-pvc-2. Mount it to /tmp/john as read-only

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-exists-string","title":"--pvc-exists <string>

                                  Mount a persistent volume. You must include a claimname and path.

                                  • claim name\u2014The name of the persistent colume claim. Can be obtained by running

                                  kubectl get storageclasses.storage.k8s.io

                                  • path\u2014the path internal to the container where the storage will be mounted

                                  Use the format:

                                  claimname=<CLAIM_NAME>,path=<PATH>

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-new-string","title":"--pvc-new <string>

                                  Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.

                                  • claim name\u2014The name of the persistent colume claim.
                                  • storage class\u2014A storage class name that can be obtained by running

                                  kubectl get storageclasses.storage.k8s.io.

                                  storageclass may be omitted if there is a single storage class in the system, or you are using the default storage class.

                                  • size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.
                                  • accessmode\u2014The description ofthedesired volume capabilities for the PVC.
                                  • ro\u2014Mount the PVC with read-only access.
                                  • ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.

                                  Use the format:

                                  storageclass= <storageclass>,size= <size>, path= <path>, ro, accessmode-rwm

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-s3-string","title":"--s3 <string>

                                  Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:

                                  bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH

                                  All the fields, except url=URL, are mandatory. Default for url is

                                  url=https://s3.amazon.com

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'

                                  Volumes to mount into the container.

                                  Examples:

                                  -v /raid/public/john/data:/root/data:ro

                                  Mount /root/data to local path /raid/public/john/data for read-only access.

                                  -v /public/data:/root/data::nfs.example.com

                                  Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-configmap-volume-namepath","title":"--configmap-volume name=,path= ...'

                                  Mount a ConfigMap object for use as a data volume.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-address-string","title":"--address <string>

                                  Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-host-ipc","title":"--host-ipc

                                  Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.

                                  For further information see docker run reference documentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-host-network","title":"--host-network

                                  Use the host's network stack inside the container. For further information see docker run referencedocumentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-port-stringarray","title":"--port <stringArray>

                                  Expose ports from the Job container.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-s-service-type-string","title":"-s | --service-type <string>

                                  External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-allow-privilege-escalation","title":"--allow-privilege-escalation

                                  Allow the job to gain additional privileges after start.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-run-as-user","title":"--run-as-user

                                  Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-node-pools-string","title":"--node-pools <string>

                                  Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-node-type-string","title":"--node-type <string>

                                  Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-toleration-string","title":"--toleration <string>

                                  Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.

                                  The format of the string:

                                  operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n
                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-loglevel-string","title":"--loglevel (string)

                                  Set the logging level. One of: debug | info | warn | error (default \"info\")

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-project-p-string","title":"--project | -p (string)

                                  Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-help-h","title":"--help | -h

                                  Show help text.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#output","title":"Output","text":"

                                  The command will attempt to submit an mpi Job. You can follow up on the Job by running runai list jobs or runai describe job <job-name>.

                                  "},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#see-also","title":"See Also","text":"
                                  • See Quickstart document Running Distributed Training.
                                  "},{"location":"Researcher/cli-reference/runai-submit/","title":"Description","text":"

                                  Submit a Run:ai Job for execution.

                                  Syntax notes:

                                  • Flags of type stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.
                                  "},{"location":"Researcher/cli-reference/runai-submit/#examples","title":"Examples","text":"

                                  All examples assume a Run:ai Project has been setup using runai config project <project-name>.

                                  Start an interactive Job:

                                  runai submit -i ubuntu --interactive --attach -g 1\n

                                  Or

                                  runai submit --name build1 -i ubuntu -g 1 --interactive -- sleep infinity \n

                                  (see: build Quickstart).

                                  Externalize ports:

                                  runai submit --name build-remote -i rastasheep/ubuntu-sshd:14.04 --interactive \\\n   --service-type=nodeport --port 30022:22\n   -- /usr/sbin/sshd -D\n

                                  (see: build with ports Quickstart).

                                  Start a Training Job

                                  runai submit --name train1 -i runai.jfrog.io/demo/quickstart -g 1 \n

                                  (see: training Quickstart).

                                  Use GPU Fractions

                                  runai submit --name frac05 -i runai.jfrog.io/demo/quickstart -g 0.5\n

                                  (see: GPU fractions Quickstart).

                                  Submit a Job without a name (automatically generates a name)

                                  runai submit -i runai.jfrog.io/demo/quickstart -g 1 \n

                                  Submit a job using the system autogenerated name to an external URL:

                                  runai submit -i ubuntu --interactive --attach -g 1 service-type=external-url --port 3745 --custom-url=<destination_url>\n

                                  Submit a job without a name to a system generated a URL :

                                  runai submit -i ubuntu --interactive --attach -g 1 service-type=external-url --port 3745\n

                                  Submit a Job without a name with a pre-defined prefix and an incremental index suffix

                                  runai submit --job-name-prefix -i runai.jfrog.io/demo/quickstart -g 1 \n
                                  "},{"location":"Researcher/cli-reference/runai-submit/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit/#job-type","title":"Job Type","text":""},{"location":"Researcher/cli-reference/runai-submit/#-interactive","title":"--interactive","text":"

                                  Mark this Job as interactive.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit/#-completions-int","title":"--completions < int >","text":"

                                  Number of successful pods required for this job to be completed. Used with HPO.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-parallelism-int","title":"--parallelism < int >","text":"

                                  Number of pods to run in parallel at any given time. Used with HPO.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-preemptible","title":"--preemptible","text":"

                                  Interactive preemptible jobs can be scheduled above guaranteed quota but may be reclaimed at any time.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-auto-deletion-time-after-completion","title":"--auto-deletion-time-after-completion","text":"

                                  The timeframe after which a completed or failed job is automatically deleted. Configured in seconds, minutes, or hours (for example 5s, 2m, or 3h). If set to 0, the job will be deleted immediately after completing or failing.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit/#-job-name-prefix-string","title":"--job-name-prefix <string>","text":"

                                  The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional --job-name-prefix flag creates Job names with the provided prefix.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-name-string","title":"--name <string>","text":"

                                  The name of the Job.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-template-string","title":"--template <string>","text":"

                                  Load default values from a workload.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit/#-add-capability-stringarray","title":"--add-capability <stringArray>","text":"

                                  Add linux capabilities to the container.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-a-annotation-stringarray","title":"-a | --annotation <stringArray>","text":"

                                  Set annotations variables in the container.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-attach","title":"--attach","text":"

                                  Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.

                                  The --attach flag also sets --tty and --stdin to true.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-command","title":"--command","text":"

                                  Overrides the image's entry point with the command supplied after '--'. When not using the --command flag, the entry point will not be overrided and the string after -- will be appended as arguments to the entry point command.

                                  Example:

                                  --command -- run.sh 1 54 will start the docker and run run.sh 1 54

                                  -- script.py 10000 will augment script.py 10000 to the entry point command (e.g. python)

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-create-home-dir","title":"--create-home-dir","text":"

                                  Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-e-stringarray-environment-stringarray","title":"-e <stringArray> | --environment <stringArray>","text":"

                                  Define environment variables to be set in the container. To set multiple values add the flag multiple times (-e BATCH_SIZE=50 -e LEARNING_RATE=0.2).

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-image-string-i-string","title":"--image <string> | -i <string>","text":"

                                  Image to use when creating the container for this Job

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-image-pull-policy-string","title":"--image-pull-policy <string>","text":"

                                  Pulling policy of the image when starting a container. Options are:

                                  • Always (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.
                                  • IfNotPresent: the image is pulled only if it is not already present locally.
                                  • Never: the image is assumed to exist locally. No attempt is made to pull the image.

                                  For more information see Kubernetes documentation.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-l-label-stringarray","title":"-l | --label <stringArray>","text":"

                                  Set labels variables in the container.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <string>","text":"

                                  If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-required-pod-topology-key-string","title":"--required-pod-topology-key <string>","text":"

                                  Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-stdin","title":"--stdin","text":"

                                  Keep stdin open for the container(s) in the pod, even if nothing is attached.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-t-tty","title":"-t | --tty","text":"

                                  Allocate a pseudo-TTY.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-working-dir-string","title":"--working-dir <string>","text":"

                                  Starts the container with the specified directory as the current directory.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit/#-cpu-double","title":"--cpu <double>","text":"

                                  CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-cpu-limit-double","title":"--cpu-limit <double>","text":"

                                  Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-extended-resource-stringarray","title":"--extended-resource <stringArray>","text":"

                                  Request access to extended resource, syntax <resource-name> = < resource_quantity >

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-g-gpu-float","title":"-g | --gpu <float>","text":"

                                  GPU units to allocate for the Job (0.5, 1).

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-gpu-memory","title":"--gpu-memory","text":"

                                  GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-memory-string","title":"--memory <string>","text":"

                                  CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-memory-limit-string","title":"--memory-limit <string>","text":"

                                  CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-mig-profile-string-deprecated","title":"--mig-profile <string> (Deprecated)","text":"

                                  MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)

                                  "},{"location":"Researcher/cli-reference/runai-submit/#job-lifecycle_1","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit/#-backoff-limit-int","title":"--backoff-limit <int>","text":"

                                  The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the --interactive flag is not specified).

                                  "},{"location":"Researcher/cli-reference/runai-submit/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit/#-git-sync-stringarray","title":"--git-sync <stringArray>","text":"

                                  Clone a git repository into the container running the Job. The parameter should follow the syntax: source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-large-shm","title":"--large-shm","text":"

                                  Mount a large /dev/shm device.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-mount-propagation","title":"--mount-propagation","text":"

                                  Enable HostToContainer mount propagation for all container volumes

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-nfs-server-string","title":"--nfs-server <string>","text":"

                                  Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro]","text":""},{"location":"Researcher/cli-reference/runai-submit/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc Pvc_Name:Container_Mount_Path:[ro]","text":"

                                  Mount a persistent volume claim into a container.

                                  Note

                                  This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters --pvc-exists and --new-pvc.

                                  The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.

                                  Storage_Class_Name is a storage class name that can be obtained by running kubectl get storageclasses.storage.k8s.io. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class.

                                  Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes

                                  Container_Mount_Path. A path internal to the container where the storage will be mounted

                                  Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container

                                  Examples:

                                  --pvc :3Gi:/tmp/john:ro - Allocate 3GB from the default Storage class. Mount it to /tmp/john as read-only

                                  --pvc my-storage:3Gi:/tmp/john:ro - Allocate 3GB from the my-storage storage class. Mount it to /tmp/john as read-only

                                  --pvc :3Gi:/tmp/john - Allocate 3GB from the default storage class. Mount it to /tmp/john as read-write

                                  --pvc my-pvc:/tmp/john - Use a Persistent Volume Claim named my-pvc. Mount it to /tmp/john as read-write

                                  --pvc my-pvc-2:/tmp/john:ro - Use a Persistent Volume Claim named my-pvc-2. Mount it to /tmp/john as read-only

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-pvc-exists-string","title":"--pvc-exists <string>","text":"

                                  Mount a persistent volume. You must include a claimname and path.

                                  • claim name\u2014The name of the persistent colume claim. Can be obtained by running

                                  kubectl get storageclasses.storage.k8s.io

                                  • path\u2014the path internal to the container where the storage will be mounted

                                  Use the format:

                                  claimname=<CLAIM_NAME>,path=<PATH>

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-new-pvc-stringarray","title":"--new-pvc <stringArray>","text":"

                                  Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.

                                  • claim name\u2014The name of the persistent colume claim.
                                  • storage class\u2014A storage class name that can be obtained by running

                                  kubectl get storageclasses.storage.k8s.io.

                                  storageclass may be omitted if there is a single storage class in the system, or you are using the default storage class.

                                  • size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.
                                  • accessmode\u2014The description ofthedesired volume capabilities for the PVC.
                                  • ro\u2014Mount the PVC with read-only access.
                                  • ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.

                                  Use the format:

                                  storageclass= <storageclass>,size= <size>, path= <path>, ro, accessmode-rwm

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-s3-string","title":"--s3 <string>","text":"

                                  Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:

                                  bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH

                                  All the fields, except url=URL, are mandatory. Default for url is

                                  url=https://s3.amazon.com

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'","text":"

                                  Volumes to mount into the container.

                                  Examples:

                                  -v /raid/public/john/data:/root/data:ro

                                  Mount /root/data to local path /raid/public/john/data for read-only access.

                                  -v /public/data:/root/data::nfs.example.com

                                  Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#-configmap-volume-namepath","title":"--configmap-volume name=,path= ...'

                                  Mount a ConfigMap object for use as a data volume.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit/#-host-ipc","title":"--host-ipc

                                  Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.

                                  For further information see docker run reference documentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-host-network","title":"--host-network

                                  Use the host's network stack inside the container. For further information see docker run referencedocumentation.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-s-service-type-string","title":"-s | --service-type <string>

                                  External access type to jobs. Options are:

                                  • nodeport - add one or more ports using --port.
                                  • external-url - add one port and an optional custom URL using --custom-url.

                                  For example:

                                  runai submit test-jup -p team-a -i runai.jfrog.io/demo/jupyter-tensorboard --service-type external-url --port 8888

                                  runai submit test-np -p team-a -i ubuntu --service-type nodeport --port 30000:7070

                                  This flag supports more than one service-type. Multiple service types are supported in CSV style using multiple instances of the same option and commas to separate the values for them.

                                  For example:

                                  runai submit test-np -p team-a -i ubuntu --service-type nodeport,port=30000:7070 --service-type external-url,port=30001

                                  runai submit test-np -p team-a -i ubuntu --service-type nodeport,port=30000:7070,port=9090 --service-type external-url,port=8080,custom-url=https://my.domain.com/url

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-port-stringarray","title":"--port <stringArray>

                                  Expose ports from the Job container. You can use a port number (for example 9090) or use the numbers of hostport:containerport (for example, 30000:7070).

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-custom-url-string","title":"--custom-url <string>

                                  An optional argument that specifies a custom URL when using the external-url service type. If not provided, the system will generate a URL automatically.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit/#-allow-privilege-escalation","title":"--allow-privilege-escalation

                                  Allow the job to gain additional privileges after start.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-run-as-user","title":"--run-as-user

                                  Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit/#-node-pools-string","title":"--node-pools <string>

                                  Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-node-type-string","title":"--node-type <string>

                                  Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-toleration-string","title":"--toleration <string>

                                  Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.

                                  The format of the string:

                                  operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n
                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit/#-loglevel-string","title":"--loglevel (string)

                                  Set the logging level. One of: debug | info | warn | error (default \"info\")

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-project-p-string","title":"--project | -p (string)

                                  Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#-help-h","title":"--help | -h

                                  Show help text.

                                  ","text":""},{"location":"Researcher/cli-reference/runai-submit/#output","title":"Output","text":"

                                  The command will attempt to submit a Job. You can follow up on the Job by running runai list jobs or runai describe job <job-name>.

                                  Note that the submit call may use a policy to provide defaults to any of the above flags.

                                  "},{"location":"Researcher/cli-reference/runai-submit/#see-also","title":"See Also","text":"
                                  • See any of the Quickstart documents here:.
                                  • See policy configuration for a description on how policies work.
                                  "},{"location":"Researcher/cli-reference/runai-suspend/","title":"runai suspend","text":""},{"location":"Researcher/cli-reference/runai-suspend/#description","title":"Description","text":"

                                  Suspend a Job

                                  Suspending a Running Job will stop the Job and will not allow it to be scheduled until it is resumed using runai resume. This means that,

                                  • You will no longer be able to enter it via runai bash.
                                  • The Job logs will be deleted.
                                  • Any data saved on the container and not stored in a shared location will be lost.

                                  Technically, the command deletes the Kubernetes pods associated with the Job and marks the Job as suspended until it is manually released.

                                  Suspend and resume do not work with MPI and Inference

                                  "},{"location":"Researcher/cli-reference/runai-suspend/#synopsis","title":"Synopsis","text":"
                                  runai suspend <job-name>\n    [--all | -A]\n\n    [--loglevel value]\n    [--project string | -p string]\n    [--help | -h]\n
                                  "},{"location":"Researcher/cli-reference/runai-suspend/#options","title":"Options","text":"

                                  <job-name> - The name of the Job to run the command with. Mandatory.

                                  "},{"location":"Researcher/cli-reference/runai-suspend/#-all-a","title":"--all | -A","text":"

                                  Suspend all Jobs in the current Project.

                                  "},{"location":"Researcher/cli-reference/runai-suspend/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-suspend/#-loglevel-string","title":"--loglevel (string)","text":"

                                  Set the logging level. One of: debug | info | warn | error (default \"info\").

                                  "},{"location":"Researcher/cli-reference/runai-suspend/#-project-p-string","title":"--project | -p (string)","text":"

                                  Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use runai config project <project-name>.

                                  "},{"location":"Researcher/cli-reference/runai-suspend/#-help-h","title":"--help | -h","text":"

                                  Show help text.

                                  "},{"location":"Researcher/cli-reference/runai-suspend/#output","title":"Output","text":"
                                  • The Job will be suspended. When running runai list jobs the Job will be marked as Suspended.
                                  "},{"location":"Researcher/cli-reference/runai-suspend/#see-also","title":"See Also","text":"
                                  • Resuming Jobs: Resume.
                                  "},{"location":"Researcher/cli-reference/runai-top-node/","title":"runai top node","text":""},{"location":"Researcher/cli-reference/runai-top-node/#description","title":"Description","text":"

                                  Show list of Nodes (machines), their capacity and utilization.

                                  "},{"location":"Researcher/cli-reference/runai-top-node/#synopsis","title":"Synopsis","text":"
                                  runai top node \n    [--help | -h]\n    [--details | -d]\n
                                  "},{"location":"Researcher/cli-reference/runai-top-node/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-top-node/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-top-node/#-loglevel-string","title":"--loglevel (string)","text":"

                                  Set the logging level. One of: debug | info | warn | error (default \"info\").

                                  "},{"location":"Researcher/cli-reference/runai-top-node/#-help-h","title":"--help | -h","text":"

                                  Show help text.

                                  "},{"location":"Researcher/cli-reference/runai-top-node/#-details-d","title":"--details | -d","text":"

                                  Show additional details.

                                  "},{"location":"Researcher/cli-reference/runai-top-node/#output","title":"Output","text":"

                                  Shows a list of Nodes their capacity and utilization.

                                  "},{"location":"Researcher/cli-reference/runai-top-node/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-update/","title":"runai update","text":""},{"location":"Researcher/cli-reference/runai-update/#description","title":"Description","text":"

                                  Find and install the latest version of the runai command-line utility. The command must be run with sudo permissions.

                                  sudo runai update\n
                                  "},{"location":"Researcher/cli-reference/runai-update/#synopsis","title":"Synopsis","text":"
                                  runai update \n    [--loglevel value] \n    [--help | -h]\n
                                  "},{"location":"Researcher/cli-reference/runai-update/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-update/#-loglevel-string","title":"--loglevel (string)","text":"

                                  Set the logging level. One of: debug | info | warn | error (default \"info\").

                                  "},{"location":"Researcher/cli-reference/runai-update/#-help-h","title":"--help | -h","text":"

                                  Show help text.

                                  "},{"location":"Researcher/cli-reference/runai-update/#output","title":"Output","text":"

                                  Update of the Run:ai command-line interface.

                                  "},{"location":"Researcher/cli-reference/runai-update/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-version/","title":"runai version","text":""},{"location":"Researcher/cli-reference/runai-version/#description","title":"Description","text":"

                                  Show the version of this utility.

                                  "},{"location":"Researcher/cli-reference/runai-version/#synopsis","title":"Synopsis","text":"
                                  runai version \n    [--loglevel value] \n    [--help | -h]\n
                                  "},{"location":"Researcher/cli-reference/runai-version/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-version/#-loglevel-string","title":"--loglevel (string)","text":"

                                  Set the logging level. One of: debug | info | warn | error (default \"info\").

                                  "},{"location":"Researcher/cli-reference/runai-version/#-help-h","title":"--help | -h","text":"

                                  Show help text.

                                  "},{"location":"Researcher/cli-reference/runai-version/#output","title":"Output","text":"

                                  The version of the Run:ai command-line interface.

                                  "},{"location":"Researcher/cli-reference/runai-version/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-whoami/","title":"runai whoami","text":""},{"location":"Researcher/cli-reference/runai-whoami/#description","title":"Description","text":"

                                  Show the user name currently logged in

                                  "},{"location":"Researcher/cli-reference/runai-whoami/#synopsis","title":"Synopsis","text":"
                                  runai whoami \n    [--loglevel value] \n    [--help | -h]\n
                                  "},{"location":"Researcher/cli-reference/runai-whoami/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-whoami/#-loglevel-string","title":"--loglevel (string)","text":"

                                  Set the logging level. One of: debug | info | warn | error (default \"info\").

                                  "},{"location":"Researcher/cli-reference/runai-whoami/#-help-h","title":"--help | -h","text":"

                                  Show help text.

                                  "},{"location":"Researcher/cli-reference/runai-whoami/#output","title":"Output","text":"

                                  The name of the User currently logged in with the Run:ai command-line interface.

                                  "},{"location":"Researcher/cli-reference/runai-whoami/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/","title":"CLI Examples","text":"

                                  This article provides examples of popular use cases illustrating how to use the Command Line Interface (CLI)

                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#logging-in","title":"Logging in","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#logging-in-via-runai-sign-in-page-web","title":"Logging in via run:ai sign in page (web)","text":"

                                  You can log in from the UI, if you are using SSO or credentials

                                  runai login\n

                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#logging-in-via-terminal-credentials","title":"Logging in via terminal (credentials)","text":"
                                  runai login user -u john@acme.com -p \"password\"\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#configuration","title":"Configuration","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#setting-a-default-project","title":"Setting a default project","text":"
                                  runai project set \"project-name\"\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-a-workload","title":"Submitting a workload","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#naming-a-workload","title":"Naming a workload","text":"

                                  Use the commands below to provide a name for a workload.

                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#setting-a-the-workload-name-my_workload_name","title":"Setting a the workload name ( my_workload_name)","text":"
                                  runai workspace submit my-workload-name -p test -i ubuntu \n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#setting-a-random-name-with-prefix-prefixworkload-type","title":"Setting a random name with prefix (prefix=workload type)","text":"
                                      runai workspace submit -p test -i ubuntu \n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#setting-a-random-name-with-specific-prefix-prefix-determined-by-flag","title":"Setting a random name with specific prefix (prefix determined by flag)","text":"
                                  runai workspace submit --prefix-name my-prefix-workload-name -p test -i ubuntu \n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#labels-and-annotations","title":"Labels and annotations","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#labels","title":"Labels","text":"
                                  runai workspace submit -p test -i ubuntu --label name=value --label name2=value2\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#annotations","title":"Annotations","text":"
                                  runai workspace submit -p test -i ubuntu --annotation name=value --annotation name2=value2\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#containers-environment-variables","title":"Container's environment variables","text":"
                                  runai workspace submit -p test -i ubuntu -e name=value -e name2=value2\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#requests-and-limits","title":"Requests and limits","text":"
                                  runai workspace submit  -p \"project-name\" -i runai.jfrog.io/demo/quickstart-demo   --cpu-core-request 0.3 --cpu-core-limit 1 --cpu-memory-request 50M --cpu-memory-limit 1G  --gpu-devices-request 1 --gpu-memory-request 1G\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-and-attaching-to-process","title":"Submitting and attaching to process","text":"
                                  runai workspace submit  -p \"project-name\" -i python  --attach -- python3\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-a-jupyter-notebook","title":"Submitting a jupyter notebook","text":"
                                  runai workspace submit --image jupyter/scipy-notebook -p \"project-name\" --gpu-devices-request 1 --external-url container=8888 --name-prefix jupyter --command -- start-notebook.sh --NotebookApp.base_url='/${RUNAI_PROJECT}/${RUNAI_JOB_NAME}' --NotebookApp.token=''\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-distributed-training-workload-with-tensorflow","title":"Submitting distributed training workload with TensorFlow","text":"
                                  runai distributed submit -f TF --workers=5 --no-master -g 1 -i kubeflow/tf-mnist-with-summaries:latest -p \"project-name\" --command -- python /var/tf_mnist/mnist_with_summaries.py --max_steps 1000000\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-a-multi-pod-workload","title":"Submitting a multi-pod workload","text":"
                                  runai training submit  -i alpine -p test --parallelism 2 --completions 2  -- sleep 100000\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submit-and-bash","title":"Submit and bash","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-a-workload-with-bash-command","title":"Submitting a workload with bash command","text":"
                                  runai training pytorch submit  -p \"project-name\" -i nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 -g 1 --workers 3 --command -- bash -c 'trap : TERM INT; sleep infinity & wait'\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#bashing-into-the-workload","title":"Bashing into the workload","text":"
                                  runai training pytorch bash pytorch-06027b585626 -p \"project-name\"\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-distributed-training-workload-with-mpi","title":"Submitting distributed training workload with MPI","text":"
                                  runai  mpi submit dist1 --workers=2 -g 1 \\\n    -i runai.jfrog.io/demo/quickstart-distributed:v0.3.0 -e RUNAI_SLEEP_SECS=60 -p \"project-name\"\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#submitting-with-pvc","title":"Submitting with PVC","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#new-pvc-bounded-to-the-workspace","title":"New PVC bounded to the workspace","text":"

                                  New PVCs will be deleted when the workload is deleted

                                  runai workspace submit -i ubuntu --new-pvc claimname=yuval-3,size=10M,path=/tmp/test\n

                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#new-ephemeral-pvc","title":"New ephemeral PVC","text":"

                                  New ephemeral PVCs will be deleted when the workload is deleted or paused

                                  runai workspace submit -i ubuntu --new-pvc claimname=yuval2,size=10M,path=/tmp/test,ephemeral\n

                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#existing-pvc","title":"Existing PVC","text":"

                                  Existing PVCs will not be deleted when the workload is deleted

                                  runai workspace submit -i ubuntu --existing-pvc claimname=test-pvc-2-project-mn2xs,path=/home/test\n

                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#masterworker-configuration","title":"Master/Worker configuration","text":"

                                  --command flag and -- are set both leader (master) and workers command/arguments

                                  --master-args flag sets the master arguments

                                  --master-command flag sets the master commands with arguments

                                  --master-args and --master-command flags can be set together

                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#overriding-both-the-leader-master-and-worker-images-arguments","title":"Overriding both the leader (master) and worker image's arguments","text":"
                                  runai pytorch submit -i ubuntu -- -a argument_a -b argument_b -c\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#overriding-both-the-leader-master-and-worker-images-commands-with-arguments","title":"Overriding both the leader (master) and worker image's commands with arguments","text":"
                                  runai pytorch submit -i ubuntu --command -- python -m pip install\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#overriding-arguments-of-the-leader-master-and-worker-images-arguments-with-different-values","title":"Overriding arguments of the leader (master) and worker image's arguments with different values","text":"
                                  runai pytorch submit -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#overriding-command-with-arguments-of-the-leader-master-and-worker-images-arguments","title":"Overriding command with arguments of the leader (master) and worker image's arguments","text":"
                                  runai pytorch submit -i ubuntu --master-command \"python_master -m pip install'\" --command -- 'python_worker -m pip install'\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#listing-objects","title":"Listing objects","text":""},{"location":"Researcher/cli-reference/new-cli/cli-examples/#listing-all-workloads-in-the-users-scope","title":"Listing all workloads in the user's scope","text":"
                                  runai workload list -A\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#listing-projects-in-a-yaml-format","title":"Listing projects in a YAML format","text":"
                                  runai project list --yaml\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#listing-nodes-in-a-json-format","title":"Listing nodes in a JSON format","text":"
                                  runai node list --json\n
                                  "},{"location":"Researcher/cli-reference/new-cli/cli-examples/#cli-reference","title":"CLI reference","text":"

                                  For the full guide of the CLI syntax, see the CLI reference

                                  "},{"location":"Researcher/cli-reference/new-cli/overview/","title":"Run:ai V2 Command-line Interface","text":"

                                  The Run:ai Command-line Interface (CLI) tool for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, and access other features in the Run:ai platform.

                                  "},{"location":"Researcher/cli-reference/new-cli/overview/#the-new-v2-command-line-interface","title":"The new V2 Command-line interface","text":"

                                  This command-line interface is a complete revamp of the command-line interface. Few highlights:

                                  • The CLI internally uses the Control-plane API. This provides a single point of view on Workloads removing dissimilarities between the user interface, programming interface and the command-line interface.
                                  • As such, it also removes the need to configure the Kubernetes API server for authentication.
                                  • The CLI is only available for Run:ai cluster version 2.18 and up.
                                  • The new V2 CLI is backward compatible with the older V1 CLI.
                                  "},{"location":"Researcher/cli-reference/new-cli/overview/#installing-the-improved-command-line-interface","title":"Installing the Improved Command Line Interface","text":"

                                  See installation instructions here.

                                  "},{"location":"Researcher/cli-reference/new-cli/overview/#reference","title":"Reference","text":"

                                  List of all commands can be found here

                                  "},{"location":"Researcher/cli-reference/new-cli/runai/","title":"CLI Reference","text":""},{"location":"Researcher/cli-reference/new-cli/runai/#runai","title":"runai","text":"

                                  Run:ai Command-line Interface

                                  "},{"location":"Researcher/cli-reference/new-cli/runai/#synopsis","title":"Synopsis","text":"

                                  runai - The Run:ai Researcher Command Line Interface

                                  Description: A tool for managing Run:ai workloads and monitoring available resources. It provides researchers with comprehensive control over their AI development environment.

                                  runai [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai/#options","title":"Options","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -h, --help                 help for runai\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai/#see-also","title":"SEE ALSO","text":"
                                  • runai cluster - cluster management
                                  • runai config - configuration management
                                  • runai kubeconfig - kubeconfig management
                                  • runai login - login to the control plane
                                  • runai logout - logout from control plane
                                  • runai mpi - alias for mpi management
                                  • runai node - node management
                                  • runai nodepool - node pool management
                                  • runai project - project management
                                  • runai pytorch - alias for pytorch management
                                  • runai report - [Experimental] report management
                                  • runai tensorflow - alias for tensorflow management
                                  • runai training - training management
                                  • runai upgrade - upgrades the CLI to the latest version
                                  • runai version - show the current version of the CLI
                                  • runai whoami - show the current logged in user
                                  • runai workload - workload management
                                  • runai workspace - workspace management
                                  • runai xgboost - alias for xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_attach/","title":"Runai attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_attach/#runai-attach","title":"runai attach","text":"

                                  [Deprecated] attach

                                  runai attach WORKLOAD_NAME [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --type string                    The type of the workload (training, workspace)\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_cluster/","title":"Runai cluster","text":""},{"location":"Researcher/cli-reference/new-cli/runai_cluster/#runai-cluster","title":"runai cluster","text":"

                                  cluster management

                                  runai cluster [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_cluster/#options","title":"Options","text":"
                                    -h, --help                 help for cluster\n      --interactive enable   set interactive mode (enabled|disabled)\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_cluster/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_cluster/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai cluster list - cluster list command
                                  • runai cluster set - set cluster context
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_list/","title":"Runai cluster list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_cluster_list/#runai-cluster-list","title":"runai cluster list","text":"

                                  cluster list command

                                  runai cluster list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_list/#options","title":"Options","text":"
                                    -h, --help         help for list\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_list/#see-also","title":"SEE ALSO","text":"
                                  • runai cluster - cluster management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_set/","title":"Runai cluster set","text":""},{"location":"Researcher/cli-reference/new-cli/runai_cluster_set/#runai-cluster-set","title":"runai cluster set","text":"

                                  set cluster context

                                  runai cluster set [CLUSTER_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_set/#options","title":"Options","text":"
                                    -h, --help        help for set\n      --id string   set by cluster ID\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_set/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_cluster_set/#see-also","title":"SEE ALSO","text":"
                                  • runai cluster - cluster management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config/","title":"Runai config","text":""},{"location":"Researcher/cli-reference/new-cli/runai_config/#runai-config","title":"runai config","text":"

                                  configuration management

                                  runai config [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config/#options","title":"Options","text":"
                                    -h, --help                 help for config\n      --interactive enable   set interactive mode (enabled|disabled)\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai config generate - generate config file
                                  • runai config set - Set configuration values
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_generate/","title":"Runai config generate","text":""},{"location":"Researcher/cli-reference/new-cli/runai_config_generate/#runai-config-generate","title":"runai config generate","text":"

                                  generate config file

                                  runai config generate [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_generate/#options","title":"Options","text":"
                                        --file string   Output structure to file\n  -h, --help          help for generate\n      --json          Output structure JSON\n      --yaml          Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_generate/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_generate/#see-also","title":"SEE ALSO","text":"
                                  • runai config - configuration management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_project/","title":"Runai config project","text":""},{"location":"Researcher/cli-reference/new-cli/runai_config_project/#runai-config-project","title":"runai config project","text":"

                                  [Deprecated] Configure a default project

                                  runai config project PROJECT_NAME [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_project/#options","title":"Options","text":"
                                    -h, --help   help for project\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_project/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_project/#see-also","title":"SEE ALSO","text":"
                                  • runai config - configuration management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_set/","title":"Runai config set","text":""},{"location":"Researcher/cli-reference/new-cli/runai_config_set/#runai-config-set","title":"runai config set","text":"

                                  Set configuration values

                                  runai config set [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_set/#examples","title":"Examples","text":"
                                  runai config set --status-timeout-duration 5s\nrunai config set --status-timeout-duration 300ms\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_set/#options","title":"Options","text":"
                                        --auth-url string                  set the authorization URL; most likely the same as the control plane URL\n      --cp-url string                    set the control plane URL\n  -h, --help                             help for set\n      --interactive enable               set interactive mode (enabled|disabled)\n      --output string                    set the default output type\n      --status-timeout-duration string   set cluster status call timeout duration value, the default is 3 second (\"3s\")\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_set/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_config_set/#see-also","title":"SEE ALSO","text":"
                                  • runai config - configuration management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_describe/","title":"Runai describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_describe/#runai-describe","title":"runai describe","text":"

                                  [Deprecated] Display detailed information about resources

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_describe/#options","title":"Options","text":"
                                    -h, --help   help for describe\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai describe job - [Deprecated] Display details of a job
                                  • runai describe node - [Deprecated] Display detailed information about nodes in the cluster
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_describe_job/","title":"Runai describe job","text":""},{"location":"Researcher/cli-reference/new-cli/runai_describe_job/#runai-describe-job","title":"runai describe job","text":"

                                  [Deprecated] Display details of a job

                                  runai describe job JOB_NAME [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_describe_job/#options","title":"Options","text":"
                                    -h, --help             help for job\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --type string      The type of the workload (training, workspace)\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_describe_job/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_describe_job/#see-also","title":"SEE ALSO","text":"
                                  • runai describe - [Deprecated] Display detailed information about resources
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_describe_node/","title":"Runai describe node","text":""},{"location":"Researcher/cli-reference/new-cli/runai_describe_node/#runai-describe-node","title":"runai describe node","text":"

                                  [Deprecated] Display detailed information about nodes in the cluster

                                  runai describe node [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_describe_node/#options","title":"Options","text":"
                                    -h, --help   help for node\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_describe_node/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_describe_node/#see-also","title":"SEE ALSO","text":"
                                  • runai describe - [Deprecated] Display detailed information about resources
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_exec/","title":"Runai exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_exec/#runai-exec","title":"runai exec","text":"

                                  [Deprecated] exec

                                  runai exec WORKLOAD_NAME [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --type string                    The type of the workload (training, workspace)\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig/","title":"Runai kubeconfig","text":""},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig/#runai-kubeconfig","title":"runai kubeconfig","text":"

                                  kubeconfig management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig/#options","title":"Options","text":"
                                    -h, --help   help for kubeconfig\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai kubeconfig set - kubeconfig set login token
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig_set/","title":"Runai kubeconfig set","text":""},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig_set/#runai-kubeconfig-set","title":"runai kubeconfig set","text":"

                                  kubeconfig set login token

                                  runai kubeconfig set [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig_set/#options","title":"Options","text":"
                                    -h, --help   help for set\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig_set/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_kubeconfig_set/#see-also","title":"SEE ALSO","text":"
                                  • runai kubeconfig - kubeconfig management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list/","title":"Runai list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_list/#runai-list","title":"runai list","text":"

                                  [Deprecated] display resource list. By default displays the job list

                                  runai list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list/#options","title":"Options","text":"
                                    -A, --all-projects     list workloads from all projects\n  -h, --help             help for list\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai list clusters - [Deprecated] list all available clusters
                                  • runai list jobs - [Deprecated] list all jobs
                                  • runai list nodes - [Deprecated] list all nodes
                                  • runai list projects - [Deprecated] list all available projects
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_clusters/","title":"Runai list clusters","text":""},{"location":"Researcher/cli-reference/new-cli/runai_list_clusters/#runai-list-clusters","title":"runai list clusters","text":"

                                  [Deprecated] list all available clusters

                                  runai list clusters [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_clusters/#options","title":"Options","text":"
                                    -h, --help         help for clusters\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_clusters/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_clusters/#see-also","title":"SEE ALSO","text":"
                                  • runai list - [Deprecated] display resource list. By default displays the job list
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_jobs/","title":"Runai list jobs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_list_jobs/#runai-list-jobs","title":"runai list jobs","text":"

                                  [Deprecated] list all jobs

                                  runai list jobs [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_jobs/#options","title":"Options","text":"
                                    -A, --all-projects     list workloads from all projects\n  -h, --help             help for jobs\n      --json             Output structure JSON\n      --no-headers       Output structure table without headers\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --table            Output structure table\n      --yaml             Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_jobs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_jobs/#see-also","title":"SEE ALSO","text":"
                                  • runai list - [Deprecated] display resource list. By default displays the job list
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_nodes/","title":"Runai list nodes","text":""},{"location":"Researcher/cli-reference/new-cli/runai_list_nodes/#runai-list-nodes","title":"runai list nodes","text":"

                                  [Deprecated] list all nodes

                                  runai list nodes [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_nodes/#options","title":"Options","text":"
                                    -h, --help         help for nodes\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_nodes/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_nodes/#see-also","title":"SEE ALSO","text":"
                                  • runai list - [Deprecated] display resource list. By default displays the job list
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_projects/","title":"Runai list projects","text":""},{"location":"Researcher/cli-reference/new-cli/runai_list_projects/#runai-list-projects","title":"runai list projects","text":"

                                  [Deprecated] list all available projects

                                  runai list projects [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_projects/#options","title":"Options","text":"
                                    -h, --help         help for projects\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_projects/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_list_projects/#see-also","title":"SEE ALSO","text":"
                                  • runai list - [Deprecated] display resource list. By default displays the job list
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login/","title":"Runai login","text":""},{"location":"Researcher/cli-reference/new-cli/runai_login/#runai-login","title":"runai login","text":"

                                  login to the control plane

                                  runai login [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login/#examples","title":"Examples","text":"
                                    # Login using browser\n  runai login\n\n  # Login using SSO with remote browser\n  runai login sso\n  runai login remote-browser\n\n  # Login using username and password without browser\n  runai login user -u <username> \n\n  # Login using browser with specific port and host\n  runai login --listen-port=43121 --listen-host=localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login/#options","title":"Options","text":"
                                    -h, --help                 help for login\n      --listen-host string   the host to listen on for the authentication callback (for browser mode only) (default \"localhost\")\n      --listen-port int      the port to listen on for the authentication callback (for browser mode only) (default 43121)\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai login application - login as an application
                                  • runai login sso - login using sso without browser
                                  • runai login user - login for local user without browser
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_application/","title":"Runai login application","text":""},{"location":"Researcher/cli-reference/new-cli/runai_login_application/#runai-login-application","title":"runai login application","text":"

                                  login as an application

                                  runai login application [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_application/#examples","title":"Examples","text":"
                                    # Login interactive using application credentials\n  runai login app\n\n  # Login using application credentials\n  login app --name=<app_name> --secret=<app_secret> --interactive=disabled\n\n  # Login and Save application credentials\n  login app --name=<app_name> --secret=<app_secret> --interactive=disabled --save\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_application/#options","title":"Options","text":"
                                    -h, --help                 help for application\n      --interactive enable   set interactive mode (enabled|disabled)\n      --name string          application name\n      --save                 save application credentials in config file\n      --secret string        application secret\n      --secret-file string   use application secret from file\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_application/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_application/#see-also","title":"SEE ALSO","text":"
                                  • runai login - login to the control plane
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_sso/","title":"Runai login sso","text":""},{"location":"Researcher/cli-reference/new-cli/runai_login_sso/#runai-login-sso","title":"runai login sso","text":"

                                  login using sso without browser

                                  runai login sso [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_sso/#options","title":"Options","text":"
                                    -h, --help   help for sso\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_sso/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_sso/#see-also","title":"SEE ALSO","text":"
                                  • runai login - login to the control plane
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_user/","title":"Runai login user","text":""},{"location":"Researcher/cli-reference/new-cli/runai_login_user/#runai-login-user","title":"runai login user","text":"

                                  login for local user without browser

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_user/#synopsis","title":"Synopsis","text":"

                                  Login to the control plane using a local user without browser

                                  runai login user [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_user/#examples","title":"Examples","text":"
                                  # Login with a username. the password will be prompted via stdin afterward (recommended)\nrunai login user -u <username>\n\n# Login with a username and plain password (not recommended for security reasons)\nrunai login user --user=user --password=pass\n\n# Login with a username and password (not recommended for security reasons)\nrunai login user -u=user -p=pass\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_user/#options","title":"Options","text":"
                                    -h, --help              help for user\n  -p, --password string   plaintext password of the given username. not recommended for security reasons\n  -u, --user string       the username to login with\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_user/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_login_user/#see-also","title":"SEE ALSO","text":"
                                  • runai login - login to the control plane
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_logout/","title":"Runai logout","text":""},{"location":"Researcher/cli-reference/new-cli/runai_logout/#runai-logout","title":"runai logout","text":"

                                  logout from control plane

                                  runai logout [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_logout/#options","title":"Options","text":"
                                    -h, --help   help for logout\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_logout/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_logout/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_logs/","title":"Runai logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_logs/#runai-logs","title":"runai logs","text":"

                                  [Deprecated] logs

                                  runai logs WORKLOAD_NAME [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --type string             The type of the workload (training, workspace)\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi/","title":"Runai mpi","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi/#runai-mpi","title":"runai mpi","text":"

                                  alias for mpi management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi/#options","title":"Options","text":"
                                    -h, --help   help for mpi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai mpi attach - attach to a running container in a mpi training job
                                  • runai mpi bash - open a bash shell in a mpi training job
                                  • runai mpi delete - delete mpi training workload
                                  • runai mpi describe - describe mpi training
                                  • runai mpi exec - execute a command in a mpi training job
                                  • runai mpi list - list mpi training
                                  • runai mpi logs - view logs of a mpi training job
                                  • runai mpi port-forward - forward one or more local ports to a mpi training job
                                  • runai mpi resume - resume mpi training
                                  • runai mpi submit - submit mpi training
                                  • runai mpi suspend - suspend mpi training
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_attach/","title":"Runai mpi attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_attach/#runai-mpi-attach","title":"runai mpi attach","text":"

                                  attach to a running container in a mpi training job

                                  runai mpi attach [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_attach/#examples","title":"Examples","text":"
                                  # Attaching to the main worker of a mpi training\nrunai training mpi attach mpi-01 --tty --stdin\n\n# Attaching to a specific pod of a mpi training\nrunai training mpi attach mpi-01 --pod mpi-01-worker-1 --tty --stdin\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai mpi - alias for mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_bash/","title":"Runai mpi bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_bash/#runai-mpi-bash","title":"runai mpi bash","text":"

                                  open a bash shell in a mpi training job

                                  runai mpi bash [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_bash/#examples","title":"Examples","text":"
                                  # Open a bash shell in the mpi training's main worker\nrunai training mpi bash mpi-01\n\n# Open a bash shell in a specific mpi training worker\nrunai training mpi bash mpi-01 --pod mpi-01-worker-1\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_bash/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_bash/#see-also","title":"SEE ALSO","text":"
                                  • runai mpi - alias for mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_delete/","title":"Runai mpi delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_delete/#runai-mpi-delete","title":"runai mpi delete","text":"

                                  delete mpi training workload

                                  runai mpi delete [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_delete/#examples","title":"Examples","text":"
                                  # Delete a mpi training workload with a default project\nrunai training mpi delete <mpi-name>\n\n# Delete a mpi training workload with a specific project\nrunai training mpi delete <mpi-name> -p <project_name>\n\n# Delete a mpi training workload by UUID\nrunai training mpi delete --uuid=<mpi_uuid> -p <project_name>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_delete/#options","title":"Options","text":"
                                    -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_delete/#see-also","title":"SEE ALSO","text":"
                                  • runai mpi - alias for mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_describe/","title":"Runai mpi describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_describe/#runai-mpi-describe","title":"runai mpi describe","text":"

                                  describe mpi training

                                  runai mpi describe [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_describe/#examples","title":"Examples","text":"
                                  # Describe a mpi training workload with a default project\nrunai training mpi describe <mpi-name>\n\n# Describe a mpi training workload in a specific project\nrunai training mpi describe <mpi-name> -p <project_name>\n\n# Describe a mpi training workload by UUID\nrunai training mpi describe --uuid=<mpi_uuid>\n\n# Describe a mpi training workload with specific output format\nrunai training mpi describe <mpi-name> -o json\n\n# Describe a mpi training workload with specific sections\nrunai training mpi describe <mpi-name> --general --compute --pods --events --networks\n\n# Describe a mpi training workload with container details and custom limits\nrunai training mpi describe <mpi-name> --containers --pod-limit 20 --event-limit 100\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_describe/#options","title":"Options","text":"
                                        --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai mpi - alias for mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_exec/","title":"Runai mpi exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_exec/#runai-mpi-exec","title":"runai mpi exec","text":"

                                  execute a command in a mpi training job

                                  runai mpi exec [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_exec/#examples","title":"Examples","text":"
                                  # Execute bash in the mpi training's main worker\nrunai training mpi exec mpi-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the mpi training's main worker\nrunai training mpi exec mpi-01 -- ls\n\n# Execute a command in a specific mpi training worker\nrunai training mpi exec mpi-01 --pod mpi-01-worker-1 -- nvidia-smi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai mpi - alias for mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_list/","title":"Runai mpi list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_list/#runai-mpi-list","title":"runai mpi list","text":"

                                  list mpi training

                                  runai mpi list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_list/#examples","title":"Examples","text":"
                                  # List all mpi training workloads\nrunai training mpi list -A\n\n# List mpi training workloads with default project\nrunai training mpi list\n\n# List mpi training workloads in a specific project\nrunai training mpi list -p <project_name>\n\n# List all mpi training workloads with a specific output format\nrunai training mpi list -o wide\n\n# List mpi training workloads with pagination\nrunai training mpi list --limit 20 --offset 40\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_list/#options","title":"Options","text":"
                                    -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_list/#see-also","title":"SEE ALSO","text":"
                                  • runai mpi - alias for mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_logs/","title":"Runai mpi logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_logs/#runai-mpi-logs","title":"runai mpi logs","text":"

                                  view logs of a mpi training job

                                  runai mpi logs [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_logs/#examples","title":"Examples","text":"
                                  # Get logs for a mpi training\nrunai training mpi logs mpi-01\n\n# Get logs for a specific pod in a mpi training\nrunai training mpi logs mpi-01 --pod=mpi-01-worker-0\n\n# Get logs for a specific container in a mpi training\nrunai training mpi logs mpi-01 --container=mpi-worker\n\n# Get the last 100 lines of logs\nrunai training mpi logs mpi-01 --tail=100\n\n# Get logs with timestamps\nrunai training mpi logs mpi-01 --timestamps\n\n# Follow the logs\nrunai training mpi logs mpi-01 --follow\n\n# Get logs for the previous instance of the mpi training\nrunai training mpi logs mpi-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training mpi logs mpi-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training mpi logs mpi-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training mpi logs mpi-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for mpi training to be ready for logs\nrunai training mpi logs mpi-01 --wait-timeout=30s\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai mpi - alias for mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_port-forward/","title":"Runai mpi port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_port-forward/#runai-mpi-port-forward","title":"runai mpi port-forward","text":"

                                  forward one or more local ports to a mpi training job

                                  runai mpi port-forward [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_port-forward/#examples","title":"Examples","text":"
                                  # Forward connections from localhost:8080 to mpi training on port 8090:\nrunai training mpi port-forward mpi-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to mpi training on port 8080:\nrunai training mpi port-forward mpi-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to mpi training on port 8090 and from localhost:6443 to mpi training on port 443:\nrunai training mpi port-forward mpi-01 --port 8080:8090 --port 6443:443 --address localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai mpi - alias for mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_resume/","title":"Runai mpi resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_resume/#runai-mpi-resume","title":"runai mpi resume","text":"

                                  resume mpi training

                                  runai mpi resume [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_resume/#examples","title":"Examples","text":"
                                  # Resume a mpi training workload\nrunai training mpi resume <mpi-name>\n\n# Resume a mpi training workload in a specific project\nrunai training mpi resume <mpi-name> -p <project_name>\n\n# Resume a mpi training workload by UUID\nrunai training mpi resume --uuid=<mpi_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_resume/#options","title":"Options","text":"
                                    -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_resume/#see-also","title":"SEE ALSO","text":"
                                  • runai mpi - alias for mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_submit/","title":"Runai mpi submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_submit/#runai-mpi-submit","title":"runai mpi submit","text":"

                                  submit mpi training

                                  runai mpi submit [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_submit/#examples","title":"Examples","text":"
                                  # Submit a mpi training workload\nrunai training mpi submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a mpi training workload with arguments\nrunai training mpi submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a mpi training workload with a custom command\nrunai training mpi submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a mpi training master args with worker args\nrunai training mpi submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a mpi training master command with worker args\nrunai training mpi submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a mpi training master command with worker command\nrunai training mpi submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_submit/#options","title":"Options","text":"
                                        --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --slots-per-worker int32                         Number of slots to allocate for each worker\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_submit/#see-also","title":"SEE ALSO","text":"
                                  • runai mpi - alias for mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_suspend/","title":"Runai mpi suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_mpi_suspend/#runai-mpi-suspend","title":"runai mpi suspend","text":"

                                  suspend mpi training

                                  runai mpi suspend [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_suspend/#examples","title":"Examples","text":"
                                  # Suspend a mpi training workload\nrunai training mpi suspend <mpi-name>\n\n# Suspend a mpi training workload in a specific project\nrunai training mpi suspend <mpi-name> -p <project_name>\n\n# Suspend a mpi training workload by UUID\nrunai training mpi suspend --uuid=<mpi_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_suspend/#options","title":"Options","text":"
                                    -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_mpi_suspend/#see-also","title":"SEE ALSO","text":"
                                  • runai mpi - alias for mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_node/","title":"Runai node","text":""},{"location":"Researcher/cli-reference/new-cli/runai_node/#runai-node","title":"runai node","text":"

                                  node management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_node/#options","title":"Options","text":"
                                    -h, --help   help for node\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_node/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_node/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai node list - List node
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_node_list/","title":"Runai node list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_node_list/#runai-node-list","title":"runai node list","text":"

                                  List node

                                  runai node list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_node_list/#options","title":"Options","text":"
                                    -h, --help         help for list\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_node_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_node_list/#see-also","title":"SEE ALSO","text":"
                                  • runai node - node management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool/","title":"Runai nodepool","text":""},{"location":"Researcher/cli-reference/new-cli/runai_nodepool/#runai-nodepool","title":"runai nodepool","text":"

                                  node pool management

                                  runai nodepool [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool/#options","title":"Options","text":"
                                    -h, --help   help for nodepool\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai nodepool list - List node pool
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool_list/","title":"Runai nodepool list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_nodepool_list/#runai-nodepool-list","title":"runai nodepool list","text":"

                                  List node pool

                                  runai nodepool list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool_list/#options","title":"Options","text":"
                                    -h, --help         help for list\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_nodepool_list/#see-also","title":"SEE ALSO","text":"
                                  • runai nodepool - node pool management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_port-forward/","title":"Runai port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_port-forward/#runai-port-forward","title":"runai port-forward","text":"

                                  [Deprecated] port forward

                                  runai port-forward WORKLOAD_NAME [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --type string                    The type of the workload (training, workspace)\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_project/","title":"Runai project","text":""},{"location":"Researcher/cli-reference/new-cli/runai_project/#runai-project","title":"runai project","text":"

                                  project management

                                  runai project [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_project/#options","title":"Options","text":"
                                    -h, --help                 help for project\n      --interactive enable   set interactive mode (enabled|disabled)\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_project/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_project/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai project list - list available project
                                  • runai project set - set default project name
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_project_list/","title":"Runai project list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_project_list/#runai-project-list","title":"runai project list","text":"

                                  list available project

                                  runai project list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_project_list/#options","title":"Options","text":"
                                    -h, --help         help for list\n      --json         Output structure JSON\n      --no-headers   Output structure table without headers\n      --table        Output structure table\n      --yaml         Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_project_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_project_list/#see-also","title":"SEE ALSO","text":"
                                  • runai project - project management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_project_set/","title":"Runai project set","text":""},{"location":"Researcher/cli-reference/new-cli/runai_project_set/#runai-project-set","title":"runai project set","text":"

                                  set default project name

                                  runai project set PROJECT_NAME [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_project_set/#options","title":"Options","text":"
                                    -h, --help   help for set\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_project_set/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_project_set/#see-also","title":"SEE ALSO","text":"
                                  • runai project - project management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch/","title":"Runai pytorch","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch/#runai-pytorch","title":"runai pytorch","text":"

                                  alias for pytorch management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch/#options","title":"Options","text":"
                                    -h, --help   help for pytorch\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai pytorch attach - attach to a running container in a pytorch training job
                                  • runai pytorch bash - open a bash shell in a pytorch training job
                                  • runai pytorch delete - delete pytorch training workload
                                  • runai pytorch describe - describe pytorch training
                                  • runai pytorch exec - execute a command in a pytorch training job
                                  • runai pytorch list - list pytorch training
                                  • runai pytorch logs - view logs of a pytorch training job
                                  • runai pytorch port-forward - forward one or more local ports to a pytorch training job
                                  • runai pytorch resume - resume pytorch training
                                  • runai pytorch submit - submit pytorch training
                                  • runai pytorch suspend - suspend pytorch training
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_attach/","title":"Runai pytorch attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_attach/#runai-pytorch-attach","title":"runai pytorch attach","text":"

                                  attach to a running container in a pytorch training job

                                  runai pytorch attach [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_attach/#examples","title":"Examples","text":"
                                  # Attaching to the main worker of a pytorch training\nrunai training pytorch attach pytorch-01 --tty --stdin\n\n# Attaching to a specific pod of a pytorch training\nrunai training pytorch attach pytorch-01 --pod pytorch-01-worker-1 --tty --stdin\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai pytorch - alias for pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_bash/","title":"Runai pytorch bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_bash/#runai-pytorch-bash","title":"runai pytorch bash","text":"

                                  open a bash shell in a pytorch training job

                                  runai pytorch bash [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_bash/#examples","title":"Examples","text":"
                                  # Open a bash shell in the pytorch training's main worker\nrunai training pytorch bash pytorch-01\n\n# Open a bash shell in a specific pytorch training worker\nrunai training pytorch bash pytorch-01 --pod pytorch-01-worker-1\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_bash/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_bash/#see-also","title":"SEE ALSO","text":"
                                  • runai pytorch - alias for pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_delete/","title":"Runai pytorch delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_delete/#runai-pytorch-delete","title":"runai pytorch delete","text":"

                                  delete pytorch training workload

                                  runai pytorch delete [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_delete/#examples","title":"Examples","text":"
                                  # Delete a pytorch training workload with a default project\nrunai training pytorch delete <pytorch-name>\n\n# Delete a pytorch training workload with a specific project\nrunai training pytorch delete <pytorch-name> -p <project_name>\n\n# Delete a pytorch training workload by UUID\nrunai training pytorch delete --uuid=<pytorch_uuid> -p <project_name>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_delete/#options","title":"Options","text":"
                                    -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_delete/#see-also","title":"SEE ALSO","text":"
                                  • runai pytorch - alias for pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_describe/","title":"Runai pytorch describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_describe/#runai-pytorch-describe","title":"runai pytorch describe","text":"

                                  describe pytorch training

                                  runai pytorch describe [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_describe/#examples","title":"Examples","text":"
                                  # Describe a pytorch training workload with a default project\nrunai training pytorch describe <pytorch-name>\n\n# Describe a pytorch training workload in a specific project\nrunai training pytorch describe <pytorch-name> -p <project_name>\n\n# Describe a pytorch training workload by UUID\nrunai training pytorch describe --uuid=<pytorch_uuid>\n\n# Describe a pytorch training workload with specific output format\nrunai training pytorch describe <pytorch-name> -o json\n\n# Describe a pytorch training workload with specific sections\nrunai training pytorch describe <pytorch-name> --general --compute --pods --events --networks\n\n# Describe a pytorch training workload with container details and custom limits\nrunai training pytorch describe <pytorch-name> --containers --pod-limit 20 --event-limit 100\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_describe/#options","title":"Options","text":"
                                        --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai pytorch - alias for pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_exec/","title":"Runai pytorch exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_exec/#runai-pytorch-exec","title":"runai pytorch exec","text":"

                                  execute a command in a pytorch training job

                                  runai pytorch exec [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_exec/#examples","title":"Examples","text":"
                                  # Execute bash in the pytorch training's main worker\nrunai training pytorch exec pytorch-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the pytorch training's main worker\nrunai training pytorch exec pytorch-01 -- ls\n\n# Execute a command in a specific pytorch training worker\nrunai training pytorch exec pytorch-01 --pod pytorch-01-worker-1 -- nvidia-smi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai pytorch - alias for pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_list/","title":"Runai pytorch list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_list/#runai-pytorch-list","title":"runai pytorch list","text":"

                                  list pytorch training

                                  runai pytorch list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_list/#examples","title":"Examples","text":"
                                  # List all pytorch training workloads\nrunai training pytorch list -A\n\n# List pytorch training workloads with default project\nrunai training pytorch list\n\n# List pytorch training workloads in a specific project\nrunai training pytorch list -p <project_name>\n\n# List all pytorch training workloads with a specific output format\nrunai training pytorch list -o wide\n\n# List pytorch training workloads with pagination\nrunai training pytorch list --limit 20 --offset 40\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_list/#options","title":"Options","text":"
                                    -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_list/#see-also","title":"SEE ALSO","text":"
                                  • runai pytorch - alias for pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_logs/","title":"Runai pytorch logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_logs/#runai-pytorch-logs","title":"runai pytorch logs","text":"

                                  view logs of a pytorch training job

                                  runai pytorch logs [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_logs/#examples","title":"Examples","text":"
                                  # Get logs for a pytorch training\nrunai training pytorch logs pytorch-01\n\n# Get logs for a specific pod in a pytorch training\nrunai training pytorch logs pytorch-01 --pod=pytorch-01-worker-0\n\n# Get logs for a specific container in a pytorch training\nrunai training pytorch logs pytorch-01 --container=pytorch-worker\n\n# Get the last 100 lines of logs\nrunai training pytorch logs pytorch-01 --tail=100\n\n# Get logs with timestamps\nrunai training pytorch logs pytorch-01 --timestamps\n\n# Follow the logs\nrunai training pytorch logs pytorch-01 --follow\n\n# Get logs for the previous instance of the pytorch training\nrunai training pytorch logs pytorch-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training pytorch logs pytorch-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training pytorch logs pytorch-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training pytorch logs pytorch-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for pytorch training to be ready for logs\nrunai training pytorch logs pytorch-01 --wait-timeout=30s\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai pytorch - alias for pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_port-forward/","title":"Runai pytorch port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_port-forward/#runai-pytorch-port-forward","title":"runai pytorch port-forward","text":"

                                  forward one or more local ports to a pytorch training job

                                  runai pytorch port-forward [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_port-forward/#examples","title":"Examples","text":"
                                  # Forward connections from localhost:8080 to pytorch training on port 8090:\nrunai training pytorch port-forward pytorch-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to pytorch training on port 8080:\nrunai training pytorch port-forward pytorch-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to pytorch training on port 8090 and from localhost:6443 to pytorch training on port 443:\nrunai training pytorch port-forward pytorch-01 --port 8080:8090 --port 6443:443 --address localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai pytorch - alias for pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_resume/","title":"Runai pytorch resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_resume/#runai-pytorch-resume","title":"runai pytorch resume","text":"

                                  resume pytorch training

                                  runai pytorch resume [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_resume/#examples","title":"Examples","text":"
                                  # Resume a pytorch training workload\nrunai training pytorch resume <pytorch-name>\n\n# Resume a pytorch training workload in a specific project\nrunai training pytorch resume <pytorch-name> -p <project_name>\n\n# Resume a pytorch training workload by UUID\nrunai training pytorch resume --uuid=<pytorch_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_resume/#options","title":"Options","text":"
                                    -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_resume/#see-also","title":"SEE ALSO","text":"
                                  • runai pytorch - alias for pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_submit/","title":"Runai pytorch submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_submit/#runai-pytorch-submit","title":"runai pytorch submit","text":"

                                  submit pytorch training

                                  runai pytorch submit [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_submit/#examples","title":"Examples","text":"
                                  # Submit a pytorch training workload\nrunai training pytorch submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a pytorch training workload with arguments\nrunai training pytorch submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a pytorch training workload with a custom command\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a pytorch training master args with worker args\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a pytorch training master command with worker args\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a pytorch training master command with worker command\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_submit/#options","title":"Options","text":"
                                        --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-gpu-devices-request int32               GPU units to allocate for the job (e.g. 1, 2)\n      --master-gpu-portion-limit float                 GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-gpu-portion-request float               GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --max-replicas int32                             Maximum number of replicas for an elastic PyTorch job\n      --min-replicas int32                             Minimum number of replicas for an elastic PyTorch job\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --no-master                                      Do not create a separate pod for the master\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_submit/#see-also","title":"SEE ALSO","text":"
                                  • runai pytorch - alias for pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_suspend/","title":"Runai pytorch suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_suspend/#runai-pytorch-suspend","title":"runai pytorch suspend","text":"

                                  suspend pytorch training

                                  runai pytorch suspend [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_suspend/#examples","title":"Examples","text":"
                                  # Suspend a pytorch training workload\nrunai training pytorch suspend <pytorch-name>\n\n# Suspend a pytorch training workload in a specific project\nrunai training pytorch suspend <pytorch-name> -p <project_name>\n\n# Suspend a pytorch training workload by UUID\nrunai training pytorch suspend --uuid=<pytorch_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_suspend/#options","title":"Options","text":"
                                    -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_pytorch_suspend/#see-also","title":"SEE ALSO","text":"
                                  • runai pytorch - alias for pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report/","title":"Runai report","text":""},{"location":"Researcher/cli-reference/new-cli/runai_report/#runai-report","title":"runai report","text":"

                                  [Experimental] report management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report/#options","title":"Options","text":"
                                    -h, --help   help for report\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai report metrics - [Experimental] metrics management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics/","title":"Runai report metrics","text":""},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics/#runai-report-metrics","title":"runai report metrics","text":"

                                  [Experimental] metrics management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics/#options","title":"Options","text":"
                                    -h, --help   help for metrics\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics/#see-also","title":"SEE ALSO","text":"
                                  • runai report - [Experimental] report management
                                  • runai report metrics clear - metrics logs deletion
                                  • runai report metrics config - metrics configuration
                                  • runai report metrics output - metrics logs output
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_clear/","title":"Runai report metrics clear","text":""},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_clear/#runai-report-metrics-clear","title":"runai report metrics clear","text":"

                                  metrics logs deletion

                                  runai report metrics clear [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_clear/#options","title":"Options","text":"
                                    -h, --help   help for clear\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_clear/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_clear/#see-also","title":"SEE ALSO","text":"
                                  • runai report metrics - [Experimental] metrics management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_config/","title":"Runai report metrics config","text":""},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_config/#runai-report-metrics-config","title":"runai report metrics config","text":"

                                  metrics configuration

                                  runai report metrics config [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_config/#options","title":"Options","text":"
                                        --age int          metrics max file age (default 14)\n      --files int        metrics max file number (default 30)\n  -h, --help             help for config\n      --metrics enable   metrics enable flag (enabled|disabled)\n      --size int         metrics max file size (default 10)\n      --type reporter    report generated type (none|logger|local)\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_config/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_config/#see-also","title":"SEE ALSO","text":"
                                  • runai report metrics - [Experimental] metrics management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_output/","title":"Runai report metrics output","text":""},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_output/#runai-report-metrics-output","title":"runai report metrics output","text":"

                                  metrics logs output

                                  runai report metrics output [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_output/#options","title":"Options","text":"
                                    -h, --help       help for output\n      --tail int   number of tail metrics (default 100)\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_output/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_report_metrics_output/#see-also","title":"SEE ALSO","text":"
                                  • runai report metrics - [Experimental] metrics management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_submit/","title":"Runai submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_submit/#runai-submit","title":"runai submit","text":"

                                  [Deprecated] Submit a new workload

                                  runai submit [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_submit/#options","title":"Options","text":"
                                        --add-capability stringArray                     The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --completions int32                              Number of successful pods required for this job to be completed. Used with HPO\n      --configmap-volume stringArray                   Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu float                                      CPU core request (e.g. 0.5, 1)\n      --cpu-limit float                                CPU core limit (e.g. 0.5, 1)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu float                                      GPU units to allocate for the job (e.g. 0.5, 1)\n      --gpu-memory string                              GPU memory to allocate for the job (e.g. 1G, 500M)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --interactive                                    Mark this job as interactive\n      --job-name-prefix string                         Set defined prefix for the workload name and add index as a suffix\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --memory string                                  CPU memory to allocate for the job (e.g. 1G, 500M)\n      --memory-limit string                            CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --mig-profile string                             [Deprecated] MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --parallelism int32                              Number of pods to run in parallel at any given time. Used with HPO\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preemptible                                    Workspace preemptible workloads can be scheduled above guaranteed quota but may be reclaimed at any time\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n  -v, --volume stringArray                             Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --working-dir string                             Set the container's working directory\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_submit/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow/","title":"Runai tensorflow","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow/#runai-tensorflow","title":"runai tensorflow","text":"

                                  alias for tensorflow management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow/#options","title":"Options","text":"
                                    -h, --help   help for tensorflow\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai tensorflow attach - attach to a running container in a tf training job
                                  • runai tensorflow bash - open a bash shell in a tf training job
                                  • runai tensorflow delete - delete tf training workload
                                  • runai tensorflow describe - describe tf training
                                  • runai tensorflow exec - execute a command in a tf training job
                                  • runai tensorflow list - list tf training
                                  • runai tensorflow logs - view logs of a tf training job
                                  • runai tensorflow port-forward - forward one or more local ports to a tf training job
                                  • runai tensorflow resume - resume tf training
                                  • runai tensorflow submit - submit tf training
                                  • runai tensorflow suspend - suspend tf training
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_attach/","title":"Runai tensorflow attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_attach/#runai-tensorflow-attach","title":"runai tensorflow attach","text":"

                                  attach to a running container in a tf training job

                                  runai tensorflow attach [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_attach/#examples","title":"Examples","text":"
                                  # Attaching to the main worker of a tf training\nrunai training tf attach tf-01 --tty --stdin\n\n# Attaching to a specific pod of a tf training\nrunai training tf attach tf-01 --pod tf-01-worker-1 --tty --stdin\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai tensorflow - alias for tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_bash/","title":"Runai tensorflow bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_bash/#runai-tensorflow-bash","title":"runai tensorflow bash","text":"

                                  open a bash shell in a tf training job

                                  runai tensorflow bash [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_bash/#examples","title":"Examples","text":"
                                  # Open a bash shell in the tf training's main worker\nrunai training tf bash tf-01\n\n# Open a bash shell in a specific tf training worker\nrunai training tf bash tf-01 --pod tf-01-worker-1\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_bash/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_bash/#see-also","title":"SEE ALSO","text":"
                                  • runai tensorflow - alias for tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_delete/","title":"Runai tensorflow delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_delete/#runai-tensorflow-delete","title":"runai tensorflow delete","text":"

                                  delete tf training workload

                                  runai tensorflow delete [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_delete/#examples","title":"Examples","text":"
                                  # Delete a tf training workload with a default project\nrunai training tf delete <tf-name>\n\n# Delete a tf training workload with a specific project\nrunai training tf delete <tf-name> -p <project_name>\n\n# Delete a tf training workload by UUID\nrunai training tf delete --uuid=<tf_uuid> -p <project_name>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_delete/#options","title":"Options","text":"
                                    -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_delete/#see-also","title":"SEE ALSO","text":"
                                  • runai tensorflow - alias for tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_describe/","title":"Runai tensorflow describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_describe/#runai-tensorflow-describe","title":"runai tensorflow describe","text":"

                                  describe tf training

                                  runai tensorflow describe [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_describe/#examples","title":"Examples","text":"
                                  # Describe a tf training workload with a default project\nrunai training tf describe <tf-name>\n\n# Describe a tf training workload in a specific project\nrunai training tf describe <tf-name> -p <project_name>\n\n# Describe a tf training workload by UUID\nrunai training tf describe --uuid=<tf_uuid>\n\n# Describe a tf training workload with specific output format\nrunai training tf describe <tf-name> -o json\n\n# Describe a tf training workload with specific sections\nrunai training tf describe <tf-name> --general --compute --pods --events --networks\n\n# Describe a tf training workload with container details and custom limits\nrunai training tf describe <tf-name> --containers --pod-limit 20 --event-limit 100\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_describe/#options","title":"Options","text":"
                                        --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai tensorflow - alias for tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_exec/","title":"Runai tensorflow exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_exec/#runai-tensorflow-exec","title":"runai tensorflow exec","text":"

                                  execute a command in a tf training job

                                  runai tensorflow exec [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_exec/#examples","title":"Examples","text":"
                                  # Execute bash in the tf training's main worker\nrunai training tf exec tf-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the tf training's main worker\nrunai training tf exec tf-01 -- ls\n\n# Execute a command in a specific tf training worker\nrunai training tf exec tf-01 --pod tf-01-worker-1 -- nvidia-smi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai tensorflow - alias for tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_list/","title":"Runai tensorflow list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_list/#runai-tensorflow-list","title":"runai tensorflow list","text":"

                                  list tf training

                                  runai tensorflow list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_list/#examples","title":"Examples","text":"
                                  # List all tf training workloads\nrunai training tf list -A\n\n# List tf training workloads with default project\nrunai training tf list\n\n# List tf training workloads in a specific project\nrunai training tf list -p <project_name>\n\n# List all tf training workloads with a specific output format\nrunai training tf list -o wide\n\n# List tf training workloads with pagination\nrunai training tf list --limit 20 --offset 40\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_list/#options","title":"Options","text":"
                                    -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_list/#see-also","title":"SEE ALSO","text":"
                                  • runai tensorflow - alias for tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_logs/","title":"Runai tensorflow logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_logs/#runai-tensorflow-logs","title":"runai tensorflow logs","text":"

                                  view logs of a tf training job

                                  runai tensorflow logs [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_logs/#examples","title":"Examples","text":"
                                  # Get logs for a tf training\nrunai training tf logs tf-01\n\n# Get logs for a specific pod in a tf training\nrunai training tf logs tf-01 --pod=tf-01-worker-0\n\n# Get logs for a specific container in a tf training\nrunai training tf logs tf-01 --container=tf-worker\n\n# Get the last 100 lines of logs\nrunai training tf logs tf-01 --tail=100\n\n# Get logs with timestamps\nrunai training tf logs tf-01 --timestamps\n\n# Follow the logs\nrunai training tf logs tf-01 --follow\n\n# Get logs for the previous instance of the tf training\nrunai training tf logs tf-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training tf logs tf-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training tf logs tf-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training tf logs tf-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for tf training to be ready for logs\nrunai training tf logs tf-01 --wait-timeout=30s\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai tensorflow - alias for tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/","title":"Runai tensorflow port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/#runai-tensorflow-port-forward","title":"runai tensorflow port-forward","text":"

                                  forward one or more local ports to a tf training job

                                  runai tensorflow port-forward [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/#examples","title":"Examples","text":"
                                  # Forward connections from localhost:8080 to tf training on port 8090:\nrunai training tf port-forward tf-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to tf training on port 8080:\nrunai training tf port-forward tf-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to tf training on port 8090 and from localhost:6443 to tf training on port 443:\nrunai training tf port-forward tf-01 --port 8080:8090 --port 6443:443 --address localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai tensorflow - alias for tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_resume/","title":"Runai tensorflow resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_resume/#runai-tensorflow-resume","title":"runai tensorflow resume","text":"

                                  resume tf training

                                  runai tensorflow resume [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_resume/#examples","title":"Examples","text":"
                                  # Resume a tf training workload\nrunai training tf resume <tf-name>\n\n# Resume a tf training workload in a specific project\nrunai training tf resume <tf-name> -p <project_name>\n\n# Resume a tf training workload by UUID\nrunai training tf resume --uuid=<tf_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_resume/#options","title":"Options","text":"
                                    -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_resume/#see-also","title":"SEE ALSO","text":"
                                  • runai tensorflow - alias for tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_submit/","title":"Runai tensorflow submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_submit/#runai-tensorflow-submit","title":"runai tensorflow submit","text":"

                                  submit tf training

                                  runai tensorflow submit [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_submit/#examples","title":"Examples","text":"
                                  # Submit a tf training workload\nrunai training tf submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a tf training workload with arguments\nrunai training tf submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a tf training workload with a custom command\nrunai training tf submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a tf training master args with worker args\nrunai training tf submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a tf training master command with worker args\nrunai training tf submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a tf training master command with worker command\nrunai training tf submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_submit/#options","title":"Options","text":"
                                        --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-gpu-devices-request int32               GPU units to allocate for the job (e.g. 1, 2)\n      --master-gpu-portion-limit float                 GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-gpu-portion-request float               GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --max-replicas int32                             Maximum number of replicas for an elastic PyTorch job\n      --min-replicas int32                             Minimum number of replicas for an elastic PyTorch job\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --no-master                                      Do not create a separate pod for the master\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_submit/#see-also","title":"SEE ALSO","text":"
                                  • runai tensorflow - alias for tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_suspend/","title":"Runai tensorflow suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_suspend/#runai-tensorflow-suspend","title":"runai tensorflow suspend","text":"

                                  suspend tf training

                                  runai tensorflow suspend [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_suspend/#examples","title":"Examples","text":"
                                  # Suspend a tf training workload\nrunai training tf suspend <tf-name>\n\n# Suspend a tf training workload in a specific project\nrunai training tf suspend <tf-name> -p <project_name>\n\n# Suspend a tf training workload by UUID\nrunai training tf suspend --uuid=<tf_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_suspend/#options","title":"Options","text":"
                                    -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_tensorflow_suspend/#see-also","title":"SEE ALSO","text":"
                                  • runai tensorflow - alias for tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training/","title":"Runai training","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training/#runai-training","title":"runai training","text":"

                                  training management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training/#options","title":"Options","text":"
                                    -h, --help   help for training\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai training attach - attach to a running container in a standard training job
                                  • runai training bash - open a bash shell in a standard training job
                                  • runai training delete - delete standard training workload
                                  • runai training describe - describe standard training
                                  • runai training exec - execute a command in a standard training job
                                  • runai training list - list all training frameworks
                                  • runai training logs - view logs of a standard training job
                                  • runai training mpi - mpi management
                                  • runai training port-forward - forward one or more local ports to a standard training job
                                  • runai training pytorch - pytorch management
                                  • runai training resume - resume standard training
                                  • runai training standard - standard training management
                                  • runai training submit - submit standard training
                                  • runai training suspend - suspend standard training
                                  • runai training tensorflow - tensorflow management
                                  • runai training xgboost - xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_attach/","title":"Runai training attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_attach/#runai-training-attach","title":"runai training attach","text":"

                                  attach to a running container in a standard training job

                                  runai training attach [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_attach/#examples","title":"Examples","text":"
                                  # Attaching to the main worker of a standard training\nrunai training standard attach standard-01 --tty --stdin\n\n# Attaching to a specific pod of a standard training\nrunai training standard attach standard-01 --pod standard-01-worker-1 --tty --stdin\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_bash/","title":"Runai training bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_bash/#runai-training-bash","title":"runai training bash","text":"

                                  open a bash shell in a standard training job

                                  runai training bash [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_bash/#examples","title":"Examples","text":"
                                  # Open a bash shell in the standard training's main worker\nrunai training standard bash standard-01\n\n# Open a bash shell in a specific standard training worker\nrunai training standard bash standard-01 --pod standard-01-worker-1\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_bash/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_bash/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_delete/","title":"Runai training delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_delete/#runai-training-delete","title":"runai training delete","text":"

                                  delete standard training workload

                                  runai training delete [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_delete/#examples","title":"Examples","text":"
                                  # Delete a standard training workload with a default project\nrunai training standard delete <standard-name>\n\n# Delete a standard training workload with a specific project\nrunai training standard delete <standard-name> -p <project_name>\n\n# Delete a standard training workload by UUID\nrunai training standard delete --uuid=<standard_uuid> -p <project_name>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_delete/#options","title":"Options","text":"
                                    -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_delete/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_describe/","title":"Runai training describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_describe/#runai-training-describe","title":"runai training describe","text":"

                                  describe standard training

                                  runai training describe [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_describe/#examples","title":"Examples","text":"
                                  # Describe a standard training workload with a default project\nrunai training standard describe <standard-name>\n\n# Describe a standard training workload in a specific project\nrunai training standard describe <standard-name> -p <project_name>\n\n# Describe a standard training workload by UUID\nrunai training standard describe --uuid=<standard_uuid>\n\n# Describe a standard training workload with specific output format\nrunai training standard describe <standard-name> -o json\n\n# Describe a standard training workload with specific sections\nrunai training standard describe <standard-name> --general --compute --pods --events --networks\n\n# Describe a standard training workload with container details and custom limits\nrunai training standard describe <standard-name> --containers --pod-limit 20 --event-limit 100\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_describe/#options","title":"Options","text":"
                                        --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_exec/","title":"Runai training exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_exec/#runai-training-exec","title":"runai training exec","text":"

                                  execute a command in a standard training job

                                  runai training exec [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_exec/#examples","title":"Examples","text":"
                                  # Execute bash in the standard training's main worker\nrunai training standard exec standard-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the standard training's main worker\nrunai training standard exec standard-01 -- ls\n\n# Execute a command in a specific standard training worker\nrunai training standard exec standard-01 --pod standard-01-worker-1 -- nvidia-smi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_list/","title":"Runai training list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_list/#runai-training-list","title":"runai training list","text":"

                                  list all training frameworks

                                  runai training list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_list/#examples","title":"Examples","text":"
                                  runai training list -A\nrunai training list --state=<training_state> --limit=20\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_list/#options","title":"Options","text":"
                                    -A, --all                list workloads from all projects\n      --framework string   filter by workload framework\n  -h, --help               help for list\n      --json               Output structure JSON\n      --limit int32        number of workload in list (default 50)\n      --no-headers         Output structure table without headers\n      --offset int32       offset number of limit, default 0 (first offset)\n  -p, --project string     Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string      filter by workload state\n      --table              Output structure table\n      --yaml               Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_list/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_logs/","title":"Runai training logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_logs/#runai-training-logs","title":"runai training logs","text":"

                                  view logs of a standard training job

                                  runai training logs [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_logs/#examples","title":"Examples","text":"
                                  # Get logs for a standard training\nrunai training standard logs standard-01\n\n# Get logs for a specific pod in a standard training\nrunai training standard logs standard-01 --pod=standard-01-worker-0\n\n# Get logs for a specific container in a standard training\nrunai training standard logs standard-01 --container=standard-worker\n\n# Get the last 100 lines of logs\nrunai training standard logs standard-01 --tail=100\n\n# Get logs with timestamps\nrunai training standard logs standard-01 --timestamps\n\n# Follow the logs\nrunai training standard logs standard-01 --follow\n\n# Get logs for the previous instance of the standard training\nrunai training standard logs standard-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training standard logs standard-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training standard logs standard-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training standard logs standard-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for standard training to be ready for logs\nrunai training standard logs standard-01 --wait-timeout=30s\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi/","title":"Runai training mpi","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi/#runai-training-mpi","title":"runai training mpi","text":"

                                  mpi management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi/#options","title":"Options","text":"
                                    -h, --help   help for mpi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  • runai training mpi attach - attach to a running container in a mpi training job
                                  • runai training mpi bash - open a bash shell in a mpi training job
                                  • runai training mpi delete - delete mpi training workload
                                  • runai training mpi describe - describe mpi training
                                  • runai training mpi exec - execute a command in a mpi training job
                                  • runai training mpi list - list mpi training
                                  • runai training mpi logs - view logs of a mpi training job
                                  • runai training mpi port-forward - forward one or more local ports to a mpi training job
                                  • runai training mpi resume - resume mpi training
                                  • runai training mpi submit - submit mpi training
                                  • runai training mpi suspend - suspend mpi training
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_attach/","title":"Runai training mpi attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_attach/#runai-training-mpi-attach","title":"runai training mpi attach","text":"

                                  attach to a running container in a mpi training job

                                  runai training mpi attach [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_attach/#examples","title":"Examples","text":"
                                  # Attaching to the main worker of a mpi training\nrunai training mpi attach mpi-01 --tty --stdin\n\n# Attaching to a specific pod of a mpi training\nrunai training mpi attach mpi-01 --pod mpi-01-worker-1 --tty --stdin\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai training mpi - mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_bash/","title":"Runai training mpi bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_bash/#runai-training-mpi-bash","title":"runai training mpi bash","text":"

                                  open a bash shell in a mpi training job

                                  runai training mpi bash [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_bash/#examples","title":"Examples","text":"
                                  # Open a bash shell in the mpi training's main worker\nrunai training mpi bash mpi-01\n\n# Open a bash shell in a specific mpi training worker\nrunai training mpi bash mpi-01 --pod mpi-01-worker-1\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_bash/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_bash/#see-also","title":"SEE ALSO","text":"
                                  • runai training mpi - mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_delete/","title":"Runai training mpi delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_delete/#runai-training-mpi-delete","title":"runai training mpi delete","text":"

                                  delete mpi training workload

                                  runai training mpi delete [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_delete/#examples","title":"Examples","text":"
                                  # Delete a mpi training workload with a default project\nrunai training mpi delete <mpi-name>\n\n# Delete a mpi training workload with a specific project\nrunai training mpi delete <mpi-name> -p <project_name>\n\n# Delete a mpi training workload by UUID\nrunai training mpi delete --uuid=<mpi_uuid> -p <project_name>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_delete/#options","title":"Options","text":"
                                    -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_delete/#see-also","title":"SEE ALSO","text":"
                                  • runai training mpi - mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_describe/","title":"Runai training mpi describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_describe/#runai-training-mpi-describe","title":"runai training mpi describe","text":"

                                  describe mpi training

                                  runai training mpi describe [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_describe/#examples","title":"Examples","text":"
                                  # Describe a mpi training workload with a default project\nrunai training mpi describe <mpi-name>\n\n# Describe a mpi training workload in a specific project\nrunai training mpi describe <mpi-name> -p <project_name>\n\n# Describe a mpi training workload by UUID\nrunai training mpi describe --uuid=<mpi_uuid>\n\n# Describe a mpi training workload with specific output format\nrunai training mpi describe <mpi-name> -o json\n\n# Describe a mpi training workload with specific sections\nrunai training mpi describe <mpi-name> --general --compute --pods --events --networks\n\n# Describe a mpi training workload with container details and custom limits\nrunai training mpi describe <mpi-name> --containers --pod-limit 20 --event-limit 100\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_describe/#options","title":"Options","text":"
                                        --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai training mpi - mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_exec/","title":"Runai training mpi exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_exec/#runai-training-mpi-exec","title":"runai training mpi exec","text":"

                                  execute a command in a mpi training job

                                  runai training mpi exec [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_exec/#examples","title":"Examples","text":"
                                  # Execute bash in the mpi training's main worker\nrunai training mpi exec mpi-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the mpi training's main worker\nrunai training mpi exec mpi-01 -- ls\n\n# Execute a command in a specific mpi training worker\nrunai training mpi exec mpi-01 --pod mpi-01-worker-1 -- nvidia-smi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai training mpi - mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_list/","title":"Runai training mpi list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_list/#runai-training-mpi-list","title":"runai training mpi list","text":"

                                  list mpi training

                                  runai training mpi list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_list/#examples","title":"Examples","text":"
                                  # List all mpi training workloads\nrunai training mpi list -A\n\n# List mpi training workloads with default project\nrunai training mpi list\n\n# List mpi training workloads in a specific project\nrunai training mpi list -p <project_name>\n\n# List all mpi training workloads with a specific output format\nrunai training mpi list -o wide\n\n# List mpi training workloads with pagination\nrunai training mpi list --limit 20 --offset 40\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_list/#options","title":"Options","text":"
                                    -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_list/#see-also","title":"SEE ALSO","text":"
                                  • runai training mpi - mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_logs/","title":"Runai training mpi logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_logs/#runai-training-mpi-logs","title":"runai training mpi logs","text":"

                                  view logs of a mpi training job

                                  runai training mpi logs [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_logs/#examples","title":"Examples","text":"
                                  # Get logs for a mpi training\nrunai training mpi logs mpi-01\n\n# Get logs for a specific pod in a mpi training\nrunai training mpi logs mpi-01 --pod=mpi-01-worker-0\n\n# Get logs for a specific container in a mpi training\nrunai training mpi logs mpi-01 --container=mpi-worker\n\n# Get the last 100 lines of logs\nrunai training mpi logs mpi-01 --tail=100\n\n# Get logs with timestamps\nrunai training mpi logs mpi-01 --timestamps\n\n# Follow the logs\nrunai training mpi logs mpi-01 --follow\n\n# Get logs for the previous instance of the mpi training\nrunai training mpi logs mpi-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training mpi logs mpi-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training mpi logs mpi-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training mpi logs mpi-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for mpi training to be ready for logs\nrunai training mpi logs mpi-01 --wait-timeout=30s\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai training mpi - mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/","title":"Runai training mpi port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/#runai-training-mpi-port-forward","title":"runai training mpi port-forward","text":"

                                  forward one or more local ports to a mpi training job

                                  runai training mpi port-forward [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/#examples","title":"Examples","text":"
                                  # Forward connections from localhost:8080 to mpi training on port 8090:\nrunai training mpi port-forward mpi-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to mpi training on port 8080:\nrunai training mpi port-forward mpi-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to mpi training on port 8090 and from localhost:6443 to mpi training on port 443:\nrunai training mpi port-forward mpi-01 --port 8080:8090 --port 6443:443 --address localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai training mpi - mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_resume/","title":"Runai training mpi resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_resume/#runai-training-mpi-resume","title":"runai training mpi resume","text":"

                                  resume mpi training

                                  runai training mpi resume [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_resume/#examples","title":"Examples","text":"
                                  # Resume a mpi training workload\nrunai training mpi resume <mpi-name>\n\n# Resume a mpi training workload in a specific project\nrunai training mpi resume <mpi-name> -p <project_name>\n\n# Resume a mpi training workload by UUID\nrunai training mpi resume --uuid=<mpi_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_resume/#options","title":"Options","text":"
                                    -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_resume/#see-also","title":"SEE ALSO","text":"
                                  • runai training mpi - mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_submit/","title":"Runai training mpi submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_submit/#runai-training-mpi-submit","title":"runai training mpi submit","text":"

                                  submit mpi training

                                  runai training mpi submit [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_submit/#examples","title":"Examples","text":"
                                  # Submit a mpi training workload\nrunai training mpi submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a mpi training workload with arguments\nrunai training mpi submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a mpi training workload with a custom command\nrunai training mpi submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a mpi training master args with worker args\nrunai training mpi submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a mpi training master command with worker args\nrunai training mpi submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a mpi training master command with worker command\nrunai training mpi submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_submit/#options","title":"Options","text":"
                                        --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --slots-per-worker int32                         Number of slots to allocate for each worker\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_submit/#see-also","title":"SEE ALSO","text":"
                                  • runai training mpi - mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_suspend/","title":"Runai training mpi suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_suspend/#runai-training-mpi-suspend","title":"runai training mpi suspend","text":"

                                  suspend mpi training

                                  runai training mpi suspend [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_suspend/#examples","title":"Examples","text":"
                                  # Suspend a mpi training workload\nrunai training mpi suspend <mpi-name>\n\n# Suspend a mpi training workload in a specific project\nrunai training mpi suspend <mpi-name> -p <project_name>\n\n# Suspend a mpi training workload by UUID\nrunai training mpi suspend --uuid=<mpi_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_suspend/#options","title":"Options","text":"
                                    -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_mpi_suspend/#see-also","title":"SEE ALSO","text":"
                                  • runai training mpi - mpi management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_port-forward/","title":"Runai training port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_port-forward/#runai-training-port-forward","title":"runai training port-forward","text":"

                                  forward one or more local ports to a standard training job

                                  runai training port-forward [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_port-forward/#examples","title":"Examples","text":"
                                  # Forward connections from localhost:8080 to standard training on port 8090:\nrunai training standard port-forward standard-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to standard training on port 8080:\nrunai training standard port-forward standard-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to standard training on port 8090 and from localhost:6443 to standard training on port 443:\nrunai training standard port-forward standard-01 --port 8080:8090 --port 6443:443 --address localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch/","title":"Runai training pytorch","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch/#runai-training-pytorch","title":"runai training pytorch","text":"

                                  pytorch management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch/#options","title":"Options","text":"
                                    -h, --help   help for pytorch\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  • runai training pytorch attach - attach to a running container in a pytorch training job
                                  • runai training pytorch bash - open a bash shell in a pytorch training job
                                  • runai training pytorch delete - delete pytorch training workload
                                  • runai training pytorch describe - describe pytorch training
                                  • runai training pytorch exec - execute a command in a pytorch training job
                                  • runai training pytorch list - list pytorch training
                                  • runai training pytorch logs - view logs of a pytorch training job
                                  • runai training pytorch port-forward - forward one or more local ports to a pytorch training job
                                  • runai training pytorch resume - resume pytorch training
                                  • runai training pytorch submit - submit pytorch training
                                  • runai training pytorch suspend - suspend pytorch training
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_attach/","title":"Runai training pytorch attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_attach/#runai-training-pytorch-attach","title":"runai training pytorch attach","text":"

                                  attach to a running container in a pytorch training job

                                  runai training pytorch attach [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_attach/#examples","title":"Examples","text":"
                                  # Attaching to the main worker of a pytorch training\nrunai training pytorch attach pytorch-01 --tty --stdin\n\n# Attaching to a specific pod of a pytorch training\nrunai training pytorch attach pytorch-01 --pod pytorch-01-worker-1 --tty --stdin\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai training pytorch - pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_bash/","title":"Runai training pytorch bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_bash/#runai-training-pytorch-bash","title":"runai training pytorch bash","text":"

                                  open a bash shell in a pytorch training job

                                  runai training pytorch bash [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_bash/#examples","title":"Examples","text":"
                                  # Open a bash shell in the pytorch training's main worker\nrunai training pytorch bash pytorch-01\n\n# Open a bash shell in a specific pytorch training worker\nrunai training pytorch bash pytorch-01 --pod pytorch-01-worker-1\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_bash/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_bash/#see-also","title":"SEE ALSO","text":"
                                  • runai training pytorch - pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_delete/","title":"Runai training pytorch delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_delete/#runai-training-pytorch-delete","title":"runai training pytorch delete","text":"

                                  delete pytorch training workload

                                  runai training pytorch delete [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_delete/#examples","title":"Examples","text":"
                                  # Delete a pytorch training workload with a default project\nrunai training pytorch delete <pytorch-name>\n\n# Delete a pytorch training workload with a specific project\nrunai training pytorch delete <pytorch-name> -p <project_name>\n\n# Delete a pytorch training workload by UUID\nrunai training pytorch delete --uuid=<pytorch_uuid> -p <project_name>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_delete/#options","title":"Options","text":"
                                    -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_delete/#see-also","title":"SEE ALSO","text":"
                                  • runai training pytorch - pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_describe/","title":"Runai training pytorch describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_describe/#runai-training-pytorch-describe","title":"runai training pytorch describe","text":"

                                  describe pytorch training

                                  runai training pytorch describe [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_describe/#examples","title":"Examples","text":"
                                  # Describe a pytorch training workload with a default project\nrunai training pytorch describe <pytorch-name>\n\n# Describe a pytorch training workload in a specific project\nrunai training pytorch describe <pytorch-name> -p <project_name>\n\n# Describe a pytorch training workload by UUID\nrunai training pytorch describe --uuid=<pytorch_uuid>\n\n# Describe a pytorch training workload with specific output format\nrunai training pytorch describe <pytorch-name> -o json\n\n# Describe a pytorch training workload with specific sections\nrunai training pytorch describe <pytorch-name> --general --compute --pods --events --networks\n\n# Describe a pytorch training workload with container details and custom limits\nrunai training pytorch describe <pytorch-name> --containers --pod-limit 20 --event-limit 100\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_describe/#options","title":"Options","text":"
                                        --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai training pytorch - pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_exec/","title":"Runai training pytorch exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_exec/#runai-training-pytorch-exec","title":"runai training pytorch exec","text":"

                                  execute a command in a pytorch training job

                                  runai training pytorch exec [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_exec/#examples","title":"Examples","text":"
                                  # Execute bash in the pytorch training's main worker\nrunai training pytorch exec pytorch-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the pytorch training's main worker\nrunai training pytorch exec pytorch-01 -- ls\n\n# Execute a command in a specific pytorch training worker\nrunai training pytorch exec pytorch-01 --pod pytorch-01-worker-1 -- nvidia-smi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai training pytorch - pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_list/","title":"Runai training pytorch list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_list/#runai-training-pytorch-list","title":"runai training pytorch list","text":"

                                  list pytorch training

                                  runai training pytorch list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_list/#examples","title":"Examples","text":"
                                  # List all pytorch training workloads\nrunai training pytorch list -A\n\n# List pytorch training workloads with default project\nrunai training pytorch list\n\n# List pytorch training workloads in a specific project\nrunai training pytorch list -p <project_name>\n\n# List all pytorch training workloads with a specific output format\nrunai training pytorch list -o wide\n\n# List pytorch training workloads with pagination\nrunai training pytorch list --limit 20 --offset 40\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_list/#options","title":"Options","text":"
                                    -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_list/#see-also","title":"SEE ALSO","text":"
                                  • runai training pytorch - pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_logs/","title":"Runai training pytorch logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_logs/#runai-training-pytorch-logs","title":"runai training pytorch logs","text":"

                                  view logs of a pytorch training job

                                  runai training pytorch logs [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_logs/#examples","title":"Examples","text":"
                                  # Get logs for a pytorch training\nrunai training pytorch logs pytorch-01\n\n# Get logs for a specific pod in a pytorch training\nrunai training pytorch logs pytorch-01 --pod=pytorch-01-worker-0\n\n# Get logs for a specific container in a pytorch training\nrunai training pytorch logs pytorch-01 --container=pytorch-worker\n\n# Get the last 100 lines of logs\nrunai training pytorch logs pytorch-01 --tail=100\n\n# Get logs with timestamps\nrunai training pytorch logs pytorch-01 --timestamps\n\n# Follow the logs\nrunai training pytorch logs pytorch-01 --follow\n\n# Get logs for the previous instance of the pytorch training\nrunai training pytorch logs pytorch-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training pytorch logs pytorch-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training pytorch logs pytorch-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training pytorch logs pytorch-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for pytorch training to be ready for logs\nrunai training pytorch logs pytorch-01 --wait-timeout=30s\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai training pytorch - pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/","title":"Runai training pytorch port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/#runai-training-pytorch-port-forward","title":"runai training pytorch port-forward","text":"

                                  forward one or more local ports to a pytorch training job

                                  runai training pytorch port-forward [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/#examples","title":"Examples","text":"
                                  # Forward connections from localhost:8080 to pytorch training on port 8090:\nrunai training pytorch port-forward pytorch-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to pytorch training on port 8080:\nrunai training pytorch port-forward pytorch-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to pytorch training on port 8090 and from localhost:6443 to pytorch training on port 443:\nrunai training pytorch port-forward pytorch-01 --port 8080:8090 --port 6443:443 --address localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai training pytorch - pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_resume/","title":"Runai training pytorch resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_resume/#runai-training-pytorch-resume","title":"runai training pytorch resume","text":"

                                  resume pytorch training

                                  runai training pytorch resume [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_resume/#examples","title":"Examples","text":"
                                  # Resume a pytorch training workload\nrunai training pytorch resume <pytorch-name>\n\n# Resume a pytorch training workload in a specific project\nrunai training pytorch resume <pytorch-name> -p <project_name>\n\n# Resume a pytorch training workload by UUID\nrunai training pytorch resume --uuid=<pytorch_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_resume/#options","title":"Options","text":"
                                    -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_resume/#see-also","title":"SEE ALSO","text":"
                                  • runai training pytorch - pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_submit/","title":"Runai training pytorch submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_submit/#runai-training-pytorch-submit","title":"runai training pytorch submit","text":"

                                  submit pytorch training

                                  runai training pytorch submit [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_submit/#examples","title":"Examples","text":"
                                  # Submit a pytorch training workload\nrunai training pytorch submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a pytorch training workload with arguments\nrunai training pytorch submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a pytorch training workload with a custom command\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a pytorch training master args with worker args\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a pytorch training master command with worker args\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a pytorch training master command with worker command\nrunai training pytorch submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_submit/#options","title":"Options","text":"
                                        --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-gpu-devices-request int32               GPU units to allocate for the job (e.g. 1, 2)\n      --master-gpu-portion-limit float                 GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-gpu-portion-request float               GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --max-replicas int32                             Maximum number of replicas for an elastic PyTorch job\n      --min-replicas int32                             Minimum number of replicas for an elastic PyTorch job\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --no-master                                      Do not create a separate pod for the master\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_submit/#see-also","title":"SEE ALSO","text":"
                                  • runai training pytorch - pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/","title":"Runai training pytorch suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/#runai-training-pytorch-suspend","title":"runai training pytorch suspend","text":"

                                  suspend pytorch training

                                  runai training pytorch suspend [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/#examples","title":"Examples","text":"
                                  # Suspend a pytorch training workload\nrunai training pytorch suspend <pytorch-name>\n\n# Suspend a pytorch training workload in a specific project\nrunai training pytorch suspend <pytorch-name> -p <project_name>\n\n# Suspend a pytorch training workload by UUID\nrunai training pytorch suspend --uuid=<pytorch_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/#options","title":"Options","text":"
                                    -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/#see-also","title":"SEE ALSO","text":"
                                  • runai training pytorch - pytorch management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_resume/","title":"Runai training resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_resume/#runai-training-resume","title":"runai training resume","text":"

                                  resume standard training

                                  runai training resume [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_resume/#examples","title":"Examples","text":"
                                  # Resume a standard training workload\nrunai training standard resume <standard-name>\n\n# Resume a standard training workload in a specific project\nrunai training standard resume <standard-name> -p <project_name>\n\n# Resume a standard training workload by UUID\nrunai training standard resume --uuid=<standard_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_resume/#options","title":"Options","text":"
                                    -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_resume/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard/","title":"Runai training standard","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard/#runai-training-standard","title":"runai training standard","text":"

                                  standard training management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard/#options","title":"Options","text":"
                                    -h, --help   help for standard\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  • runai training standard attach - attach to a running container in a standard training job
                                  • runai training standard bash - open a bash shell in a standard training job
                                  • runai training standard delete - delete standard training workload
                                  • runai training standard describe - describe standard training
                                  • runai training standard exec - execute a command in a standard training job
                                  • runai training standard list - list standard training
                                  • runai training standard logs - view logs of a standard training job
                                  • runai training standard port-forward - forward one or more local ports to a standard training job
                                  • runai training standard resume - resume standard training
                                  • runai training standard submit - submit standard training
                                  • runai training standard suspend - suspend standard training
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_attach/","title":"Runai training standard attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_attach/#runai-training-standard-attach","title":"runai training standard attach","text":"

                                  attach to a running container in a standard training job

                                  runai training standard attach [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_attach/#examples","title":"Examples","text":"
                                  # Attaching to the main worker of a standard training\nrunai training standard attach standard-01 --tty --stdin\n\n# Attaching to a specific pod of a standard training\nrunai training standard attach standard-01 --pod standard-01-worker-1 --tty --stdin\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai training standard - standard training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_bash/","title":"Runai training standard bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_bash/#runai-training-standard-bash","title":"runai training standard bash","text":"

                                  open a bash shell in a standard training job

                                  runai training standard bash [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_bash/#examples","title":"Examples","text":"
                                  # Open a bash shell in the standard training's main worker\nrunai training standard bash standard-01\n\n# Open a bash shell in a specific standard training worker\nrunai training standard bash standard-01 --pod standard-01-worker-1\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_bash/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_bash/#see-also","title":"SEE ALSO","text":"
                                  • runai training standard - standard training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_delete/","title":"Runai training standard delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_delete/#runai-training-standard-delete","title":"runai training standard delete","text":"

                                  delete standard training workload

                                  runai training standard delete [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_delete/#examples","title":"Examples","text":"
                                  # Delete a standard training workload with a default project\nrunai training standard delete <standard-name>\n\n# Delete a standard training workload with a specific project\nrunai training standard delete <standard-name> -p <project_name>\n\n# Delete a standard training workload by UUID\nrunai training standard delete --uuid=<standard_uuid> -p <project_name>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_delete/#options","title":"Options","text":"
                                    -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_delete/#see-also","title":"SEE ALSO","text":"
                                  • runai training standard - standard training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_describe/","title":"Runai training standard describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_describe/#runai-training-standard-describe","title":"runai training standard describe","text":"

                                  describe standard training

                                  runai training standard describe [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_describe/#examples","title":"Examples","text":"
                                  # Describe a standard training workload with a default project\nrunai training standard describe <standard-name>\n\n# Describe a standard training workload in a specific project\nrunai training standard describe <standard-name> -p <project_name>\n\n# Describe a standard training workload by UUID\nrunai training standard describe --uuid=<standard_uuid>\n\n# Describe a standard training workload with specific output format\nrunai training standard describe <standard-name> -o json\n\n# Describe a standard training workload with specific sections\nrunai training standard describe <standard-name> --general --compute --pods --events --networks\n\n# Describe a standard training workload with container details and custom limits\nrunai training standard describe <standard-name> --containers --pod-limit 20 --event-limit 100\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_describe/#options","title":"Options","text":"
                                        --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai training standard - standard training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_exec/","title":"Runai training standard exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_exec/#runai-training-standard-exec","title":"runai training standard exec","text":"

                                  execute a command in a standard training job

                                  runai training standard exec [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_exec/#examples","title":"Examples","text":"
                                  # Execute bash in the standard training's main worker\nrunai training standard exec standard-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the standard training's main worker\nrunai training standard exec standard-01 -- ls\n\n# Execute a command in a specific standard training worker\nrunai training standard exec standard-01 --pod standard-01-worker-1 -- nvidia-smi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai training standard - standard training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_list/","title":"Runai training standard list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_list/#runai-training-standard-list","title":"runai training standard list","text":"

                                  list standard training

                                  runai training standard list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_list/#examples","title":"Examples","text":"
                                  # List all standard training workloads\nrunai training standard list -A\n\n# List standard training workloads with default project\nrunai training standard list\n\n# List standard training workloads in a specific project\nrunai training standard list -p <project_name>\n\n# List all standard training workloads with a specific output format\nrunai training standard list -o wide\n\n# List standard training workloads with pagination\nrunai training standard list --limit 20 --offset 40\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_list/#options","title":"Options","text":"
                                    -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_list/#see-also","title":"SEE ALSO","text":"
                                  • runai training standard - standard training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_logs/","title":"Runai training standard logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_logs/#runai-training-standard-logs","title":"runai training standard logs","text":"

                                  view logs of a standard training job

                                  runai training standard logs [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_logs/#examples","title":"Examples","text":"
                                  # Get logs for a standard training\nrunai training standard logs standard-01\n\n# Get logs for a specific pod in a standard training\nrunai training standard logs standard-01 --pod=standard-01-worker-0\n\n# Get logs for a specific container in a standard training\nrunai training standard logs standard-01 --container=standard-worker\n\n# Get the last 100 lines of logs\nrunai training standard logs standard-01 --tail=100\n\n# Get logs with timestamps\nrunai training standard logs standard-01 --timestamps\n\n# Follow the logs\nrunai training standard logs standard-01 --follow\n\n# Get logs for the previous instance of the standard training\nrunai training standard logs standard-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training standard logs standard-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training standard logs standard-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training standard logs standard-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for standard training to be ready for logs\nrunai training standard logs standard-01 --wait-timeout=30s\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai training standard - standard training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_port-forward/","title":"Runai training standard port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_port-forward/#runai-training-standard-port-forward","title":"runai training standard port-forward","text":"

                                  forward one or more local ports to a standard training job

                                  runai training standard port-forward [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_port-forward/#examples","title":"Examples","text":"
                                  # Forward connections from localhost:8080 to standard training on port 8090:\nrunai training standard port-forward standard-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to standard training on port 8080:\nrunai training standard port-forward standard-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to standard training on port 8090 and from localhost:6443 to standard training on port 443:\nrunai training standard port-forward standard-01 --port 8080:8090 --port 6443:443 --address localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai training standard - standard training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_resume/","title":"Runai training standard resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_resume/#runai-training-standard-resume","title":"runai training standard resume","text":"

                                  resume standard training

                                  runai training standard resume [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_resume/#examples","title":"Examples","text":"
                                  # Resume a standard training workload\nrunai training standard resume <standard-name>\n\n# Resume a standard training workload in a specific project\nrunai training standard resume <standard-name> -p <project_name>\n\n# Resume a standard training workload by UUID\nrunai training standard resume --uuid=<standard_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_resume/#options","title":"Options","text":"
                                    -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_resume/#see-also","title":"SEE ALSO","text":"
                                  • runai training standard - standard training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_submit/","title":"Runai training standard submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_submit/#runai-training-standard-submit","title":"runai training standard submit","text":"

                                  submit standard training

                                  runai training standard submit [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_submit/#examples","title":"Examples","text":"
                                  # Submit a standard training workload\nrunai training standard submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a standard training workload with arguments\nrunai training standard submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a standard training workload with a custom command\nrunai training standard submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a standard training jupiter notebook\nrunai training standard submit <name> -p <project_name> -i jupyter/scipy-notebook --gpu-devices-request 1 --external-url container=8888 --name-prefix jupyter --command -- start-notebook.sh --NotebookApp.base_url='/${RUNAI_PROJECT}/${RUNAI_JOB_NAME}' --NotebookApp.token='\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_submit/#options","title":"Options","text":"
                                        --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --parallelism int32                              Specifies the maximum number of pods that should run in parallel at any given time\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --runs int32                                     Number of successful runs required for this workload to be considered completed\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --working-dir string                             Set the container's working directory\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_submit/#see-also","title":"SEE ALSO","text":"
                                  • runai training standard - standard training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_suspend/","title":"Runai training standard suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_suspend/#runai-training-standard-suspend","title":"runai training standard suspend","text":"

                                  suspend standard training

                                  runai training standard suspend [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_suspend/#examples","title":"Examples","text":"
                                  # Suspend a standard training workload\nrunai training standard suspend <standard-name>\n\n# Suspend a standard training workload in a specific project\nrunai training standard suspend <standard-name> -p <project_name>\n\n# Suspend a standard training workload by UUID\nrunai training standard suspend --uuid=<standard_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_suspend/#options","title":"Options","text":"
                                    -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_standard_suspend/#see-also","title":"SEE ALSO","text":"
                                  • runai training standard - standard training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_submit/","title":"Runai training submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_submit/#runai-training-submit","title":"runai training submit","text":"

                                  submit standard training

                                  runai training submit [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_submit/#examples","title":"Examples","text":"
                                  # Submit a standard training workload\nrunai training standard submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a standard training workload with arguments\nrunai training standard submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a standard training workload with a custom command\nrunai training standard submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a standard training jupiter notebook\nrunai training standard submit <name> -p <project_name> -i jupyter/scipy-notebook --gpu-devices-request 1 --external-url container=8888 --name-prefix jupyter --command -- start-notebook.sh --NotebookApp.base_url='/${RUNAI_PROJECT}/${RUNAI_JOB_NAME}' --NotebookApp.token='\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_submit/#options","title":"Options","text":"
                                        --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --parallelism int32                              Specifies the maximum number of pods that should run in parallel at any given time\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --runs int32                                     Number of successful runs required for this workload to be considered completed\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --working-dir string                             Set the container's working directory\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_submit/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_suspend/","title":"Runai training suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_suspend/#runai-training-suspend","title":"runai training suspend","text":"

                                  suspend standard training

                                  runai training suspend [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_suspend/#examples","title":"Examples","text":"
                                  # Suspend a standard training workload\nrunai training standard suspend <standard-name>\n\n# Suspend a standard training workload in a specific project\nrunai training standard suspend <standard-name> -p <project_name>\n\n# Suspend a standard training workload by UUID\nrunai training standard suspend --uuid=<standard_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_suspend/#options","title":"Options","text":"
                                    -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_suspend/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow/","title":"Runai training tensorflow","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow/#runai-training-tensorflow","title":"runai training tensorflow","text":"

                                  tensorflow management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow/#options","title":"Options","text":"
                                    -h, --help   help for tensorflow\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  • runai training tensorflow attach - attach to a running container in a tf training job
                                  • runai training tensorflow bash - open a bash shell in a tf training job
                                  • runai training tensorflow delete - delete tf training workload
                                  • runai training tensorflow describe - describe tf training
                                  • runai training tensorflow exec - execute a command in a tf training job
                                  • runai training tensorflow list - list tf training
                                  • runai training tensorflow logs - view logs of a tf training job
                                  • runai training tensorflow port-forward - forward one or more local ports to a tf training job
                                  • runai training tensorflow resume - resume tf training
                                  • runai training tensorflow submit - submit tf training
                                  • runai training tensorflow suspend - suspend tf training
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/","title":"Runai training tensorflow attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/#runai-training-tensorflow-attach","title":"runai training tensorflow attach","text":"

                                  attach to a running container in a tf training job

                                  runai training tensorflow attach [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/#examples","title":"Examples","text":"
                                  # Attaching to the main worker of a tf training\nrunai training tf attach tf-01 --tty --stdin\n\n# Attaching to a specific pod of a tf training\nrunai training tf attach tf-01 --pod tf-01-worker-1 --tty --stdin\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai training tensorflow - tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/","title":"Runai training tensorflow bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/#runai-training-tensorflow-bash","title":"runai training tensorflow bash","text":"

                                  open a bash shell in a tf training job

                                  runai training tensorflow bash [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/#examples","title":"Examples","text":"
                                  # Open a bash shell in the tf training's main worker\nrunai training tf bash tf-01\n\n# Open a bash shell in a specific tf training worker\nrunai training tf bash tf-01 --pod tf-01-worker-1\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/#see-also","title":"SEE ALSO","text":"
                                  • runai training tensorflow - tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/","title":"Runai training tensorflow delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/#runai-training-tensorflow-delete","title":"runai training tensorflow delete","text":"

                                  delete tf training workload

                                  runai training tensorflow delete [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/#examples","title":"Examples","text":"
                                  # Delete a tf training workload with a default project\nrunai training tf delete <tf-name>\n\n# Delete a tf training workload with a specific project\nrunai training tf delete <tf-name> -p <project_name>\n\n# Delete a tf training workload by UUID\nrunai training tf delete --uuid=<tf_uuid> -p <project_name>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/#options","title":"Options","text":"
                                    -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/#see-also","title":"SEE ALSO","text":"
                                  • runai training tensorflow - tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/","title":"Runai training tensorflow describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/#runai-training-tensorflow-describe","title":"runai training tensorflow describe","text":"

                                  describe tf training

                                  runai training tensorflow describe [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/#examples","title":"Examples","text":"
                                  # Describe a tf training workload with a default project\nrunai training tf describe <tf-name>\n\n# Describe a tf training workload in a specific project\nrunai training tf describe <tf-name> -p <project_name>\n\n# Describe a tf training workload by UUID\nrunai training tf describe --uuid=<tf_uuid>\n\n# Describe a tf training workload with specific output format\nrunai training tf describe <tf-name> -o json\n\n# Describe a tf training workload with specific sections\nrunai training tf describe <tf-name> --general --compute --pods --events --networks\n\n# Describe a tf training workload with container details and custom limits\nrunai training tf describe <tf-name> --containers --pod-limit 20 --event-limit 100\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/#options","title":"Options","text":"
                                        --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai training tensorflow - tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/","title":"Runai training tensorflow exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/#runai-training-tensorflow-exec","title":"runai training tensorflow exec","text":"

                                  execute a command in a tf training job

                                  runai training tensorflow exec [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/#examples","title":"Examples","text":"
                                  # Execute bash in the tf training's main worker\nrunai training tf exec tf-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the tf training's main worker\nrunai training tf exec tf-01 -- ls\n\n# Execute a command in a specific tf training worker\nrunai training tf exec tf-01 --pod tf-01-worker-1 -- nvidia-smi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai training tensorflow - tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_list/","title":"Runai training tensorflow list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_list/#runai-training-tensorflow-list","title":"runai training tensorflow list","text":"

                                  list tf training

                                  runai training tensorflow list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_list/#examples","title":"Examples","text":"
                                  # List all tf training workloads\nrunai training tf list -A\n\n# List tf training workloads with default project\nrunai training tf list\n\n# List tf training workloads in a specific project\nrunai training tf list -p <project_name>\n\n# List all tf training workloads with a specific output format\nrunai training tf list -o wide\n\n# List tf training workloads with pagination\nrunai training tf list --limit 20 --offset 40\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_list/#options","title":"Options","text":"
                                    -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_list/#see-also","title":"SEE ALSO","text":"
                                  • runai training tensorflow - tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/","title":"Runai training tensorflow logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/#runai-training-tensorflow-logs","title":"runai training tensorflow logs","text":"

                                  view logs of a tf training job

                                  runai training tensorflow logs [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/#examples","title":"Examples","text":"
                                  # Get logs for a tf training\nrunai training tf logs tf-01\n\n# Get logs for a specific pod in a tf training\nrunai training tf logs tf-01 --pod=tf-01-worker-0\n\n# Get logs for a specific container in a tf training\nrunai training tf logs tf-01 --container=tf-worker\n\n# Get the last 100 lines of logs\nrunai training tf logs tf-01 --tail=100\n\n# Get logs with timestamps\nrunai training tf logs tf-01 --timestamps\n\n# Follow the logs\nrunai training tf logs tf-01 --follow\n\n# Get logs for the previous instance of the tf training\nrunai training tf logs tf-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training tf logs tf-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training tf logs tf-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training tf logs tf-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for tf training to be ready for logs\nrunai training tf logs tf-01 --wait-timeout=30s\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai training tensorflow - tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/","title":"Runai training tensorflow port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/#runai-training-tensorflow-port-forward","title":"runai training tensorflow port-forward","text":"

                                  forward one or more local ports to a tf training job

                                  runai training tensorflow port-forward [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/#examples","title":"Examples","text":"
                                  # Forward connections from localhost:8080 to tf training on port 8090:\nrunai training tf port-forward tf-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to tf training on port 8080:\nrunai training tf port-forward tf-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to tf training on port 8090 and from localhost:6443 to tf training on port 443:\nrunai training tf port-forward tf-01 --port 8080:8090 --port 6443:443 --address localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai training tensorflow - tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/","title":"Runai training tensorflow resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/#runai-training-tensorflow-resume","title":"runai training tensorflow resume","text":"

                                  resume tf training

                                  runai training tensorflow resume [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/#examples","title":"Examples","text":"
                                  # Resume a tf training workload\nrunai training tf resume <tf-name>\n\n# Resume a tf training workload in a specific project\nrunai training tf resume <tf-name> -p <project_name>\n\n# Resume a tf training workload by UUID\nrunai training tf resume --uuid=<tf_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/#options","title":"Options","text":"
                                    -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/#see-also","title":"SEE ALSO","text":"
                                  • runai training tensorflow - tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/","title":"Runai training tensorflow submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/#runai-training-tensorflow-submit","title":"runai training tensorflow submit","text":"

                                  submit tf training

                                  runai training tensorflow submit [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/#examples","title":"Examples","text":"
                                  # Submit a tf training workload\nrunai training tf submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a tf training workload with arguments\nrunai training tf submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a tf training workload with a custom command\nrunai training tf submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a tf training master args with worker args\nrunai training tf submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a tf training master command with worker args\nrunai training tf submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a tf training master command with worker command\nrunai training tf submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/#options","title":"Options","text":"
                                        --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-gpu-devices-request int32               GPU units to allocate for the job (e.g. 1, 2)\n      --master-gpu-portion-limit float                 GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-gpu-portion-request float               GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --max-replicas int32                             Maximum number of replicas for an elastic PyTorch job\n      --min-replicas int32                             Minimum number of replicas for an elastic PyTorch job\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --no-master                                      Do not create a separate pod for the master\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/#see-also","title":"SEE ALSO","text":"
                                  • runai training tensorflow - tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/","title":"Runai training tensorflow suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/#runai-training-tensorflow-suspend","title":"runai training tensorflow suspend","text":"

                                  suspend tf training

                                  runai training tensorflow suspend [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/#examples","title":"Examples","text":"
                                  # Suspend a tf training workload\nrunai training tf suspend <tf-name>\n\n# Suspend a tf training workload in a specific project\nrunai training tf suspend <tf-name> -p <project_name>\n\n# Suspend a tf training workload by UUID\nrunai training tf suspend --uuid=<tf_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/#options","title":"Options","text":"
                                    -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/#see-also","title":"SEE ALSO","text":"
                                  • runai training tensorflow - tensorflow management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost/","title":"Runai training xgboost","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost/#runai-training-xgboost","title":"runai training xgboost","text":"

                                  xgboost management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost/#options","title":"Options","text":"
                                    -h, --help   help for xgboost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost/#see-also","title":"SEE ALSO","text":"
                                  • runai training - training management
                                  • runai training xgboost attach - attach to a running container in a xgboost training job
                                  • runai training xgboost bash - open a bash shell in a xgboost training job
                                  • runai training xgboost delete - delete xgboost training workload
                                  • runai training xgboost describe - describe xgboost training
                                  • runai training xgboost exec - execute a command in a xgboost training job
                                  • runai training xgboost list - list xgboost training
                                  • runai training xgboost logs - view logs of a xgboost training job
                                  • runai training xgboost port-forward - forward one or more local ports to a xgboost training job
                                  • runai training xgboost resume - resume xgboost training
                                  • runai training xgboost submit - submit xgboost training
                                  • runai training xgboost suspend - suspend xgboost training
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_attach/","title":"Runai training xgboost attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_attach/#runai-training-xgboost-attach","title":"runai training xgboost attach","text":"

                                  attach to a running container in a xgboost training job

                                  runai training xgboost attach [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_attach/#examples","title":"Examples","text":"
                                  # Attaching to the main worker of a xgboost training\nrunai training xgboost attach xgboost-01 --tty --stdin\n\n# Attaching to a specific pod of a xgboost training\nrunai training xgboost attach xgboost-01 --pod xgboost-01-worker-1 --tty --stdin\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai training xgboost - xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_bash/","title":"Runai training xgboost bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_bash/#runai-training-xgboost-bash","title":"runai training xgboost bash","text":"

                                  open a bash shell in a xgboost training job

                                  runai training xgboost bash [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_bash/#examples","title":"Examples","text":"
                                  # Open a bash shell in the xgboost training's main worker\nrunai training xgboost bash xgboost-01\n\n# Open a bash shell in a specific xgboost training worker\nrunai training xgboost bash xgboost-01 --pod xgboost-01-worker-1\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_bash/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_bash/#see-also","title":"SEE ALSO","text":"
                                  • runai training xgboost - xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_delete/","title":"Runai training xgboost delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_delete/#runai-training-xgboost-delete","title":"runai training xgboost delete","text":"

                                  delete xgboost training workload

                                  runai training xgboost delete [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_delete/#examples","title":"Examples","text":"
                                  # Delete a xgboost training workload with a default project\nrunai training xgboost delete <xgboost-name>\n\n# Delete a xgboost training workload with a specific project\nrunai training xgboost delete <xgboost-name> -p <project_name>\n\n# Delete a xgboost training workload by UUID\nrunai training xgboost delete --uuid=<xgboost_uuid> -p <project_name>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_delete/#options","title":"Options","text":"
                                    -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_delete/#see-also","title":"SEE ALSO","text":"
                                  • runai training xgboost - xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_describe/","title":"Runai training xgboost describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_describe/#runai-training-xgboost-describe","title":"runai training xgboost describe","text":"

                                  describe xgboost training

                                  runai training xgboost describe [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_describe/#examples","title":"Examples","text":"
                                  # Describe a xgboost training workload with a default project\nrunai training xgboost describe <xgboost-name>\n\n# Describe a xgboost training workload in a specific project\nrunai training xgboost describe <xgboost-name> -p <project_name>\n\n# Describe a xgboost training workload by UUID\nrunai training xgboost describe --uuid=<xgboost_uuid>\n\n# Describe a xgboost training workload with specific output format\nrunai training xgboost describe <xgboost-name> -o json\n\n# Describe a xgboost training workload with specific sections\nrunai training xgboost describe <xgboost-name> --general --compute --pods --events --networks\n\n# Describe a xgboost training workload with container details and custom limits\nrunai training xgboost describe <xgboost-name> --containers --pod-limit 20 --event-limit 100\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_describe/#options","title":"Options","text":"
                                        --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai training xgboost - xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_exec/","title":"Runai training xgboost exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_exec/#runai-training-xgboost-exec","title":"runai training xgboost exec","text":"

                                  execute a command in a xgboost training job

                                  runai training xgboost exec [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_exec/#examples","title":"Examples","text":"
                                  # Execute bash in the xgboost training's main worker\nrunai training xgboost exec xgboost-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the xgboost training's main worker\nrunai training xgboost exec xgboost-01 -- ls\n\n# Execute a command in a specific xgboost training worker\nrunai training xgboost exec xgboost-01 --pod xgboost-01-worker-1 -- nvidia-smi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai training xgboost - xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_list/","title":"Runai training xgboost list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_list/#runai-training-xgboost-list","title":"runai training xgboost list","text":"

                                  list xgboost training

                                  runai training xgboost list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_list/#examples","title":"Examples","text":"
                                  # List all xgboost training workloads\nrunai training xgboost list -A\n\n# List xgboost training workloads with default project\nrunai training xgboost list\n\n# List xgboost training workloads in a specific project\nrunai training xgboost list -p <project_name>\n\n# List all xgboost training workloads with a specific output format\nrunai training xgboost list -o wide\n\n# List xgboost training workloads with pagination\nrunai training xgboost list --limit 20 --offset 40\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_list/#options","title":"Options","text":"
                                    -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_list/#see-also","title":"SEE ALSO","text":"
                                  • runai training xgboost - xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_logs/","title":"Runai training xgboost logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_logs/#runai-training-xgboost-logs","title":"runai training xgboost logs","text":"

                                  view logs of a xgboost training job

                                  runai training xgboost logs [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_logs/#examples","title":"Examples","text":"
                                  # Get logs for a xgboost training\nrunai training xgboost logs xgboost-01\n\n# Get logs for a specific pod in a xgboost training\nrunai training xgboost logs xgboost-01 --pod=xgboost-01-worker-0\n\n# Get logs for a specific container in a xgboost training\nrunai training xgboost logs xgboost-01 --container=xgboost-worker\n\n# Get the last 100 lines of logs\nrunai training xgboost logs xgboost-01 --tail=100\n\n# Get logs with timestamps\nrunai training xgboost logs xgboost-01 --timestamps\n\n# Follow the logs\nrunai training xgboost logs xgboost-01 --follow\n\n# Get logs for the previous instance of the xgboost training\nrunai training xgboost logs xgboost-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training xgboost logs xgboost-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training xgboost logs xgboost-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training xgboost logs xgboost-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for xgboost training to be ready for logs\nrunai training xgboost logs xgboost-01 --wait-timeout=30s\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai training xgboost - xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/","title":"Runai training xgboost port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/#runai-training-xgboost-port-forward","title":"runai training xgboost port-forward","text":"

                                  forward one or more local ports to a xgboost training job

                                  runai training xgboost port-forward [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/#examples","title":"Examples","text":"
                                  # Forward connections from localhost:8080 to xgboost training on port 8090:\nrunai training xgboost port-forward xgboost-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to xgboost training on port 8080:\nrunai training xgboost port-forward xgboost-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to xgboost training on port 8090 and from localhost:6443 to xgboost training on port 443:\nrunai training xgboost port-forward xgboost-01 --port 8080:8090 --port 6443:443 --address localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai training xgboost - xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_resume/","title":"Runai training xgboost resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_resume/#runai-training-xgboost-resume","title":"runai training xgboost resume","text":"

                                  resume xgboost training

                                  runai training xgboost resume [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_resume/#examples","title":"Examples","text":"
                                  # Resume a xgboost training workload\nrunai training xgboost resume <xgboost-name>\n\n# Resume a xgboost training workload in a specific project\nrunai training xgboost resume <xgboost-name> -p <project_name>\n\n# Resume a xgboost training workload by UUID\nrunai training xgboost resume --uuid=<xgboost_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_resume/#options","title":"Options","text":"
                                    -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_resume/#see-also","title":"SEE ALSO","text":"
                                  • runai training xgboost - xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_submit/","title":"Runai training xgboost submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_submit/#runai-training-xgboost-submit","title":"runai training xgboost submit","text":"

                                  submit xgboost training

                                  runai training xgboost submit [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_submit/#examples","title":"Examples","text":"
                                  # Submit a xgboost training workload\nrunai training xgboost submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a xgboost training workload with arguments\nrunai training xgboost submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a xgboost training workload with a custom command\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a xgboost training master args with worker args\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a xgboost training master command with worker args\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a xgboost training master command with worker command\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_submit/#options","title":"Options","text":"
                                        --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-gpu-devices-request int32               GPU units to allocate for the job (e.g. 1, 2)\n      --master-gpu-portion-limit float                 GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-gpu-portion-request float               GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_submit/#see-also","title":"SEE ALSO","text":"
                                  • runai training xgboost - xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/","title":"Runai training xgboost suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/#runai-training-xgboost-suspend","title":"runai training xgboost suspend","text":"

                                  suspend xgboost training

                                  runai training xgboost suspend [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/#examples","title":"Examples","text":"
                                  # Suspend a xgboost training workload\nrunai training xgboost suspend <xgboost-name>\n\n# Suspend a xgboost training workload in a specific project\nrunai training xgboost suspend <xgboost-name> -p <project_name>\n\n# Suspend a xgboost training workload by UUID\nrunai training xgboost suspend --uuid=<xgboost_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/#options","title":"Options","text":"
                                    -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/#see-also","title":"SEE ALSO","text":"
                                  • runai training xgboost - xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_upgrade/","title":"Runai upgrade","text":""},{"location":"Researcher/cli-reference/new-cli/runai_upgrade/#runai-upgrade","title":"runai upgrade","text":"

                                  upgrades the CLI to the latest version

                                  runai upgrade [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_upgrade/#options","title":"Options","text":"
                                        --force   upgrade CLI without checking for new version\n  -h, --help    help for upgrade\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_upgrade/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_upgrade/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_version/","title":"Runai version","text":""},{"location":"Researcher/cli-reference/new-cli/runai_version/#runai-version","title":"runai version","text":"

                                  show the current version of the CLI

                                  runai version [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_version/#options","title":"Options","text":"
                                    -h, --help   help for version\n      --wide   print full version details\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_version/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_version/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_whoami/","title":"Runai whoami","text":""},{"location":"Researcher/cli-reference/new-cli/runai_whoami/#runai-whoami","title":"runai whoami","text":"

                                  show the current logged in user

                                  runai whoami [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_whoami/#options","title":"Options","text":"
                                    -h, --help   help for whoami\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_whoami/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_whoami/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload/","title":"Runai workload","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload/#runai-workload","title":"runai workload","text":"

                                  workload management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload/#options","title":"Options","text":"
                                    -h, --help                 help for workload\n      --interactive enable   set interactive mode (enabled|disabled)\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai workload describe - Describe a workload
                                  • runai workload list - List workloads
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_attach/","title":"Runai workload attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload_attach/#runai-workload-attach","title":"runai workload attach","text":"

                                  Attach to a process that is already running inside an existing container.

                                  runai workload attach WORKLOAD_NAME [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_attach/#examples","title":"Examples","text":"
                                  # Attaching to ubuntu workspace \nrunai workload attach ubuntu-wl --type workspace --tty --stdin\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --type string                    The type of the workload (training, workspace)\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai workload - workload management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_describe/","title":"Runai workload describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload_describe/#runai-workload-describe","title":"runai workload describe","text":"

                                  Describe a workload

                                  runai workload describe WORKLOAD_NAME [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_describe/#options","title":"Options","text":"
                                        --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --framework string    filter by workload framework\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --type string         The type of the workload (training, workspace)\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai workload - workload management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_exec/","title":"Runai workload exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload_exec/#runai-workload-exec","title":"runai workload exec","text":"

                                  exec management

                                  runai workload exec WORKLOAD_NAME [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_exec/#examples","title":"Examples","text":"
                                  # Execute bash to workspace \nrunai workload exec jup --type workspace --tty --stdin -- /bin/bash \n\n# Execute ls to workload\nrunai workload exec jup --type workspace -- ls\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --type string                    The type of the workload (training, workspace)\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai workload - workload management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_list/","title":"Runai workload list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload_list/#runai-workload-list","title":"runai workload list","text":"

                                  List workloads

                                  runai workload list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_list/#options","title":"Options","text":"
                                    -A, --all                list workloads from all projects\n      --framework string   filter by workload framework\n  -h, --help               help for list\n      --json               Output structure JSON\n      --limit int32        number of workload in list (default 50)\n      --no-headers         Output structure table without headers\n      --offset int32       offset number of limit, default 0 (first offset)\n  -p, --project string     Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string      filter by workload state\n      --table              Output structure table\n      --type string        filter by workload type\n      --yaml               Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_list/#see-also","title":"SEE ALSO","text":"
                                  • runai workload - workload management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_logs/","title":"Runai workload logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload_logs/#runai-workload-logs","title":"runai workload logs","text":"

                                  logs management

                                  runai workload logs WORKLOAD_NAME [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_logs/#examples","title":"Examples","text":"
                                    # Get logs for a workspace\n  runai workload logs workspace-01 --type=workspace\n\n  # Get logs for a specific pod in a workspace\n  runai workload logs workspace-01 --type=workspace --pod=workspace-01-0\n\n  # Get logs for a specific container in a workspace\n  runai workload logs workspace-01 --type=workspace --container=container-01\n\n  # Get the last 100 lines of logs\n  runai workload logs workspace-01 --type=workspace --tail=100\n\n  # Get logs with timestamps\n  runai workload logs workspace-01 --type=workspace --timestamps\n\n  # Follow the logs\n  runai workload logs workspace-01 --type=workspace --follow\n\n  # Get logs for the previous instance of the workspace\n  runai workload logs workspace-01 --type=workspace --previous\n\n  # GetLimit the logs to 1024 bytes\n  runai workload logs workspace-01 --type=workspace --limit-bytes=1024\n\n  # Get logs since the last 5 minutes\n  runai workload logs workspace-01 --type=workspace --since=5m\n\n  # Get logs since a specific timestamp\n  runai workload logs workspace-01 --type=workspace --since-time=2023-05-30T10:00:00Z\n\n  # Wait up to 30 seconds for workload to be ready for logs\n  runai workload logs workspace-01 --type=workspace --wait-timeout=30s\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --type string             The type of the workload (training, workspace)\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai workload - workload management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_port-forward/","title":"Runai workload port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workload_port-forward/#runai-workload-port-forward","title":"runai workload port-forward","text":"

                                  port forward management

                                  runai workload port-forward WORKLOAD_NAME [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_port-forward/#examples","title":"Examples","text":"
                                  # Forward connections from localhost:8080 to <workload-name> on port 8090:\nrunai workload port-forward <workload-name> --type=<workload-type> --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to <job-name> on port 8080:\nrunai workload port-forward <workload-name> --type=<workload-type> --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to <workload-name> on port 8090 and from localhost:6443 to <workload-name> on port 443:\nrunai workload port-forward <workload-name> --type=<workload-type> --port 8080:8090 --port 6443:443 --address localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --type string                    The type of the workload (training, workspace)\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workload_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai workload - workload management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace/","title":"Runai workspace","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace/#runai-workspace","title":"runai workspace","text":"

                                  workspace management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace/#options","title":"Options","text":"
                                    -h, --help   help for workspace\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai workspace attach - attach to a running container in a workspace job
                                  • runai workspace bash - open a bash shell in a workspace job
                                  • runai workspace delete - delete workspace workload
                                  • runai workspace describe - describe workspace
                                  • runai workspace exec - execute a command in a workspace job
                                  • runai workspace list - list workspace
                                  • runai workspace logs - view logs of a workspace job
                                  • runai workspace port-forward - forward one or more local ports to a workspace job
                                  • runai workspace resume - resume workspace
                                  • runai workspace submit - submit workspace
                                  • runai workspace suspend - suspend workspace
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_attach/","title":"Runai workspace attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_attach/#runai-workspace-attach","title":"runai workspace attach","text":"

                                  attach to a running container in a workspace job

                                  runai workspace attach [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_attach/#examples","title":"Examples","text":"
                                  # Attaching to the main worker of a workspace\nrunai workspace attach workspace-01 --tty --stdin\n\n# Attaching to a specific pod of a workspace\nrunai workspace attach workspace-01 --pod workspace-01-worker-1 --tty --stdin\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai workspace - workspace management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_bash/","title":"Runai workspace bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_bash/#runai-workspace-bash","title":"runai workspace bash","text":"

                                  open a bash shell in a workspace job

                                  runai workspace bash [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_bash/#examples","title":"Examples","text":"
                                  # Open a bash shell in the workspace's main worker\nrunai workspace bash workspace-01\n\n# Open a bash shell in a specific workspace worker\nrunai workspace bash workspace-01 --pod workspace-01-worker-1\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_bash/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_bash/#see-also","title":"SEE ALSO","text":"
                                  • runai workspace - workspace management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_delete/","title":"Runai workspace delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_delete/#runai-workspace-delete","title":"runai workspace delete","text":"

                                  delete workspace workload

                                  runai workspace delete [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_delete/#examples","title":"Examples","text":"
                                  # Delete a workspace workload with a default project\nrunai workspace delete <workspace-name>\n\n# Delete a workspace workload with a specific project\nrunai workspace delete <workspace-name> -p <project_name>\n\n# Delete a workspace workload by UUID\nrunai workspace delete --uuid=<workspace_uuid> -p <project_name>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_delete/#options","title":"Options","text":"
                                    -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_delete/#see-also","title":"SEE ALSO","text":"
                                  • runai workspace - workspace management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_describe/","title":"Runai workspace describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_describe/#runai-workspace-describe","title":"runai workspace describe","text":"

                                  describe workspace

                                  runai workspace describe [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_describe/#examples","title":"Examples","text":"
                                  # Describe a workspace workload with a default project\nrunai workspace describe <workspace-name>\n\n# Describe a workspace workload in a specific project\nrunai workspace describe <workspace-name> -p <project_name>\n\n# Describe a workspace workload by UUID\nrunai workspace describe --uuid=<workspace_uuid>\n\n# Describe a workspace workload with specific output format\nrunai workspace describe <workspace-name> -o json\n\n# Describe a workspace workload with specific sections\nrunai workspace describe <workspace-name> --general --compute --pods --events --networks\n\n# Describe a workspace workload with container details and custom limits\nrunai workspace describe <workspace-name> --containers --pod-limit 20 --event-limit 100\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_describe/#options","title":"Options","text":"
                                        --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai workspace - workspace management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_exec/","title":"Runai workspace exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_exec/#runai-workspace-exec","title":"runai workspace exec","text":"

                                  execute a command in a workspace job

                                  runai workspace exec [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_exec/#examples","title":"Examples","text":"
                                  # Execute bash in the workspace's main worker\nrunai workspace exec workspace-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the workspace's main worker\nrunai workspace exec workspace-01 -- ls\n\n# Execute a command in a specific workspace worker\nrunai workspace exec workspace-01 --pod workspace-01-worker-1 -- nvidia-smi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai workspace - workspace management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_list/","title":"Runai workspace list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_list/#runai-workspace-list","title":"runai workspace list","text":"

                                  list workspace

                                  runai workspace list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_list/#examples","title":"Examples","text":"
                                  # List all workspace workloads\nrunai workspace list -A\n\n# List workspace workloads with default project\nrunai workspace list\n\n# List workspace workloads in a specific project\nrunai workspace list -p <project_name>\n\n# List all workspace workloads with a specific output format\nrunai workspace list -o wide\n\n# List workspace workloads with pagination\nrunai workspace list --limit 20 --offset 40\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_list/#options","title":"Options","text":"
                                    -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_list/#see-also","title":"SEE ALSO","text":"
                                  • runai workspace - workspace management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_logs/","title":"Runai workspace logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_logs/#runai-workspace-logs","title":"runai workspace logs","text":"

                                  view logs of a workspace job

                                  runai workspace logs [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_logs/#examples","title":"Examples","text":"
                                  # Get logs for a workspace\nrunai workspace logs workspace-01\n\n# Get logs for a specific pod in a workspace\nrunai workspace logs workspace-01 --pod=workspace-01-worker-0\n\n# Get logs for a specific container in a workspace\nrunai workspace logs workspace-01 --container=workspace-worker\n\n# Get the last 100 lines of logs\nrunai workspace logs workspace-01 --tail=100\n\n# Get logs with timestamps\nrunai workspace logs workspace-01 --timestamps\n\n# Follow the logs\nrunai workspace logs workspace-01 --follow\n\n# Get logs for the previous instance of the workspace\nrunai workspace logs workspace-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai workspace logs workspace-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai workspace logs workspace-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai workspace logs workspace-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for workspace to be ready for logs\nrunai workspace logs workspace-01 --wait-timeout=30s\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai workspace - workspace management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_port-forward/","title":"Runai workspace port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_port-forward/#runai-workspace-port-forward","title":"runai workspace port-forward","text":"

                                  forward one or more local ports to a workspace job

                                  runai workspace port-forward [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_port-forward/#examples","title":"Examples","text":"
                                  # Forward connections from localhost:8080 to workspace on port 8090:\nrunai workspace port-forward workspace-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to workspace on port 8080:\nrunai workspace port-forward workspace-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to workspace on port 8090 and from localhost:6443 to workspace on port 443:\nrunai workspace port-forward workspace-01 --port 8080:8090 --port 6443:443 --address localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai workspace - workspace management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_resume/","title":"Runai workspace resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_resume/#runai-workspace-resume","title":"runai workspace resume","text":"

                                  resume workspace

                                  runai workspace resume [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_resume/#examples","title":"Examples","text":"
                                  # Resume a workspace workload\nrunai workspace resume <workspace-name>\n\n# Resume a workspace workload in a specific project\nrunai workspace resume <workspace-name> -p <project_name>\n\n# Resume a workspace workload by UUID\nrunai workspace resume --uuid=<workspace_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_resume/#options","title":"Options","text":"
                                    -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_resume/#see-also","title":"SEE ALSO","text":"
                                  • runai workspace - workspace management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_submit/","title":"Runai workspace submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_submit/#runai-workspace-submit","title":"runai workspace submit","text":"

                                  submit workspace

                                  runai workspace submit [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_submit/#examples","title":"Examples","text":"
                                  # Submit a workspace workload\nrunai workspace submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a workspace workload with arguments\nrunai workspace submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a workspace workload with a custom command\nrunai workspace submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a workspace jupiter notebook\nrunai workspace submit <name> -p <project_name> -i jupyter/scipy-notebook --gpu-devices-request 1 --external-url container=8888 --name-prefix jupyter --command -- start-notebook.sh --NotebookApp.base_url='/${RUNAI_PROJECT}/${RUNAI_JOB_NAME}' --NotebookApp.token='\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_submit/#options","title":"Options","text":"
                                        --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preemptible                                    Workspace preemptible workloads can be scheduled above guaranteed quota but may be reclaimed at any time\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --working-dir string                             Set the container's working directory\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_submit/#see-also","title":"SEE ALSO","text":"
                                  • runai workspace - workspace management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_suspend/","title":"Runai workspace suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_workspace_suspend/#runai-workspace-suspend","title":"runai workspace suspend","text":"

                                  suspend workspace

                                  runai workspace suspend [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_suspend/#examples","title":"Examples","text":"
                                  # Suspend a workspace workload\nrunai workspace suspend <workspace-name>\n\n# Suspend a workspace workload in a specific project\nrunai workspace suspend <workspace-name> -p <project_name>\n\n# Suspend a workspace workload by UUID\nrunai workspace suspend --uuid=<workspace_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_suspend/#options","title":"Options","text":"
                                    -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_workspace_suspend/#see-also","title":"SEE ALSO","text":"
                                  • runai workspace - workspace management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost/","title":"Runai xgboost","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost/#runai-xgboost","title":"runai xgboost","text":"

                                  alias for xgboost management

                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost/#options","title":"Options","text":"
                                    -h, --help   help for xgboost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost/#see-also","title":"SEE ALSO","text":"
                                  • runai - Run:ai Command-line Interface
                                  • runai xgboost attach - attach to a running container in a xgboost training job
                                  • runai xgboost bash - open a bash shell in a xgboost training job
                                  • runai xgboost delete - delete xgboost training workload
                                  • runai xgboost describe - describe xgboost training
                                  • runai xgboost exec - execute a command in a xgboost training job
                                  • runai xgboost list - list xgboost training
                                  • runai xgboost logs - view logs of a xgboost training job
                                  • runai xgboost port-forward - forward one or more local ports to a xgboost training job
                                  • runai xgboost resume - resume xgboost training
                                  • runai xgboost submit - submit xgboost training
                                  • runai xgboost suspend - suspend xgboost training
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_attach/","title":"Runai xgboost attach","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_attach/#runai-xgboost-attach","title":"runai xgboost attach","text":"

                                  attach to a running container in a xgboost training job

                                  runai xgboost attach [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_attach/#examples","title":"Examples","text":"
                                  # Attaching to the main worker of a xgboost training\nrunai training xgboost attach xgboost-01 --tty --stdin\n\n# Attaching to a specific pod of a xgboost training\nrunai training xgboost attach xgboost-01 --pod xgboost-01-worker-1 --tty --stdin\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_attach/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for attach\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_attach/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_attach/#see-also","title":"SEE ALSO","text":"
                                  • runai xgboost - alias for xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_bash/","title":"Runai xgboost bash","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_bash/#runai-xgboost-bash","title":"runai xgboost bash","text":"

                                  open a bash shell in a xgboost training job

                                  runai xgboost bash [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_bash/#examples","title":"Examples","text":"
                                  # Open a bash shell in the xgboost training's main worker\nrunai training xgboost bash xgboost-01\n\n# Open a bash shell in a specific xgboost training worker\nrunai training xgboost bash xgboost-01 --pod xgboost-01-worker-1\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_bash/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for bash\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_bash/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_bash/#see-also","title":"SEE ALSO","text":"
                                  • runai xgboost - alias for xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_delete/","title":"Runai xgboost delete","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_delete/#runai-xgboost-delete","title":"runai xgboost delete","text":"

                                  delete xgboost training workload

                                  runai xgboost delete [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_delete/#examples","title":"Examples","text":"
                                  # Delete a xgboost training workload with a default project\nrunai training xgboost delete <xgboost-name>\n\n# Delete a xgboost training workload with a specific project\nrunai training xgboost delete <xgboost-name> -p <project_name>\n\n# Delete a xgboost training workload by UUID\nrunai training xgboost delete --uuid=<xgboost_uuid> -p <project_name>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_delete/#options","title":"Options","text":"
                                    -h, --help             help for delete\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_delete/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_delete/#see-also","title":"SEE ALSO","text":"
                                  • runai xgboost - alias for xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_describe/","title":"Runai xgboost describe","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_describe/#runai-xgboost-describe","title":"runai xgboost describe","text":"

                                  describe xgboost training

                                  runai xgboost describe [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_describe/#examples","title":"Examples","text":"
                                  # Describe a xgboost training workload with a default project\nrunai training xgboost describe <xgboost-name>\n\n# Describe a xgboost training workload in a specific project\nrunai training xgboost describe <xgboost-name> -p <project_name>\n\n# Describe a xgboost training workload by UUID\nrunai training xgboost describe --uuid=<xgboost_uuid>\n\n# Describe a xgboost training workload with specific output format\nrunai training xgboost describe <xgboost-name> -o json\n\n# Describe a xgboost training workload with specific sections\nrunai training xgboost describe <xgboost-name> --general --compute --pods --events --networks\n\n# Describe a xgboost training workload with container details and custom limits\nrunai training xgboost describe <xgboost-name> --containers --pod-limit 20 --event-limit 100\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_describe/#options","title":"Options","text":"
                                        --compute             Show compute information (default true)\n      --containers          Include container information in pods\n      --event-limit int32   Limit the number of events displayed (-1 for no limit) (default 50)\n      --events              Show events information (default true)\n      --general             Show general information (default true)\n  -h, --help                help for describe\n      --networks            Show networks information (default true)\n  -o, --output string       Output format (table, json, yaml) (default \"table\")\n      --pod-limit int32     Limit the number of pods displayed (-1 for no limit) (default 10)\n      --pods                Show pods information (default true)\n  -p, --project string      Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_describe/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_describe/#see-also","title":"SEE ALSO","text":"
                                  • runai xgboost - alias for xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_exec/","title":"Runai xgboost exec","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_exec/#runai-xgboost-exec","title":"runai xgboost exec","text":"

                                  execute a command in a xgboost training job

                                  runai xgboost exec [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_exec/#examples","title":"Examples","text":"
                                  # Execute bash in the xgboost training's main worker\nrunai training xgboost exec xgboost-01 --tty --stdin -- /bin/bash \n\n# Execute ls command in the xgboost training's main worker\nrunai training xgboost exec xgboost-01 -- ls\n\n# Execute a command in a specific xgboost training worker\nrunai training xgboost exec xgboost-01 --pod xgboost-01-worker-1 -- nvidia-smi\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_exec/#options","title":"Options","text":"
                                    -c, --container string               Container name for log extraction\n  -h, --help                           help for exec\n      --pod string                     Workload pod ID for log extraction, default: master (0-0)\n      --pod-running-timeout duration   Pod check for running state timeout.\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -i, --stdin                          Pass stdin to the container\n  -t, --tty                            Stdin is a TTY\n      --wait-timeout duration          Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_exec/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_exec/#see-also","title":"SEE ALSO","text":"
                                  • runai xgboost - alias for xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_list/","title":"Runai xgboost list","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_list/#runai-xgboost-list","title":"runai xgboost list","text":"

                                  list xgboost training

                                  runai xgboost list [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_list/#examples","title":"Examples","text":"
                                  # List all xgboost training workloads\nrunai training xgboost list -A\n\n# List xgboost training workloads with default project\nrunai training xgboost list\n\n# List xgboost training workloads in a specific project\nrunai training xgboost list -p <project_name>\n\n# List all xgboost training workloads with a specific output format\nrunai training xgboost list -o wide\n\n# List xgboost training workloads with pagination\nrunai training xgboost list --limit 20 --offset 40\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_list/#options","title":"Options","text":"
                                    -A, --all              list workloads from all projects\n  -h, --help             help for list\n      --json             Output structure JSON\n      --limit int32      number of workload in list (default 50)\n      --no-headers       Output structure table without headers\n      --offset int32     offset number of limit, default 0 (first offset)\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --status string    filter by workload state\n      --table            Output structure table\n      --yaml             Output structure YAML\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_list/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_list/#see-also","title":"SEE ALSO","text":"
                                  • runai xgboost - alias for xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_logs/","title":"Runai xgboost logs","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_logs/#runai-xgboost-logs","title":"runai xgboost logs","text":"

                                  view logs of a xgboost training job

                                  runai xgboost logs [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_logs/#examples","title":"Examples","text":"
                                  # Get logs for a xgboost training\nrunai training xgboost logs xgboost-01\n\n# Get logs for a specific pod in a xgboost training\nrunai training xgboost logs xgboost-01 --pod=xgboost-01-worker-0\n\n# Get logs for a specific container in a xgboost training\nrunai training xgboost logs xgboost-01 --container=xgboost-worker\n\n# Get the last 100 lines of logs\nrunai training xgboost logs xgboost-01 --tail=100\n\n# Get logs with timestamps\nrunai training xgboost logs xgboost-01 --timestamps\n\n# Follow the logs\nrunai training xgboost logs xgboost-01 --follow\n\n# Get logs for the previous instance of the xgboost training\nrunai training xgboost logs xgboost-01 --previous\n\n# Limit the logs to 1024 bytes\nrunai training xgboost logs xgboost-01 --limit-bytes=1024\n\n# Get logs since the last 5 minutes\nrunai training xgboost logs xgboost-01 --since=300s\n\n# Get logs since a specific timestamp\nrunai training xgboost logs xgboost-01 --since-time=2023-05-30T10:00:00Z\n\n# Wait up to 30 seconds for xgboost training to be ready for logs\nrunai training xgboost logs xgboost-01 --wait-timeout=30s\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_logs/#options","title":"Options","text":"
                                    -c, --container string        Container name for log extraction\n  -f, --follow                  Follow log output\n  -h, --help                    help for logs\n      --limit-bytes int         Limit the number of bytes returned from the server\n      --name string             Set workload name for log extraction\n      --pod string              Workload pod ID for log extraction, default: master (0-0)\n      --previous                Show previous pod log output\n  -p, --project string          Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --since duration          Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs\n      --since-time string       Return logs after a specific date (RFC3339)\n  -t, --tail int                Numer of tailed lines to fetch from the log, for no limit set to -1 (default -1)\n      --timestamps              Show timestamps in log output\n      --wait-timeout duration   Timeout for waiting for workload to be ready for log streaming\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_logs/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_logs/#see-also","title":"SEE ALSO","text":"
                                  • runai xgboost - alias for xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_port-forward/","title":"Runai xgboost port forward","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_port-forward/#runai-xgboost-port-forward","title":"runai xgboost port-forward","text":"

                                  forward one or more local ports to a xgboost training job

                                  runai xgboost port-forward [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_port-forward/#examples","title":"Examples","text":"
                                  # Forward connections from localhost:8080 to xgboost training on port 8090:\nrunai training xgboost port-forward xgboost-01 --port 8080:8090 --address localhost\n\n# Forward connections from 0.0.0.0:8080 to xgboost training on port 8080:\nrunai training xgboost port-forward xgboost-01 --port 8080 --address 0.0.0.0 [requires privileges]\n\n# Forward multiple connections from localhost:8080 to xgboost training on port 8090 and from localhost:6443 to xgboost training on port 443:\nrunai training xgboost port-forward xgboost-01 --port 8080:8090 --port 6443:443 --address localhost\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_port-forward/#options","title":"Options","text":"
                                        --address string                 --address [local-interface-ip\\host] --address localhost --address 0.0.0.0 [privileged] (default \"localhost\")\n  -h, --help                           help for port-forward\n      --pod string                     Workload pod ID for port-forward, default: distributed(master) otherwise(random)\n      --pod-running-timeout duration   Pod check for running state timeout.\n      --port stringArray               port\n  -p, --project string                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_port-forward/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_port-forward/#see-also","title":"SEE ALSO","text":"
                                  • runai xgboost - alias for xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_resume/","title":"Runai xgboost resume","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_resume/#runai-xgboost-resume","title":"runai xgboost resume","text":"

                                  resume xgboost training

                                  runai xgboost resume [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_resume/#examples","title":"Examples","text":"
                                  # Resume a xgboost training workload\nrunai training xgboost resume <xgboost-name>\n\n# Resume a xgboost training workload in a specific project\nrunai training xgboost resume <xgboost-name> -p <project_name>\n\n# Resume a xgboost training workload by UUID\nrunai training xgboost resume --uuid=<xgboost_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_resume/#options","title":"Options","text":"
                                    -h, --help             help for resume\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_resume/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_resume/#see-also","title":"SEE ALSO","text":"
                                  • runai xgboost - alias for xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_submit/","title":"Runai xgboost submit","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_submit/#runai-xgboost-submit","title":"runai xgboost submit","text":"

                                  submit xgboost training

                                  runai xgboost submit [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_submit/#examples","title":"Examples","text":"
                                  # Submit a xgboost training workload\nrunai training xgboost submit <name> -p <project_name> -i runai.jfrog.io/demo/quickstart-demo\n\n# Submit a xgboost training workload with arguments\nrunai training xgboost submit <name> -p <project_name> -i ubuntu -- ls -la\n\n# Submit a xgboost training workload with a custom command\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --command -- echo \"Hello, World\"\n\n# Submit a xgboost training master args with worker args\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --master-args \"-a master_arg_a -b master-arg_b'\" -- '-a worker_arg_a'\n\n# Submit a xgboost training master command with worker args\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" -- '-a worker_arg_a'\n\n# Submit a xgboost training master command with worker command\nrunai training xgboost submit <name> -p <project_name> -i ubuntu --master-command \"echo -e 'master command'\" --command -- echo -e 'worker command'\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_submit/#options","title":"Options","text":"
                                        --allow-privilege-escalation                     Allow the job to gain additional privileges after starting\n      --annotation stringArray                         Set of annotations to populate into the container running the workspace\n      --attach                                         If true, wait for the pod to start running, and then attach to the pod as if 'runai attach' was called. Attach makes tty and stdin true by default. Defaults to false\n      --auto-deletion-time-after-completion duration   The length of time (like 5s, 2m, or 3h, higher than zero) after which a completed job is automatically deleted (default 0s)\n      --backoff-limit int                              The number of times the job will be retried before failing (default 6)\n      --capability stringArray                         The POSIX capabilities to add when running containers. Defaults to the default set of capabilities granted by the container runtime.\n      --clean-pod-policy string                        Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed)\n  -c, --command                                        If true, override the image's entrypoint with the command supplied after '--'\n      --configmap-map-volume stringArray               Mount ConfigMap as a volume. Use the fhe format name=CONFIGMAP_NAME,path=PATH\n      --cpu-core-limit float                           CPU core limit (e.g. 0.5, 1)\n      --cpu-core-request float                         CPU core request (e.g. 0.5, 1)\n      --cpu-memory-limit string                        CPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --cpu-memory-request string                      CPU memory to allocate for the job (e.g. 1G, 500M)\n      --create-home-dir                                Create a temporary home directory. Defaults to true when --run-as-user is set, false otherwise\n  -e, --environment stringArray                        Set environment variables in the container\n      --existing-pvc stringArray                       Mount an existing persistent volume. Use the format: claimname=CLAIM_NAME,path=PATH\n      --extended-resource stringArray                  Request access to an extended resource. Use the format: resource_name=quantity\n      --external-url stringArray                       Expose URL from the job container. Use the format: container=9443,url=https://external.runai.com,authusers=user1,authgroups=group1\n      --git-sync stringArray                           Specifies git repositories to mount into the container. Use the format: name=NAME,repository=REPO,path=PATH,secret=SECRET,rev=REVISION\n  -g, --gpu-devices-request int32                      GPU units to allocate for the job (e.g. 1, 2)\n      --gpu-memory-limit string                        GPU memory limit to allocate for the job (e.g. 1G, 500M)\n      --gpu-memory-request string                      GPU memory to allocate for the job (e.g. 1G, 500M)\n      --gpu-portion-limit float                        GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-portion-request float                      GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --gpu-request-type string                        GPU request type (portion|memory|migProfile)\n  -h, --help                                           help for submit\n      --host-ipc                                       Whether to enable host IPC. (Default: false)\n      --host-network                                   Whether to enable host networking. (Default: false)\n      --host-path stringArray                          Volumes to mount into the container. Use the format: path=PATH,mount=MOUNT,mount-propagation=None|HostToContainer,readwrite\n  -i, --image string                                   The image for the workload\n      --image-pull-policy string                       Set image pull policy. One of: Always, IfNotPresent, Never. Defaults to Always (default \"Always\")\n      --label stringArray                              Set of labels to populate into the container running the workspace\n      --large-shm                                      Request large /dev/shm device to mount\n      --master-args string                             Specifies the arguments to pass to the master pod container command\n      --master-command string                          Specifies the command to run in the master pod container, overriding the image's default entrypoint. The command can include arguments following it.\n      --master-environment stringArray                 Set master environment variables in the container\n      --master-extended-resource stringArray           Request access to an extended resource. Use the format: resource_name=quantity\n      --master-gpu-devices-request int32               GPU units to allocate for the job (e.g. 1, 2)\n      --master-gpu-portion-limit float                 GPU portion limit, must be no less than the gpu-memory-request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-gpu-portion-request float               GPU portion request (between 0 and 1, e.g. 0.5, 0.2)\n      --master-no-pvcs                                 Do not mount any persistent volumes in the master pod\n      --name-prefix string                             Set defined prefix for the workload name and add index as a suffix\n      --new-pvc stringArray                            Mount a persistent volume, create it if it does not exist. Use the format: claimname=CLAIM_NAME,storageclass=STORAGE_CLASS,size=SIZE,path=PATH,accessmode-rwo,accessmode-rom,accessmode-rwm,ro,ephemeral\n      --nfs stringArray                                NFS storage details. Use the format: path=PATH,server=SERVER,mountpath=MOUNT_PATH,readwrite\n      --node-pools stringArray                         List of node pools to use for scheduling the job, ordered by priority\n      --node-type string                               Enforce node type affinity by setting a node-type label\n      --pod-running-timeout duration                   Pod check for running state timeout.\n      --port stringArray                               Expose ports from the job container. Use the format: service-type=NodePort,container=80,external=8080\n      --preferred-pod-topology-key string              If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values\n  -p, --project string                                 Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n      --required-pod-topology-key string               Enforce scheduling pods of this job onto nodes that have a label with this key and identical values\n      --run-as-gid int                                 The group ID the container will run with\n      --run-as-uid int                                 The user ID the container will run with\n      --run-as-user                                    takes the uid, gid, and supplementary groups fields from the token, if all the fields do not exist, uses the local running terminal user credentials. if any of the fields exist take only the existing fields\n      --s3 stringArray                                 s3 storage details. Use the format: name=NAME,bucket=BUCKET,path=PATH,accesskey=ACCESS_KEY,url=URL\n      --seccomp-profile string                         Indicates which kind of seccomp profile will be applied to the container, options: RuntimeDefault|Unconfined|Localhost\n      --stdin                                          Keep stdin open on the container(s) in the pod, even if nothing is attached\n      --supplemental-groups ints                       Comma seperated list of groups (IDs) that the user running the container belongs to\n      --toleration stringArray                         Toleration details. Use the format: operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n  -t, --tty                                            Allocate a TTY for the container\n      --user-group-source string                       Indicate the way to determine the user and group ids of the container, options: fromTheImage|fromIdpToken|fromIdpToken\n      --wait-for-submit duration                       Waiting duration for the workload to be created in the cluster. Defaults to 1 minute (1m)\n      --workers int32                                  the number of workers that will be allocated for running the workload\n      --working-dir string                             Set the container's working directory\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_submit/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_submit/#see-also","title":"SEE ALSO","text":"
                                  • runai xgboost - alias for xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_suspend/","title":"Runai xgboost suspend","text":""},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_suspend/#runai-xgboost-suspend","title":"runai xgboost suspend","text":"

                                  suspend xgboost training

                                  runai xgboost suspend [WORKLOAD_NAME] [flags]\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_suspend/#examples","title":"Examples","text":"
                                  # Suspend a xgboost training workload\nrunai training xgboost suspend <xgboost-name>\n\n# Suspend a xgboost training workload in a specific project\nrunai training xgboost suspend <xgboost-name> -p <project_name>\n\n# Suspend a xgboost training workload by UUID\nrunai training xgboost suspend --uuid=<xgboost_uuid>\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_suspend/#options","title":"Options","text":"
                                    -h, --help             help for suspend\n  -p, --project string   Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use \u2018runai config project <project name>\u2019\n  -u, --uuid string      The UUID of the workload\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_suspend/#options-inherited-from-parent-commands","title":"Options inherited from parent commands","text":"
                                        --config-file string   config file name; can be set by environment variable RUNAI_CLI_CONFIG_FILE (default \"config.json\")\n      --config-path string   config path; can be set by environment variable RUNAI_CLI_CONFIG_PATH\n  -d, --debug                enable debug mode\n  -q, --quiet                enable quiet mode, suppress all output except error messages\n      --verbose              enable verbose mode\n
                                  "},{"location":"Researcher/cli-reference/new-cli/runai_xgboost_suspend/#see-also","title":"SEE ALSO","text":"
                                  • runai xgboost - alias for xgboost management
                                  "},{"location":"Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/","title":"Add Run:ai authorization to kubeconfig","text":"

                                  The runai kubeconfig set command allows users to configure their kubeconfig file with Run:ai authorization token. This setup enables users to gain access to the Kubernetes (k8s) cluster seamlessly.

                                  Note

                                  Setting kubeconfig is not required in order to use the CLI. This command is used to enable third-party workloads under Run:ai authorization.

                                  "},{"location":"Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/#usage","title":"Usage","text":"

                                  To set the token (will be fetched automatically) inside the kubeconfig file, run the following command:

                                  runai kubeconfig set\n
                                  "},{"location":"Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/#prerequisites","title":"Prerequisites","text":"

                                  Before executing the command, ensure that

                                  1. Cluster authentication is configured and enabled.
                                  2. The user has a kubeconfig file configured.
                                  3. The user is logged in (use the runai login command).
                                  "},{"location":"Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/#cluster-configuration","title":"Cluster configuration","text":"

                                  To enable cluster authentication, add the following flags to the Kubernetes server API of each cluster:

                                  spec:\n  containers:\n  - command:\n    ...\n    - --oidc-client-id=<OIDC_CLIENT_ID>\n    - --oidc-issuer-url=url=https://<HOST>/auth/realms/<REALM>\n    - --oidc-username-prefix=-\n
                                  "},{"location":"Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/#user-kubeconfig-configuration","title":"User Kubeconfig configuration","text":"

                                  Add the following to the Kubernetes client configuration file (./kube/config). For the full command reference, see kubeconfig set.

                                  • Make sure to replace values with the actual cluster information and user credentials.
                                  • There can be multiple contexts in the kubeconfig file. The command will configure the current context.
                                  apiVersion: v1\nkind: Config\npreferences:\n  colors: true\ncurrent-context: <CONTEXT_NAME>\ncontexts:\n- context:\n    cluster: <CLUSTER_NAME>\n    user: <USER_NAME>\n  name: <CONTEXT_NAME>\nclusters:\n- cluster:\n    server: <CLUSTER_URL>\n    certificate-authority-data: <CLUSTER_CERT>\n  name: <CLUSTER_NAME>\nusers:\n- name: <USER_NAME>\n
                                  "},{"location":"Researcher/scheduling/GPU-time-slicing-scheduler/","title":"GPU Time Slicing Scheduler","text":""},{"location":"Researcher/scheduling/GPU-time-slicing-scheduler/#new-time-slicing-scheduler-by-runai","title":"New Time-slicing scheduler by Run:ai","text":"

                                  To provide customers with predictable and accurate GPU compute resources scheduling, Run:ai is introducing a new feature called Time-slicing GPU scheduler which adds fractional compute capabilities on top of other existing Run:ai memory fractions capabilities. Unlike the default NVIDIA GPU orchestrator which doesn\u2019t provide the ability to split or limit the runtime of each workload, Run:ai created a new mechanism that gives each workload exclusive access to the full GPU for a limited amount of time (lease time) in each scheduling cycle (plan time). This cycle repeats itself for the lifetime of the workload.

                                  Using the GPU runtime this way guarantees a workload is granted its requested GPU compute resources proportionally to its requested GPU fraction.

                                  Run:ai offers two new Time-slicing modes:

                                  1. Strict\u2014each workload gets its precise GPU compute fraction, which equals to its requested GPU (memory) fraction. In terms of official Kubernetes resource specification, this means:
                                  gpu-compute-request = gpu-compute-limit = gpu-(memory-)fraction\n
                                  1. Fair\u2014each workload is guaranteed at least its GPU compute fraction, but at the same time can also use additional GPU runtime compute slices that are not used by other idle workloads. Those excess time slices are divided equally between all workloads running on that GPU (after each got at least its requested GPU compute fraction). In terms of official Kubernetes resource specification, this means:
                                  gpu-compute-request = gpu-(memory-)fraction\n\ngpu-compute-limit = 1.0\n

                                  The figure below illustrates how Strict time-slicing mode is using the GPU from Lease (slice) and Plan (cycle) perspective:

                                  The figure below illustrates how Fair time-slicing mode is using the GPU from Lease (slice) and Plan (cycle) perspective:

                                  "},{"location":"Researcher/scheduling/GPU-time-slicing-scheduler/#setting-the-time-slicing-scheduler-policy","title":"Setting the Time-slicing scheduler policy","text":"

                                  Time-slicing is a cluster flag which changes the default behavior of Run:ai GPU fractions feature.

                                  Enable time-slicing by setting the following cluster flag in the runaiconfig file:

                                  global: \n    core: \n        timeSlicing: \n            mode: fair/strict\n

                                  If the timeSlicing flag is not set, the system continues to use the default NVidia GPU orchestrator to maintain backward compatability.

                                  "},{"location":"Researcher/scheduling/GPU-time-slicing-scheduler/#time-slicing-plan-and-lease-times","title":"Time-slicing Plan and Lease Times","text":"

                                  Each GPU scheduling cycle is a plan, the plan time is determined by the lease time and granularity (precision). By default, basic lease time is 250ms with 5% granularity (precision), which means the plan (cycle) time is: 250 / 0.05 = 5000ms (5 Sec). Using these values, a workload that asked to get gpu-fraction=0.5 gets 2.5s runtime out of 5s cycle time.

                                  Different workloads requires different SLA and precision, so it also possible to tune the lease time and precision for customizing the time-slicing capabilities to your cluster.

                                  Note

                                  Decreasing the lease time makes time-slicing less accurate. Increasing the lease time make the system more accurate, but each workload is less responsive.

                                  Once timeSlicing is enabled, all submitted GPU fraction or GPU memory workloads will have their gpu-compute-request\\limit set automatically by the system, depending on the annotation used on the timeSlicing mode:

                                  "},{"location":"Researcher/scheduling/GPU-time-slicing-scheduler/#strict-compute-resources","title":"Strict Compute Resources","text":"Annotation Value GPU Compute Request GPU Compute Limit gpu-fraction x x x gpu-memory x 0 1.0"},{"location":"Researcher/scheduling/GPU-time-slicing-scheduler/#fair-compute-resources","title":"Fair Compute Resources","text":"Annotation Value GPU Compute Request GPU Compute Limit gpu-fraction x x 1.0 gpu-memory x 0 1.0

                                  Note

                                  The above tables show that when submitting a workload using gpu-memory annotation, the system will split the GPU compute time between the different workloads running on that GPU. This means the workload can get anything from very little compute time (>0) to full GPU compute time (1.0).

                                  "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/","title":"Introduction","text":"

                                  When we discuss the allocation of deep learning compute resources, the discussion tends to focus on GPUs as the most critical resource. But two additional resources are no less important:

                                  • CPUs. Mostly needed for preprocessing and postprocessing tasks during a deep learning training run.
                                  • Memory. Has a direct influence on the quantities of data a training run can process in batches.

                                  GPU servers tend to come installed with a significant amount of memory and CPUs.

                                  "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#requesting-cpu-memory","title":"Requesting CPU & Memory","text":"

                                  When submitting a Job, you can request a guaranteed amount of CPUs and memory by using the --cpu and --memory flags in the runai submit command. For example:

                                  runai submit job1 -i ubuntu --gpu 2 --cpu 12 --memory 1G\n

                                  The system guarantees that if the Job is scheduled, you will be able to receive this amount of CPU and memory.

                                  For further details on these flags see: runai submit

                                  "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#cpu-over-allocation","title":"CPU over allocation","text":"

                                  The number of CPUs your Job will receive is guaranteed to be the number defined using the --cpu flag. In practice, however, you may receive more CPUs than you have asked for:

                                  • If you are currently alone on a node, you will receive all the node CPUs until such time when another workload has joined.
                                  • However, when a second workload joins, each workload will receive a number of CPUs proportional to the number requested via the --cpu flag. For example, if the first workload asked for 1 CPU and the second for 3 CPUs, then on a node with 40 cpus, the workloads will receive 10 and 30 CPUs respectively. If the flag --cpu is not specified, it will be taken from the cluster default (see the section below)
                                  "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#memory-over-allocation","title":"Memory over allocation","text":"

                                  The amount of Memory your Job will receive is guaranteed to be the number defined using the --memory flag. In practice, however, you may receive more memory than you have asked for. This is along the same lines as described with CPU over allocation above.

                                  It is important to note, however, that if you have used this memory over-allocation, and new workloads have joined, your Job may receive an out-of-memory exception and terminate.

                                  "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#cpu-and-memory-limits","title":"CPU and Memory limits","text":"

                                  You can limit your Job's allocation of CPU and memory by using the --cpu-limit and --memory-limit flags in the runai submit command. For example:

                                  runai submit job1 -i ubuntu --gpu 2 --cpu 12 --cpu-limit 24 \\\n    --memory 1G --memory-limit 4G\n

                                  The limit behavior is different for CPUs and memory.

                                  • Your Job will never be allocated with more than the amount stated in the --cpu-limit flag
                                  • If your Job tries to allocate more than the amount stated in the --memory-limit flag it will receive an out-of-memory exception.

                                  The limit (for both CPU and memory) overrides the cluster default described in the section below

                                  For further details on these flags see: runai submit

                                  "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#flag-defaults","title":"Flag Defaults","text":""},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-cpu-flag","title":"Defaults for --cpu flag","text":"

                                  If your Job has not specified --cpu, the system will use a default. The default is cluster-wide and is defined as a ratio of GPUs to CPUs.

                                  If, for example, the default has been defined as 1:6 and your Job has specified --gpu 2 and has not specified --cpu, then the implied --cpu flag value is 12 CPUs.

                                  The system comes with a cluster-wide default of 1:1. To change the ratio see below.

                                  If you didn't request any GPUs for your job and has not specified --cpu, the default is defined as a ratio of CPU limit to CPUs.

                                  If, for example, the default has been defined as 1:0.2 and your Job has specified --cpu-limit 10 and has not specified --cpu, then the implied --cpu flag value is 2 CPUs.

                                  The system comes with a cluster-wide default of 1:0.1. To change the ratio see below.

                                  "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-memory-flag","title":"Defaults for --memory flag","text":"

                                  If your Job has not specified --memory, the system will use a default. The default is cluster-wide and is proportional to the number of requested GPUs.

                                  The system comes with a cluster-wide default of 100MiB of allocated CPU memory per GPU. To change the ratio see below.

                                  If you didn't request any GPUs for your job and has not specified --memory, the default is defined as a ratio of CPU Memory limit to CPU Memory Request.

                                  The system comes with a cluster-wide default of 1:0.1. To change the ratio see below.

                                  "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-cpu-limit-flag","title":"Defaults for --cpu-limit flag","text":"

                                  If your Job has not specified --cpu-limit, then by default, the system will not set a limit. You can set a cluster-wide limit as a ratio of GPUs to CPUs. See below on how to change the ratio.

                                  "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-memory-limit-flag","title":"Defaults for --memory-limit flag","text":"

                                  If your Job has not specified --memory-limit, then by default, the system will not set a limit. You can set a cluster-wide limit as a ratio of GPUs to Memory. See below on how to change the ratio.

                                  "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#changing-the-ratios","title":"Changing the ratios","text":"

                                  To change the cluster wide-ratio use the following process. The example shows:

                                  • a CPU request with a default ratio of 2:1 CPUs to GPUs.
                                  • a CPU Memory request with a default ratio of 200MB per GPU.
                                  • a CPU limit with a default ratio of 4:1 CPU to GPU.
                                  • a Memory limit with a default ratio of 2GB per GPU.
                                  • a CPU request with a default ratio of 0.1 CPUs per 1 CPU limit.
                                  • a CPU Memory request with a default ratio of 0.1:1 request per CPU Memory limit.

                                  You must edit the cluster installation values file:

                                  • When installing the Run:ai cluster, edit the values file.
                                  • On an existing installation, use the upgrade cluster instructions to modify the values file.
                                  • You must specify at least the first 4 values as follows:
                                  runai-operator:\n  config:\n    limitRange:\n      cpuDefaultRequestGpuFactor: 2\n      memoryDefaultRequestGpuFactor: 200Mi\n      cpuDefaultLimitGpuFactor: 4\n      memoryDefaultLimitGpuFactor: 2Gi\n      cpuDefaultRequestCpuLimitFactorNoGpu: 0.1\n      memoryDefaultRequestMemoryLimitFactorNoGpu: 0.1\n
                                  "},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#validating-cpu-memory-allocations","title":"Validating CPU & Memory Allocations","text":"

                                  To review CPU & Memory allocations you need to look into Kubernetes. A Run:ai Job creates a Kubernetes pod. The pod declares its resource requests and limits. To see the memory and CPU consumption in Kubernetes:

                                  • Get the pod name for the Job by running:

                                  runai describe job <JOB_NAME>

                                  the pod will appear under the PODS category.

                                  • Run:

                                  kubectl describe pod <POD_NAME>

                                  The information will appear under Requests and Limits. For example:

                                  Limits:\n    nvidia.com/gpu:  2\nRequests:\n    cpu:             1\n    memory:          104857600\n    nvidia.com/gpu:  2\n
                                  "},{"location":"Researcher/scheduling/dynamic-gpu-fractions/","title":"Dynamic GPU Fractions","text":""},{"location":"Researcher/scheduling/dynamic-gpu-fractions/#introduction","title":"Introduction","text":"

                                  Many AI workloads are using GPU resources intermittently and sometimes these resources are not used at all. These AI workloads need these resources when they are running AI applications, or debugging a model in development. Other workloads such as Inference, might be using GPU resources at a lower utilization rate than requested, and may suddenly ask for higher guaranteed resources at peak utilization times.

                                  This pattern of resource request vs. actual resource utilization causes lower utilization of GPUs. This mainly happens if there are many workloads requesting resources to match their peak demand, even though the majority of the time they operate far below that peak.

                                  Run:ai has introduced Dynamic GPU fractions in v2.15 to cope with resource request vs. actual resource utilization which enables users to optimize GPU resource usage.

                                  Dynamic GPU fractions is part of Run:ai's core capabilities to enable workloads to optimize the use of GPU resources. This works by providing the ability to specify and consume GPU memory and compute resources dynamically by leveraging Kubernetes Request and Limit notations.

                                  Dynamic GPU fractions allow a workload to request a guaranteed fraction of GPU memory or GPU compute resource (similar to a Kubernetes request), and at the same time also request the ability to grow beyond that guaranteed request up to a specific limit (similar to a Kubernetes limit), if the resources are available.

                                  For example, with Dynamic GPU Fractions, a user can specify a workload with a GPU fraction Request of 0.25 GPU, and add the parameter gpu-fraction-limit of up to 0.80 GPU. The cluster/node-pool scheduler schedules the workload to a node that can provide the GPU fraction request (0.25), and then assigns the workload to a GPU. The GPU scheduler monitors the workload and allows it to occupy memory between 0 to 0.80 of the GPU memory (based on the parameter gpu-fraction-limit), where only 0.25 of the GPU memory is guaranteed to that workload. The rest of the memory (from 0.25 to 0.8) is \u201cloaned\u201d to the workload, as long as it is not needed by other workloads.

                                  Run:ai automatically manages the state changes between request and Limit as well as the reverse (when the balance need to be \"returned\"), updating the metrics and workloads' states and graphs.

                                  "},{"location":"Researcher/scheduling/dynamic-gpu-fractions/#setting-fractional-gpu-memory-limit","title":"Setting Fractional GPU Memory Limit","text":"

                                  With the fractional GPU memory limit, users can submit workloads using GPU fraction Request and Limit.

                                  You can either:

                                  1. Use a GPU Fraction parameter (use the gpu-fraction annotation)

                                    or

                                  2. Use an absolute GPU Memory parameter (gpu-memory annotation)

                                  When setting a GPU memory limit either as GPU fraction, or GPU memory size, the Limit must be equal or greater than the GPU fraction memory request.

                                  Both GPU fraction and GPU memory are translated into the actual requested memory size of the Request (guaranteed resources) and the Limit (burstable resources).

                                  To guarantee fair quality of service between different workloads using the same GPU, Run:ai developed an extendable GPU OOMKiller (Out Of Memory Killer) component that guarantees the quality of service using Kubernetes semantics for resources Request and Limit.

                                  The OOMKiller capability requires adding CAP_KILL capabilities to the Dynamic GPU fraction and to the Run:ai core scheduling module (toolkit daemon). This capability is disabled by default.

                                  To change the state of Dynamic GPU Fraction in the cluster, edit the runaiconfig file and set:

                                  spec: \n  global: \n    core: \n      dynamicFraction: \n        enabled: true # Boolean field default is true.\n

                                  To set the gpu memory limit per workload, add the RUNAI_GPU_MEMORY_LIMIT environment variable to the first container in the pod. This is the GPU consuming container.

                                  To use RUNAI_GPU_MEMORY_LIMIT environment variable:

                                  1. Submit a workload yaml directly, and set the RUNAI_GPU_MEMORY_LIMIT environment variable.

                                  2. Create a policy, per Project or globally. For example, set all Interactive workloads of Project=research_vision1 to always set the environment variable of RUNAI_GPU_MEMORY_LIMIT to 1.

                                  3. Pass the environment variable through the CLI or the UI.

                                  The supported values depend on the label used. You can use them in either the UI or the CLI. Use only one of the variables in the following table (they cannot be mixed):

                                  Variable Input format gpu-fraction A fraction value (for example: 0.25, 0.75). gpu-memory Kubernetes resources quantity which must be larger than gpu-memory. For example, 500000000, 2500M, 4G. NOTE: The gpu-memory label values are always in MB, unlike the env variable."},{"location":"Researcher/scheduling/dynamic-gpu-fractions/#compute-resources-ui-with-dynamic-fractions-support","title":"Compute Resources UI with Dynamic Fractions support","text":"

                                  To enable the UI elements for Dynamic Fractions, press Settings, General, then open the Resources pane and toggle GPU Resource Optimization. This enables all the UI features related to GPU Resource Optimization for the whole tenant. There are other per cluster or per node-pool configurations that should be configured in order to use the capabilities of \u2018GPU Resource Optimization\u2019 See the documentation for each of these features. Once the \u2018GPU Resource Optimization\u2019 feature is enabled, you will be able to create Compute Resources with the GPU Portion (Fraction) Limit and GPU Memory Limit. In addition, you will be able to view the workloads\u2019 utilization vs. Request and Limit parameters in the Metrics pane for each workload.

                                  Note

                                  When setting a workload with Dynamic Fractions, (for example, when using it with GPU Request or GPU memory Limits), you practically make the workload burstable. This means it can use memory that is not guaranteed for that workload and is susceptible to an \u2018OOM Kill\u2019 signal if the actual owner of that memory requires it back. This applies to non-preemptive workloads as well. For that reason, its recommended that you use Dynamic Fractions with Interactive workloads running Notebooks. Notebook pods are not evicted when their GPU process is OOM Kill\u2019ed. This behavior is the same as standard Kubernetes burstable CPU workloads.

                                  "},{"location":"Researcher/scheduling/dynamic-gpu-fractions/#multi-gpu-dynamic-fractions","title":"Multi-GPU Dynamic Fractions","text":"

                                  Run:ai also supports workload submission using multi-GPU dynamic fractions. Multi-GPU dynamic fractions work similarly to dynamic fractions on a single GPU workload, however, instead of a single GPU device, the Run:ai Scheduler allocates the same dynamic fraction pair (Request and Limit) on multiple GPU devices within the same node. For example, if practitioners develop a new model that uses 8 GPUs and requires 40GB of memory per GPU, but may want to burst out and consume up to the full GPU memory, they can allocate 8\u00d740GB with multi-GPU fractions and a limit of 80GB (e.g. H100 GPU) instead of reserving the full memory of each GPU (e.g. 80GB). This leaves 40GB of GPU memory available on each of the 8 GPUs for other workloads within that node.This is useful during model development, where memory requirements are usually lower due to experimentation with smaller model or configurations.

                                  This approach significantly improves GPU utilization and availability, enabling more precise and often smaller quota requirements for the end user. Time sharing where single GPUs can serve multiple workloads with dynamic fractions remains unchanged, only now, it serves multiple workloads using multi-GPU per workload.

                                  "},{"location":"Researcher/scheduling/dynamic-gpu-fractions/#configuring-multi-gpu-dynamic-fractions","title":"Configuring Multi-GPU Dynamic Fractions","text":"

                                  You can configure multi-GPU dynamic fractions as follows:

                                  • Using the compute resources asset, you can define the compute requirement to run multiple GPU devices, by specifying either a fraction (percentage) of the overall memory or specifying the memory request (GB, MB), both with Request and Limit parameters:

                                  • You can submit a workload with dynamic fractions using the CLI V2:

                                  "},{"location":"Researcher/scheduling/fractions/","title":"Allocation of GPU Fractions","text":""},{"location":"Researcher/scheduling/fractions/#introduction","title":"Introduction","text":"

                                  A single GPU has a significant amount of memory. Ranging from a couple of gigabytes in older generations and up to 80GB per GPU in the later models of the latest NVIDIA GPU technology. A single GPU also has a vast amount of computing power.

                                  This amount of memory and computing power is important for processing large amounts of data, such as in training deep learning models. However, there are quite a few applications that do not need this power. Examples can be inference workloads and the model-creation phase. It would thus be convenient if we could divide up a GPU between various workloads, thus achieving better GPU utilization.

                                  This article describes a Run:ai technology called Fractions that allow the division of GPUs and how to use them with Run:ai.

                                  "},{"location":"Researcher/scheduling/fractions/#runai-fractions","title":"Run:ai Fractions","text":"

                                  Run:ai provides the capability to allocate a container with a specific amount of GPU RAM. As a researcher, if you know that your code needs 4GB of RAM. You can submit a job using the flag --gpu-memory 4G to specify the exact portion of the GPU memory that you need. Run:ai will allocate your container that specific amount of GPU RAM. Attempting to reach beyond your allotted RAM will result in an out-of-memory exception.

                                  You can also use the flag --gpu 0.2 to get 20% of the GPU memory on the GPU assigned for you.

                                  For more details on Run:ai fractions see the fractions quickstart.

                                  Limitation

                                  With the fraction technology all running workloads, which utilize the GPU, share the compute in parallel and on average get an even share of the compute. For example, assuming two containers, one with 0.25 GPU workload and the other with 0.75 GPU workload - both will get (on average) an equal part of the computation power. If one of the workloads does not utilize the GPU, the other workload will get the entire GPU's compute power.

                                  Info

                                  For interoperability with other Kubernetes schedulers, Run:ai creates special reservation pods. Once a workload has been allocated a fraction of a GPU, Run:ai will create a pod in a dedicated runai-reservation namespace with the full GPU as a resource. This would cause other schedulers to understand that the GPU is reserved.

                                  "},{"location":"Researcher/scheduling/fractions/#multi-gpu-fractions","title":"Multi-GPU Fractions","text":"

                                  Run:ai also supports workload submission using multi-GPU fractions. Multi-GPU fractions work similarly to fractional single GPU workloads, however, the Run:ai Scheduler allocates the same fraction size on multiple GPU devices within the same node. For example, if practitioners develop a new model that uses 8 GPUs and requires 40GB of memory per GPU, they can allocate 8\u00d740GB with multi-GPU fractions instead of reserving the full memory of each GPU (e.g. 80GB). This leaves 40GB of GPU memory available on each of the 8 GPUs for other workloads within that node. This is useful during model development, where memory requirements are usually lower due to experimentation with smaller model or configurations.

                                  This approach significantly improves GPU utilization and availability, enabling more precise and often smaller quota requirements for the end user. Time sharing where single GPUs can serve multiple workloads with fractions remains unchanged, only now, it serves multiple workloads using multi-GPU per workload, single GPU per workload, or a mix of both.

                                  "},{"location":"Researcher/scheduling/fractions/#configuring-multi-gpu-fractions","title":"Configuring Multi-GPU Fractions","text":"

                                  Using the compute resources asset, you can define the compute requirement to run multiple GPU devices, by specifying either a fraction (percentage) of the overall memory or specifying the memory request (GB, MB):

                                  "},{"location":"Researcher/scheduling/fractions/#see-also","title":"See Also","text":"
                                  • Fractions quickstart.
                                  "},{"location":"Researcher/scheduling/gpu-memory-swap/","title":"GPU Memory SWAP","text":""},{"location":"Researcher/scheduling/gpu-memory-swap/#introduction","title":"Introduction","text":"

                                  To ensure efficient and effective usage of an organization\u2019s resources, Run:ai provides multiple features on multiple layers to help administrators and practitioners maximize their existing GPUs resource utilization.

                                  Run:ai\u2019s GPU memory swap feature helps administrators and AI practitioners to further increase the utilization of existing GPU hardware by improving GPU sharing between AI initiatives and stakeholders. This is done by expanding the GPU physical memory to the CPU memory which is typically an order of magnitude larger than that of the GPU.

                                  Expanding the GPU physical memory, helps the Run:ai system to put more workloads on the same GPU physical hardware, and to provide a smooth workload context switching between GPU memory and CPU memory, eliminating the need to kill workloads when the memory requirement is larger than what the GPU physical memory can provide.

                                  "},{"location":"Researcher/scheduling/gpu-memory-swap/#benefits-of-gpu-memory-swap","title":"Benefits of GPU memory swap","text":"

                                  There are several use cases where GPU memory swap can benefit and improve the user experience and the system's overall utilization:

                                  "},{"location":"Researcher/scheduling/gpu-memory-swap/#sharing-a-gpu-between-multiple-interactive-workloads-notebooks","title":"Sharing a GPU between multiple interactive workloads (notebooks)","text":"

                                  AI practitioners use notebooks to develop and test new AI models and to improve existing AI models. While developing or testing an AI model, notebooks use GPU resources intermittently, yet, required resources of the GPU\u2019s are pre-allocated by the notebook and cannot be used by other workloads after one notebook has already reserved them. To overcome this inefficiency, Run:ai introduced Dynamic Fractions and Node Level Scheduler.

                                  When one or more workloads require more than their requested GPU resources, there\u2019s a high probability not all workloads can run on a single GPU because the total memory required is larger than the physical size of the GPU memory.

                                  With GPU memory swap, several workloads can run on the same GPU, even if the sum of their used memory is larger than the size of the physical GPU memory. GPU memory swap can swap in and out workloads interchangeably, allowing multiple workloads to each use the full amount of GPU memory. The most common scenario is for one workload to run on the GPU (for example, an interactive notebook),while other notebooks are either idle or using the CPU to develop new code (while not using the GPU). From a user experience point of view, the swap in and out is a smooth process since the notebooks do not notice that they are being swapped in and out of the GPU memory. On rare occasions, when multiple notebooks need to access the GPU simultaneously, slower workload execution may be experienced.

                                  Notebooks typically use the GPU intermittently, therefore with high probability, only one workload (for example, an interactive notebook), will use the GPU at a time. The more notebooks the system puts on a single GPU, the higher the chances are that there will be more than one notebook requiring the GPU resources at the same time. Admins have a significant role here in fine tuning the number of notebooks running on the same GPU, based on specific use patterns and required SLAs. Using \u2018Node Level Scheduler\u2019 reduces GPU access contention between different interactive notebooks running on the same node.

                                  "},{"location":"Researcher/scheduling/gpu-memory-swap/#sharing-a-gpu-between-inferenceinteractive-workloads-and-training-workloads","title":"Sharing a GPU between inference/interactive workloads and training workloads","text":"

                                  A single GPU can be shared between an interactive or inference workload (for example, a Jupyter notebook, image recognition services, or an LLM service), and a training workload that is not time-sensitive or delay-sensitive. At times when the inference/interactive workload uses the GPU, both training and inference/interactive workloads share the GPU resources, each running part of the time swapped-in to the GPU memory, and swapped-out into the CPU memory the rest of the time.

                                  Whenever the inference/interactive workload stops using the GPU, the swap mechanism swaps out the inference/interactive workload GPU data to the CPU memory. Kubernetes wise, the POD is still alive and running using the CPU. This allows the training workload to run faster when the inference/interactive workload is not using the GPU, and slower when it does, thus sharing the same resource between multiple workloads, fully utilizing the GPU at all times, and maintaining uninterrupted service for both workloads.

                                  "},{"location":"Researcher/scheduling/gpu-memory-swap/#serving-inference-warm-models-with-gpu-memory-swap","title":"Serving inference warm models with GPU memory swap","text":"

                                  Running multiple inference models is a demanding task and you will need to ensure that your SLA is met. You need to provide high performance and low latency, while maximizing GPU utilization. This becomes even more challenging when the exact model usage patterns are unpredictable. You must plan for the agility of inference services and strive to keep models on standby in a ready state rather than an idle state.

                                  Run:ai\u2019s GPU memory swap feature enables you to load multiple models to a single GPU, where each can use up to the full amount GPU memory. Using an application load balancer, the administrator can control to which server each inference request is sent. Then the GPU can be loaded with multiple models, where the model in use is loaded into the GPU memory and the rest of the models are swapped-out to the CPU memory. The swapped models are stored as ready models to be loaded when required. GPU memory swap always maintains the context of the workload (model) on the GPU so it can easily and quickly switch between models. This is unlike industry standard model servers that load models from scratch into the GPU whenever required.

                                  "},{"location":"Researcher/scheduling/gpu-memory-swap/#configuring-memory-swap","title":"Configuring memory swap","text":"

                                  Perquisites\u2014before configuring the GPU Memory Swap the administrator must configure the Dynamic Fractions feature, and optionally configure the Node Level Scheduler feature.

                                  The first enables you to make your workloads burstable, and both features will maximize your workloads\u2019 performance and GPU utilization within a single node.

                                  To enable GPU memory swap in a Run:aAi cluster, the administrator must update the runaiconfig file with the following parameters:

                                  spec: \n global: \n   core: \n     swap:\n       enabled: true\n       limits:\n         cpuRam: 100Gi\n

                                  The example above uses 100Gi as the size of the swap memory.

                                  You can also use the patch command from your terminal:

                                  kubectl patch -n runai runaiconfigs.run.ai/runai --type='merge' --patch '{\"spec\":{\"global\":{\"core\":{\"swap\":{\"enabled\": true, \"limits\": {\"cpuRam\": \"100Gi\"}}}}}}'\n

                                  To make a workload swappable, a number of conditions must be met:

                                  1. The workload MUST use Dynamic Fractions. This means the workload\u2019s memory request is less than a full GPU, but it may add a GPU memory limit to allow the workload to effectively use the full GPU memory.

                                  2. The administrator must label each node that they want to provide GPU memory swap with a run.ai/swap-enabled=true this enables the feature on that node. Enabling the feature reserves CPU memory to serve the swapped GPU memory from all GPUs on that node. The administrator sets the size of the CPU reserved RAM memory using the runaiconfigs file.

                                  3. Optionally, configure Node Level Scheduler. Using node level scheduler can help in the following ways:

                                    • The Node Level Scheduler automatically spreads workloads between the different GPUs on a node, ensuring maximum workload performance and GPU utilization.
                                    • In scenarios where Interactive notebooks are involved, if the CPU reserved memory for the GPU swap is full, the Node Level Scheduler preempts the GPU process of that workload and potentially routes the workload to another GPU to run.
                                  "},{"location":"Researcher/scheduling/gpu-memory-swap/#configure-system-reserved-gpu-resources","title":"Configure system reserved GPU Resources","text":"

                                  Swappable workloads require reserving a small part of the GPU for non-swappable allocations like binaries and GPU context. To avoid getting out-of-memory (OOM) errors due to non-swappable memory regions, the system reserves a 2GiB of GPU RAM memory by default, effectively truncating the total size of the GPU memory. For example, a 16GiB T4 will appear as 14GiB on a swap-enabled node. The exact reserved size is application-dependent, and 2GiB is a safe assumption for 2-3 applications sharing and swapping on a GPU. This value can be changed by editing the runaiconfig specification as follows:

                                  spec: \n global: \n   core: \n     swap:\n       limits:\n         reservedGpuRam: 2Gi\n

                                  You can also use the patch command from your terminal:

                                  kubectl patch -n runai runaiconfigs.run.ai/runai --type='merge' --patch '{\"spec\":{\"global\":{\"core\":{\"swap\":{\"limits\":{\"reservedGpuRam\": <quantity>}}}}}}'\n

                                  This configuration is in addition to the Dynamic Fractions configuration, and optional Node Level Scheduler configuration.

                                  "},{"location":"Researcher/scheduling/gpu-memory-swap/#preventing-your-workloads-from-getting-swapped","title":"Preventing your workloads from getting swapped","text":"

                                  If you prefer your workloads not to be swapped into CPU memory, you can specify on the pod an anti-affinity to run.ai/swap-enabled=true node label when submitting your workloads and the Scheduler will ensure not to use swap-enabled nodes.

                                  "},{"location":"Researcher/scheduling/gpu-memory-swap/#known-limitations","title":"Known Limitations","text":"
                                  • A pod created before the GPU memory swap feature was eneabled in that cluster, cannot be scheduled to a swap-enabled node. A proper event is generated in case no matching node is found. Users must re-submit those pods to make them swap-enabled.
                                  • GPU memory swap cannot be enabled if fairshare time-slicing or strict time-slicing is used, GPU memory swap can only be used with the default time-slicing mechanism.
                                  • CPU RAM size cannot be decreased once GPU memory swap is enabled.
                                  "},{"location":"Researcher/scheduling/gpu-memory-swap/#what-happens-when-the-cpu-reserved-memory-for-gpu-swap-is-exhausted","title":"What happens when the CPU reserved memory for GPU swap is exhausted?","text":"

                                  CPU memory is limited, and since a single CPU serves multiple GPUs on a node, this number is usually between 2 to 8. For example, when using 80GB of GPU memory, each swapped workload consumes up to 80GB (but may use less) assuming each GPU is shared between 2-4 workloads. In this example, you can see how the swap memory can become very large. Therefore, we give administrators a way to limit the size of the CPU reserved memory for swapped GPU memory on each swap enabled node.

                                  Limiting the CPU reserved memory means that there may be scenarios where the GPU memory cannot be swapped out to the CPU reserved RAM. Whenever the CPU reserved memory for swapped GPU memory is exhausted, the workloads currently running will not be swapped out to the CPU reserved RAM, instead, Node Level Scheduler logic takes over and provides GPU resource optimization. See Node Level Scheduler.

                                  "},{"location":"Researcher/scheduling/gpu-memory-swap/#multi-gpu-memory-swap","title":"Multi-GPU Memory Swap","text":"

                                  Run:ai also supports workload submission using multi-GPU memory swap. Multi-GPU memory swap works similarly to single GPU memory swap, but instead of swapping memory for a single GPU workload, it swaps memory for workloads across multiple GPUs simultaneously and synchronously.

                                  The Run:ai Scheduler allocates the same dynamic fraction pair (Request and Limit) on multiple GPU devices in the same node. For example, if you want to run two LLM models, each consuming 8 GPUs that are not used simultaneously, you can use GPU memory swap to share their GPUs. This approach allows multiple models to be stacked on the same node.

                                  The following outlines the advantages of stacking multiple models on the same node:

                                  • Maximizes GPU utilization: Efficiently uses available GPU resources by enabling multiple workloads to share GPUs.
                                  • Improves cold start times: Loading large LLM models to a node and it\u2019s GPUs can take several minutes during a \u201ccold start\u201d. Using memory swap turns this process into a \u201cwarm start\u201d that takes only a faction of a second to a few seconds (depending on the model size and the GPU model).
                                  • Increases GPU availability: Frees up and maximizes GPU availability for additional workloads (and users), enabling better resource sharing.
                                  • Smaller quota requirements: Enables more precise and often smaller quota requirements for the end user.
                                  "},{"location":"Researcher/scheduling/gpu-memory-swap/#configuring-multi-gpu-memory-swap","title":"Configuring multi-GPU memory swap","text":"

                                  You can configure multi-GPU memory swapping as follows:

                                  • Using the compute resources asset, you can define the compute requirement to run multiple GPU devices, by specifying either a fraction (percentage) of the overall memory or specifying the memory request (GB, MB). Both options require defining the Request and Limit parameters, Workloads can then be scheduled to nodes or node pools where memory swap is enabled.

                                  • You can submit a workload with dynamic fractions using the CLI V2:

                                  "},{"location":"Researcher/scheduling/node-level-scheduler/","title":"Optimize performance with Node Level Scheduler","text":"

                                  The Node Level Scheduler optimizes the performance of your pods and maximizes the utilization of GPUs by making optimal local decisions on GPU allocation to your pods. While the Cluster Scheduler chooses the specific node for a POD, but has no visibility to node\u2019s GPUs internal state, the Node Level Scheduler is aware of the local GPUs states and makes optimal local decisions such that it can optimize both the GPU utilization and pods\u2019 performance running on the node\u2019s GPUs.

                                  Node Level Scheduler applies to all workload types, but will best optimize the performance of burstable workloads, giving those more GPU memory than requested and up to the limit specified. Be aware, burstable workloads are always susceptible to an OOM Kill signal if the owner of the excess memory requires it back. This means that using the Node Level Scheduler with Inference or Training workloads may cause pod preemption. Interactive workloads that are using notebooks behave differently since the OOM Kill signal will cause the Notebooks' GPU process to exit but not the notebook itself. This keeps the Interactive pod running and retrying to attach a GPU again. This makes Interactive workloads with notebooks a great use case for burstable workloads and Node Level Scheduler.

                                  "},{"location":"Researcher/scheduling/node-level-scheduler/#interactive-notebooks-use-case","title":"Interactive Notebooks Use Case","text":"

                                  Consider the following example of a node with 2 GPUs and 2 interactive pods that are submitted and want GPU resources.

                                  The Scheduler instructs the node to put the two pods on a single GPU, bin packing a single GPU and leaving the other free for a workload that might want a full GPU or more than half GPU. However that would mean GPU#2 is idle while the two notebooks can only use up to half a GPU, even if they temporarily need more.

                                  However, with Node Level Scheduler enabled, the local decision will be to spread those two pods on two GPUs and allow them to maximize bot pods\u2019 performance and GPUs\u2019 utilization by bursting out up to the full GPU memory and GPU compute resources.

                                  The Cluster Scheduler still sees a node with a full empty GPU. When a 3rd pod is scheduled, and it requires a full GPU (or more than 0.5 GPU), the scheduler will send it to that node, and Node Level Scheduler will move one of the Interactive workloads to run with the other pod in GPU#1, as was the Cluster Scheduler initial plan.

                                  This is an example of one scenario that shows how Node Level Scheduler locally optimizes and maximizes GPU utilization and pods\u2019 performance.

                                  "},{"location":"Researcher/scheduling/node-level-scheduler/#how-to-configure-node-level-scheduler","title":"How to configure Node Level Scheduler","text":"

                                  Node Level Scheduler can be enabled per Node-Pool, giving the Administrator the option to decide which Node-Pools will be used with this new feature.

                                  To use Node Level Scheduler the Administrator should follow the steps:

                                  1. Enable Node Level Scheduler at the cluster level (per cluster), edit the runaiconfig file and set:

                                    spec: \n  global: \n      core: \n        nodeScheduler:\n          enabled: true\n

                                    The Administrator can also use this patch command to perform the change:

                                    kubectl patch -n runai runaiconfigs.run.ai/runai --type='merge' --patch '{\"spec\":{\"global\":{\"core\":{\"nodeScheduler\":{\"enabled\": true}}}}}'\n
                                  2. To enable \u2018GPU resource optimization\u2019 on your tenant\u2019s, go to your tenant\u2019s UI and press General settings, then open the Resources pane and toggle GPU Resource Optimization to on.

                                  3. To enable \u2018Node Level Scheduler\u2019 on any of the Node Pools you want to use this feature, go to the tenant\u2019s UI \u2018Node Pools\u2019 tab (under \u2018Nodes\u2019), and either create a new Node-Pool or edit an existing Node-Pool. In the Node-Pool\u2019s form, under the \u2018Resource Utilization Optimization\u2019 tab, change the \u2018Number of workloads on each GPU\u2019 to any value other than \u2018Not Enforced\u2019 (i.e. 2, 3, 4, 5).

                                  The Node Level Scheduler is now ready to be used on that Node-Pool.

                                  "},{"location":"Researcher/scheduling/schedule-to-aws-groups/","title":"Scheduling workloads to AWS placement groups","text":"

                                  Run:ai supports AWS placement groups when building and submitting a job. AWS Placement Groups are used to maximize throughput and performance of distributed training workloads.

                                  To enable and configure this feature:

                                  1. Press Jobs | New job.
                                  2. In Scheduling and lifecycle enable the Topology aware scheduling.
                                  3. In Topology key, enter the label of the topology of the node.
                                  4. In Scheduling rule choose Required or Preferred from the drop down.

                                    • Required\u2014when enabled, all PODs must be scheduled to the same placement group.
                                    • Preferred\u2014when enabled, this is a best-effort, to place as many PODs on the same placement group.
                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/","title":"The Run:ai Scheduler","text":"

                                  Each time a user submits a workload via the Run:ai platform, through a 3rd party framework, or directly to Kubernetes APIs, the submitted workload goes to the selected Kubernetes cluster, and is handled by the Run:ai Scheduler.

                                  The Scheduler\u2019s main role is to find the best-suited node or nodes for each submitted workload. The nodes must match the resources and other characteristics requested by the workload, while adhering to the quota and fairness principles of the Run:ai platform. A workload can be a single pod running on a single node, or a distributed workload using multiple pods, each running on a node (or part of a node). It is not rare to find large training workloads using 128 nodes and even more, or inference workloads using many pods (replicas) and nodes. There are numerous types of workloads, some are Kubernetes native and some are 3rd party extensions on top of Kubernetes native pods. The Run:ai Scheduler schedules any Kubernetes native workloads, Run:ai workloads, or any other type of 3rd party workload.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#scheduler-basics","title":"Scheduler basics","text":"

                                  Set out below are some basic terms and information regarding the Run:ai Scheduler.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#terminology","title":"Terminology","text":"

                                  This section describes the terminology and building blocks of the Run:ai scheduler, it also explains some of the scheduling principles used by the Run:ai scheduler.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#workloads-and-pod-groups","title":"Workloads and Pod-Groups","text":"

                                  The Run:ai scheduler attaches any newly created pod to a pod-group. A pod-group may contain one or more pods representing a workload. For example, if the submitted workload is a PyTorch distributed training with 32 workers, a single pod-group is created for the entire workload, and all pods are then attached to the pod-group with certain rules that may apply to the pod-group itself, for example, gang scheduling.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#scheduling-queue","title":"Scheduling queue","text":"

                                  A scheduling queue (or simply a queue) represents a scheduler primitive that manages the scheduling of workloads based on different parameters. A queue is created for each project/node pool pair and department/node pool pair. The Run:ai scheduler supports hierarchical queueing, project queues are bound to department queues, per node pool. This allows an organization to manage quota, over-quota, and other characteristics for projects and their associated departments.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#priority-and-preemption","title":"Priority and Preemption","text":"

                                  Run:ai supports scheduling workloads using different priorities and preemption policies. In the Run:ai scheduling system, higher priority workloads (pods) may preempt lower priority workloads (pods) within the same scheduling queue (project), according to their Preemption policy. Run:ai Scheduler implicitly assumes any PriorityClass >= 100 is non-preemptible and any PriorityClass < 100 is preemptible.

                                  Cross project and cross department workload preemptions are referred to as Resource reclaim and are based on fairness between queues rather than the priority of the workloads.

                                  To make it easier for users to submit AI workloads, Run:ai preconfigured several Kubernetes PriorityClass objects, the Run:ai preset PriorityClass objects have their preemptionPolicy always set to PreemptLowerPriority, regardless of their actual Run:ai preemption policy within the Run:ai platform.

                                  PriorityClass Name PriorityClass Run:ai preemption policy K8S preemption policy Inference 125 Non-preemptible PreemptLowerPriority Build 100 Non-preemptible PreemptLowerPriority Interactive-preemptible 75 Preemptible PreemptLowerPriority Train 50 Preemptible PreemptLowerPriority"},{"location":"Researcher/scheduling/the-runai-scheduler/#quota","title":"Quota","text":"

                                  Each project and department includes a set of guaranteed resource quotas per node pool per resource type. For example, Project LLM-Train/Node Pool NV-H100 quota parameters specify the number of GPUs, CPUs(cores), and the amount of CPU memory that this project guarantees for that node pool.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#over-quota","title":"Over-quota","text":"

                                  Projects and departments can have a share in the unused resources of any node pool, beyond their quota of resources. We name these resources as over quota resources. The admin configures the over-quota parameters per node pool for each project and department.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#over-quota-weight","title":"Over quota weight","text":"

                                  Projects can receive a share of the cluster/node pool unused resources when the over-quota weight setting is enabled, the part each Project receives depends on its over-quota weight value, and the total weights of all other projects\u2019 over-quota priorities. The admin configures the over-quota weight parameters per node pool for each project and department.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#fairshare-and-fairshare-balancing","title":"Fairshare and fairshare balancing","text":"

                                  Run:ai Scheduler calculates a numerical value per project (or department) for each node-pool, representing the project\u2019s (department\u2019s) sum of guaranteed resources plus the portion of non-guaranteed resources in that node pool. We name this value fairshare.

                                  The scheduler strives to provide each project (or department) the resources they deserve using two main parameters - deserved quota and deserved fairshare (i.e. quota + over quota resources), this is done per node pool. If one project\u2019s node pool queue is below fairshare and another project\u2019s node pool queue is above fairshare, the scheduler shifts resources between queues to balance fairness; this may result in the preemption of some over-quota preemptible workloads.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#over-subscription","title":"Over-subscription","text":"

                                  Over-subscription is a scenario where the sum of all guaranteed resource quotas surpasses the physical resources of the cluster or node pool. In this case, there may be scenarios in which the scheduler cannot find matching nodes to all workload requests, even if those requests were within the resource quota of their associated projects.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#gang-scheduling","title":"Gang scheduling","text":"

                                  Gang scheduling describes a scheduling principle where a workload composed of multiple pods is either fully scheduled (i.e. all pods are scheduled and running) or fully pending (i.e. all pods are not running). Gang scheduling refers to a single pod group.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#fairness-fair-resource-distribution","title":"Fairness (fair resource distribution)","text":"

                                  Fairness is a major principle within the Run:ai scheduling system. In essence, it means that the Run:ai Scheduler always respects certain resource splitting rules (fairness) between projects and between departments.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#preemption-of-lower-priority-workloads-within-a-project","title":"Preemption of lower priority workloads within a project","text":"

                                  Workload priority is always respected within a project. This means higher priority workloads are scheduled before lower priority workloads, it also means that higher priority workloads may preempt lower priority workloads within the same project if the lower priority workloads are preemptible.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#reclaim-of-resources-between-projects-and-departments","title":"Reclaim of resources between projects and departments","text":"

                                  Reclaim is an inter-project (and inter-department) scheduling action that takes back resources from one project (or department) that has used them as over-quota, back to a project (or department) that deserves those resources as part of its guaranteed quota, or to balance fairness between projects, each to its fairshare (i.e. sharing fairly the portion of the unused resources).

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#multi-level-quota-system","title":"Multi-Level quota system","text":"

                                  Each project has a set of guaranteed resource quotas (GPUs, CPUs, and CPU memory) per node pool. Projects can go over-quota and get a share of the unused resources (over-quota) in a node pool beyond their guaranteed quota in that node pool. The same applies to Departments. The Scheduler balances the amount of over quota between departments, and then between projects.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#placement-strategy-bin-pack-and-spread","title":"Placement strategy - bin-pack and spread","text":"

                                  The admin can set per node pool placement strategy of the scheduler for GPU based workloads and for CPU-only based workloads.

                                  Each type\u2019s strategy can be either bin-pack or spread.

                                  GPU workloads:

                                  • Bin-pack means the Scheduler places as many workloads as possible in each GPU and each node to use fewer resources and maximize GPU and node vacancy.
                                  • Spread means the Scheduler spreads workloads across as many GPUs and nodes as possible to minimize the load and maximize the available resources per workload.
                                  • GPU workloads are workloads that request both GPU and CPU resources.

                                  CPU workloads:

                                  • Bin-pack means the scheduler places as many workloads as possible in each CPU and node to use fewer resources and maximize CPU and node vacancy.
                                  • Spread means the scheduler spreads workloads across as many CPUs and nodes as possible to minimize the load and maximize the available resources per workload.
                                  • CPU workloads are workloads that request only CPU resources
                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#scheduler-deep-dive","title":"Scheduler deep dive","text":""},{"location":"Researcher/scheduling/the-runai-scheduler/#allocation","title":"Allocation","text":"

                                  When a user submits a workload, the workload controller creates a pod or pods (for distributed training workloads or a deployment based Inference). When the scheduler gets a submit request with the first pod, it creates a pod group and allocates all the relevant building blocks of that workload. The next pods of the same workload are attached to the same pod group.

                                  A workload, with its associated pod group, is queued in the appropriate queue. In every scheduling cycle, the Scheduler ranks the order of queues by calculating their precedence for scheduling.

                                  The next step is for the scheduler to find nodes for those pods, assign the pods to their nodes (bind operation), and bind other building blocks of the pods such as storage, ingress etc.

                                  If the pod-group has a gang scheduling rule attached to it, the scheduler either allocates and binds all pods together, or puts all of them into the pending state. It retries to schedule them all together in the next scheduling cycle.

                                  The scheduler also updates the status of the pods and their associate pod group, users are able to track the workload submission process both in the CLI or Run:ai UI.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#preemption","title":"Preemption","text":"

                                  If the scheduler cannot find resources for the submitted workloads (and all of its associated pods), and the workload deserves resources either because it is under its queue quota or under its queue fairshare, the scheduler tries to reclaim resources from other queues; if this doesn\u2019t solve the resources issue, the scheduler tries to preempt lower priority preemptible workloads within the same queue.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#reclaim-preemption-between-projects-and-departments","title":"Reclaim preemption between projects (and departments)","text":"

                                  Reclaim is an inter-project (and inter-department) resource balancing action that takes back resources from one project (or department) that has used them as an over-quota, back to a project (or department) that deserves those resources as part of its deserved quota, or to balance fairness between projects (or departments), so a project (or department) doesn\u2019t exceed its fairshare (portion of the unused resources).

                                  This mode of operation means that a lower priority workload submitted in one project (e.g. training) can reclaim resources from a project that runs a higher priority workload (e.g. preemptive workspace) if fairness balancing is required.

                                  Note

                                  Only preemptive workloads can go over-quota as they are susceptible to reclaim (cross-projects preemption) of the over-quota resources they are using. The amount of over-quota resources a project can gain depends on the over-quota weight or quota (if over-quota weight is disabled). Departments\u2019 over-quota is always proportional to its quota.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#priority-preemption-within-a-project","title":"Priority preemption within a project","text":"

                                  Higher priority workloads may preempt lower priority preemptible workloads within the same project/node pool queue. For example, in a project that runs a training workload that exceeds the project quota for a certain node pool, a newly submitted workspace within the same project/node pool may stop (preempt) the training workload if there are not enough over-quota resources for the project within that node pool to run both workloads (e.g. workspace using in-quota resources and training using over-quota resources).

                                  There is no priority notion between workloads of different projects.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#quota-over-quota-and-fairshare","title":"Quota, over-quota, and fairshare","text":"

                                  Run:ai scheduler strives to ensure fairness between projects and between departments, this means each department and project always strive to get their deserved quota, and unused resources are split between projects according to known rules (e.g. over-quota weights).

                                  If a project needs more resources even beyond its fairshare, and the scheduler finds unused resources that no other project needs, this project can consume resources even beyond its fairshare.

                                  Some scenarios can prevent the scheduler from fully providing the deserved quota and fairness promise, such as fragmentation or other scheduling constraints like affinities, taints etc.

                                  The example below illustrates a split of quota between different projects and departments, using several node pools:

                                  Legend:

                                  • OQW = Over-quota weight
                                  • OQ = Over-quota

                                  The example below illustrates how fairshare is calculated per project/node pool and per department/node pool for the above example:

                                  The Over quota (OQ) portion of each Project (per node pool) is calculated as:

                                  [(OQ-Weight) / (\u03a3 Projects OQ-Weights)] x (Unused Resource per node pool)\n

                                  Fairshare(FS) is calculated as: the sum of Quota + Over-Quota

                                  Let\u2019s see how Project 2 over quota and fairshare are calculated:

                                  For this example, we assume that out of the 40 available GPUs in node pool A, 20 GPUs are currently unused (unused means either not part of any project\u2019s quota, or part of a project\u2019s quota but not used by any workloads of that project).

                                  Project 2 over quota share:

                                  [(Project 2 OQ-Weight) / (\u03a3 all Projects OQ-Weights)] x (Unused Resource within node pool A)\n\n[(3) / (2 + 3 + 1)] x (20) = (3/6) x 20 = 10 GPUs\n

                                  Fairshare = deserved quota + over quota = 6 +10 = 16 GPUs

                                  Similarly, fairshare is also calculated for CPU and CPU memory.

                                  The scheduler can grant a project more resources than its fairshare if the scheduler finds resources not required by other projects that may deserve those resources.

                                  One can also see in the above illustration that Project 3 has no guaranteed quota, but it still has a share of the excess resources in node pool A. Run:ai Scheduler ensures that Project 3 receives its part of the unused resources for over quota, even if this results in reclaiming resources from other projects and preempting preemptible workloads.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#fairshare-balancing","title":"Fairshare balancing","text":"

                                  The Scheduler constantly re-calculates the fairshare of each project and department (per node pool, represented in the scheduler as queues), resulting in the re-balancing of resources between projects and between departments. This means that a preemptible workload that was granted resources to run in one scheduling cycle, can find itself preempted and go back to the pending state waiting for resources on the next cycle.

                                  A queue, representing a scheduler-managed object for each Project or Department per node pool, can be in one of 3 states:

                                  • __In-quota __ The queue\u2019s allocated resources \u2264 queue deserved quota
                                  • __Over-quota (but below fairshare) __ The queue\u2019s deserved quota < queue\u2019s allocates resources <= queue\u2019s fairshare
                                  • Over-Fairshare (and over-quota) The queue\u2019s fairshare < queue\u2019s allocated resources

                                  The scheduler\u2019s first priority is to ensure each queue (representing a project/node pool or department/node pool scheduler object) receives its deserved quota. Then the scheduler tries to find and allocate more resources to queues that need resources beyond their deserved quota and up to their fairshare, finally, the scheduler tries to allocate resources to queues that need even more resources - beyond their fairshare.

                                  When re-balancing resources between queues of different projects and departments, the scheduler goes in the opposite direction, i.e. first take resources from over-fairshare queues, then from over-quota queues, and finally, in some scenarios, even from queues that are below their deserved quota.

                                  "},{"location":"Researcher/scheduling/the-runai-scheduler/#summary","title":"Summary","text":"

                                  The scheduler\u2019s role is to bind any submitted pod to a node that satisfies the pod\u2019s requirements and constraints while adhering to the Run:ai quota and fairness system. In some scenarios, the scheduler finds a node for a pod (or nodes for a group of pods) immediately. In other scenarios, the scheduler has to preempt an already running workload to \u201cmake room\u201d, while sometimes a workload becomes pending until resources are released by other workloads (e.g. wait for other workloads to terminate), and only then it is scheduled and run.

                                  Other than scenarios where the requested resources or other constraints cannot be met within the cluster, either because the resources physically don\u2019t exist (e.g. a node with 16 GPUs, or a GPU with 200GB of memory), or a combination of constraints cannot be matched (e.g. a GPU with 80GB of memory together with a node with specific label or storage type), the scheduler eventually finds any workload its matching nodes to use, but this process may take some time.

                                  The Run:ai scheduler adheres to Kubernetes standard rules, but it also adds a layer of fairness between queues, queue hierarchy, node pools, and many more features, making the scheduling and Quota management more sophisticated, granular, and robust. The combination of these scheduler capabilities results in higher efficiency, scale, and maximization of cluster utilization.

                                  "},{"location":"Researcher/tools/dev-jupyter/","title":"Use a Jupyter Notebook with a Run:ai Job","text":"

                                  See the Jupyter Notebook Quickstart here.

                                  "},{"location":"Researcher/tools/dev-pycharm/","title":"Use PyCharm with a Run:ai Job","text":"

                                  Once you launch a workload using Run:ai, you will want to connect to it. You can do so via command-line or via other tools such as a Jupyter Notebook

                                  This document is about accessing the remote container created by Run:ai, from JetBrain's PyCharm.

                                  "},{"location":"Researcher/tools/dev-pycharm/#submit-a-workload","title":"Submit a Workload","text":"

                                  You will need your image to run an SSH server (e.g OpenSSH). For the purposes of this document, we have created an image named runai.jfrog.io/demo/pycharm-demo. The image runs both python and ssh. Details on how to create the image are here. The image is configured to use the root user and password for SSH.

                                  Run the following command to connect to the container as if it were running locally:

                                  runai submit build-remote -i runai.jfrog.io/demo/pycharm-demo --interactive  \\\n        --service-type=portforward --port 2222:22\n

                                  The terminal will show the connection:

                                  The job 'build-remote' has been submitted successfully\nYou can run `runai describe job build-remote -p team-a` to check the job status\nINFO[0007] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0045] Job started\nOpen access point(s) to service from localhost:2222\nForwarding from [::1]:2222 -> 22\n
                                  • The Job starts an sshd server on port 22.
                                  • The connection is redirected to the local machine (127.0.0.1) on port 2222

                                  Note

                                  It is possible to connect to the container using a remote IP address. However, this would be less convinient as you will need to maintain port numbers manually and change them when remote accessing using the development tool. As an example, run:\n\n```\nrunai submit build-remote -i runai.jfrog.io/demo/pycharm-demo -g 1 --interactive --service-type=nodeport --port 30022:22\n```\n\n* The Job starts an sshd server on port 22.\n* The Job redirects the external port 30022 to port 22 and uses a [Node Port](https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types){target=_blank} service type.\n* Run: `runai list worklaods`\n\n* Next to the Job, under the \"Service URL\" column you will find the IP address and port. The port is 30222\n
                                  "},{"location":"Researcher/tools/dev-pycharm/#pycharm","title":"PyCharm","text":"
                                  • Under PyCharm | Preferences go to: Project | Python Interpreter
                                  • Add a new SSH Interpreter.
                                  • As Host, use the IP address above. Change the port to the above and use the Username root
                                  • You will be prompted for a password. Enter root
                                  • Apply settings and run the code via this interpreter. You will see your project uploaded to the container and running remotely.
                                  "},{"location":"Researcher/tools/dev-tensorboard/","title":"Connecting to TensorBoard","text":"

                                  Once you launch a Deep Learning workload using Run:ai, you may want to view its progress. A popular tool for viewing progress is TensorBoard.

                                  The document below explains how to use TensorBoard to view the progress or a Run:ai Job.

                                  "},{"location":"Researcher/tools/dev-tensorboard/#emitting-tensorboard-logs","title":"Emitting TensorBoard Logs","text":"

                                  When you submit a workload, your workload must save TensorBoard logs which can later be viewed. Follow this document on how to do this. You can also view the Run:ai sample code here.

                                  The code shows:

                                  • A reference to a log directory:
                                  log_dir = \"logs/fit/\" + datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n
                                  • A registered Keras callback for TensorBoard:
                                  tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)\n\nmodel.fit(x_train, y_train,\n        ....\n        callbacks=[..., tensorboard_callback])\n

                                  The logs directory must be saved on a Network File Server such that it can be accessed by the TensorBoard Job. For example, by running the Job as follows:

                                  runai submit train-with-logs -i tensorflow/tensorflow:1.14.0-gpu-py3 \\\n  -v /mnt/nfs_share/john:/mydir -g 1  --working-dir /mydir --command -- ./startup.sh\n

                                  Note the volume flag (-v) and working directory flag (--working-dir). The logs directory will be created on /mnt/nfs_share/john/logs/fit.

                                  "},{"location":"Researcher/tools/dev-tensorboard/#submit-a-tensorboard-workload","title":"Submit a TensorBoard Workload","text":"

                                  There are two ways to submit a TensorBoard Workload: via the Command-line interface or the user interface

                                  User InterfaceCLI V1

                                  Browse to the provided Run:ai user interface and log in with your credentials.

                                  • In the Run:ai UI select Workloads
                                  • Select New Workload and then Workspace
                                  • You should already have Cluster, Project and a start from scratch Template selected. Enter tb as the name and press CONTINUE.
                                  • Under Environment, select jupyter-tensorboard.
                                  • Under Compute Resource, select one-gpu.
                                  • Select CREATE WORKSPACE.
                                  • In the workload list, add a column of Connections
                                  • When the workspace is running, you will see two connections:
                                    1. Juypter
                                    2. TensorBoard

                                  Run the following:

                                  runai submit tb -i tensorflow/tensorflow:latest --interactive --service-type=portforward --port 8888:8888  --working-dir /mydir  -v /mnt/nfs_share/john:/mydir  -- tensorboard --logdir logs/fit --port 8888 --host 0.0.0.0\n

                                  The terminal will show the following:

                                  The job 'tb' has been submitted successfully\nYou can run `runai describe job tb -p team-a` to check the job status\nINFO[0006] Waiting for job to start\nWaiting for job to start\nINFO[0014] Job started\nOpen access point(s) to service from localhost:8888\nForwarding from 127.0.0.1:8888 -> 8888\nForwarding from [::1]:8888 -> 8888\n

                                  Browse to http://localhost:8888/ to view TensorBoard.

                                  Note

                                  A single TensorBoard Job can be used to view multiple deep learning Jobs, provided it has access to the logs directory for these Jobs.

                                  "},{"location":"Researcher/tools/dev-vscode/","title":"Use Visual Studio Code with a Run:ai Job","text":"

                                  Once you launch a workload using Run:ai, you will want to connect to it. You can do so via command line or via other tools such as a Jupyter Notebook

                                  Important

                                  This document is about accessing the remote container created by Run:ai, from the installed version of Visual Studio Code. If you want to use Visual Studio Code for web, please see Visual Studio Code Web Quickstart.

                                  "},{"location":"Researcher/tools/dev-vscode/#submit-a-workload","title":"Submit a Workload","text":"

                                  You will need your image to run an SSH server (e.g OpenSSH). For this document, we have created an image named runai.jfrog.io/demo/pycharm-demo. The image runs both python and ssh. Details on how to create the image are here. The image is configured to use the root user and password for SSH.

                                  Run the following command to connect to the container as if it were running locally:

                                  runai submit build-remote -i runai.jfrog.io/demo/pycharm-demo --interactive  \\\n        --service-type=portforward --port 2222:22\n

                                  The terminal will show the connection:

                                  The job 'build-remote' has been submitted successfully\nYou can run `runai describe job build-remote -p team-a` to check the job status\nINFO[0007] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0045] Job started\nOpen access point(s) to service from localhost:2222\nForwarding from [::1]:2222 -> 22\n
                                  • The Job starts an sshd server on port 22.
                                  • The connection is redirected to the local machine (127.0.0.1) on port 2222

                                  Note

                                  It is possible to connect to the container using a remote IP address. However, this would be less convinient as you will need to maintain port numbers manually and change them when remote accessing using the development tool. As an example, run:

                                  runai submit build-remote -i runai.jfrog.io/demo/pycharm-demo -g 1 --interactive --service-type=nodeport --port 30022:22\n
                                  • The Job starts an sshd server on port 22.
                                  • The Job redirects the external port 30022 to port 22 and uses a Node Port service type.
                                  • Run: runai list jobs

                                  • Next to the Job, under the \"Service URL\" column you will find the IP address and port. The port is 30222

                                  "},{"location":"Researcher/tools/dev-vscode/#visual-studio-code","title":"Visual Studio Code","text":"
                                  • Under Visual Studio code install the Remote SSH extension.
                                  • Create an ssh entry to the service by editing .ssh/config file or use the command Remote-SSH: Connect to Host... from the Command Palette. Enter the IP address and port from above (e.g. ssh root@35.34.212.12 -p 30022 or ssh root@127.0.0.1 -p 2222). User and password are root
                                  • Using VS Code, install the Python extension on the remote machine
                                  • Write your first Python code and run it remotely.
                                  "},{"location":"Researcher/tools/dev-x11forward-pycharm/","title":"Use PyCharm with X11 Forwarding and Run:ai","text":"

                                  X11 is a window system for the Unix operating systems. X11 forwarding allows executing a program remotely through an SSH connection. Meaning, the executable file itself is hosted on a different machine than where the graphical interface is being displayed. The graphical windows are forwarded to your local machine through the SSH connection.

                                  This section is about setting up X11 forwarding from a Run:ai-based container to a PyCharm IDE on a remote machine.

                                  "},{"location":"Researcher/tools/dev-x11forward-pycharm/#submit-a-workload","title":"Submit a Workload","text":"

                                  You will need your image to run an SSH server (e.g OpenSSH). For the purposes of this document, we have created an image named runai.jfrog.io/demo/quickstart-x-forwarding. The image runs:

                                  • Python
                                  • SSH Daemon configured for X11Forwarding
                                  • OpenCV python library for image handling

                                  Details on how to create the image are here. The image is configured to use the root user and password for SSH.

                                  Run the following command to connect to the container as if it were running locally:

                                  runai submit xforward-remote -i runai.jfrog.io/demo/quickstart-x-forwarding --interactive  \\\n        --service-type=portforward --port 2222:22\n

                                  The terminal will show the connection:

                                  The job 'xforward-remote' has been submitted successfully\nYou can run `runai describe job xforward-remote -p team-a` to check the job status\nINFO[0007] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0045] Job started\nOpen access point(s) to service from localhost:2222\nForwarding from [::1]:2222 -> 22\n
                                  • The Job starts an sshd server on port 22.
                                  • The connection is redirected to the local machine (127.0.0.1) on port 2222
                                  "},{"location":"Researcher/tools/dev-x11forward-pycharm/#setup-the-x11-forwarding-tunnel","title":"Setup the X11 Forwarding Tunnel","text":"

                                  Connect to the new Job by running:

                                  ssh -X root@127.0.0.1 -p 2222\n

                                  Note the -X flag.

                                  Run:

                                  echo $DISPLAY\n
                                  Copy the value. It will be used as a PyCharm environment variable.

                                  Important

                                  The ssh terminal should remain active throughout the session.

                                  "},{"location":"Researcher/tools/dev-x11forward-pycharm/#pycharm","title":"PyCharm","text":"
                                  • Under PyCharm | Preferences go to: Project | Python Interpreter
                                  • Add a new SSH Interpreter.
                                  • As Host, use localhost. Change the port to the above (2222) and use the Username root.
                                  • You will be prompted for a password. Enter root.
                                  • Make sure to set the correct path of the Python binary. In our case it's /usr/local/bin/python.
                                  • Apply your settings.

                                  • Under PyCharm configuration set the following environment variables:

                                    1. DISPLAY - set environment variable you copied before
                                    2. HOME - In our case it's /root. This is required for the X11 authentication to work.

                                  Run your code. You can use our sample code here.

                                  "},{"location":"Researcher/workloads/assets/compute/","title":"Compute Resources","text":"

                                  This article explains what compute resources are and how to create and use them.

                                  Compute resources are one type of workload asset. A compute resource is a template that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

                                  A compute resource asset is a preconfigured building block that encapsulates all the specifications of compute requirements for the workload including:

                                  • GPU devices and GPU memory
                                  • CPU memory and CPU compute
                                  "},{"location":"Researcher/workloads/assets/compute/#compute-resource-table","title":"Compute resource table","text":"

                                  The Compute resource table can be found under Workload manager in the Run:ai UI.

                                  The Compute resource table provides a list of all the compute resources defined in the platform and allows you to manage them.

                                  The Compute resource table consists of the following columns:

                                  Column Description Compute resource The name of the compute resource Description A description of the essence of the compute resource GPU devices request per pod The number of requested physical devices per pod of the workload that uses this compute resource GPU memory request per device The amount of GPU memory per requested device that is granted to each pod of the workload that uses this compute resource CPU memory request The minimum amount of CPU memory per pod of the workload that uses this compute resource CPU memory limit The maximum amount of CPU memory per pod of the workload that uses this compute resource CPU compute request The minimum number of CPU cores per pod of the workload that uses this compute resource CPU compute limit The maximum number of CPU cores per pod of the workload that uses this compute resource Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram Workload(s) The list of workloads associated with the compute resource Template(s) The list of workload templates that use this compute resource Created by The name of the user who created the compute resource Creation time The timestamp of when the compute resource was created Last updated The timestamp of when the compute resource was last updated Cluster The cluster that the compute resource is associated with"},{"location":"Researcher/workloads/assets/compute/#workloads-associated-with-the-compute-resource","title":"Workloads associated with the compute resource","text":"

                                  Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

                                  Column Description Workload The workload that uses the compute resource Type Workspace/Training/Inference Status Represents the workload lifecycle. See the full list of workload status."},{"location":"Researcher/workloads/assets/compute/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  "},{"location":"Researcher/workloads/assets/compute/#adding-new-compute-resource","title":"Adding new compute resource","text":"

                                  To add a new compute resource:

                                  1. Go to the Compute resource table
                                  2. Click +NEW COMPUTE RESOURCE
                                  3. Select under which cluster to create the compute resource
                                  4. Select a scope
                                  5. Enter a name for the compute resource. The name must be unique.
                                  6. Optional: Provide a description of the essence of the compute resource
                                  7. Set the resource types needed within a single node (the Run:ai scheduler tries to match a single node that complies with the compute resource for each of the workload\u2019s pods)

                                    • GPU

                                      • GPU devices per pod The number of devices (physical GPUs) per pod (for example, if you requested 3 devices per pod and the running workload using this compute resource consists of 3 pods, there are 9 physical GPU devices used in total)

                                      Note

                                      • When setting it to zero, the workload using this computer resource neither requests or uses GPU resources while running
                                      • You can set any number of GPU devices and specify the memory requirement to any portion size (1..100), or memory size value using GB or MB units per device
                                      • GPU memory per device
                                        • Select the memory request format
                                          • % (of device) - Fraction of a GPU device\u2019s memory
                                          • MB (memory size) - An explicit GPU memory unit
                                          • GB (memory size) - An explicit GPU memory unit
                                        • Set the memory Request - The minimum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives this amount of GPU memory for each device(s) the pod utilizes
                                        • Optional: Set the memory Limit - The maximum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives at most this amount of GPU memory for each device(s) the pod utilizes. To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.

                                      Note

                                      • GPU memory limit is disabled by default. If you cannot see the Limit toggle in the compute resource form, then it must be enabled by your Administrator, under General settings \u2192 Resources \u2192 GPU resource optimization
                                      • When a Limit is set and is bigger than the Request, the scheduler allows each pod to reach the maximum amount of GPU memory in an opportunistic manner (only upon availability).
                                      • If the GPU Memory Limit is bigger that the Request the pod is prone to be killed by the Run:ai toolkit (out of memory signal). The greater the difference between the GPU memory used and the request, the higher the risk of being killed
                                      • If GPU resource optimization is turned off, the minimum and maximum are in fact equal
                                    • CPU

                                      • CPU compute per pod
                                        • Select the units for the CPU compute (Cores / Millicores)
                                        • Set the CPU compute Request - the minimum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU compute for each pod.
                                        • Optional: Set the CPU compute Limit - The maximum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU compute. To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request. By default, the limit is set to \u201cUnlimited\u201d - which means that the pod may consume all the node's free CPU compute resources.
                                      • CPU memory per pod
                                        • Select the units for the CPU memory (MB / GB)
                                        • Set the CPU memory Request - The minimum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU memory for each pod.
                                        • Optional: Set the CPU memory Limit - The maximum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU memory. To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request. By default, the limit is set to \u201cUnlimited\u201d - Meaning that the pod may consume all the node's free CPU memory resources.

                                      Note

                                      If the CPU Memory Limit is bigger that the Request the pod is prone to be killed by the operating system (out of memory signal). The greater the difference between the CPU memory used and the request, the higher the risk of being killed.

                                  8. Optional: More settings

                                    • Increase shared memory size When enabled, the shared memory size available to the pod is increased from the default 64MB to the node's total available memory or the CPU memory limit, if set above.
                                    • Set extended resource(s) Click +EXTENDED RESOURCES to add resource/quantity pairs. For more information on how to set extended resources, see the Extended resources and Quantity guides
                                  9. Click CREATE COMPUTE RESOURCE

                                    Note

                                    It is also possible to add compute resources directly when creating a specific Workspace, training or inference workload.

                                  "},{"location":"Researcher/workloads/assets/compute/#editing-a-compute-resource","title":"Editing a compute resource","text":"

                                  To edit a compute resource:

                                  1. Select the compute resource you want to edit
                                  2. Click Edit
                                  3. Click SAVE COMPUTE RESOURCE

                                  Note

                                  The already bound workload that is using this asset will not be affected.

                                  "},{"location":"Researcher/workloads/assets/compute/#copying-a-compute-resource","title":"Copying a compute resource","text":"

                                  To make a copy of an existing compute resource:

                                  1. Select the compute resource you want to copy
                                  2. Click MAKE A COPY
                                  3. Enter a name for the environment. The name must be unique.
                                  4. Update the environment
                                  5. Click CREATE COMPUTE RESOURCE
                                  "},{"location":"Researcher/workloads/assets/compute/#deleting-a-compute-resource","title":"Deleting a compute resource","text":"
                                  1. Select the compute resource you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm

                                  Note

                                  The already bound workload that is using this asset will not be affected.

                                  "},{"location":"Researcher/workloads/assets/compute/#using-api","title":"Using API","text":"

                                  Go to the Compute resources API reference to view the available actions

                                  "},{"location":"Researcher/workloads/assets/credentials/","title":"Credentials","text":"

                                  This article explains what credentials are and how to create and use them.

                                  Credentials are a workload asset that simplify the complexities of Kubernetes secrets. They consist of and mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

                                  Credentials are crucial for the security of AI workloads and the resources they require, as they restrict access to authorized users, verify identities, and ensure secure interactions. By enforcing the protection of sensitive data, credentials help organizations comply with industry regulations, fostering a secure environment overall.

                                  Essentially, credentials enable AI practitioners to access relevant protected resources, such as private data sources and Docker images, thereby streamlining the workload submission process.

                                  "},{"location":"Researcher/workloads/assets/credentials/#credentials-table","title":"Credentials table","text":"

                                  The Credentials table can be found under Workload manager in the Run:ai User interface.

                                  The Credentials table provides a list of all the credentials defined in the platform and allows you to manage them.

                                  The Credentials table comprises the following columns:

                                  Column Description Credentials The name of the credentials Description A description of the credentials Type The type of credentials, e.g., Docker registry Status The different lifecycle phases and representation of the credentials\u2019 condition Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram Kubernetes name The unique name of the credentials Kubernetes name as it appears in the cluster Environment(s) The environment(s) that are associated with the credentials Data source(s) The private data source(s) that are accessed using the credentials Created by The user who created the credentials Creation time The timestamp of when the credentials were created Cluster The cluster with which the credentials are associated"},{"location":"Researcher/workloads/assets/credentials/#credentials-status","title":"Credentials status","text":"

                                  The following table describes the credentials\u2019 condition and whether they were created successfully for the selected scope.

                                  Status Description No issues found No issues were found while creating the credentials (this status may change while propagating the credentials to the selected scope) Issues found Issues found while propagating the credentials Issues found Failed to access the cluster Creating\u2026 Credentials are being created Deleting\u2026 Credentials are being deleted No status When the credentials\u2019 scope is an account, or the current version of the cluster is not up to date, the status cannot be displayed"},{"location":"Researcher/workloads/assets/credentials/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then click \u2018Download as CSV\u2019. Export to CSV is limited to 20,000 rows.
                                  • Refresh - Click REFRESH to update the table with the latest data
                                  "},{"location":"Researcher/workloads/assets/credentials/#adding-new-credentials","title":"Adding new credentials","text":"

                                  Creating credentials is limited to specific roles.

                                  To add a new credential:

                                  1. Go to the Credentials table:
                                  2. Click +NEW CREDENTIALS
                                  3. Select the credential type from the list Follow the step-by-step guide for each credential type:
                                  "},{"location":"Researcher/workloads/assets/credentials/#docker-registry","title":"Docker registry","text":"

                                  These credentials allow users to authenticate and pull images from a Docker registry, enabling access to containerized applications and services.

                                  After creating the credentials, it is used automatically when pulling images.

                                  1. Select a scope.
                                  2. Enter a name for the credential. The name must be unique.
                                  3. Optional: Provide a description of the credentials
                                  4. Set how the credential is created
                                    • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
                                      • Select a secret from the list (The list is empty if no secrets were created in advance)
                                    • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
                                      • Enter the username, password, and Docker registry URL
                                  5. Click CREATE CREDENTIALS

                                  After the credentials are created, check their status to monitor their proper creation across the selected scope.

                                  "},{"location":"Researcher/workloads/assets/credentials/#access-key","title":"Access key","text":"

                                  These credentials are unique identifiers used to authenticate and authorize access to cloud services or APIs, ensuring secure communication between applications. They typically consist of two parts:

                                  • An access key ID
                                  • A secret access key

                                  The purpose of this credential type is to allow access to restricted data.

                                  1. Select a scope.
                                  2. Enter a name for the credential. The name must be unique.
                                  3. Optional: Provide a description of the credential
                                  4. Set how the credential is created
                                    • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
                                      • Select a secret from the list (The list is empty if no secrets were created in advance)
                                    • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
                                      • Enter the Access key and Access secret
                                  5. Click CREATE CREDENTIALS

                                  After the credentials are created, check their status to monitor their proper creation across the selected scope.

                                  "},{"location":"Researcher/workloads/assets/credentials/#username-password","title":"Username & password","text":"

                                  These credentials require a username and corresponding password to access various resources, ensuring that only authorized users can log in.

                                  The purpose of this credential type is to allow access to restricted data.

                                  1. Select a scope
                                  2. Enter a name for the credential. The name must be unique.
                                  3. Optional: Provide a description of the credentials
                                  4. Set how the credential is created
                                    • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
                                      • Select a secret from the list (The list is empty if no secrets were created in advance)
                                    • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
                                      • Enter the username and password
                                  5. Click CREATE CREDENTIALS

                                  After the credentials are created, check their status to monitor their proper creation across the selected scope.

                                  "},{"location":"Researcher/workloads/assets/credentials/#generic-secret","title":"Generic secret","text":"

                                  These credentials are a flexible option that consists of multiple keys & values and can store various sensitive information, such as API keys or configuration data, to be used securely within applications.

                                  The purpose of this credential type is to allow access to restricted data.

                                  1. Select a scope
                                  2. Enter a name for the credential. The name must be unique.
                                  3. Optional: Provide a description of the credentials
                                  4. Set how the credential is created
                                    • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
                                      • Select a secret from the list (The list is empty if no secrets were created in advance)
                                    • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
                                      • Click +KEY & VALUE - to add key/value pairs to store in the new secret
                                  5. Click CREATE CREDENTIALS
                                  "},{"location":"Researcher/workloads/assets/credentials/#editing-credentials","title":"Editing credentials","text":"

                                  To rename a credential:

                                  1. Select the credential from the table
                                  2. Click Rename to edit its name and description
                                  "},{"location":"Researcher/workloads/assets/credentials/#deleting-credentials","title":"Deleting credentials","text":"

                                  To delete a credential:

                                  1. Select the credential you want to delete
                                  2. Click DELETE
                                  3. In the dialog, click DELETE to confirm

                                  Note

                                  Credentials cannot be deleted if they are being used by a workload and template.

                                  "},{"location":"Researcher/workloads/assets/credentials/#using-credentials","title":"Using credentials","text":"

                                  You can use credentials (secrets) in various ways within the system

                                  "},{"location":"Researcher/workloads/assets/credentials/#access-private-data-sources","title":"Access private data sources","text":"

                                  To access private data sources, attach credentials to data sources of the following types: Git, S3 Bucket

                                  "},{"location":"Researcher/workloads/assets/credentials/#use-directly-within-the-container","title":"Use directly within the container","text":"

                                  To use the secret directly from within the container, you can choose between the following options

                                  1. Get the secret mounted to the file system by using the Generic secret data source
                                  2. Get the secret as an environment variable injected into the container. There are two equivalent ways to inject the environment variable.

                                    a. By adding it to the Environment asset. b. By adding it ad-hoc as part of the workload.

                                  "},{"location":"Researcher/workloads/assets/credentials/#creating-secrets-in-advance","title":"Creating secrets in advance","text":"

                                  Add secrets in advance to be used when creating credentials via the Run:ai UI.

                                  Follow the steps below for each required scope:

                                  Cluster scopeDepartment scopeProject scope
                                  1. Create the secret in the Run:ai namespace (runai)
                                  2. To authorize Run:ai to use the secret, label it: run.ai/cluster-wide: \"true\"
                                  3. Label the secret with the correct credential type:
                                    1. Docker registry - run.ai/resource: \"docker-registry\"
                                    2. Access key - run.ai/resource: \"access-key\"
                                    3. Username and password - run.ai/resource: \"password\"
                                    4. Generic secret - run.ai/resource: \"generic\" \u05bf
                                  1. Create the secret in the Run:ai namespace (runai)
                                  2. To authorize Run:ai to use the secret, label it: run.ai/department: \"<department id>\"
                                  3. Label the secret with the correct credential type:
                                    1. Docker registry - run.ai/resource: \"docker-registry\"
                                    2. Access key - run.ai/resource: \"access-key\"
                                    3. Username and password - run.ai/resource: \"password\"
                                    4. Generic secret - run.ai/resource: \"generic\"
                                  1. Create the secret in the project\u2019s namespace
                                  2. Label the secret with the correct credential type:
                                    1. Docker registry - run.ai/resource: \"docker-registry\"
                                    2. Access key - run.ai/resource: \"access-key\"
                                    3. Username and password - run.ai/resource: \"password\"
                                    4. Generic secret - run.ai/resource: \"generic\"

                                  The secret is now displayed for that scope in the list of existing secrets.

                                  "},{"location":"Researcher/workloads/assets/credentials/#using-api","title":"Using API","text":"

                                  To view the available actions, go to the Credentials API reference

                                  "},{"location":"Researcher/workloads/assets/data-volumes/","title":"Data Volumes","text":"

                                  Data volumes offer a powerful solution for storing, managing, and sharing AI training data within the Run:ai platform. They promote collaboration, simplify data access control, and streamline the AI development lifecycle.

                                  Data volumes are snapshots of datasets stored in Kubernetes Persistent Volume Claims (PVCs). They act as a central repository for training data.

                                  "},{"location":"Researcher/workloads/assets/data-volumes/#why-use-a-data-volume","title":"Why use a data volume?","text":"
                                  1. Sharing with multiple scopes Unlike other Run:ai data sources, data volumes can be shared across projects, departments, or clusters, encouraging data reuse and collaboration within the organization.
                                  2. Storage saving A single copy of the data can be used across multiple scopes
                                  "},{"location":"Researcher/workloads/assets/data-volumes/#typical-use-cases","title":"Typical use cases","text":"
                                  1. Sharing large data sets In large organizations, the data is often stored in a remote location, which can be a barrier for large model training. Even if the data is transferred into the cluster, sharing it easily with multiple users is still challenging. Data volumes can help share the data seamlessly, with maximum security and control.
                                  2. Sharing data with colleagues When sharing training results, generated data sets, or other artifacts with team members is needed, data volumes can help make the data available easily.
                                  "},{"location":"Researcher/workloads/assets/data-volumes/#prerequisites","title":"Prerequisites","text":"

                                  To create a data volume, there must be a project with a PVC in its namespace.

                                  Working with data volumes is currently available using the API. To view the available actions, go to the Data volumes API reference.

                                  "},{"location":"Researcher/workloads/assets/data-volumes/#adding-a-new-data-volume","title":"Adding a new data volume","text":"

                                  Data volume creation is limited to specific roles

                                  "},{"location":"Researcher/workloads/assets/data-volumes/#adding-scopes-for-a-data-volume","title":"Adding scopes for a data volume","text":"

                                  Data volume sharing (adding scopes) is limited to specific roles

                                  Once created, the data volume is available to its originating project (see the prerequisites above).

                                  Data volumes can be shared with additional scopes in the organization.

                                  "},{"location":"Researcher/workloads/assets/data-volumes/#who-can-use-a-data-volume","title":"Who can use a data volume?","text":"

                                  Data volumes are used when submitting workloads. Any user, application or SSO group with a role that has permissions to create workloads can also use data volumes.

                                  Researchers can list available data volumes within their permitted scopes for easy selection.

                                  "},{"location":"Researcher/workloads/assets/datasources/","title":"Data Sources","text":"

                                  This article explains what data sources are and how to create and use them.

                                  Data sources are a type of workload asset and represent a location where data is actually stored. They may represent a remote data location, such as NFS, Git, or S3, or a Kubernetes local resource, such as PVC, ConfigMap, HostPath, or Secret.

                                  This configuration simplifies the mapping of the data into the workload\u2019s file system and handles the mounting process during workload creation for reading and writing. These data sources are reusable and can be easily integrated and used by AI practitioners while submitting workloads across various scopes.

                                  "},{"location":"Researcher/workloads/assets/datasources/#data-sources-table","title":"Data sources table","text":"

                                  The data sources table can be found under Workload manager in the Run:ai platform.

                                  The data sources table provides a list of all the data sources defined in the platform and allows you to manage them.

                                  The data sources table comprises the following columns:

                                  Column Description Data source The name of the data source Description A description of the data source Type The type of data source connected \u2013 e.g., S3 bucket, PVC, or others Status The different lifecycle phases and representation of the data source condition Scope The scope of the data source within the organizational tree. Click the scope name to view the organizational tree diagram Kubernetes name The unique name of the data sources Kubernetes name as it appears in the cluster Workload(s) The list of existing workloads that use the data source Template(s) The list of workload templates that use the data source Created by The user who created the data source Creation time The timestamp for when the data source was created Cluster The cluster that the data source is associated with"},{"location":"Researcher/workloads/assets/datasources/#data-sources-status","title":"Data sources status","text":"

                                  The following table describes the data sources' condition and whether they were created successfully for the selected scope.

                                  Status Description No issues found No issues were found while creating the data source Issues found Issues were found while propagating the data source credentials Issues found The data source couldn\u2019t be created at the cluster Creating\u2026 The data source is being created No status / \u201c-\u201d When the data source\u2019s scope is an account, the current version of the cluster is not up to date, or the asset is not a cluster-syncing entity, the status can\u2019t be displayed"},{"location":"Researcher/workloads/assets/datasources/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then click \u2018Download as CSV\u2019
                                  • Refresh - Click REFRESH to update the table with the latest data
                                  "},{"location":"Researcher/workloads/assets/datasources/#adding-a-new-data-source","title":"Adding a new data source","text":"

                                  To create a new data source:

                                  1. Click +NEW DATA SOURCE
                                  2. Select the data source type from the list. Follow the step-by-step guide for each data source type:
                                  "},{"location":"Researcher/workloads/assets/datasources/#nfs","title":"NFS","text":"

                                  A Network File System (NFS) is a Kubernetes concept used for sharing storage in the cluster among different pods. Like a PVC, the NFS volume\u2019s content remains preserved, even outside the lifecycle of a single pod. However, unlike PVCs, which abstract storage management, NFS provides a method for network-based file sharing. The NFS volume can be pre-populated with data and can be mounted by multiple pod writers simultaneously. At Run:ai, an NFS-type data source is an abstraction that is mapped directly to a Kubernetes NFS volume. This integration allows multiple workloads under various scopes to mount and present the NFS data source.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Set the data origin
                                    • Enter the NFS server (host name or host IP)
                                    • Enter the NFS path
                                  6. Set the data target location
                                    • Container path
                                  7. Optional: Restrictions
                                    • Prevent data modification - When enabled, the data will be mounted with read-only permissions
                                  8. Click CREATE DATA SOURCE
                                  "},{"location":"Researcher/workloads/assets/datasources/#pvc","title":"PVC","text":"

                                  A Persistent Volume Claim (PVC) is a Kubernetes concept used for managing storage in the cluster, which can be provisioned by an administrator or dynamically by Kubernetes using a StorageClass. PVCs allow users to request specific sizes and access modes (read/write once, read-only many). Run:ai ensures that data remains consistent and accessible across various scopes and workloads, beyond the lifecycle of individual pods, which is efficient while working with large datasets typically associated with AI projects.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Select PVC:
                                    • Existing PVC This option is relevant when the purpose is to create a PVC-type data source based on an existing PVC in the cluster
                                      • Select a PVC from the list - (The list is empty if no existing PVCs were created in advance)
                                    • New PVC - creates a new PVC in the cluster. New PVCs are not added to the Existing PVCs list. When creating a PVC-type data source and selecting the \u2018New PVC\u2019 option, the PVC is immediately created in the cluster (even if no workload has requested this PVC).
                                  6. Select the storage class
                                    • None - Proceed without defining a storage class
                                    • Custom storage class - This option applies when selecting a storage class based on existing storage classes. To add new storage classes to the storage class list, and for additional information, check Kubernetes storage classes
                                  7. Select the access mode(s) (multiple modes can be selected)
                                    • Read-write by one node - The volume can be mounted as read-write by a single node.
                                    • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
                                    • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
                                  8. Set the claim size and its units
                                  9. Select the volume mode
                                    • File system (default) - allows the volume to be mounted as a filesystem, enabling the usage of directories and files.
                                    • Block - exposes the volume as a block storage, which can be formatted or used by applications directly without a filesystem.
                                  10. Set the data target location
                                    • container path
                                  11. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permission.
                                  12. Click CREATE DATA SOURCE

                                  After the data source is created, check its status to monitor its proper creation across the selected scope.

                                  "},{"location":"Researcher/workloads/assets/datasources/#s3-bucket","title":"S3 Bucket","text":"

                                  The S3 bucket data source enables the mapping of a remote S3 bucket into the workload\u2019s file system. Similar to a PVC, this mapping remains accessible across different workload executions, extending beyond the lifecycle of individual pods. However, unlike PVCs, data stored in an S3 bucket resides remotely, which may lead to decreased performance during the execution of heavy machine learning workloads. As part of the Run:ai connection to the S3 bucket, you can create credentials in order to access and map private buckets.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Set the data origin
                                    • Set the S3 service URL
                                    • Select the credentials
                                      • None - for public buckets
                                      • Credential names - This option is relevant for private buckets based on existing credentials that were created for the scope. To add new credentials to the credentials list, and for additional information, check the Credentials article.
                                    • Enter the bucket name
                                  6. Set the data target location
                                    • container path
                                  7. Click CREATE DATA SOURCE

                                  After a private data source is created, check its status to monitor its proper creation across the selected scope.

                                  "},{"location":"Researcher/workloads/assets/datasources/#git","title":"Git","text":"

                                  A Git-type data source is a Run:ai integration, that enables code to be copied from a Git branch into a dedicated folder in the container. It is mainly used to provide the workload with the latest code repository. As part of the integration with Git, in order to access private repositories, you can add predefined credentials to the data source mapping.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Set the data origin
                                    • Set the Repository URL
                                    • Set the Revision (branch, tag, or hash)- If left empty, it will use the 'HEAD' (latest)
                                    • Select the credentials
                                      • None - for public repositories
                                      • Credential names - This option applies to private repositories based on existing credentials that were created for the scope. To add new credentials to the credentials list, and for additional information, check the Credentials article.
                                  6. Set the data target location
                                    • container path
                                  7. Click CREATE DATA SOURCE

                                  After a private data source is created, check its status to monitor its proper creation across the selected scope.

                                  "},{"location":"Researcher/workloads/assets/datasources/#host-path","title":"Host path","text":"

                                  A Host path volume is a Kubernetes concept that enables mounting a host path file or a directory on the workload\u2019s file system. Like a PVC, the host path volume\u2019s data persists across workloads under various scopes. It also enables data serving from the hosting node.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Set the data origin
                                    • host path
                                  6. Set the data target location
                                    • container path
                                  7. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permissions.
                                  8. Click CREATE DATA SOURCE
                                  "},{"location":"Researcher/workloads/assets/datasources/#configmap","title":"ConfigMap","text":"

                                  A ConfigMap data source is a Run:ai abstraction for the Kubernetes ConfigMap concept. The ConfigMap is used mainly for storage that can be mounted on the workload container for non-confidential data. It is usually represented in key-value pairs (e.g., environment variables, command-line arguments etc.). It allows you to decouple environment-specific system configurations from your container images, so that your applications are easily portable. ConfigMaps must be created on the cluster prior to being used within the Run:ai system.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Set the data origin
                                    • Select the ConfigMap name (The list is empty if no existing ConfigMaps were created in advance).
                                  6. Set the data target location
                                    • container path
                                  7. Click CREATE DATA SOURCE
                                  "},{"location":"Researcher/workloads/assets/datasources/#secret","title":"Secret","text":"

                                  A secret-type data source enables the mapping of a credential into the workload\u2019s file system. Credentials are a workload asset that simplify the complexities of Kubernetes Secrets. The credentials mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Set the data origin
                                    • Select the credentials To add new credentials, and for additional information, check the Credentials article.
                                  6. Set the data target location
                                    • container path
                                  7. Click CREATE DATA SOURCE

                                  After the data source is created, check its status to monitor its proper creation across the selected scope.

                                  Note

                                  It is also possible to add data sources directly when creating a specific workspace, training or inference workload

                                  "},{"location":"Researcher/workloads/assets/datasources/#editing-a-data-source","title":"Editing a data source","text":"

                                  To edit a data source:

                                  1. Select the data source from the table
                                  2. Click Rename to provide it with a new name
                                  3. Click Copy & Edit to make any changes to the data source
                                  "},{"location":"Researcher/workloads/assets/datasources/#deleting-a-data-source","title":"Deleting a data source","text":"

                                  To delete a data source:

                                  1. Select the data source you want to delete
                                  2. Click DELETE
                                  3. Confirm you want to delete the data source

                                  Note

                                  It is not possible to delete an environment being used by an existing workload or template.

                                  "},{"location":"Researcher/workloads/assets/datasources/#using-api","title":"Using API","text":"

                                  To view the available actions, go to the Data sources API reference.

                                  "},{"location":"Researcher/workloads/assets/environments/","title":"Environments","text":"

                                  This article explains what environments are and how to create and use them.

                                  Environments are one type of workload asset. An environment consists of a configuration that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

                                  An environment asset is a preconfigured building block that encapsulates aspects for the workload such as:

                                  • Container image and container configuration
                                  • Tools and connections
                                  • The type of workload it serves
                                  "},{"location":"Researcher/workloads/assets/environments/#environments-table","title":"Environments table","text":"

                                  The Environments table can be found under Workload manager in the Run:ai platform.

                                  The Environment table provides a list of all the environment defined in the platform and allows you to manage them.

                                  The Environments table consists of the following columns:

                                  Column Description Environment The name of the environment Description A description of the environment Scope The scope of this environment within the organizational tree. Click the name of the scope to view the organizational tree diagram Image The application or service to be run by the workload Workload Architecture This can be either standard for running workloads on a single node or distributed for running distributed workloads on multiple nodes Tool(s) The tools and connection types the environment exposes Workload(s) The list of existing workloads that use the environment Workload types The workload types that can use the environment (Workspace/ Training / Inference) Template(s) The list of workload templates that use this environment Created by The user who created the environment. By default Run:ai UI comes with preinstalled environments created by Run:ai created by Run:ai Creation time The timestamp of when the environment was created Last updated The timestamp of when the environment was last updated Cluster The cluster with which the environment is associated"},{"location":"Researcher/workloads/assets/environments/#tools-associated-with-the-environment","title":"Tools associated with the environment","text":"

                                  Click one of the values in the tools column to view the list of tools and their connection type.

                                  Column Description Tool name The name of the tool or application AI practitioner can set up within the environment. Connection type The method by which you can access and interact with the running workload. It's essentially the \"doorway\" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc)"},{"location":"Researcher/workloads/assets/environments/#workloads-associated-with-the-environment","title":"Workloads associated with the environment","text":"

                                  Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

                                  Column Description Workload The workload that uses the environment Type The workload type (Workspace/Training/Inference) Status Represents the workload lifecycle. See the full list of workload status"},{"location":"Researcher/workloads/assets/environments/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"Researcher/workloads/assets/environments/#environments-created-by-runai","title":"Environments created by Run:ai","text":"

                                  When installing Run:ai, you automatically get the environments created by Run:ai to ease up the onboarding process and support different use cases out of the box. These environments are created at the scope of the account.

                                  Environment Image Jupiter-lab jupyter/scipy-notebook jupyter-tensorboard gcr.io/run-ai-demo/jupyter-tensorboard tensorboard tensorflow/tensorflow:latest llm-server runai.jfrog.io/core-llm/runai-vllm:v0.6.4-0.10.0 chatbot-ui runai.jfrog.io/core-llm/llm-app gpt2 runai.jfrog.io/core-llm/quickstart-inference:gpt2-cpu"},{"location":"Researcher/workloads/assets/environments/#adding-a-new-environment","title":"Adding a new environment","text":"

                                  Environment creation is limited to specific roles

                                  To add a new environment:

                                  1. Go to the Environments table
                                  2. Click +NEW ENVIRONMENT
                                  3. Select under which cluster to create the environment
                                  4. Select a scope
                                  5. Enter a name for the environment. The name must be unique.
                                  6. Optional: Provide a description of the essence of the environment
                                  7. Enter the Image URL If a token or secret is required to pull the image, it is possible to create it via credentials of type docker registry. These credentials are automatically used once the image is pulled (which happens when the workload is submitted)
                                  8. Set the image pull policy - the condition for when to pull the image from the registry
                                  9. Set the workload architecture:
                                    • Standard Only standard workloads can use the environment. A standard workload consists of a single process.
                                    • Distributed Only distributed workloads can use the environment. A distributed workload consists of multiple processes working together. These processes can run on different nodes.
                                    • Select a framework from the list.
                                  10. Set the workload type:
                                    • Workspace
                                    • Training
                                    • Inference
                                    • When inference is selected, define the endpoint of the model by providing both the protocol and the container\u2019s serving port
                                  11. Optional: Set the connection for your tool(s). The tools must be configured in the image. When submitting a workload using the environment, it is possible to connect to these tools
                                    • Select the tool from the list (the available tools varies from IDE, experiment tracking, and more, including a custom tool for your choice)
                                    • Select the connection type
                                      • External URL
                                        • Auto generate A unique URL is automatically created for each workload using the environment
                                        • Custom URL The URL is set manually
                                      • Node port
                                        • Auto generate A unique port is automatically exposed for each workload using the environment
                                        • Custom URL Set the port manually
                                      • Set the container port
                                  12. Optional: Set a command and arguments for the container running the pod
                                    • When no command is added, the default command of the image is used (the image entrypoint)
                                    • The command can be modified while submitting a workload using the environment
                                    • The argument(s) can be modified while submitting a workload using the environment
                                  13. Optional: Set the environment variable(s)
                                    • Click +ENVIRONMENT VARIABLE
                                    • Enter a name
                                    • Select the source for the environment variable
                                    • Custom
                                      • Enter a value
                                      • Leave empty
                                      • Add instructions for the expected value if any
                                    • Credentials - Select existing credentials as the environment variable
                                      • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
                                      • Select a secret key
                                    • The environment variables can be modified and new variables can be added while submitting a workload using the environment
                                  14. Optional: Set the container\u2019s working directory to define where the container\u2019s process starts running. When left empty, the default directory is used.
                                  15. Optional: Set where the UID, GID and supplementary groups are taken from, this can be:
                                    • From the image
                                    • From the IdP token (only available in an SSO installations)
                                    • Custom (manually set) - decide whether the submitter can modify these value upon submission.
                                    • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
                                      • Enter UID
                                      • Enter GID
                                      • Add Supplementary groups (multiple groups can be added, separated by commas)
                                      • Disable Allow the values above to be modified within the workload if you want the above values to be used as the default
                                  16. Optional: Select Linux capabilities - Grant certain privileges to a container without granting all the privileges of the root user.
                                  17. Click CREATE ENVIRONMENT

                                  Note

                                  It is also possible to add environments directly when creating a specific workspace, training or inference workload.

                                  "},{"location":"Researcher/workloads/assets/environments/#editing-an-environment","title":"Editing an environment","text":"

                                  To edit an environment:

                                  1. Select the environment you want to edit
                                  2. Click Edit
                                  3. Click SAVE ENVIRONMENT

                                  Note

                                  • The already bound workload that is using this asset will not be affected.
                                  • llm-server and chatbot-ui environments cannot be edited.
                                  "},{"location":"Researcher/workloads/assets/environments/#copying-an-environment","title":"Copying an environment","text":"

                                  To make a copy of an existing environment:

                                  1. Select the environment you want to copy
                                  2. Click MAKE A COPY
                                  3. Enter a name for the environment. The name must be unique.
                                  4. Update the environment
                                  5. Click CREATE ENVIRONMENT
                                  "},{"location":"Researcher/workloads/assets/environments/#deleting-an-environment","title":"Deleting an environment","text":"

                                  To delete an environment:

                                  1. Select the environment you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm

                                  Note

                                  The already bound workload that is using this asset will not be affected.

                                  "},{"location":"Researcher/workloads/assets/environments/#using-api","title":"Using API","text":"

                                  Go to the Environment API reference to view the available actions

                                  "},{"location":"Researcher/workloads/assets/overview/","title":"Overview","text":"

                                  Workload assets enable organizations to:

                                  • Create and reuse preconfigured setup for code, data, storage and resources to be used by AI practitioners to simplify the process of submitting workloads
                                  • Share the preconfigured setup with a wide audience of AI practitioners with similar needs

                                  Note

                                  • The creation of assets is possible only via API and the Run:ai UI
                                  • The submission of workloads using assets, is possible only via the Run:ai UI
                                  "},{"location":"Researcher/workloads/assets/overview/#workload-asset-types","title":"Workload asset types","text":"

                                  There are four workload asset types used by the workload:

                                  • Environments The container image, tools and connections for the workload
                                  • Data sources The type of data, its origin and the target storage location such as PVCs or cloud storage buckets where datasets are stored
                                  • Compute resources The compute specification, including GPU and CPU compute and memory
                                  • Credentials The secrets to be used to access sensitive data, services, and applications such as docker registry or S3 buckets
                                  "},{"location":"Researcher/workloads/assets/overview/#asset-scope","title":"Asset scope","text":"

                                  When a workload asset is created, a scope is required. The scope defines who in the organization can view and/or use the asset.

                                  Note

                                  When an asset is created via API, the scope can be the entire account, this is currently an experimental feature.

                                  "},{"location":"Researcher/workloads/assets/overview/#who-can-create-an-asset","title":"Who can create an asset?","text":"

                                  Any subject (user, application, or SSO group) with a role that has permissions to Create an asset, can do so within their scope.

                                  "},{"location":"Researcher/workloads/assets/overview/#who-can-use-an-asset","title":"Who can use an asset?","text":"

                                  Assets are used when submitting workloads. Any subject (user, application or SSO group) with a role that has permissions to Create workloads, can also use assets.

                                  "},{"location":"Researcher/workloads/assets/overview/#who-can-view-an-asset","title":"Who can view an asset?","text":"

                                  Any subject (user, application, or SSO group) with a role that has permission to View an asset, can do so within their scope.

                                  "},{"location":"Researcher/workloads/assets/templates/","title":"Workspace Templates","text":"

                                  This article explains the procedure to manage templates.

                                  A template is a pre-set configuration that is used to quickly configure and submit workloads using existing assets. A template consists of all the assets a workload needs, allowing researchers to submit a workload in a single click, or make subtle adjustments to differentiate them from each other.

                                  "},{"location":"Researcher/workloads/assets/templates/#workspace-templates-table","title":"Workspace templates table","text":"

                                  The Templates table can be found under Workload manager in the Run:ai User interface.

                                  The Templates table provides a list of all the templates defined in the platform, and allows you to manage them.

                                  Flexible Management

                                  It is also possible to manage templates directly for a specific user, application, project, or department.

                                  The Templates table consists of the following columns:

                                  Column Description Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates Environment The name of the environment related to the workspace template Compute resource The name of the compute resource connected to the workspace template Data source(s) The name of the data source(s) connected to the workspace template Created by The subject that created the template Creation time The timestamp for when the template was created Cluster The cluster name containing the template"},{"location":"Researcher/workloads/assets/templates/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  • Refresh (optional) - Click REFRESH to update the table with the latest data
                                  • Show/Hide details (optional) - Click to view additional information on the selected row
                                  "},{"location":"Researcher/workloads/assets/templates/#adding-a-new-workspace-template","title":"Adding a new workspace template","text":"

                                  To add a new template:

                                  1. Click +NEW TEMPLATE
                                  2. Set the scope for the template
                                  3. Enter a name for the template
                                  4. Select the environment for your workload
                                  5. Select the node resources needed to run your workload - or - Click +NEW COMPUTE RESOURCE

                                  6. Set the volume needed for your workload

                                  7. Create a new data source
                                  8. Set auto-deletion, annotations and labels, as required
                                  9. Click CREATE TEMPLATE
                                  "},{"location":"Researcher/workloads/assets/templates/#editing-a-template","title":"Editing a template","text":"

                                  To edit a template:

                                  1. Select the template from the table
                                  2. Click Rename to provide it with a new name
                                  3. Click Copy & Edit to make any changes to the template
                                  "},{"location":"Researcher/workloads/assets/templates/#deleting-a-template","title":"Deleting a template","text":"

                                  To delete a template:

                                  1. Select the template you want to delete
                                  2. Click DELETE
                                  3. Confirm you want to delete the template
                                  "},{"location":"Researcher/workloads/assets/templates/#using-api","title":"Using API**","text":"

                                  Go to the Workload template API reference to view the available actions

                                  "},{"location":"Researcher/workloads/inference/custom-inference/","title":"Deploy a custom inference workload","text":"

                                  This article explains how to create a custom inference workload via the Run:ai UI.

                                  An inference workload provides the setup and configuration needed to deploy your trained model for real-time or batch predictions. It includes specifications for the container image, data sets, network settings, and resource requests required to serve your models.

                                  The inference workload is assigned to a project and is affected by the project\u2019s quota.

                                  To learn more about the inference workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

                                  "},{"location":"Researcher/workloads/inference/custom-inference/#creating-a-custom-inference-workload","title":"Creating a custom inference workload","text":"

                                  Before you start, make sure you have a project.

                                  To add a new custom inference workload:

                                  1. Go to the Workload manager \u2192 Workloads
                                  2. Click +NEW WORKLOAD and select Inference Within the new inference form:
                                  3. Select under which cluster to create the inference workload
                                  4. Select the project in which your inference will run
                                  5. Select custom inference from Inference type

                                    Note

                                    Selecting the Inference type is disabled by default. If you cannot see it in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Models.

                                  6. Enter a unique name for the inference workload (if the name already exists in the project, you will be requested to submit a different name)

                                  7. Click CONTINUE In the next step:
                                  8. Select the environment for your inference workload

                                    • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery. For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
                                    • Set an inference serving endpoint. The connection protocol and the container port are defined within the environment

                                      • Optional: Modify who can access the endpoint

                                        • Public (default)

                                          Everyone within the network can access the endpoint with no authentication

                                        • All authenticated users

                                          Everyone within the organization\u2019s account that can log in (to Run:ai or SSO)

                                        • Specific group(s)

                                          • Click +GROUP
                                          • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the endpoint.
                                        • Specific user(s)

                                          • Click +USER
                                          • Enter a valid email address or username. If you remove yourself, you will lose access to the endpoint.
                                    • Set the connection for your tool(s). The tools are configured as part of the environment.

                                      • External URL
                                        • Custom URL
                                          • Set the URL
                                        • Optional: Modify who can access the tool:
                                          • All authenticated users (default) Everyone within the organization\u2019s account
                                          • Specific group(s)
                                            • Click +GROUP
                                            • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
                                          • Specific user(s)
                                            • Click +USER
                                            • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
                                      • Node port
                                        • Custom port
                                          • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
                                    • Optional: Set the command and arguments for the container running the workload If no command is added, the container will use the image\u2019s default command (entry-point).
                                      • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
                                      • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
                                    • Set the environment variable(s)
                                      • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
                                      • (Optional) Add new variables
                                        • Click +ENVIRONMENT VARIABLE
                                        • Enter a name
                                        • Select the source for the environment variable
                                          • Custom
                                            • Enter a value according to the provided instructions
                                          • Credentials - Select existing credentials as the environment variable
                                            • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
                                            • Select a secret key
                                  9. Select the compute resource for your inference workload

                                    • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery. For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
                                    • Optional: Set the minimum and maximum number of replicas to be scaled up and down to meet the changing demands of inference services.
                                    • If the number of minimum and maximum replicas are different, autoscaling will be triggered and you'll need to set conditions for creating a new replica. A replica will be created every time a condition is met. When a condition is no longer met after a replica was created, the replica will be automatically deleted to save resources.

                                      • Select a variable - The variable's values will be monitored via the container's port.
                                        • Latency (milliseconds)
                                        • Throughput (Requests/sec)
                                        • Concurrency (Requests)
                                      • Set a value - This value is the threshold at which autoscaling is triggered.
                                    • Optional: Set when the replicas should be automatically scaled down to zero. This allows compute resources to be freed up when the model is inactive (i.e., there are no requests being sent) When automatic scaling to zero is enabled, the minimum number of replicas set in the previous step, automatically changes to 0

                                    • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload. When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
                                      • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
                                      • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster. To configure a new node pool and for additional information, see node pools.
                                    • Select a node affinity to schedule the workload on a specific node type. If the administrator added a \u2018node type (affinity)\u2019 scheduling rule to the project/department, then this field is mandatory. Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
                                    • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

                                      Note

                                      Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Tolerations

                                      • Click +TOLERATION
                                      • Enter a key
                                      • Select the operator
                                        • Exists - If the key exists on the node, the effect will be applied.
                                        • Equals - If the key and the value set below matches to the value on the node, the effect will be applied
                                          • Enter a value matching the value on the node
                                      • Select the effect for the toleration
                                        • NoExecute - Pods that do not tolerate this taint are evicted immediately.
                                        • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
                                        • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
                                        • Any - All effects above match.
                                      • Optional: Select data sources for your inference workload Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection. For a step-by-step guide on adding data sources to the gallery, see data sources. Once created, the new data source will be automatically selected.
                                        • Optional: Modify the data target location for the selected data source(s).
                                  10. Optional - General settings:

                                    • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
                                    • Set annotations(s) Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
                                      • Click +ANNOTATION
                                      • Enter a name
                                      • Enter a value
                                    • Set labels(s) Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
                                      • Enter a name
                                      • Enter a value
                                  11. Click CREATE INFERENCE
                                  "},{"location":"Researcher/workloads/inference/custom-inference/#managing-and-monitoring","title":"Managing and monitoring","text":"

                                  After the inference workload is created, it is added to the Workloads table, where it can be managed and monitored.

                                  "},{"location":"Researcher/workloads/inference/custom-inference/#using-api","title":"Using API","text":"

                                  To view the available actions, see the Inferences API reference.

                                  "},{"location":"Researcher/workloads/inference/hugging-face-inference/","title":"Deploy inference workloads from Hugging Face","text":"

                                  This article explains how to create an inference workload via the Run:ai UI using Hugging Face inference models.

                                  An inference workload provides the setup and configuration needed to deploy your trained model for real-time or batch predictions. It includes specifications for the container image, data sets, network settings, and resource requests required to serve your models.

                                  The inference workload is assigned to a project and is affected by the project\u2019s quota.

                                  To learn more about the inference workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

                                  "},{"location":"Researcher/workloads/inference/hugging-face-inference/#creating-a-hugging-face-inference-workload","title":"Creating a Hugging Face inference workload","text":"

                                  Before you start, make sure you have a project.

                                  To add a new inference workload:

                                  1. Go to the Workload manager \u2192 Workloads
                                  2. Click +NEW WORKLOAD and select Inference Within the new inference form:
                                  3. Select under which cluster to create the inference workload
                                  4. Select the project in which your inference will run
                                  5. Select Hugging Face from Inference type

                                    Note

                                    Selecting the Inference type is disabled by default. If you cannot see it in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Models.

                                  6. Enter a unique name for the inference workload (if the name already exists in the project, you will be requested to submit a different name)

                                  7. Click CONTINUE In the next step:
                                  8. Set the model and how to access

                                    • Set the model name as displayed in Hugging Face. The model must be supported by vLLM version 0.6.4.
                                      • Enter a name
                                    • Set how to access Hugging Face

                                      • Provide a token
                                        • Access token
                                          • Enter a token
                                      • Select credentials
                                        • Select existing credentials. Make sure the existing credentials contain an HF_TOKEN key
                                        • Add new credentials with a HF_TOKEN

                                          Within the new credentials form:

                                          • Enter a name for the credential. The name must be unique.
                                          • Optional: Provide a description of the credentials
                                          • Set how the credential is created

                                            • Existing secret (in the cluster)

                                              This option applies when the purpose is to create credentials based on an existing secret

                                              • Select a secret from the list (the list is empty if no secrets were created in advance)
                                            • New secret

                                              A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.

                                              • Enter a key
                                              • Enter the HF_TOKEN as the value
                                    • Optional: Modify who can access the inference serving endpoint

                                      • Public (default)

                                        Everyone within the network can access the endpoint with no authentication

                                      • All authenticated users

                                        Everyone within the organization\u2019s account that can log in (to Run:ai or SSO)

                                      • Specific group(s)

                                        • Click +GROUP
                                        • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the endpoint.
                                      • Specific user(s)

                                        • Click +USER
                                        • Enter a valid email address or username. If you remove yourself, you will lose access to the endpoint.
                                  9. Select the compute resource for your inference workload

                                    • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery. For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
                                    • Optional: Set the minimum and maximum number of replicas to be scaled up and down to meet the changing demands of inference services.
                                    • If the number of minimum and maximum replicas are different, autoscaling will be triggered and you'll need to set conditions for creating a new replica. A replica will be created every time a condition is met. When a condition is no longer met after a replica was created, the replica will be automatically deleted to save resources.

                                      • Select a variable - The variable's values will be monitored via the container's port.
                                        • Latency (milliseconds)
                                        • Throughput (Requests/sec)
                                        • Concurrency (Requests)
                                      • Set a value - This value is the threshold at which autoscaling is triggered.
                                    • Optional: Set when the replicas should be automatically scaled down to zero. This allows the compute resources to be freed up when the model is inactive (i.e., there are no requests being sent) When automatic scaling to zero is enabled, the minimum number of replicas set in the previous step, automatically changes to 0

                                    • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload. When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
                                      • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
                                      • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster. To configure a new node pool and for additional information, see node pools.
                                    • Select a node affinity to schedule the workload on a specific node type. If the administrator added a \u2018node type (affinity)\u2019 scheduling rule to the project/department, then this field is mandatory. Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
                                    • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

                                      Note

                                      Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Tolerations

                                      • Click +TOLERATION
                                      • Enter a key
                                      • Select the operator
                                        • Exists - If the key exists on the node, the effect will be applied.
                                        • Equals - If the key and the value set below matches to the value on the node, the effect will be applied
                                          • Enter a value matching the value on the node
                                      • Select the effect for the toleration
                                        • NoExecute - Pods that do not tolerate this taint are evicted immediately.
                                        • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
                                        • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
                                        • Any - All effects above match.
                                  10. Optional - General settings:

                                    • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
                                    • Set annotations(s) Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
                                      • Click +ANNOTATION
                                      • Enter a name
                                      • Enter a value
                                    • Set labels(s) Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
                                      • Enter a name
                                      • Enter a value
                                  11. Click CREATE INFERENCE
                                  "},{"location":"Researcher/workloads/inference/hugging-face-inference/#managing-and-monitoring","title":"Managing and monitoring","text":"

                                  After the inference workload is created, it is added to the Workloads table, where it can be managed and monitored.

                                  "},{"location":"Researcher/workloads/inference/inference-overview/","title":"Inference overview","text":""},{"location":"Researcher/workloads/inference/inference-overview/#what-is-inference","title":"What is Inference","text":"

                                  Machine learning (ML) inference is the process of running live data points into a machine-learning algorithm to calculate an output.

                                  With Inference workloads, you are taking a trained Model and deploying it into a production environment. The deployment must align with the organization's production standards such as average and 95% response time as well as up-time.

                                  "},{"location":"Researcher/workloads/inference/inference-overview/#inference-and-gpus","title":"Inference and GPUs","text":"

                                  The Inference process is a subset of the original Training algorithm on a single datum (e.g. one sentence or one image), or a small batch. As such, GPU memory requirements are typically smaller than a full-blown Training process.

                                  Given that, Inference lends itself nicely to the usage of Run:ai Fractions. You can, for example, run 4 instances of an Inference server on a single GPU, each employing a fourth of the memory.

                                  "},{"location":"Researcher/workloads/inference/inference-overview/#inference-runai","title":"Inference @Run:ai","text":"

                                  Run:ai provides Inference services as an equal part together with the other two Workload types: Train and Build.

                                  • Inference is considered a high-priority workload as it is customer-facing. Running an Inference workload (within the Project's quota) will preempt any Run:ai Workload marked as Training.

                                  • Inference workloads will receive priority over Train and Build workloads during scheduling.

                                  • Inference is implemented as a Kubernetes Deployment object with a defined number of replicas. The replicas are load-balanced by Kubernetes so adding more replicas will improve the overall throughput of the system.

                                  • Multiple replicas will appear in Run:ai as a single Inference workload. The workload will appear in all Run:ai dashboards and views as well as the Command-line interface.

                                  • Inference workloads can be submitted via Run:ai user interface as well as Run:ai API. Internally, spawning an Inference workload also creates a Kubernetes Service. The service is an end-point to which clients can connect.

                                  "},{"location":"Researcher/workloads/inference/inference-overview/#autoscaling","title":"Autoscaling","text":"

                                  To withstand SLA, Inference workloads are typically set with auto scaling. Auto-scaling is the ability to add more computing power (Kubernetes pods) when the load increases and shrink allocated resources when the system is idle. There are several ways to trigger autoscaling. Run:ai supports the following:

                                  Metric Units Latency Millisecond Throughput Requests/sec Concurrency Requests

                                  The Minimum and Maximum number of replicas can be configured as part of the autoscaling configuration.

                                  Autoscaling also supports a scale-to-zero policy with Throughput and Concurrency metrics, meaning that given enough time under the target threshold, the number of replicas will be scaled down to 0.

                                  This has the benefit of conserving resources at the risk of a delay from \"cold starting\" the model when traffic resumes.

                                  "},{"location":"Researcher/workloads/inference/inference-overview/#rolling-inference-updates","title":"Rolling inference updates","text":"

                                  When deploying models and running inference workloads, you may need to update the workload configuration in a live manner, without impacting the important services that are provided by the workload.

                                  This means you can submit updates to an existing inference workload whether it is currently running, pending, or any other status.

                                  The following are a few examples of updates that can be implemented:

                                  • Changing the container image to deploy a new version of the model
                                  • Changing different parameters (such as environment variables)
                                  • Changing compute resources to improve performance
                                  • Changing the number of replicas and scale plan to adapt to requirement changes and scales

                                  During the update and until its successful completion, the service that the workload provides is not jeopardized as these are production-grade workloads. Hence, consumers can continue using the model (interact with the LLM) during the update process.

                                  During the update process of an inference workload, a new revision of pod(s) is created. This revision is the new desired specification of the workload. Although several updates can be submitted consecutively even if the process of the previous update is not complete, the target goal is always according to the last submitted update. This means, the previous updates are ignored.

                                  Once the new revision is created completely and is up and running, the entire traffic of requests is navigated to the new revision, the original revision is terminated and the resources are sent back to the shared pool. Only then is the update process considered complete.

                                  It is important to note that:

                                  • To finish the inference workload update successfully, the project must have sufficient free GPU quota in favor of the update. For example:

                                    • The existing workload uses 3 replicas: A running inference workload with 3 replicas, assuming that each replica is equal to 1 GPU, means the project is already using 3 GPUs of its quota. For the sake of simplicity, we will refer to this revision as revision #1.

                                    • The workload is updated to use 8 replicas: This means, to complete the update, an additional 8 GPUs of free quota is needed. Only when the update is complete, the 3 GPUs used for the initial revision (revision #1) are reclaimed.

                                  • In the UI, the Workloads table displays the configuration of the latest submitted update. For example, if you change the container image, the image column in the running / requested pods will display the name of updated image. The status of the workload continues to reflect the operational state of the service the workload exposes. For instance, during an update, the workload status remains \"Running\" if the service is still being delivered to consumers. Hovering over the workload's status in the grid will display the phase message for the update, offering additional insights into its update state.

                                  • The submission of inference updates is currently possible only via API. The following are the API fields that can be updated:

                                    • Command
                                    • Args
                                    • Image
                                    • imagePullPolicy
                                    • workingDir
                                    • createHomeDir
                                    • Probes
                                    • environmentVariables
                                    • Autoscaling
                                  • As long as the update process is not completed, GPUs are not allocated to the replicas of the new revision. This prevents the allocation of idle GPUs so others will not be deprived using them.

                                  • If the update process is not completed within the default time limit of 10 minutes, it will automatically stop. At that point, all replicas of the new revision will be removed, and the original revision will continue to run normally.
                                  • The default time limit for updates is configurable. Consider setting a longer duration if your workload requires extended time to pull the image due to its size, if the workload takes additional time to reach a 'READY' state due to a long initialization process, or if your cluster depends on autoscaling to allocate resources for new replicas. For example, to set the time limit to 30 minutes, you can run the following command:
                                    kubectl patch ConfigMap config-deployment -n knative-serving --type='merge' -p '{\"data\": {\"progress-deadline\": \"1800s\"}}'\n
                                  "},{"location":"Researcher/workloads/inference/inference-overview/#inference-workloads-with-knative-new-behavior-in-v219","title":"Inference workloads with Knative new behavior in v2.19","text":"

                                  Starting in version 2.19, all pods of a single Knative revision are grouped under a single Pod-Group. This means that when a new Knative revision is created:

                                  • It either succeeds in allocating the minimum number of pods; or
                                  • It fails and moves into a pending state, to retry again later to allocate all pods with their resources.

                                  The resources (GPUs, CPUs) are not occupied by a new Knative revision until it succeeds in allocating all pods. The older revision pods are then terminated and release their resources (GPUs, CPUs) back to the cluster to be used by other workloads.

                                  "},{"location":"Researcher/workloads/inference/inference-overview/#see-also","title":"See Also","text":"
                                  • To set up Inference, see Cluster installation prerequisites.
                                  • For running Inference see Inference quick-start.
                                  • To run Inference using API see Workload overview.
                                  "},{"location":"Researcher/workloads/inference/nim-inference/","title":"Deploy inference workloads with NVIDIA NIM","text":"

                                  This article explains how to deploy a GenAI model from Nvidia NIM as an inference workload via the Run:ai UI.

                                  An inference workload provides the setup and configuration needed to deploy your trained model for real-time or batch predictions. It includes specifications for the container image, data sets, network settings, and resource requests required to serve your models.

                                  The inference workload is assigned to a project and is affected by the project\u2019s quota.

                                  To learn more about the inference workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

                                  "},{"location":"Researcher/workloads/inference/nim-inference/#creating-a-nim-inference-workload","title":"Creating a NIM inference workload","text":"

                                  Before you start, make sure you have a project.

                                  To add a new inference workload:

                                  1. Go to the Workload manager \u2192 Workloads
                                  2. Click +NEW WORKLOAD and select Inference Within the new inference form:
                                  3. Select under which cluster to create the inference workload
                                  4. Select the project in which your inference will run
                                  5. Select NIM from Inference type

                                    Note

                                    Selecting the Inference type is disabled by default. If you cannot see it in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Models.

                                  6. Enter a unique name for the inference workload (if the name already exists in the project, you will be requested to submit a different name)

                                  7. Click CONTINUE In the next step:
                                  8. Select the NIM model and set how to access

                                    • Set the model name by selecting a model or entering the model name as displayed in NIM
                                    • Set how the model profile should be selected

                                      A NIM model profile sets compatible model engines and criteria for engine selection, such as precision, latency, throughput optimization, and GPU requirements. Profiles are optimized to balance either latency or throughput, with quantized profiles (e.g., fp8) preferred to reduce memory usage and enhance performance.

                                      • Automatically (recommended) NIM is designed to automatically select the most suitable profile from the list of compatible profiles based on the detected hardware. Each profile consists of different parameters that influence the selection process.
                                      • Manually
                                        • Enter profile name or hash
                                    • Optional: Modify who can access the inference serving endpoint

                                      • Public (default)

                                        Everyone within the network can access the endpoint with no authentication

                                      • All authenticated users

                                        Everyone within the organization\u2019s account that can log in (to Run:ai or SSO)

                                      • Specific group(s)

                                        • Click +GROUP
                                        • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the endpoint.
                                      • Specific user(s)

                                        • Click +USER
                                        • Enter a valid email address or username. If you remove yourself, you will lose access to the endpoint.
                                  9. Select how to access the model store

                                    • From NVIDIA NGC - The model is downloaded when the workload starts running
                                      • Set the NVIDIA NGC API key
                                        • Enter a key
                                        • (Optional) Click Storage - When downloading a model from NVIDIA NGC, selecting storage is recommended. Select a data source where the model is already cached to reduce loading time or click +NEW DATA SOURCE to add a new data source to the gallery. This will cache the model and reduce loading time for future use. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection. For a step-by-step guide on adding data sources to the gallery, see data sources. Once created, the new data source will be automatically selected.
                                    • From storage - The model is accessed directly and without being downloaded
                                      • Storage - Set where to load the model Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection. For a step-by-step guide on adding data sources to the gallery, see data sources. Once created, the new data source will be automatically selected.
                                  10. Select the compute resource for your inference workload

                                    • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery. For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
                                    • Optional: Set the minimum and maximum number of replicas to be scaled up and down to meet the changing demands of inference services.
                                    • If the number of minimum and maximum replicas are different, autoscaling will be triggered and you'll need to set conditions for creating a new replica. A replica will be created every time a condition is met. When a condition is no longer met after a replica was created, the replica will be automatically deleted to save resources.

                                      • Select a variable - The variable's values will be monitored via the container's port.
                                        • Latency (milliseconds)
                                        • Throughput (Requests/sec)
                                        • Concurrency (Requests)
                                      • Set a value - This value is the threshold at which autoscaling is triggered
                                    • Optional: Set when the replicas should be automatically scaled down to zero. This allows the compute resources to be freed up when the model is inactive (i.e., there are no requests being sent) When automatic scaling to zero is enabled, the minimum number of replicas set in the previous step, automatically changes to 0

                                    • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload. When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
                                      • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
                                      • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster. To configure a new node pool and for additional information, see node pools.
                                    • Select a node affinity to schedule the workload on a specific node type. If the administrator added a \u2018node type (affinity)\u2019 scheduling rule to the project/department, then this field is mandatory. Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
                                    • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

                                      Note

                                      Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Tolerations

                                      • Click +TOLERATION
                                      • Enter a key
                                      • Select the operator
                                        • Exists - If the key exists on the node, the effect will be applied.
                                        • Equals - If the key and the value set below matches to the value on the node, the effect will be applied
                                          • Enter a value matching the value on the node
                                      • Select the effect for the toleration
                                        • NoExecute - Pods that do not tolerate this taint are evicted immediately.
                                        • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
                                        • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
                                        • Any - All effects above match.
                                  11. Optional - General settings:

                                    • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
                                    • Set annotations(s) Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
                                      • Click +ANNOTATION
                                      • Enter a name
                                      • Enter a value
                                    • Set labels(s) Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
                                      • Enter a name
                                      • Enter a value
                                  12. Click CREATE INFERENCE
                                  "},{"location":"Researcher/workloads/inference/nim-inference/#managing-and-monitoring","title":"Managing and monitoring","text":"

                                  After the inference workload is created, it is added to the Workloads table, where it can be managed and monitored.

                                  "},{"location":"Researcher/workloads/overviews/introduction-to-workloads/","title":"Introduction to Workloads","text":"

                                  Run:ai enhances visibility and simplifies management, by monitoring, presenting and orchestrating all AI workloads in the clusters it is installed on. Workloads are the fundamental building blocks for consuming resources, enabling AI practitioners such as researchers, data scientists and engineers to efficiently support the entire life cycle of an AI initiative.

                                  "},{"location":"Researcher/workloads/overviews/introduction-to-workloads/#workloads-across-the-ai-lifecycle","title":"Workloads across the AI lifecycle","text":"

                                  A typical AI initiative progresses through several key stages, each with distinct workloads and objectives. With Run:ai, research and engineering teams can host and manage all these workloads to achieve the following:

                                  • Data preparation: Aggregating, cleaning, normalizing, and labeling data to prepare for training.
                                  • Training: Conducting resource-intensive model development and iterative performance optimization.
                                  • Fine-tuning: Adapting pre-trained models to domain-specific data sets while balancing efficiency and performance.
                                  • Inference: Deploying models for real-time or batch predictions with a focus on low latency and high throughput.
                                  • Monitoring and optimization: Ensuring ongoing performance by addressing data drift, usage patterns, and retraining as needed.
                                  "},{"location":"Researcher/workloads/overviews/introduction-to-workloads/#what-is-a-workload","title":"What is a workload?","text":"

                                  A workload runs in the cluster, is associated with a namespace, and operates to fulfill its targets, whether that is running to completion for a batch job, allocating resources for experimentation in an integrated development environment (IDE)/notebook, or serving inference requests in production.

                                  The workload, defined by the AI practitioner, consists of:

                                  • Container images: This includes the application, its dependencies, and the runtime environment.
                                  • Compute resources: CPU, GPU, and RAM to execute efficiently and address the workload\u2019s needs.
                                  • Data sets: The data needed for processing, such as training data sets or input from external databases.
                                  • Credentials: The access to certain data sources or external services, ensuring proper authentication and authorization.
                                  "},{"location":"Researcher/workloads/overviews/introduction-to-workloads/#workload-scheduling-and-orchestration","title":"Workload scheduling and orchestration","text":"

                                  Run:ai\u2019s core mission is to optimize AI resource usage at scale. This is achieved through efficient scheduling and orchestrating of all cluster workloads using the Run:ai Scheduler. The Scheduler allows the prioritization of workloads across different departments and projects within the organization at large scales, based on the resource distribution set by the system administrator.

                                  "},{"location":"Researcher/workloads/overviews/introduction-to-workloads/#runai-and-third-party-workloads","title":"Run:ai and third-party workloads","text":"
                                  • Run:ai workloads: These workloads are submitted via the Run:ai platform. They are represented by Kubernetes Custom Resource Definitions (CRDs) and APIs. When using Run:ai workloads, a complete Workload and Scheduling Policy solution is offered for administrators to ensure optimizations, governance and security standards are applied.
                                  • Third-party workloads: These workloads are submitted via third-party applications that use the Run:ai Scheduler. The Run:ai platform manages and monitors these workloads. They enable seamless integrations with external tools, allowing teams and individuals flexibility.
                                  "},{"location":"Researcher/workloads/overviews/introduction-to-workloads/#levels-of-support","title":"Levels of support","text":"

                                  Different types of workloads have different levels of support. Understanding what capabilities are needed before selecting the workload type to work with is important. The table below details the level of support for each workload type in Run:ai. Run:ai workloads are fully supported with all of Run:ai advanced features and capabilities. While third-party workloads are partially supported. The list of capabilities can change between different Run:ai versions.

                                  Functionality Workload Type Run:ai workloads Third-party workloads Training - Standard Workspace Inference Training - distributed Fairness v v v v v Priority and preemption v v v v v Over quota v v v v v Node pools v v v v v Bin packing / Spread v v v v v Multi-GPU fractions v v v v v Multi-GPU dynamic fractions v v v v v Node level scheduler v v v v v Multi-GPU memory swap v v v v v Elastic scaling NA NA v v v Gang scheduling v v v v v Monitoring v v v v v RBAC v v v v Workload awareness v v v v Workload submission v v v v Workload actions (stop/run) v v v v Workload Policies v v v v Scheduling rules v v v v

                                  Note

                                  Workload awareness

                                  Specific workload-aware visibility, so that different pods are identified and treated as a single workload (for example GPU utilization, workload view, dashboards).

                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/","title":"Workloads","text":"

                                  This article explains the procedure for managing workloads.

                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#workloads-table","title":"Workloads table","text":"

                                  The Workloads table can be found under Workload manager in the Run:ai platform.

                                  The workloads table provides a list of all the workloads scheduled on the run:ai Scheduler, and allows you to manage them.

                                  The Workloads table consists of the following columns:

                                  Column Description Workload The name of the workload Type The workload type Preemptible Is the workload preemptible Status The different phases in a workload life cycle. Project The project in which the workload runs. Department The department that the workload is associated with. this column is visible only if the department toggle is enabled by your administrator. Created by The user who created the workload Running/requested pods The number of running pods out of the requested Creation time The timestamp for when the workload was created Completion time The timestamp the workload reached a terminal state (failed/completed) Connection(s) The method by which you can access and interact with the running workload. It's essentially the \"doorway\" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc). Click one of the values in the column to view the list of connections and their parameters Data source(s) Data resources used by the workload Environment The environment used by the workload Workload architecture Standard or distributed. A standard workload consists of a single process. A distributed workload consists of multiple processes working together. These processes can run on different nodes. GPU compute request Amount of GPU devices requested GPU compute allocation Amount of GPU devices allocated GPU memory request Amount of GPU memory Requested GPU memory allocation Amount of GPU memory allocated Idle GPU devices The number of allocated GPU devices that have been idle for more than 5 minutes CPU compute request Amount of CPU cores requested CPU compute allocation Amount of CPU cores allocated CPU memory request Amount of CPU memory requested CPU memory allocation Amount of CPU memory allocated Cluster The cluster that the workload is associated with"},{"location":"Researcher/workloads/overviews/managing-workloads/#workload-status","title":"Workload status","text":"

                                  The following table describes the different phases in a workload life cycle. The UI provides additional details for some of the below workload statuses which can be viewed by clicking the icon next to the status.

                                  Status Description Entry Condition Exit Condition Creating Workload setup is initiated in the cluster. Resources and pods are now provisioning. A workload is submitted. A multi-pod group is created. Pending Workload is queued and awaiting resource allocation. A pod group exists. All pods are scheduled. Initializing Workload is retrieving images, starting containers, and preparing pods. All pods are scheduled. All pods are initialized or a failure to initialize is detected. Running Workload is currently in progress with all pods operational. All pods initialized (all containers in pods are ready). Workload completion or failure. Degraded Pods may not align with specifications, network services might be incomplete, or persistent volumes may be detached. Check your logs for specific details. Pending - All pods are running but have issues. Running - All pods are running with no issues. Running - All resources are OK. Completed - Workload finished with fewer resources. Failed - Workload failure or user-defined rules. Deleting Workload and its associated resources are being decommissioned from the cluster. Deleting the workload. Resources are fully deleted. Stopped Workload is on hold and resources are intact but inactive. Stopping the workload without deleting resources. Transitioning back to the initializing phase or proceeding to deleting the workload. Failed Image retrieval failed or containers experienced a crash. Check your logs for specific details. An error occurs preventing the successful completion of the workload. Terminal state. Completed Workload has successfully finished its execution. The workload has finished processing without errors. Terminal state."},{"location":"Researcher/workloads/overviews/managing-workloads/#pods-associated-with-workload","title":"Pods Associated with Workload","text":"

                                  Click one of the values in the Running/requested pods column, to view the list of pods and their parameters.

                                  Column Description Pod Pod name Status Pod lifecycle stages Node The node on which the pod resides Node pool The node pool in which the pod resides (applicable if node pools are enabled) Image The pod\u2019s main image GPU compute allocation Amount of GPU devices allocated for the pod GPU memory allocation Amount of GPU memory allocated for the pod"},{"location":"Researcher/workloads/overviews/managing-workloads/#connections-associated-with-workload","title":"Connections Associated with Workload","text":"

                                  A connection refers to the method by which you can access and interact with the running workloads. It is essentially the \"doorway\" through which you can reach and use the applications (tools) these workloads provide.

                                  Click one of the values in the Connection(s) column, to view the list of connections and their parameters. Connections are network interfaces that communicate with the application running in the workload. Connections are either the URL the application exposes or the IP and the port of the node that the workload is running on.

                                  Column Description Name The name of the application running on the workload Connection type The network connection type selected for the workload Access Who is authorized to use this connection (everyone, specific groups/users) Address The connection URL Copy button Copy URL to clipboard Connect button Enabled only for supported tools"},{"location":"Researcher/workloads/overviews/managing-workloads/#data-sources-associated-with-workload","title":"Data Sources Associated with Workload","text":"

                                  Click one of the values in the Data source(s) column, to view the list of data sources and their parameters.

                                  Column Description Data source The name of the data source mounted to the workload Type The data source type"},{"location":"Researcher/workloads/overviews/managing-workloads/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  • Refresh - Click REFRESH to update the table with the latest data
                                  • Show/Hide details - Click to view additional information on the selected row
                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#showhide-details","title":"Show/Hide details","text":"

                                  Click a row in the Workloads table and then click the SHOW DETAILS button at the upper-right side of the action bar. The details pane appears, presenting the following tabs:

                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#event-history","title":"Event History","text":"

                                  Displays the workload status over time. It displays events describing the workload lifecycle and alerts on notable events. Use the filter to search through the history for specific events.

                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#metrics","title":"Metrics","text":"
                                  • GPU utilization Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs compute utilization (percentage of GPU compute) in this node.
                                  • GPU memory utilization Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs memory usage (percentage of the GPU memory) in this node.
                                  • CPU compute utilization The average of all CPUs\u2019 cores compute utilization graph, along an adjustable period allows you to see the trends of CPU compute utilization (percentage of CPU compute) in this node.
                                  • CPU memory utilization The utilization of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory utilization (percentage of CPU memory) in this node.
                                  • CPU memory usage The usage of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory usage (in GB or MB of CPU memory) in this node.

                                  • For GPUs charts - Click the GPU legend on the right-hand side of the chart, to activate or deactivate any of the GPU lines.

                                  • You can click the date picker to change the presented period
                                  • You can use your mouse to mark a sub-period in the graph for zooming in, and use Reset zoom to go back to the preset period
                                  • Changes in the period affect all graphs on this screen.
                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#logs","title":"Logs","text":"

                                  Workload events are ordered in chronological order. The logs contain events from the workload\u2019s lifecycle to help monitor and debug issues.

                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#adding-new-workload","title":"Adding new workload","text":"

                                  Before starting, make sure you have created a project or have one created for you to work with workloads.

                                  To create a new workload:

                                  1. Click +NEW WORKLOAD
                                  2. Select a workload type - Follow the links below to view the step-by-step guide for each workload type:
                                    • Workspace. Used for data preparation and model-building tasks.
                                    • Training. Used for standard training tasks of all sorts
                                    • Distributed Training. Used for distributed tasks of all sorts
                                    • Inference. Used for inference and serving tasks
                                    • Job (legacy). This type is displayed only if enabled by your Administrator, under General settings \u2192 Workloads \u2192 Workload policies
                                  3. Click CREATE WORKLOAD
                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#stopping-a-workload","title":"Stopping a workload","text":"

                                  Stopping a workload kills the workload pods and releases the workload resources.

                                  1. Select the workload you want to stop
                                  2. Click STOP
                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#running-a-workload","title":"Running a workload","text":"

                                  Running a workload spins up new pods and resumes the workload work after it was stopped.

                                  1. Select the workload you want to run again
                                  2. Click RUN
                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#connecting-to-a-workload","title":"Connecting to a workload","text":"

                                  To connect to an application running in the workload (for example, Jupyter Notebook)

                                  1. Select the workload you want to connect
                                  2. Click CONNECT
                                  3. Select the tool from the drop-down list
                                  4. The selected tool is opened in a new tab on your browser
                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#deleting-a-workload","title":"Deleting a workload","text":"
                                  1. Select the workload you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm the deletion

                                  Note

                                  Once a workload is deleted you can view it in the Deleted tab in the workloads view. This tab is displayed only if enabled by your Administrator, under General settings \u2192 Workloads \u2192 Deleted workloads

                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#copy-edit-a-workload","title":"Copy & Edit a workload","text":"
                                  1. Select the workload you want to copy and edit
                                  2. Click COPY & EDIT
                                  3. Update the workload and click CREATE WORKLOAD
                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#using-api","title":"Using API","text":"

                                  Go to the Workloads API reference to view the available actions

                                  "},{"location":"Researcher/workloads/overviews/managing-workloads/#troubleshooting","title":"Troubleshooting","text":"

                                  To understand the condition of the workload, review the workload status in the Workload table. For more information, see check the workload\u2019s event history.

                                  Listed below are a number of known issues when working with workloads and how to fix them:

                                  Issue Mediation Cluster connectivity issues (there are issues with your connection to the cluster error message) Verify that you are on a network that has been granted access to the cluster. Reach out to your cluster admin for instructions on verifying this. If you are an admin, see the troubleshooting section in the cluster documentation Workload in \u201cInitializing\u201d status for some time Check that you have access to the Container image registry. Check the statuses of the pods in the pods\u2019 modal. Check the event history for more details Workload has been pending for some time Check that you have the required quota. Check the project\u2019s available quota in the project dialog. Check that all services needed to run are bound to the workload. Check the event history for more details. PVCs created using the K8s API or kubectl are not visible or mountable in Run:ai. This is by design. - Create a new data source of type PVC in the Run:ai UI - In the Data mount section, select Existing PVC - Select the PVC you created via the K8S API You are now able to select and mount this PVC in your Run:ai submitted workloads. Workload is not visible in the UI. Check that the workload hasn\u2019t been deleted. See the \u201cDeleted\u201d tab in the workloads view"},{"location":"Researcher/workloads/overviews/workload-types/","title":"Run:ai Workload Types","text":"

                                  In the world of machine learning (ML), the journey from raw data to actionable insights is a complex process that spans multiple stages. Each stage of the AI lifecycle requires different tools, resources, and frameworks to ensure optimal performance. Run:ai simplifies this process by offering specialized workload types tailored to each phase, facilitating a smooth transition across various stages of the ML workflows.

                                  The ML lifecycle usually begins with the experimental work on data and exploration of different modeling techniques to identify the best approach for accurate predictions. At this stage, resource consumption is usually moderate as experimentation is done on a smaller scale. As confidence grows in the model's potential and its accuracy, the demand for compute resources increases. This is especially true during the training phase, where vast amounts of data need to be processed, particularly with complex models such as large language models (LLMs), with their huge parameter sizes, that often require distributed training across multiple GPUs to handle the intensive computational load.

                                  Finally, once the model is ready, it moves to the inference stage, where it is deployed to make predictions on new, unseen data. Run:ai's workload types are designed to correspond with the natural stages of this lifecycle. They are structured to align with the specific resource and framework requirements of each phase, ensuring that AI researchers and data scientists can focus on advancing their models without worrying about infrastructure management.

                                  Run:ai offers three workload types that correspond to a specific phase of the researcher\u2019s work:

                                  • Workspaces \u2013 For experimentation with data and models.
                                  • Training \u2013 For resource-intensive tasks such as model training and data preparation.
                                  • Inference \u2013 For deploying and serving the trained model.
                                  "},{"location":"Researcher/workloads/overviews/workload-types/#workspaces-the-experimentation-phase","title":"Workspaces: the experimentation phase","text":"

                                  The Workspace is where data scientists conduct initial research, experiment with different data sets, and test various algorithms. This is the most flexible stage in the ML lifecycle, where models and data are explored, tuned, and refined. The value of workspaces lies in the flexibility they offer, allowing the researcher to iterate quickly without being constrained by rigid infrastructure.

                                  • Framework flexibility

                                    Workspaces support a variety of machine learning frameworks, as researchers need to experiment with different tools and methods.

                                  • Resource requirements

                                    Workspaces are often lighter on resources compared to the training phase, but they still require significant computational power for data processing, analysis, and model iteration.

                                    Hence, the default for the Run:ai workspaces considerations is to allow scheduling those workloads without the ability to preempt them once the resources were allocated. However, this non-preemptable state doesn\u2019t allow to utilize more resources outside of the project\u2019s deserved quota.

                                  See Running workspaces to learn more about how to submit a workspace via the Run:ai platform. For quick starts, see Running Jupyter Notebook using workspaces.

                                  "},{"location":"Researcher/workloads/overviews/workload-types/#training-scaling-resources-for-model-development","title":"Training: scaling resources for model development","text":"

                                  As models mature and the need for more robust data processing and model training increases, Run:ai facilitates this shift through the Training workload. This phase is resource-intensive, often requiring distributed computing and high-performance clusters to process vast data sets and train models.

                                  • Training architecture

                                    For training workloads Run:ai allows you to specify the architecture - standard or distributed. The distributed architecture is relevant for larger data sets and more complex models that require utilizing multiple nodes. For the distributed architecture, Run:ai allows you to specify different configurations for the master and workers and select which framework to use - PyTorch, XGBoost, MPI, and TensorFlow. In addition, as part of the distributed configuration, Run:ai enable the researchers to schedule their distributed workloads on nodes within the same region, zone, placement group, or any other topology.

                                  • Resource requirements

                                    Training tasks demand high memory, compute power, and storage. Run:ai ensures that the allocated resources match the scale of the task and allows those workloads to utilize more compute resources than the project\u2019s deserved quota. Make sure that if you wish your training workload not to be preempted, specify the number of GPU\u2019s that are in your quota.

                                  See Standard training and Distributed training to learn more about how to submit a training workload via the Run:ai UI. For quick starts, see Run your first standard training and Run your first distributed training.

                                  "},{"location":"Researcher/workloads/overviews/workload-types/#inference-deploying-and-serving-models","title":"Inference: deploying and serving models","text":"

                                  Once a model is trained and validated, it moves to the Inference stage, where it is deployed to make predictions (usually in a production environment). This phase is all about efficiency and responsiveness, as the model needs to serve real-time or batch predictions to end-users or other systems.

                                  • Inference-specific use cases

                                    Naturally, inference workloads are required to change and adapt to the ever-changing demands to meet SLA. For example, additional replicas may be deployed, manually or automatically, to increase compute resources as part of a horizontal scaling approach or a new version of the deployment may need to be rolled out without affecting the running services.

                                  • Resource requirements

                                    Inference models differ in size and purpose, leading to varying computational requirements. For example, small OCR models can run efficiently on CPUs, whereas LLMs typically require significant GPU memory for deployment and serving. Inference workloads are considered production-critical and are given the highest priority to ensure compliance with SLAs. Additionally, Run:ai ensures that inference workloads cannot be preempted, maintaining consistent performance and reliability.

                                  See Deploy a custom inference workload to learn more about how to submit an inference workload via the Run:ai UI.

                                  "},{"location":"Researcher/workloads/training/distributed-training/distributed-training/","title":"Train models using a distributed training workload","text":"

                                  This article explains how to create a distributed training workload via the Run:ai UI.

                                  A training workload contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

                                  The distributed training workload is assigned to a project and is affected by the project\u2019s quota.

                                  To learn more about the distributed training workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

                                  "},{"location":"Researcher/workloads/training/distributed-training/distributed-training/#creating-a-distributed-training-workload","title":"Creating a distributed training workload","text":"

                                  Before you start, make sure you have a project.

                                  To add a new distributed training workload:

                                  1. Go to the Workload manager \u2192 Workloads
                                  2. Click +NEW WORKLOAD and select Training Within the new training form:
                                  3. Select under which cluster to create the training workload
                                  4. Select the project in which your training will run
                                  5. Set the training workload architecture as distributed workload, which consists of multiple processes working together. These processes can run on different nodes. This workload uses environments that support distributed training workloads only.

                                    • Set the framework for the distributed workload. Select from -

                                      • PyTorch
                                      • TensorFlow
                                      • XG Boost
                                      • MPI

                                      In case one the above frameworks is not enabled, see Distributed training prerequisites for details on enabling.

                                    • Set the distributed workload configuration that defines how distributed training workloads are divided across multiple machines or processes. Choose a configuration based on your training requirements and infrastructure -

                                      • Workers & master
                                      • Workers only
                                  6. Select a preconfigured template or select Start from scratch to launch a new training workload quickly

                                  7. Enter a unique name for the training workload (if the name already exists in the project, you will be requested to submit a different name)
                                  8. Click CONTINUE In the next step:
                                  9. Select the environment for your training workload
                                    • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery. For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
                                    • Set the connection for your tool(s). The tools are configured as part of the environment.
                                      • External URL
                                        • Custom URL
                                          • Set the URL
                                        • Optional: Modify who can access the tool:
                                          • All authenticated users (default) Everyone within the organization\u2019s account
                                          • Specific group(s)
                                            • Click +GROUP
                                            • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
                                          • Specific user(s)
                                            • Click +USER
                                            • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
                                      • Node port
                                        • Custom port
                                          • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
                                    • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
                                      • Enter UID
                                      • Enter GID
                                      • Add Supplementary groups (multiple groups can be added, separated by commas).
                                    • Optional: Set the command and arguments for the container running the workload When If no command is added, the container will use the image\u2019s default command (entry-point).
                                      • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
                                      • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
                                    • Set the environment variable(s)
                                      • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
                                      • (Optional) Add new variables
                                        • Click +ENVIRONMENT VARIABLE
                                        • Enter a name
                                        • Select the source for the environment variable
                                          • Custom
                                            • Enter a value according to the provided instructions
                                          • Credentials - Select existing credentials as the environment variable
                                            • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
                                            • Select a secret key
                                  10. Select the compute resource for your training workload

                                    • Set the number of workers for your workload
                                    • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery. For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
                                    • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload. When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
                                      • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
                                      • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster. To configure a new node pool and for additional information, see node pools.
                                    • Select a node affinity to schedule the workload on a specific node type. If the administrator added a \u2018node type (affinity)\u2019 scheduling rule to the project/department, then this field is mandatory. Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
                                    • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

                                      Note

                                      Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Tolerations

                                      • Click +TOLERATION
                                      • Enter a key
                                      • Select the operator
                                        • Exists - If the key exists on the node, the effect will be applied.
                                        • Equals - if the key and the value set below matches to the value on the node, the effect will be applied
                                          • Enter a value matching the value on the node
                                      • Select the effect for the toleration
                                        • NoExecute - Pods that do not tolerate this taint are evicted immediately.
                                        • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
                                        • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
                                        • Any - All effects above match.
                                    • Optional: Set topology to let the workload be scheduled on nodes with a matching topology. Topology lets the workload to be scheduled on nodes within the same region, zone, placement group or any other topology you define.

                                      Note

                                      Setting topology is disabled, by default. If you cannot see Topology in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Topology

                                      • Click +TOPOLOGY
                                      • Enter a key
                                      • Select the operator
                                        • Required - If the scheduler can\u2019t schedule all pods within the same topology, the workload will be pending.
                                        • Preferred - The scheduler will try to schedule all pods within the same topology but may schedule some pods on nodes that are not part of the same topology.
                                  11. Optional: Set the volume needed for your workload A volume allocates storage space to your workload that is persistent across restarts.

                                    • Click +VOLUME
                                    • Select the storage class
                                      • None - Proceed without defining a storage class.
                                      • Custom storage class - This option applies when selecting a storage class based on existing storage classes. To add new storage classes to the storage class list, and for additional information, see Kubernetes storage classes.
                                    • Select the access mode(s) (multiple modes can be selected)
                                      • Read-write by one node - The volume can be mounted as read-write by a single node.
                                      • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
                                      • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
                                    • Set the claim size and its units
                                    • Select the volume mode
                                      • File system (default) - This allows the volume to be mounted as a file system, enabling the usage of directories and files.
                                      • Block - This exposes the volume as a block storage, which can be formatted or used directly by applications without a file system.
                                    • Set the Container path with the volume target location
                                    • Set the volume persistency
                                      • Persistent - The volume and its data will be deleted only when the workload is deleted.
                                      • Ephemeral - The volume and its data will be deleted every time the workload\u2019s status changes to \u201cStopped.\u201d
                                  12. Optional: Select data sources for your training workload

                                    Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection. For a step-by-step guide on adding data sources to the gallery, see data sources. Once created, the new data source will be automatically selected.

                                    • Optional: Modify the data target location for the selected data source(s).
                                  13. Optional - General settings:

                                    • Set the grace period for workload preemption. This is a buffer that allows a preempted workload to reach a safe checkpoint before it is forcibly preempted. Enter a timeframe between 0 sec and 5 min.
                                    • Set the backoff limit before workload failure. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload status will change to \"Failed.\" Enter a value between 1 and 100.
                                    • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
                                    • Set annotations(s) Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
                                      • Click +ANNOTATION
                                      • Enter a name
                                      • Enter a value
                                    • Set labels(s) Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
                                      • Enter a name
                                      • Enter a value
                                  14. Click CONTINUE
                                  15. Decide if you wish to define a different setup between the Workers and the Master via the toggle. When disabled the master\u2019s setup will inherit the workers\u2019 setup.

                                    • In case a different setup is requested or required, repeat steps 9-13 stated above with necessary changes.
                                  16. Click CREATE TRAINING

                                  "},{"location":"Researcher/workloads/training/distributed-training/distributed-training/#workload-policies","title":"Workload Policies","text":"

                                  When creating a new workload, fields and assets may have limitations or defaults. These rules and defaults are derived from a policy your administrator set.

                                  Policies allow you to control, standardize, and simplify the workload submission process. For additional information, see Workload Policies and Rules.

                                  The effects of the policy are reflected in the training creation form:

                                  • Defaults derived from the policy will be displayed automatically for specific fields.
                                  • Disabled actions or values must be within a certain range.
                                  • Rules and defaults for entire sections (such as environments, compute resources, or data sources) may prevent selection and will appear on the entire library card with an option for additional information via an external modal.
                                  "},{"location":"Researcher/workloads/training/distributed-training/distributed-training/#managing-and-monitoring","title":"Managing and monitoring","text":"

                                  After the training workload is created, it is added to the Workloads table, where it can be managed and monitored.

                                  "},{"location":"Researcher/workloads/training/distributed-training/distributed-training/#using-cli","title":"Using CLI","text":"

                                  To view the available actions,see all possible distributed training workloads in the CLI v2 reference or the CLI v1 reference.

                                  "},{"location":"Researcher/workloads/training/distributed-training/distributed-training/#using-api","title":"Using API","text":"

                                  To view the available actions, see the Distributed workload API reference.

                                  "},{"location":"Researcher/workloads/training/distributed-training/quickstart-distributed-training/","title":"Run your First Distributed Training","text":"

                                  This article provides a step-by-step walkthrough for running a PyTorch distributed training workload.

                                  Distributed training is the ability to split the training of a model among multiple processors. Each processor is called a worker. Worker nodes work in parallel to speed up model training. There is also a master which coordinates the workers.

                                  "},{"location":"Researcher/workloads/training/distributed-training/quickstart-distributed-training/#prerequisites","title":"Prerequisites","text":"

                                  Before you start, make sure:

                                  • You have created a project or have one created for you.
                                  • The project has an assigned quota of at least 1 GPU.
                                  "},{"location":"Researcher/workloads/training/distributed-training/quickstart-distributed-training/#step-1-logging-in","title":"Step 1: Logging in","text":"User InterfaceCLI V1CLI V2API

                                  Browse to the provided Run:ai user interface and log in with your credentials.

                                  Log in using the following command. You will be prompted to enter your username and password:

                                  runai login\n

                                  Run the below --help command to obtain the login options and log in according to your setup:

                                  runai login --help  \n

                                  To use the API, you will need to obtain a token. Please follow the API authentication article.

                                  "},{"location":"Researcher/workloads/training/distributed-training/quickstart-distributed-training/#step-2-submitting-a-distributed-training-workload","title":"Step 2: Submitting a distributed training workload","text":"User InterfaceCLI V1CLI V2API
                                  1. Go to the Workload manager \u2192 Workloads
                                  2. Click +NEW WORKLOAD and select Training
                                  3. Select under which cluster to create the workload
                                  4. Select the project in which your workload will run
                                  5. Under Workload architecture, select Distributed and choose PyTorch. Set the distributed training configuration to Worker & master
                                  6. Select a preconfigured template or select the Start from scratch to launch a new workload quickly
                                  7. Enter a name for the distributed training workload (if the name already exists in the project, you will be requested to submit a different name)
                                  8. Click CONTINUE
                                  9. Click +NEW ENVIRONMENT

                                    a. Enter pytorch-dt as the name

                                    b. Enter kubeflow/pytorch-dist-mnist:latest as the Image URL

                                    c. Click CREATE ENVIRONMENT

                                  10. When the previous screen comes up, enter 2 workers and select \u2018small-fraction\u2019 as the compute resource for your workload

                                    • If the \u2018small-fraction\u2019 is not displayed in the gallery, follow the step-by-step guide:
                                    Create a small-fraction compute resource
                                    1. Click +NEW COMPUTE RESOURCE
                                    2. Select under which cluster to create the compute resource
                                    3. Select a scope
                                    4. Enter a name for the compute resource. The name must be unique.
                                    5. Set GPU devices per pod - 1
                                    6. Set GPU memory per device

                                      • Select % (of device) - Fraction of a GPU device\u2019s memory
                                    7. Optional: set the CPU compute per pod - 0.1 cores (default)

                                    8. Optional: set the CPU memory per pod - 100 MB (default)
                                    9. Click CREATE COMPUTE RESOURCE
                                    • The newly created small-fraction compute resource will be selected automatically
                                  11. Click CONTINUE

                                  12. Click CREATE TRAINING

                                    After the distributed training workload is created, it is added to the workloads table.

                                  Copy the following command to your terminal. Make sure to update the below with the name of your project and workload:

                                  runai config project \"project-name\"  \nrunai submit-dist pytorch \"workload-name\" --workers=2 -g 0.1 \\\n   -i kubeflow/pytorch-dist-mnist:latest\n

                                  This would start a distributed training workload based on kubeflow/pytorch-dist-mnist:latest with one master and two workers.

                                  Copy the following command to your terminal. Make sure to update the below with the name of your project and workload:

                                  runai project set \"project-name\"\nrunai distributed submit \"workload-name\" --framework PyTorch \\\n   -i kubeflow/pytorch-dist-mnist:latest --workers 2 \n   --gpu-request-type portion --gpu-portion-request 0.1 --gpu-devices-request 1 --cpu-memory-request 100M\n

                                  This would start a distributed training workload based on kubeflow/pytorch-dist-mnist:latest with one master and two workers.

                                  Copy the following command to your terminal. Make sure to update the below parameters according to the comments. For more details, see Distributed API reference:

                                  curl -L 'https://<COMPANY-URL>/api/v1/workloads/distributed' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"workload-name\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {  \n        \"compute\": { \n            \"cpuCoreRequest\": 0.1,\n            \"gpuRequestType\": \"portion\",\n            \"cpuMemoryRequest\": \"100M\",\n            \"gpuDevicesRequest\": 1,\n            \"gpuPortionRequest\": 0.1 \n        },      \n        \"image\": \"kubeflow/pytorch-dist-mnist:latest\",  \n        \"numWorkers\": 2,  \\ \n        \"distributedFramework\": \"PyTorch\" \\\n    } \n}'\n
                                  1. <COMPANY-URL> is the link to the Run:ai user interface.
                                  2. <TOKEN> is the API access token obtained in Step 1.
                                  3. <PROJECT-ID> is #The ID of the Project the workload is running on. You can get the Project ID via the Get Projects API Get Projects API.
                                  4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.

                                  This would start a distributed training workload based on kubeflow/pytorch-dist-mnist:latest with one master and two workers.

                                  Note

                                  The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.

                                  "},{"location":"Researcher/workloads/training/distributed-training/quickstart-distributed-training/#next-steps","title":"Next Steps","text":"
                                  • Manage and monitor your newly created workload using the workloads table.
                                  • After validating your training performance and results, deploy your model using inference.
                                  "},{"location":"Researcher/workloads/training/standard-training/quickstart-standard-training/","title":"Run your First Standard Training","text":"

                                  This article provides a step-by-step walkthrough for running a standard training workload.

                                  A training workload contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

                                  "},{"location":"Researcher/workloads/training/standard-training/quickstart-standard-training/#prerequisites","title":"Prerequisites","text":"

                                  Before you start, make sure:

                                  • You have created a project or have one created for you.
                                  • The project has an assigned quota of at least 1 GPU.
                                  "},{"location":"Researcher/workloads/training/standard-training/quickstart-standard-training/#step-1-logging-in","title":"Step 1: Logging in","text":"User InterfaceCLI V1CLI V2API

                                  Browse to the provided Run:ai user interface and log in with your credentials.

                                  Log in using the following command. You will be prompted to enter your username and password:

                                  runai login\n

                                  Run the below --help command to obtain the login options and log in according to your setup:

                                  runai login --help  \n

                                  To use the API, you will need to obtain a token. Please follow the API authentication article.

                                  "},{"location":"Researcher/workloads/training/standard-training/quickstart-standard-training/#step-2-submitting-a-standard-training-workload","title":"Step 2: Submitting a standard training workload","text":"User InterfaceCLI V1CLI V2API
                                  1. Go to the Workload manager \u2192 Workloads
                                  2. Click +NEW WORKLOAD and select Training
                                  3. Select under which cluster to create the workload
                                  4. Select the project in which your workload will run
                                  5. Under Workload architecture, select Standard
                                  6. Select a preconfigured template or select the Start from scratch to launch a new workload quickly
                                  7. Enter a name for the standard training workload (if the name already exists in the project, you will be requested to submit a different name)
                                  8. Click CONTINUE
                                  9. Click +NEW ENVIRONMENT

                                    a. Enter quickstart as the name

                                    b. Enter runai.jfrog.io/demo/quickstart as the Image URL

                                    c. Click CREATE ENVIRONMENT

                                  10. Select the \u2018one-gpu\u2019 compute resource for your workload (GPU devices: 1)

                                    • If the \u2018one-gpu\u2019 is not displayed in the gallery, follow the step-by-step guide:
                                    Create a one-gpu compute resource
                                    1. Click +NEW COMPUTE RESOURCE
                                    2. Select under which cluster to create the compute resource
                                    3. Select a scope
                                    4. Enter a name for the compute resource. The name must be unique.
                                    5. Set GPU devices per pod - 1
                                    6. Set GPU memory per device

                                      • Select % (of device) - Fraction of a GPU device\u2019s memory
                                      • Set the memory Request - 100 (The workload will allocate 100% of the GPU memory)
                                    7. Optional: set the CPU compute per pod - 0.1 cores (default)

                                    8. Optional: set the CPU memory per pod - 100 MB (default)
                                    9. Click CREATE COMPUTE RESOURCE
                                    • The newly created one-gpu compute resource will be selected automatically
                                  11. Click CONTINUE

                                  12. Click CREATE TRAINING

                                    After the standard training workload is created, it is added to the workloads table.

                                  Copy the following command to your terminal. Make sure to update the below with the name of your project:

                                  runai config project \"project-name\"  \nrunai submit \"workload-name\" -i runai.jfrog.io/demo/quickstart -g 1\n

                                  This would start a standard training workload based on a sample docker image, runai.jfrog.io/demo/quickstart, with one GPU allocated.

                                  Copy the following command to your terminal. Make sure to update the below with the name of your project and workload:

                                  runai project set \"project-name\"\nrunai training submit \"workload-name\" -i runai.jfrog.io/demo/quickstart -g 1\n

                                  This would start a standard training workload based on a sample docker image, runai.jfrog.io/demo/quickstart, with one GPU allocated.

                                  Copy the following command to your terminal. Make sure to update the below parameters according to the comments. For more details, see Trainings API reference:

                                  curl -L 'https://<COMPANY-URL>/api/v1/workloads/trainings' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"workload-name\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {  \n       \"image\": \"runai.jfrog.io/demo/quickstart\", \n       \"compute\": { \n       \"gpuDevicesRequest\": 1\n       }, \n    } \n}'\n
                                  1. <COMPANY-URL> is the link to the Run:ai user interface.
                                  2. <TOKEN> is the API access token obtained in Step 1.
                                  3. <PROJECT-ID> is #The ID of the Project the workload is running on. You can get the Project ID via the Get Projects API Get Projects API.
                                  4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.

                                  This would start a standard training workload based on a sample docker image, runai.jfrog.io/demo/quickstart, with one GPU allocated.

                                  Note

                                  The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.

                                  "},{"location":"Researcher/workloads/training/standard-training/quickstart-standard-training/#next-steps","title":"Next Steps","text":"
                                  • Manage and monitor your newly created workload using the workloads table.
                                  • After validating your training performance and results, deploy your model using inference.
                                  "},{"location":"Researcher/workloads/training/standard-training/trainings-v2/","title":"Train models using a standard training workload","text":"

                                  This article explains how to create a standard training workload via the Run:ai UI.

                                  A training workload contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

                                  The training workload is assigned to a project and is affected by the project\u2019s quota.

                                  To learn more about the training workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

                                  "},{"location":"Researcher/workloads/training/standard-training/trainings-v2/#creating-a-standard-training-workload","title":"Creating a standard training workload","text":"

                                  Before you start, make sure you have a project.

                                  To add a new training workload:

                                  1. Go to the Workload manager \u2192 Workloads
                                  2. Click +NEW WORKLOAD and select Training Within the new training form:
                                  3. Select under which cluster to create the training workload
                                  4. Select the project in which your training will run
                                  5. Set the training workload architecture as standard, which consists of a single main running process. This workload uses environments that support standard training workloads only.
                                  6. Select a preconfigured template or select Start from scratch to launch a new training workload quickly
                                  7. Enter a unique name for the training workload (if the name already exists in the project, you will be requested to submit a different name)
                                  8. Click CONTINUE In the next step:
                                  9. Select the environment for your training workload
                                    • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery. For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
                                    • Set the connection for your tool(s). The tools are configured as part of the environment.
                                      • External URL
                                        • Custom URL
                                          • Set the URL
                                        • Optional: Modify who can access the tool:
                                          • All authenticated users (default) Everyone within the organization\u2019s account
                                          • Specific group(s)
                                            • Click +GROUP
                                            • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
                                          • Specific user(s)
                                            • Click +USER
                                            • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
                                      • Node port
                                        • Custom port
                                          • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
                                    • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
                                      • Enter UID
                                      • Enter GID
                                      • Add Supplementary groups (multiple groups can be added, separated by commas).
                                    • Optional: Set the command and arguments for the container running the workload When If no command is added, the container will use the image\u2019s default command (entry-point).
                                      • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
                                      • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
                                    • Set the environment variable(s)
                                      • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
                                      • (Optional) Add new variables
                                        • Click +ENVIRONMENT VARIABLE
                                        • Enter a name
                                        • Select the source for the environment variable
                                          • Custom
                                            • Enter a value according to the provided instructions
                                          • Credentials - Select existing credentials as the environment variable
                                            • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
                                            • Select a secret key
                                  10. Select the compute resource for your training workload

                                    • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery. For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
                                    • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload. When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
                                      • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
                                      • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster. To configure a new node pool and for additional information, see node pools.
                                    • Select a node affinity to schedule the workload on a specific node type. If the administrator added a \u2018node type (affinity)\u2019 scheduling rule to the project/department, then this field is mandatory. Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
                                    • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

                                      Note

                                      Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Tolerations

                                      • Click +TOLERATION
                                      • Enter a key
                                      • Select the operator
                                        • Exists - If the key exists on the node, the effect will be applied.
                                        • Equals - if the key and the value set below matches to the value on the node, the effect will be applied
                                          • Enter a value matching the value on the node
                                      • Select the effect for the toleration
                                        • NoExecute - Pods that do not tolerate this taint are evicted immediately.
                                        • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
                                        • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
                                        • Any - All effects above match.
                                  11. Optional: Set the volume needed for your workload A volume allocates storage space to your workload that is persistent across restarts.

                                    • Click +VOLUME
                                    • Select the storage class
                                      • None - Proceed without defining a storage class.
                                      • Custom storage class - This option applies when selecting a storage class based on existing storage classes. To add new storage classes to the storage class list, and for additional information, see Kubernetes storage classes.
                                    • Select the access mode(s) (multiple modes can be selected)
                                      • Read-write by one node - The volume can be mounted as read-write by a single node.
                                      • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
                                      • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
                                    • Set the claim size and its units
                                    • Select the volume mode
                                      • File system (default) - This allows the volume to be mounted as a file system, enabling the usage of directories and files.
                                      • Block - This exposes the volume as a block storage, which can be formatted or used directly by applications without a file system.
                                    • Set the Container path with the volume target location
                                    • Set the volume persistency
                                      • Persistent - The volume and its data will be deleted only when the workload is deleted.
                                      • Ephemeral - The volume and its data will be deleted every time the workload\u2019s status changes to \u201cStopped.\u201d
                                  12. Optional: Select data sources for your training workload

                                    Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection. For a step-by-step guide on adding data sources to the gallery, see data sources. Once created, the new data source will be automatically selected.

                                    • Optional: Modify the data target location for the selected data source(s).
                                  13. Optional - General settings:

                                    • Set the grace period for workload preemption. This is a buffer that allows a preempted workload to reach a safe checkpoint before it is forcibly preempted. Enter a timeframe between 0 sec and 5 min.
                                    • Set the number of runs the workload must finish to be considered complete. Multiple runs enhance the reliability and validity of the training results.
                                    • If the number of runs is above 1, enter a value under Parallelism to specify how many runs may be scheduled in parallel. The value must be less than or equal to the number of runs.
                                    • Set the backoff limit before workload failure. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload status will change to \"Failed.\" Enter a value between 1 and 100.
                                    • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
                                    • Set annotations(s) Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
                                      • Click +ANNOTATION
                                      • Enter a name
                                      • Enter a value
                                    • Set labels(s) Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
                                      • Enter a name
                                      • Enter a value
                                  14. Click CREATE TRAINING
                                  "},{"location":"Researcher/workloads/training/standard-training/trainings-v2/#workload-policies","title":"Workload Policies","text":"

                                  When creating a new workload, fields and assets may have limitations or defaults. These rules and defaults are derived from a policy your administrator set.

                                  Policies allow you to control, standardize, and simplify the workload submission process. For additional information, see Workload Policies and Rules.

                                  The effects of the policy are reflected in the training creation form:

                                  • Defaults derived from the policy will be displayed automatically for specific fields.
                                  • Disabled actions or values must be within a certain range.
                                  • Rules and defaults for entire sections (such as environments, compute resources, or data sources) may prevent selection and will appear on the entire library card with an option for additional information via an external modal.
                                  "},{"location":"Researcher/workloads/training/standard-training/trainings-v2/#managing-and-monitoring","title":"Managing and monitoring","text":"

                                  After the training workload is created, it is added to the Workloads table, where it can be managed and monitored.

                                  "},{"location":"Researcher/workloads/training/standard-training/trainings-v2/#using-cli","title":"Using CLI","text":"

                                  To view the available actions, see the training workload CLI v2 reference or the CLI v1 reference.

                                  "},{"location":"Researcher/workloads/training/standard-training/trainings-v2/#using-api","title":"Using API","text":"

                                  To view the available actions, see the Trainings workload API reference.

                                  "},{"location":"Researcher/workloads/workspaces/quickstart-jupyter/","title":"Running Jupyter Notebook Using Workspaces","text":"

                                  This guide provides a step-by-step walkthrough for running a Jupyter Notebook using workspaces.

                                  A workspace contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in one place. See Running workspaces for more information.

                                  "},{"location":"Researcher/workloads/workspaces/quickstart-jupyter/#prerequisites","title":"Prerequisites","text":"

                                  Before you start, make sure:

                                  • You have created a project or have one created for you.
                                  • The project has an assigned quota of at least 1 GPU.
                                  "},{"location":"Researcher/workloads/workspaces/quickstart-jupyter/#step-1-logging-in","title":"Step 1: Logging in","text":"User InterfaceCLI V1CLI V2API

                                  Browse to the provided Run:ai user interface and log in with your credentials.

                                  Log in using the following command. You will be prompted to enter your username and password:

                                  runai login\n

                                  Run the below --help command to obtain the login options and log in according to your setup:

                                  runai login --help  \n

                                  To use the API, you will need to obtain a token. Please follow the API authentication article.

                                  "},{"location":"Researcher/workloads/workspaces/quickstart-jupyter/#step-2-submitting-a-workspace","title":"Step 2: Submitting a workspace","text":"User InterfaceCLI V1CLI V2API
                                  1. Go to the Workload manager \u2192 Workloads
                                  2. Select +NEW WORKLOAD and then Workspace
                                  3. Select under which cluster to create the workload
                                  4. Select the project in which your workspace will run
                                  5. Select a preconfigured template or select the Start from scratch to launch a new workspace quickly
                                  6. Enter a name for the workspace (If the name already exists in the project, you will be requested to submit a different name)
                                  7. Click CONTINUE
                                  8. Select the \u2018jupyter-lab\u2019 environment for your workspace (Image URL: jupyter/scipy-notebook)

                                    • If the \u2018jupyter-lab\u2019 is not displayed in the gallery, follow the step-by-step guide:
                                    Create a jupyter-lab environment
                                    1. Click +NEW ENVIRONMENT
                                    2. Select under which cluster to create the environment
                                    3. Select a scope.
                                    4. Enter a name for the environment. The name must be unique.
                                    5. Enter the jupyter-lab Image URL - jupyter/scipy-notebook
                                    6. Tools - Set the connection for your tool

                                      • Click +TOOL
                                      • Select Jupyter tool from the list
                                    7. Set the runtime settings for the environment

                                      • Click +COMMAND
                                      • Enter command - start-notebook.sh
                                      • Enter arguments - --NotebookApp.base_url=/${RUNAI_PROJECT}/${RUNAI_JOB_NAME} --NotebookApp.token=''

                                    Note

                                    If host-based routing is enabled on the cluster, enter the argument --NotebookApp.token='' only.

                                    1. Click CREATE ENVIRONMENT
                                    • The newly created jupyter-lab will be selected automatically
                                  9. Select the \u2018one-gpu\u2019 compute resource for your workspace (GPU devices: 1)

                                    • If the \u2018one-gpu\u2019 is not displayed in the gallery, follow the step-by-step guide:
                                    Create a one-gpu compute resource
                                    1. Click +NEW COMPUTE RESOURCE
                                    2. Select under which cluster to create the compute resource
                                    3. Select a scope
                                    4. Enter a name for the compute resource. The name must be unique.
                                    5. Set GPU devices per pod - 1
                                    6. Set GPU memory per device

                                      • Select % (of device) - Fraction of a GPU device\u2019s memory
                                      • Set the memory Request - 100 (The workload will allocate 100% of the GPU memory)
                                    7. Optional: set the CPU compute per pod - 0.1 cores (default)

                                    8. Optional: set the CPU memory per pod - 100 MB (default)
                                    9. Click CREATE COMPUTE RESOURCE
                                    • The newly created one-gpu compute resource will be selected automatically
                                  10. Click CREATE WORKSPACE

                                    After the workspace is created, it is added to the workloads table.

                                  Copy the following command to your terminal. Make sure to update the below with the name of your project and workload:

                                  runai config project \"project-name\"  \nrunai submit \"workload-name\" --jupyter -g 1\n

                                  This would start a workspace with a pre-configured Jupyter image with one GPU allocated.

                                  Copy the following command to your terminal. Make sure to update the below with the name of your project and workload:

                                  runai project set \"project-name\"\nrunai workspace submit \"workload-name\"  --image jupyter/scipy-notebook --gpu-devices-request 1 \\\n    --external-url container=8888  --command start-notebook.sh  \\\n    -- --NotebookApp.base_url=/\\${RUNAI_PROJECT}/\\${RUNAI_JOB_NAME} --NotebookApp.token=''\n

                                  Copy the following command to your terminal. Make sure to update the below parameters according to the comments. For more details, see Workspaces API reference:

                                  curl -L 'https://<COMPANY-URL>/api/v1/workloads/workspaces' \\ # (1)\n-H 'Content-Type: application/json' \\\n-H 'Authorization: Bearer <TOKEN>' \\ # (2)\n-d '{ \n    \"name\": \"workload-name\", \n    \"projectId\": \"<PROJECT-ID>\", '\\ # (3)\n    \"clusterId\": \"<CLUSTER-UUID>\", \\ # (4)\n    \"spec\": {\n        \"command\" : \"start-notebook.sh\",\n        \"args\" : \"--NotebookApp.base_url=/${RUNAI_PROJECT}/${RUNAI_JOB_NAME} --NotebookApp.token=''\",\n        \"image\": \"jupyter/scipy-notebook\",\n        \"compute\": {\n            \"gpuDevicesRequest\": 1\n        },\n        \"exposedUrls\" : [\n            { \n                \"container\" : 8888,\n                \"toolType\": \"jupyter-notebook\", \\ # (5)\n                \"toolName\": \"Jupyter\" \\ # (6)\n            }\n        ]\n    }\n}'\n
                                  1. <COMPANY-URL> is the link to the Run:ai user interface.
                                  2. <TOKEN> is the API access token obtained in Step 1.
                                  3. <PROJECT-ID> is #The ID of the Project the workspace is running on. You can get the Project ID via the Get Projects API Get Projects API.
                                  4. <CLUSTER-UUID> is the unique identifier of the Cluster. You can get the Cluster UUID by adding the \"Cluster ID\" column to the Clusters view.
                                  5. toolType will show the Jupyter icon when connecting to the Jupyter tool via the user interface.
                                  6. toolName text will show when connecting to the Jupyter tool via the user interface.

                                  Note

                                  The above API snippet will only work with Run:ai clusters of 2.18 and above. For older clusters, use, the now deprecated Cluster API.

                                  "},{"location":"Researcher/workloads/workspaces/quickstart-jupyter/#step-3-connecting-to-the-jupyter-notebook","title":"Step 3: Connecting to the Jupyter Notebook","text":"User InterfaceCLI V1CLI V1API
                                  1. Select the newly created workspace with the Jupyter application that you want to connect to
                                  2. Click CONNECT
                                  3. Select the Jupyter tool
                                  4. The selected tool is opened in a new tab on your browser

                                  To connect to the Jupyter Notebook, browse directly to https://<COMPANY-URL>/<PROJECT-NAME>/jup1.

                                  To connect to the Jupyter Notebook, browse directly to https://<COMPANY-URL>/<PROJECT-NAME>/jup1.

                                  To connect to the Jupyter Notebook, browse directly to https://<COMPANY-URL>/<PROJECT-NAME>/jup1.

                                  "},{"location":"Researcher/workloads/workspaces/quickstart-jupyter/#next-steps","title":"Next Steps","text":"

                                  Manage and monitor your newly created workspace using the workloads table.

                                  "},{"location":"Researcher/workloads/workspaces/workspace-v2/","title":"Running Workspaces","text":"

                                  This article explains how to create a workspace via the Run:ai UI.

                                  A workspace contains the setup and configuration needed for building your model, including the container, images, data sets, and resource requests, as well as the required tools for the research, all in a single place.

                                  To learn more about the workspace workload type in Run:ai and determine that it is the most suitable workload type for your goals, see Workload types.

                                  "},{"location":"Researcher/workloads/workspaces/workspace-v2/#creating-a-new-workspace","title":"Creating a new Workspace","text":"

                                  Before you start, make sure you have a project.

                                  To add a new workspace:

                                  1. Go to the Workload manager \u2192 Workloads
                                  2. Click +NEW WORKLOAD and select Workspace Within the new workspace form:
                                  3. Select under which cluster to create the workload
                                  4. Select the project in which your workspace will run
                                  5. Select a preconfigured template or select Start from scratch to launch a new workspace quickly
                                  6. Enter a unique name for the workspace (if the name already exists in the project, you will be requested to submit a different name)
                                  7. Click CONTINUE In the next step:
                                  8. Select the environment for your workspace

                                    • Select an environment or click +NEW ENVIRONMENT to add a new environment to the gallery. For a step-by-step guide on adding environments to the gallery, see Environments. Once created, the new environment will be automatically selected.
                                    • Set the connection for your tool(s). The tools are configured as part of the environment.
                                      • External URL
                                        • Custom URL
                                          • Set the URL
                                        • Optional: Modify who can access the tool:
                                          • All authenticated users (default) Everyone within the organization\u2019s account
                                          • Specific group(s)
                                            • Click +GROUP
                                            • Enter group names as they appear in your identity provider. You must be a member of one of the groups listed to have access to the tool.
                                          • Specific user(s)
                                            • Click +USER
                                            • Enter a valid email address or username. If you remove yourself, you will lose access to the tool.
                                      • Node port
                                        • Custom port
                                          • Set the node port (enter a port between 30000 and 32767; if the node port is already in use, the workload will fail and display an error message)
                                    • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
                                      • Enter UID
                                      • Enter GID
                                      • Add Supplementary groups (multiple groups can be added, separated by commas).
                                    • Optional: Set the command and arguments for the container running the workload. If no command is added, the container will use the image\u2019s default command (entry-point).
                                      • Modify the existing command or click +COMMAND & ARGUMENTS to add a new command.
                                      • Set multiple arguments separated by spaces, using the following format (e.g.: --arg1=val1).
                                    • Set the environment variable(s)
                                      • Modify the existing environment variable(s). The existing environment variables may include instructions to guide you with entering the correct values.
                                      • (Optional) Add new variables
                                      • Click +ENVIRONMENT VARIABLE
                                        • Enter a name
                                        • Select the source for the environment variable

                                          • Custom
                                            • Enter a value according to the provided instructions
                                          • Credentials - Select select an existing credentials as the environment variable
                                            • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
                                            • Select a secret key
                                  9. Select the compute resource for your workspace

                                    • Select a compute resource or click +NEW COMPUTE RESOURCE to add a new compute resource to the gallery. For a step-by-step guide on adding compute resources to the gallery, see compute resources. Once created, the new compute resource will be automatically selected.
                                    • Optional: Set the order of priority for the node pools on which the scheduler tries to run the workload. When a workload is created, the scheduler will try to run it on the first node pool on the list. If the node pool doesn't have free resources, the scheduler will move on to the next one until it finds one that is available.
                                      • Drag and drop them to change the order, remove unwanted ones, or reset to the default order defined in the project.
                                      • Click +NODE POOL to add a new node pool from the list of node pools that were defined on the cluster. To configure a new node pool and for additional information, see node pools.
                                    • Select a node affinity to schedule the workload on a specific node type. If the administrator added a \u2018node type (affinity)\u2019 scheduling rule to the project/department, then this field is mandatory. Otherwise, entering a node type (affinity) is optional. Nodes must be tagged with a label that matches the node type key and value.
                                    • Optional: Set toleration(s) to allow the workload to be scheduled on a node with a matching taint

                                      Note

                                      Tolerations are disabled, by default. If you cannot see Tolerations in the menu, then it must be enabled by your Administrator, under General settings \u2192 Workloads \u2192 Tolerations

                                      • Click +TOLERATION
                                      • Enter a key
                                      • Select the operator
                                        • Exists - If the key exists on the node, the effect will be applied.
                                        • Equals - if the key and the value set below matches to the value on the node, the effect will be applied
                                          • Enter a value matching the value on the node
                                      • Select the effect for the toleration
                                        • NoExecute - Pods that do not tolerate this taint are evicted immediately.
                                        • NoSchedule - No new pods will be scheduled on the tainted node unless they have a matching toleration. Pods currently running on the node will not be evicted.
                                        • PreferNoSchedule - The control plane will try to avoid placing a pod that does not tolerate the taint on the node, but it is not guaranteed.
                                        • Any - All effects above match.
                                  10. Optional: Set the volume needed for your workload A volume allocates storage space to your workload that is persistent across restarts.

                                    • Click +VOLUME
                                    • Select the storage class
                                      • None - Proceed without defining a storage class.
                                      • Custom storage class - This option applies when selecting a storage class based on existing storage classes. To add new storage classes to the storage class list, and for additional information, see Kubernetes storage classes
                                    • Select the access mode(s) (multiple modes can be selected)
                                      • Read-write by one node - The volume can be mounted as read-write by a single node.
                                      • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
                                      • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
                                    • Set the claim size and its units
                                    • Select the volume mode
                                      • File system (default) - This allows the volume to be mounted as a file system, enabling the usage of directories and files.
                                      • Block - This exposes the volume as a block storage, which can be formatted or used directly by applications without a file system.
                                    • Set the Container path with the volume target location
                                    • Set the volume persistency
                                      • Persistent - The volume and its data will be deleted only when the workload is deleted.
                                      • Ephemeral - The volume and its data will be deleted every time the workload\u2019s status changes to \u201cStopped.\u201d
                                  11. Optional: Select data sources for your workspace Select a data source or click +NEW DATA SOURCE to add a new data source to the gallery. If there are issues with the connectivity to the cluster, or issues while creating the data source, the data source won't be available for selection. For a step-by-step guide on adding data sources to the gallery, see data sources. Once created, the new data source will be automatically selected.
                                    • Optional: Modify the data target location for the selected data source(s).
                                  12. Optional - General settings:
                                    • Allow the workload to exceed the project quota. Workloads running over quota may be preempted and stopped at any time.
                                    • Set the backoff limit before workload failure. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload status will change to \"Failed.\" Enter a value between 1 and 100.
                                    • Set the timeframe for auto-deletion after workload completion or failure. The time after which a completed or failed workload is deleted; if this field is set to 0 seconds, the workload will be deleted automatically.
                                    • Set annotations(s) Kubernetes annotations are key-value pairs attached to the workload. They are used for storing additional descriptive metadata to enable documentation, monitoring and automation.
                                      • Click +ANNOTATION
                                      • Enter a name
                                      • Enter a value
                                    • Set labels(s) Kubernetes labels are key-value pairs attached to the workload. They are used for categorizing to enable querying.
                                      • Enter a name
                                      • Enter a value
                                  13. Click CREATE WORKSPACE
                                  "},{"location":"Researcher/workloads/workspaces/workspace-v2/#workload-policies","title":"Workload Policies","text":"

                                  When creating a new workload, fields and assets may have limitations or defaults. These rules and defaults are derived from a policy your administrator set.

                                  Policies allow you to control, standardize, and simplify the workload submission process. For additional information, see Workload Policies and Rules.

                                  The effects of the policy are reflected in the workspace creation form:

                                  • Defaults derived from the policy will be displayed automatically for specific fields.
                                  • Disabled actions or values must be within a certain range.
                                  • Rules and defaults for entire sections (such as environments, compute resources, or data sources) may prevent selection and will appear on the entire library card with an option for additional information via an external modal.
                                  "},{"location":"Researcher/workloads/workspaces/workspace-v2/#managing-and-monitoring","title":"Managing and monitoring","text":"

                                  After the workspace is created, it is added to the Workloads table, where it can be managed and monitored.

                                  "},{"location":"Researcher/workloads/workspaces/workspace-v2/#using-cli","title":"Using CLI","text":"

                                  To view the available actions on workspaces, see the Workspaces CLI v2 reference or the CLI v1 reference.

                                  "},{"location":"Researcher/workloads/workspaces/workspace-v2/#using-api","title":"Using API","text":"

                                  To view the available actions on workspaces, see the Workspaces API reference.

                                  "},{"location":"admin/overview-administrator/","title":"Overview: Infrastructure Administrator","text":"

                                  The Infrastructure Administrator is an IT person, responsible for the installation, setup and IT maintenance of the Run:ai product.

                                  As part of the Infrastructure Administrator documentation you will find:

                                  • Install Run:ai
                                    • Understand the Run:ai installation
                                    • Set up a Run:ai Cluster.
                                    • Set up Researchers to work with Run:ai.
                                  • IT Configuration of the Run:ai system
                                  • Connect Run:ai to an identity provider.
                                  • Maintenance & monitoring of the Run:ai system
                                  • Troubleshooting.
                                  "},{"location":"admin/authentication/accessrules/","title":"Access Rules","text":"

                                  This article explains the procedure to manage Access rules.

                                  Access rules provide users, groups, or applications privileges to system entities.

                                  An access rule is the assignment of a role to a subject in a scope: <Subject> is a <Role> in a <Scope>.

                                  For example, user user@domain.com is a department admin in department A.

                                  "},{"location":"admin/authentication/accessrules/#access-rules-table","title":"Access rules table","text":"

                                  The Access rules table can be found under Access in the Run:ai platform.

                                  The Access rules table provides a list of all the access rules defined in the platform and allows you to manage them.

                                  Note

                                  Flexible management

                                  It is also possible to manage access rules directly for a specific user, application, project, or department.

                                  The Access rules table consists of the following columns:

                                  Column Description Type The type of subject assigned to the access rule (user, SSO group, or application). Subject The user, SSO group, or application assigned with the role Role The role assigned to the subject Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates Authorized by The user who granted the access rule Creation time The timestamp for when the rule was created Last updated The last time the access rule was updated"},{"location":"admin/authentication/accessrules/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"admin/authentication/accessrules/#adding-new-access-rules","title":"Adding new access rules","text":"

                                  To add a new access rule:

                                  1. Click +NEW ACCESS RULE
                                  2. Select a subject User, SSO Group, or Application
                                  3. Select or enter the subject identifier:
                                    • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
                                    • Group name as recognized by the IDP
                                    • Application name as created in Run:ai
                                  4. Select a role
                                  5. Select a scope
                                  6. Click SAVE RULE

                                  Note

                                  An access rule consists of a single subject with a single role in a single scope. To assign multiple roles or multiple scopes to the same subject, multiple access rules must be added.

                                  "},{"location":"admin/authentication/accessrules/#editing-an-access-rule","title":"Editing an access rule","text":"

                                  Access rules cannot be edited. To change an access rule, you must delete the rule, and then create a new rule to replace it.

                                  "},{"location":"admin/authentication/accessrules/#deleting-an-access-rule","title":"Deleting an access rule","text":"
                                  1. Select the access rule you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm the deletion
                                  "},{"location":"admin/authentication/accessrules/#using-api","title":"Using API","text":"

                                  Go to the Access rules API reference to view the available actions

                                  "},{"location":"admin/authentication/applications/","title":"Applications","text":"

                                  This article explains the procedure to manage your organization's applications.

                                  Applications are used for API integrations with Run:ai. An application contains a client ID and a client secret. With the client credentials, you can obtain a token as detailed in API authentication and use it within subsequent API calls.

                                  Applications are assigned with Access Rules to manage permissions. For example, application ci-pipeline-prod is assigned with a Researcher role in Cluster: A.

                                  "},{"location":"admin/authentication/applications/#applications-table","title":"Applications table","text":"

                                  The Applications table can be found under Access in the Run:ai platform.

                                  The Applications table provides a list of all the applications defined in the platform, and allows you to manage them.

                                  The Applications table consists of the following columns:

                                  Column Description Application The name of the application Client ID The client ID of the application Access rule(s) The access rules assigned to the application Last login The timestamp for the last time the user signed in Created by The user who created the application Creation time The timestamp for when the application was created Last updated The last time the application was updated"},{"location":"admin/authentication/applications/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"admin/authentication/applications/#creating-an-application","title":"Creating an application","text":"

                                  To create an application:

                                  1. Click +NEW APPLICATION
                                  2. Enter the application\u2019s name
                                  3. Click CREATE
                                  4. Copy the Client ID and Client secret and store them securely
                                  5. Click DONE

                                  Note

                                  The client secret is visible only at the time of creation. It cannot be recovered but can be regenerated.

                                  "},{"location":"admin/authentication/applications/#adding-an-access-rule-to-an-application","title":"Adding an access rule to an application","text":"

                                  To create an access rule:

                                  1. Select the application you want to add an access rule for
                                  2. Click ACCESS RULES
                                  3. Click +ACCESS RULE
                                  4. Select a role
                                  5. Select a scope
                                  6. Click SAVE RULE
                                  7. Click CLOSE
                                  "},{"location":"admin/authentication/applications/#deleting-an-access-rule-from-an-application","title":"Deleting an access rule from an application","text":"

                                  To delete an access rule:

                                  1. Select the application you want to remove an access rule from
                                  2. Click ACCESS RULES
                                  3. Find the access rule assigned to the user you would like to delete
                                  4. Click on the trash icon
                                  5. Click CLOSE
                                  "},{"location":"admin/authentication/applications/#regenerating-client-secret","title":"Regenerating client secret","text":"

                                  To regenerate a client secret:

                                  1. Locate the application you want to regenerate its client secret
                                  2. Click REGENERATE CLIENT SECRET
                                  3. Click REGENERATE
                                  4. Copy the New client secret and store it securely
                                  5. Click DONE

                                  Warning

                                  Regenerating a client secret revokes the previous one.

                                  "},{"location":"admin/authentication/applications/#deleting-an-application","title":"Deleting an application","text":"
                                  1. Select the application you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm
                                  "},{"location":"admin/authentication/applications/#using-api","title":"Using API","text":"

                                  Go to the Applications, Access rules API reference to view the available actions

                                  "},{"location":"admin/authentication/authentication-overview/","title":"Authentication & Authorization","text":"

                                  Run:ai Authentication & Authorization enables a streamlined experience for the user with precise controls covering the data each user can see and the actions each user can perform in the Run:ai platform.

                                  Authentication verifies user identity during login, and Authorization assigns the user with specific permissions according to the assigned access rules.

                                  Authenticated access is required to use all aspects of the Run:ai interfaces, including the Run:ai platform, the Run:ai Command Line Interface (CLI) and APIs.

                                  "},{"location":"admin/authentication/authentication-overview/#authentication","title":"Authentication","text":"

                                  There are multiple methods to authenticate and access Run:ai.

                                  "},{"location":"admin/authentication/authentication-overview/#single-sign-on-sso","title":"Single Sign-On (SSO)","text":"

                                  Single Sign-On (SSO) is the preferred authentication method by large organizations, as it avoids the need to manage duplicate sets of user identities.

                                  Run:ai offers SSO integration, enabling users to utilize existing organizational credentials to access Run:ai without requiring dedicated credentials.

                                  Run:ai supports three methods to set up SSO:

                                  • SAML
                                  • OpenID Connect (OIDC)
                                  • OpenShift

                                  When using SSO, it is highly recommended to manage at least one local user, as a breakglass account (an emergency account), in case access to SSO is not possible.

                                  "},{"location":"admin/authentication/authentication-overview/#username-and-password","title":"Username and password","text":"

                                  Username and password access can be used when SSO integration is not possible.

                                  "},{"location":"admin/authentication/authentication-overview/#secret-key-for-application-programmatic-access","title":"Secret key (for Application programmatic access)","text":"

                                  A Secret is the authentication method for Applications. Applications use the Run:ai APIs to perform automated tasks including scripts and pipelines based on their assigned access rules.

                                  "},{"location":"admin/authentication/authentication-overview/#authorization","title":"Authorization","text":"

                                  The Run:ai platform uses Role Base Access Control (RBAC) to manage authorization.

                                  Once a user or an application is authenticated, they can perform actions according to their assigned access rules.

                                  "},{"location":"admin/authentication/authentication-overview/#role-based-access-control-rbac-in-runai","title":"Role Based Access Control (RBAC) in Run:ai","text":"

                                  While Kubernetes RBAC is limited to a single cluster, Run:ai expands the scope of Kubernetes RBAC, making it easy for administrators to manage access rules across multiple clusters.

                                  RBAC at Run:ai is configured using access rules.

                                  An access rule is the assignment of a role to a subject in a scope: <Subject> is a <Role> in a <Scope>.

                                  • Subject
                                  • A user, a group, or an application assigned with the role
                                  • Role
                                  • A set of permissions that can be assigned to subjects
                                  • A permission is a set of actions (view, edit, create and delete) over a Run:ai entity (e.g. projects, workloads, users)
                                    • For example, a role might allow a user to create and read Projects, but not update or delete them
                                    • Roles at Run:ai are system defined and cannot be created, edited or deleted
                                  • Scope
                                  • A scope is part of an organization in which a set of permissions (roles) is effective. Scopes include Projects, Departments, Clusters, Account (all clusters).

                                  Below is an example of an access rule: username@company.com is a Department admin in Department: A

                                  "},{"location":"admin/authentication/non-root-containers/","title":"User Identity in Container","text":"

                                  The identity of the user in the container determines its access to resources. For example, network file storage solutions typically use this identity to determine the container's access to network volumes. This document explains multiple ways for propagating the user identity into the container.

                                  "},{"location":"admin/authentication/non-root-containers/#the-default-root-access","title":"The Default: Root Access","text":"

                                  In docker, as well as in Kubernetes, the default for running containers is running as root. The implication of running as root is that processes running within the container have enough permissions to change anything in the container, and if propagated to network resources - can have permissions outside the container as well.

                                  This gives a lot of power to the Researcher but does not sit well with modern security standards of enterprise security.

                                  By default, if you run:

                                  runai submit -i ubuntu --attach --interactive -- bash\n
                                  then run id, you will see the root user.

                                  "},{"location":"admin/authentication/non-root-containers/#use-runai-flags-to-limit-root-access","title":"Use Run:ai flags to limit root access","text":"

                                  There are two [runai submit flags that control user identity at the Researcher level:

                                  • The flag --run-as-user starts the container with a specific user. The user is the current Linux user (see below for other behaviors if used in conjunction with Single sign-on).
                                  • The flag --prevent-privilege-escalation prevents the container from elevating its own privileges into root (e.g. running sudo or changing system files.).

                                  Equivalent flags exist in the Researcher User Interface.

                                  "},{"location":"admin/authentication/non-root-containers/#run-as-current-user","title":"Run as Current User","text":"

                                  From a Linux/Mac box, run:

                                  runai submit -i ubuntu --attach --interactive --run-as-user -- bash\n

                                  then run id, you will see the users and groups of the box you have been using to launch the Job.

                                  "},{"location":"admin/authentication/non-root-containers/#prevent-escalation","title":"Prevent Escalation","text":"

                                  From a Linux/Mac box, run:

                                  runai submit -i ubuntu --attach --interactive --run-as-user \\\n  --prevent-privilege-escalation  -- bash\n

                                  then verify that you cannot run su to become root within the container.

                                  "},{"location":"admin/authentication/non-root-containers/#setting-a-cluster-wide-default","title":"Setting a Cluster-Wide Default","text":"

                                  The two flags are voluntary. They are not enforced by the system. It is however possible to enforce them using Policies. Policies allow an Administrator to force compliance on both the User Interface and Command-line interface.

                                  "},{"location":"admin/authentication/non-root-containers/#passing-user-identity","title":"Passing user identity","text":""},{"location":"admin/authentication/non-root-containers/#passing-user-identity-from-identity-provider","title":"Passing user identity from Identity Provider","text":"

                                  A best practice is to store the user identifier (UID) and the group identifier (GID) in the organization's directory. Run:ai allows you to pass these values to the container and use them as the container identity.

                                  To perform this, you must:

                                  • Set up single sign-on. Perform the steps for UID/GID integration.
                                  • Run: runai login and enter your credentials
                                  • Use the flag --run-as-user

                                  Running id should show the identifier from the directory.

                                  "},{"location":"admin/authentication/non-root-containers/#passing-user-identity-explicitly-via-the-researcher-ui","title":"Passing user identity explicitly via the Researcher UI","text":"

                                  Via the Researcher User Interface, it is possible to explicitly provide the user id and group id:

                                  "},{"location":"admin/authentication/non-root-containers/#using-openshift-or-gatekeeper-to-provide-cluster-level-controls","title":"Using OpenShift or Gatekeeper to provide Cluster Level Controls","text":"

                                  Run:ai supports OpenShift as a Kubernetes platform. In OpenShift the system will provide a random UID to containers. The flags --run-as-user and --prevent-privilege-escalation are disabled on OpenShift. It is possible to achieve a similar effect on Kubernetes systems that are not OpenShift. A leading tool is Gatekeeper. Gatekeeper similarly enforces non-root on containers at the system level.

                                  "},{"location":"admin/authentication/non-root-containers/#creating-a-temporary-home-directory","title":"Creating a Temporary Home Directory","text":"

                                  When containers run as a specific user, the user needs to have a pre-created home directory within the image. Otherwise, when running a shell, you will not have a home directory:

                                  runai submit -i ubuntu --attach --interactive --run-as-user -- bash\nThe job 'job-0' has been submitted successfully\nYou can run `runai describe job job-0 -p team-a` to check the job status\nWaiting for pod to start running...\nINFO[0007] Job started\nConnecting to pod job-0-0-0\nIf you don't see a command prompt, try pressing enter.\nI have no name!@job-0-0-0:/$ \n

                                  Adding home directories to an image per user is not a viable solution. To overcome this, Run:ai provides an additional flag --create-home-dir. Adding this flag creates a temporary home directory for the user within the container.

                                  Notes

                                  • Data saved in this directory will not be saved when the container exits.
                                  • This flag is set by default to true when the --run-as-user flag is used, and false if not.
                                  "},{"location":"admin/authentication/researcher-authentication/","title":"Setup Researcher Access Control","text":""},{"location":"admin/authentication/researcher-authentication/#introduction","title":"Introduction","text":"

                                  The following instructions explain how to complete the configuration of access control for Researchers. This requires several steps:

                                  • (Mandatory) Modify the Kubernetes entry point (called the Kubernetes API server) to validate the credentials of incoming requests against the Run:ai Authentication authority.
                                  • (Command-line Interface usage only) Modify the Kubernetes profile to prompt the Researcher for credentials when running runai login (or oc login for OpenShift).

                                  Important

                                  • As of Run:ai version 2.16, you only need to perform these steps when accessing Run:ai from the command-line interface or sending YAMLs directly to Kubernetes
                                  • As of Run:ai version 2.18, you only need to perform these steps when if using the older command-line interface or sending YAMLs directly to Kubernetes.
                                  "},{"location":"admin/authentication/researcher-authentication/#kubernetes-configuration","title":"Kubernetes Configuration","text":"

                                  You must direct the Kubernetes API server to authenticate via Run:ai. This requires adding flags to the Kubernetes API Server. The flags show in the Run:ai user interface under Settings | General | Researcher Authentication | Server configuration.

                                  Modifying the API Server configuration differs between Kubernetes distributions:

                                  Vanilla KubernetesOpenShiftRKERKE2GKEEKSBCMAKSOther
                                  • Locate the Kubernetes API Server configuration file. The file's location may differ between different Kubernetes distributions. The location for vanilla Kubernetes is /etc/kubernetes/manifests/kube-apiserver.yaml
                                  • Edit the document, under the command tag, add the server configuration text described above.
                                  • Verify that the kube-apiserver-<master-node-name> pod in the kube-system namespace has been restarted and that changes have been incorporated. Run the below and verify that the oidc flags you have added:
                                  kubectl get pods -n kube-system kube-apiserver-<master-node-name> -o yaml\n

                                  No configuration is needed. Instead, Run:ai assumes that an Identity Provider has been defined at the OpenShift level and that the Run:ai Cluster installation has set the OpenshiftIdp flag to true. For more information see the Run:ai OpenShift control-plane setup.

                                  Edit Rancher cluster.yml (with Rancher UI, follow this). Add the following:

                                  cluster.yml
                                  kube-api:\n    always_pull_images: false\n    extra_args:\n        oidc-client-id: runai  # (1)\n        ...\n
                                  1. These are example parameters. Copy the actual parameters from Settings | General | Researcher Authentication as described above.

                                  You can verify that the flags have been incorporated into the RKE cluster by following the instructions here and running docker inspect <kube-api-server-container-id>, where <kube-api-server-container-id> is the container ID of api-server via obtained in the Rancher document.

                                  If working via the RKE2 Quickstart, edit /etc/rancher/rke2/config.yaml. Add the parameters provided in the server configuration section as described above in the following fashion:

                                  /etc/rancher/rke2/config.yaml
                                  kube-apiserver-arg:\n- \"oidc-client-id=runai\" # (1)\n...\n
                                  1. These are example parameters. Copy the actual parameters from Settings | General | Researcher Authentication as described above.

                                  If working via Rancher UI, need to add the flag as part of the cluster provisioning.

                                  Under Cluster Management | Create, turn on RKE2 and select a platform. Under Cluster Configuration | Advanced | Additional API Server Args. Add the Run:ai flags as <key>=<value> (e.g. oidc-username-prefix=-).

                                  Install Anthos identity service by running:

                                  gcloud container clusters update <gke-cluster-name> \\\n    --enable-identity-service --project=<gcp-project-name> --zone=<gcp-zone-name>\n

                                  Install the yq utility and run:

                                  For username-password authentication, run:

                                  kubectl get clientconfig default -n kube-public -o yaml > login-config.yaml\nyq -i e \".spec +={\\\"authentication\\\":[{\\\"name\\\":\\\"oidc\\\",\\\"oidc\\\":{\\\"clientID\\\":\\\"runai\\\",\\\"issuerURI\\\":\\\"$OIDC_ISSUER_URL\\\",\\\"kubectlRedirectURI\\\":\\\"http://localhost:8000/callback\\\",\\\"userClaim\\\":\\\"sub\\\",\\\"userPrefix\\\":\\\"-\\\"}}]}\" login-config.yaml\nkubectl apply -f login-config.yaml\n

                                  For single-sign-on, run:

                                  kubectl get clientconfig default -n kube-public -o yaml > login-config.yaml\nyq -i e \".spec +={\\\"authentication\\\":[{\\\"name\\\":\\\"oidc\\\",\\\"oidc\\\":{\\\"clientID\\\":\\\"runai\\\",\\\"issuerURI\\\":\\\"$OIDC_ISSUER_URL\\\",\\\"groupsClaim\\\":\\\"groups\\\",\\\"kubectlRedirectURI\\\":\\\"http://localhost:8000/callback\\\",\\\"userClaim\\\":\\\"email\\\",\\\"userPrefix\\\":\\\"-\\\"}}]}\" login-config.yaml\nkubectl apply -f login-config.yaml\n

                                  Where the OIDC flags are provided in the Run:ai server configuration section as described above.

                                  Then update runaiconfig with the Anthos endpoint - gke-oidc-envoy. Get the externel IP of the service in the Anthos namespace.

                                  kubectl get svc -n anthos-identity-service\nNAME               TYPE           CLUSTER-IP    EXTERNAL-IP     PORT(S)              AGE\ngke-oidc-envoy     LoadBalancer   10.37.3.111   39.201.319.10   443:31545/TCP        12h\n

                                  Add the IP to runaiconfig

                                  kubectl -n runai patch runaiconfig runai -p '{\"spec\": {\"researcher-service\": {\"args\": {\"gkeOidcEnvoyHost\": \"35.236.229.19\"}}}}'  --type=\"merge\"\n

                                  To create a kubeconfig profile for Researchers run:

                                  kubectl oidc login --cluster=CLUSTER_NAME --login-config=login-config.yaml \\\n    --kubeconfig=developer-kubeconfig\n

                                  (this will require installing the kubectl oidc plug-in as described in the Anthos document above gcloud components install kubectl-oidc)

                                  Then modify the developer-kubeconfig file as described in the Command-line Inteface Access section below.

                                  • In the AWS Console, under EKS, find your cluster.
                                  • Go to Configuration and then to Authentication.
                                  • Associate a new identity provider. Use the parameters provided in the server configuration section as described above. The process can take up to 30 minutes.

                                  Please follow the \"Vanilla Kubernetes\" instructions

                                  Please contact Run:ai customer support.

                                  See specific instructions in the documentation of the Kubernetes distribution.

                                  "},{"location":"admin/authentication/researcher-authentication/#command-line-interface-access","title":"Command-line Interface Access","text":"

                                  To control access to Run:ai (and Kubernetes) resources, you must modify the Kubernetes configuration file. The file is distributed to users as part of the Command-line interface installation.

                                  When making changes to the file, keep a copy of the original file to be used for cluster administration. After making the modifications, distribute the modified file to Researchers.

                                  • Under the ~/.kube directory edit the config file, remove the administrative user, and replace it with text from Settings | General | Researcher Authentication | Client Configuration.
                                  • Under contexts | context | user change the user to runai-authenticated-user.

                                  Important

                                  • After adding the new user, ensure to delete the following fields from the kubeconfig file to prevent unauthorized access: - Delete: client-certificate-data- Delete: client-key-data- Remove: Any references to the admin user.
                                  "},{"location":"admin/authentication/researcher-authentication/#test-via-command-line-interface","title":"Test via Command-line interface","text":"
                                  • Run: runai login (in OpenShift environments use oc login rather than runai login).
                                  • You will be prompted for a username and password. In a single sign-on flow, you will be asked to copy a link to a browser, log in and return a code.
                                  • Once login is successful, submit a Job.
                                  • If the Job was submitted with a Project to which you have no access, your access will be denied.
                                  • If the Job was submitted with a Project to which you have access, your access will be granted.

                                  You can also submit a Job from the Run:ai User interface and verify that the new job shows on the job list with your user name.

                                  "},{"location":"admin/authentication/researcher-authentication/#test-via-user-interface","title":"Test via User Interface","text":"
                                  • Open the Run:ai user interface, go to Workloads.
                                  • On the top-right, select Submit Workload.
                                  "},{"location":"admin/authentication/roles/","title":"Roles","text":"

                                  This article explains the available roles in the Run:ai platform.

                                  A role is a set of permissions that can be assigned to a subject in a scope.

                                  A permission is a set of actions (View, Edit, Create and Delete) over a Run:ai entity (e.g. projects, workloads, users).

                                  "},{"location":"admin/authentication/roles/#roles-table","title":"Roles table","text":"

                                  The Roles table can be found under Access in the Run:ai platform.

                                  The Roles table displays a list of predefined roles available to users in the Run:ai platform. It is not possible to create additional rules or edit or delete existing rules.

                                  The Roles table consists of the following columns:

                                  Column Description Role The name of the role Created by The name of the role creator Creation time The timestamp when the role was created"},{"location":"admin/authentication/roles/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"admin/authentication/roles/#reviewing-a-role","title":"Reviewing a role","text":"
                                  1. To review a role click the role name on the table
                                  2. In the role form review the following:
                                    • Role name The name of the role
                                    • Entity A system-managed object that can be viewed, edited, created or deleted by a user based on their assigned role and scope
                                    • Actions The actions that the role assignee is authorized to perform for each entity
                                      • View If checked, an assigned user with this role can view instances of this type of entity within their defined scope
                                      • Edit If checked, an assigned user with this role can change the settings of an instance of this type of entity within their defined scope
                                      • Create If checked, an assigned user with this role can create new instances of this type of entity within their defined scope
                                      • Delete If checked, an assigned user with this role can delete instances of this type of entity within their defined scope
                                  "},{"location":"admin/authentication/roles/#roles-in-runai","title":"Roles in Run:ai","text":"

                                  Run:ai supports the following roles and their permissions: Under each role is a detailed list of the actions that the role assignee is authorized to perform for each entity.

                                  Compute resource administrator

                                  Data source administrator

                                  Data volume administrator

                                  Department administrator

                                  Department viewer

                                  Editor

                                  Environment administrator

                                  L1 researcher

                                  L2 researcher

                                  ML engineer

                                  Research manager

                                  System administrator

                                  Template administrator

                                  Viewer

                                  Notes

                                  Keep the following in mind when upgrading from versions 2.13 or earlier:

                                  • Admin becomes System Admin with full access to all managed objects and scopes
                                  • Research Manager is not automatically assigned to all projects, but to projects set by the relevant Admin when assigning this role to a user, group or app
                                  • To preserve backwards compatibility, users with the role of Research Manager are assigned to all current projects, but not to new projects
                                  • To allow the Department Admin to assign a Researcher role to a user, group or app, the Department Admin must have VECD permissions for jobs and workspaces. This creates a broader span of managed objects
                                  • To preserve backwards compatibility, users with the role of Editor, are assigned to the same scope they had before the upgrade. However, with new user assignments, the Admin can limit the scope to only part of the organizational scope.
                                  "},{"location":"admin/authentication/roles/#permitted-workloads","title":"Permitted workloads","text":"

                                  When assigning a role with either one, all or any combination of the View, Edit, Create and Delete permissions for workloads, the subject has permissions to manage not only Run:ai native workloads (Workspace, Training, Inference), but also a list of 3rd party workloads:

                                  • k8s: StatefulSet
                                  • k8s: ReplicaSet
                                  • k8s: Pod
                                  • k8s: Deployment
                                  • batch: Job
                                  • batch: CronJob
                                  • machinelearning.seldon.io: SeldonDeployment
                                  • kubevirt.io: VirtualMachineInstance
                                  • kubeflow.org: TFJob
                                  • kubeflow.org: PyTorchJob
                                  • kubeflow.org: XGBoostJob
                                  • kubeflow.org: MPIJob
                                  • kubeflow.org: MPIJob
                                  • kubeflow.org: Notebook
                                  • kubeflow.org: ScheduledWorkflow
                                  • amlarc.azureml.com: AmlJob
                                  • serving.knative.dev: Service
                                  • workspace.devfile.io: DevWorkspace
                                  • ray.io: RayCluster
                                  • ray.io: RayJob
                                  • ray.io: RayService
                                  • ray.io: RayCluster
                                  • ray.io: RayJob
                                  • ray.io: RayService
                                  • tekton.dev: TaskRun
                                  • tekton.dev: PipelineRun
                                  • argoproj.io: Workflow
                                  "},{"location":"admin/authentication/roles/#using-api","title":"Using API","text":"

                                  Go to the Roles API reference to view the available actions.

                                  "},{"location":"admin/authentication/users/","title":"Users","text":"

                                  This article explains the procedure to manage users and their permissions.

                                  Users can be managed locally, or via the Identity provider, while assigned with Access Rules to manage its permissions.

                                  For example, user user@domain.com is a department admin in department A.

                                  "},{"location":"admin/authentication/users/#users-table","title":"Users table","text":"

                                  The Users table can be found under Access in the Run:ai platform.

                                  The users table provides a list of all the users in the platform. You can manage local users and manage user permissions (access rules) for both local and SSO users.

                                  Note

                                  Single Sign-On users

                                  SSO users are managed by the identity provider and appear once they have signed in to Run:ai

                                  The Users table consists of the following columns:

                                  Column Description User The unique identity of the user (email address) Type The type of the user - SSO / local Last login The timestamp for the last time the user signed in Access rule(s) The access rules assigned to the user Created By The user who created the user Creation time The timestamp for when the user was created Last updated The last time the user was updated"},{"location":"admin/authentication/users/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"admin/authentication/users/#creating-a-local-user","title":"Creating a local user","text":"

                                  To create a local user:

                                  1. Click +NEW LOCAL USER
                                  2. Enter the user\u2019s Email address
                                  3. Click CREATE
                                  4. Review and copy the user\u2019s credentials:
                                    • User Email
                                    • Temporary password to be used on first sign-in
                                  5. Click DONE

                                  Note

                                  The temporary password is visible only at the time of user\u2019s creation, and must be changed after the first sign-in

                                  "},{"location":"admin/authentication/users/#adding-an-access-rule-to-a-user","title":"Adding an access rule to a user","text":"

                                  To create an access rule:

                                  1. Select the user you want to add an access rule for
                                  2. Click ACCESS RULES
                                  3. Click +ACCESS RULE
                                  4. Select a role
                                  5. Select a scope
                                  6. Click SAVE RULE
                                  7. Click CLOSE
                                  "},{"location":"admin/authentication/users/#deleting-users-access-rule","title":"Deleting user\u2019s access rule","text":"

                                  To delete an access rule:

                                  1. Select the user you want to remove an access rule from
                                  2. Click ACCESS RULES
                                  3. Find the access rule assigned to the user you would like to delete
                                  4. Click on the trash icon
                                  5. Click CLOSE
                                  "},{"location":"admin/authentication/users/#resetting-a-user-password","title":"Resetting a user password","text":"

                                  To reset a user\u2019s password:

                                  1. Select the user you want to reset it\u2019s password
                                  2. Click RESET PASSWORD
                                  3. Click RESET
                                  4. Review and copy the user\u2019s credentials:
                                    • User Email
                                    • Temporary password to be used on next sign-in
                                  5. Click DONE
                                  "},{"location":"admin/authentication/users/#deleting-a-user","title":"Deleting a user","text":"
                                  1. Select the user you want to delete
                                  2. Click DELETE
                                  3. In the dialog, click DELETE to confirm the deletion

                                  Note

                                  To ensure administrative operations are always available, at least one local user with System Administrator role should exist.

                                  "},{"location":"admin/authentication/users/#using-api","title":"Using API","text":"

                                  Go to the Users, Access rules API reference to view the available actions

                                  "},{"location":"admin/authentication/sso/openidconnect/","title":"Setup SSO with OpenID Connect","text":"

                                  Single Sign-On (SSO) is an authentication scheme, allowing users to log-in with a single pair of credentials to multiple, independent software systems.

                                  This article explains the procedure to configure single sign-on to Run:ai using the OpenID Connect protocol.

                                  "},{"location":"admin/authentication/sso/openidconnect/#prerequisites","title":"Prerequisites","text":"

                                  Before starting, make sure you have the following available from your identity provider:

                                  • Discovery URL - the OpenID server where the content discovery information is published.
                                  • ClientID - the ID used to identify the client with the Authorization Server.
                                  • Client Secret - a secret password that only the Client and Authorization server know.
                                  • Optional: Scopes - a set of user attributes to be used during authentication to authorize access to a user's details.
                                  "},{"location":"admin/authentication/sso/openidconnect/#setup","title":"Setup","text":"

                                  Follow the steps below to setup SSO with OpenID Connect.

                                  "},{"location":"admin/authentication/sso/openidconnect/#adding-the-identity-provider","title":"Adding the identity provider","text":"
                                  1. Go to General settings
                                  2. Open the Security section and click +IDENTITY PROVIDER
                                  3. Select Custom OpenID Connect
                                  4. Enter the Discovery URL, Client ID, and Client Secret
                                  5. Copy the Redirect URL to be used in your identity provider
                                  6. Optional: Add the OIDC scopes
                                  7. Optional: Enter the user attributes and their value in the identity provider (see the user attributes table below)
                                  8. Click SAVE User attributes
                                  Attribute Default value in Run:ai Description User role groups GROUPS If it exists in the IDP, it allows you to assign Run:ai role groups via the IDP. The IDP attribute must be a list of strings. Linux User ID UID If it exists in the IDP, it allows Researcher containers to start with the Linux User UID. Used to map access to network resources such as file systems to users. The IDP attribute must be of type integer. Linux Group ID GID If it exists in the IDP, it allows Researcher containers to start with the Linux Group GID. The IDP attribute must be of type integer. Supplementary Groups SUPPLEMENTARYGROUPS If it exists in the IDP, it allows Researcher containers to start with the relevant Linux supplementary groups. The IDP attribute must be a list of integers. Email email Defines the user attribute in the IDP holding the user's email address, which is the user identifier in Run:ai User first name firstName Used as the user\u2019s first name appearing in the Run:ai user interface User last name lastName Used as the user\u2019s last name appearing in the Run:ai user interface"},{"location":"admin/authentication/sso/openidconnect/#testing-the-setup","title":"Testing the setup","text":"
                                  1. Log-in to the Run:ai platform as an admin
                                  2. Add Access Rules to an SSO user defined in the IDP
                                  3. Open the Run:ai platform in an incognito browser tab
                                  4. On the sign-in page click CONTINUE WITH SSO You are redirected to the identity provider sign in page
                                  5. In the identity provider sign-in page, log in with the SSO user who you granted with access rules
                                  6. If you are unsuccessful signing-in to the identity provider, follow the Troubleshooting section below
                                  "},{"location":"admin/authentication/sso/openidconnect/#editing-the-identity-provider","title":"Editing the identity provider","text":"

                                  You can view the identity provider details and edit its configuration:

                                  1. Go to General settings
                                  2. Open the Security section
                                  3. On the identity provider box, click Edit identity provider
                                  4. You can edit either the Discovery URL, Client ID, Client Secret, OIDC scopes, or the User attributes
                                  "},{"location":"admin/authentication/sso/openidconnect/#removing-the-identity-provider","title":"Removing the identity provider","text":"

                                  You can remove the identity provider configuration:

                                  1. Go to General settings
                                  2. Open the Security section
                                  3. On the identity provider card, click Remove identity provider
                                  4. In the dialog, click REMOVE to confirm the action

                                  Note

                                  To avoid losing access, removing the identity provider must be carried out by a local user.

                                  "},{"location":"admin/authentication/sso/openidconnect/#troubleshooting","title":"Troubleshooting","text":"

                                  If testing the setup was unsuccessful, try the different troubleshooting scenarios according to the error you received.

                                  "},{"location":"admin/authentication/sso/openidconnect/#troubleshooting-scenarios","title":"Troubleshooting scenarios","text":"403 - Sorry, we can\u2019t let you see this page. Something about permissions\u2026

                                  Description: The authenticated user is missing permissions

                                  Mitigation:

                                  1. Validate either the user or its related group/s are assigned with access rules
                                  2. Validate groups attribute is available in the configured OIDC Scopes
                                  3. Validate the user\u2019s groups attribute is mapped correctly

                                  Advanced:

                                  1. Open the Chrome DevTools: Right-click on page \u2192 Inspect \u2192 Console tab
                                  2. Run the following command to retrieve and paste the user\u2019s token: localStorage.token;
                                  3. Paste in https://jwt.io
                                  4. Under the Payload section validate the values of the user\u2019s attributes
                                  401 - We\u2019re having trouble identifying your account because your email is incorrect or can\u2019t be found.

                                  Description: Authentication failed because email attribute was not found.

                                  Mitigation:

                                  1. Validate email attribute is available in the configured OIDC Scopes
                                  2. Validate the user\u2019s email attribute is mapped correctly
                                  Unexpected error when authenticating with identity provider

                                  Description: User authentication failed

                                  Mitigation:

                                  1. Validate that the configured OIDC Scopes exist and match the Identity Provider\u2019s available scopes

                                  Advanced:

                                  1. Look for the specific error message in the URL address
                                  Unexpected error when authenticating with identity provider (SSO sign-in is not available)

                                  Description: User authentication failed

                                  Mitigation:

                                  1. Validate that the configured OIDC scope exists in the Identity Provider
                                  2. Validate the configured Client Secret match the Client Secret in the Identity Provider

                                  Advanced:

                                  1. Look for the specific error message in the URL address
                                  Client not found

                                  Description: OIDC Client ID was not found in the Identity Provider

                                  Mitigation:

                                  1. Validate that the configured Client ID matches the Identity Provider Client ID
                                  "},{"location":"admin/authentication/sso/openshift/","title":"Setup SSO with OpenShift","text":"

                                  Single Sign-On (SSO) is an authentication scheme, allowing users to log-in with a single pair of credentials to multiple, independent software systems.

                                  This article explains the procedure to configure single sign-on to Run:ai using the OpenID Connect protocol in OpenShift V4.

                                  "},{"location":"admin/authentication/sso/openshift/#prerequisites","title":"Prerequisites","text":"

                                  Before starting, make sure you have the following available from your OpenShift cluster:

                                  • OpenShift OAuth client - see Registering an additional OAuth client
                                  • ClientID - the ID used to identify the client with the Authorization Server.
                                  • Client Secret - a secret password that only the Client and Authorization Server know.
                                  • Base URL - the OpenShift API Server endpoint (example: https://api.<cluster-url>:6443)
                                  "},{"location":"admin/authentication/sso/openshift/#setup","title":"Setup","text":"

                                  Follow the steps below to setup SSO with OpenShift.

                                  "},{"location":"admin/authentication/sso/openshift/#adding-the-identity-provider","title":"Adding the identity provider","text":"
                                  1. Go to General settings
                                  2. Open the Security section and click +IDENTITY PROVIDER
                                  3. Select OpenShift V4
                                  4. Enter the Base URL, Client ID, and Client Secret from your OpenShift OAuth client.
                                  5. Copy the Redirect URL to be used in your OpenShift OAuth client
                                  6. Optional: Enter the user attributes and their value in the identity provider (see the user attributes table below)
                                  7. Click SAVE User attributes
                                  Attribute Default value in Run:ai Description User role groups GROUPS If it exists in the IDP, it allows you to assign Run:ai role groups via the IDP. The IDP attribute must be a list of strings. Linux User ID UID If it exists in the IDP, it allows researcher containers to start with the Linux User UID. Used to map access to network resources such as file systems to users. The IDP attribute must be of type integer. Linux Group ID GID If it exists in the IDP, it allows researcher containers to start with the Linux Group GID. The IDP attribute must be of type integer. Supplementary Groups SUPPLEMENTARYGROUPS If it exists in the IDP, it allows researcher containers to start with the relevant Linux supplementary groups. The IDP attribute must be a list of integers. Email email Defines the user attribute in the IDP holding the user's email address, which is the user identifier in Run:ai User first name firstName Used as the user\u2019s first name appearing in the Run:ai platform User last name lastName Used as the user\u2019s last name appearing in the Run:ai platform"},{"location":"admin/authentication/sso/openshift/#testing-the-setup","title":"Testing the setup","text":"
                                  1. Open the Run:ai platform as an admin
                                  2. Add Access Rules to an SSO user defined in the IDP
                                  3. Open the Run:ai platform in an incognito browser tab
                                  4. On the sign-in page click CONTINUE WITH SSO You are redirected to the OpenShift IDP sign-in page
                                  5. In the identity provider sign-in page, log-in with the SSO user who you granted with access rules
                                  6. If you are unsuccessful signing-in to the identity provider, follow the Troubleshooting section below
                                  "},{"location":"admin/authentication/sso/openshift/#editing-the-identity-provider","title":"Editing the identity provider","text":"

                                  You can view the identity provider details and edit its configuration:

                                  1. Go to General settings
                                  2. Open the Security section
                                  3. On the identity provider box, click Edit identity provider
                                  4. You can edit either the Base URL, Client ID, Client Secret, or the User attributes
                                  "},{"location":"admin/authentication/sso/openshift/#removing-the-identity-provider","title":"Removing the identity provider","text":"

                                  You can remove the identity provider configuration:

                                  1. Go to General settings
                                  2. Open the Security section
                                  3. On the identity provider card, click Remove identity provider
                                  4. In the dialog, click REMOVE to confirm the action

                                  Note

                                  To avoid losing access, removing the identity provider must be carried out by a local user.

                                  "},{"location":"admin/authentication/sso/openshift/#troubleshooting","title":"Troubleshooting","text":"

                                  If testing the setup was unsuccessful, try the different troubleshooting scenarios according to the error you received.

                                  "},{"location":"admin/authentication/sso/openshift/#troubleshooting-scenarios","title":"Troubleshooting scenarios","text":"403 - Sorry, we can\u2019t let you see this page. Something about permissions\u2026

                                  Description: The authenticated user is missing permissions

                                  Mitigation:

                                  1. Validate either the user or its related group/s are assigned with access rules
                                  2. Validate groups attribute is available in the configured OIDC Scopes
                                  3. Validate the user\u2019s groups attribute is mapped correctly

                                  Advanced:

                                  1. Open the Chrome DevTools: Right-click on page \u2192 Inspect \u2192 Console tab
                                  2. Run the following command to retrieve and copy the user\u2019s token: localStorage.token;
                                  3. Paste in https://jwt.io
                                  4. Under the Payload section validate the value of the user\u2019s attributes
                                  401 - We\u2019re having trouble identifying your account because your email is incorrect or can\u2019t be found.

                                  Description: Authentication failed because e-mail attribute was not found.

                                  Mitigation:

                                  1. Validate email attribute is available in the configured OIDC Scopes
                                  2. Validate the user\u2019s email attribute is mapped correctly
                                  Unexpected error when authenticating with identity provider

                                  Description: User authentication failed

                                  Mitigation:

                                  1. Validate the the configured OIDC Scopes exist and match the Identity Provider\u2019s available scopes

                                  Advanced:

                                  1. Look for the specific error message in the URL address
                                  Unexpected error when authenticating with identity provider (SSO sign-in is not available)

                                  Description: User authentication failed

                                  Mitigation:

                                  1. Validate that the configured OIDC scope exists in the Identity Provider
                                  2. Validate that the configured Client Secret matches the Client Secret value in the OAuthclient Kubernetes object.

                                  Advanced:

                                  1. Look for the specific error message in the URL address
                                  unauthorized_client

                                  Description: OIDC Client ID was not found in the OpenShift IDP

                                  Mitigation:

                                  1. Validate that the configured Client ID matches the value in the OAuthclient Kubernetes object.
                                  "},{"location":"admin/authentication/sso/saml/","title":"Setup SSO with SAML","text":"

                                  Single Sign-On (SSO) is an authentication scheme, allowing users to log-in with a single pair of credentials to multiple, independent software systems.

                                  This article explains the procedure to configure SSO to Run:ai using the SAML 2.0 protocol.

                                  "},{"location":"admin/authentication/sso/saml/#prerequisites","title":"Prerequisites","text":"

                                  Before starting, ensure you have the following available from your identity provider:

                                  • SAML XML Metadata
                                  "},{"location":"admin/authentication/sso/saml/#setup","title":"Setup","text":"

                                  Follow the steps below to setup SSO with SAML.

                                  "},{"location":"admin/authentication/sso/saml/#adding-the-identity-provider","title":"Adding the identity provider","text":"
                                  1. Go to General settings
                                  2. Open the Security section and click +IDENTITY PROVIDER
                                  3. Select Custom SAML 2.0
                                  4. Select either From computer or From URL
                                    • From computer - click the Metadata XML file field, then select your file for upload
                                    • From URL - in the Metadata XML URL field, enter the URL to the XML Metadata file
                                  5. Copy the Redirect URL and Entity ID to be used in your identity provider
                                  6. Optional: Enter the user attributes and their value in the identity provider (see the user attributes table below)
                                  Attribute Default value in Run:ai Description User role groups GROUPS If it exists in the IDP, it allows you to assign Run:ai role groups via the IDP. The IDP attribute must be a list of strings. Linux User ID UID If it exists in the IDP, it allows Researcher containers to start with the Linux User UID. Used to map access to network resources such as file systems to users. The IDP attribute must be of type integer. Linux Group ID GID If it exists in the IDP, it allows Researcher containers to start with the Linux Group GID. The IDP attribute must be of type integer. Supplementary Groups SUPPLEMENTARYGROUPS If it exists in the IDP, it allows Researcher containers to start with the relevant Linux supplementary groups. The IDP attribute must be a list of integers. Email email Defines the user attribute in the IDP holding the user's email address, which is the user identifier in Run:ai. User first name firstName Used as the user\u2019s first name appearing in the Run:ai platform. User last name lastName Used as the user\u2019s last name appearing in the Run:ai platform.
                                  1. Click SAVE
                                  "},{"location":"admin/authentication/sso/saml/#testing-the-setup","title":"Testing the setup","text":"
                                  1. Open the Run:ai platform as an admin
                                  2. Add Access Rules to an SSO user defined in the IDP
                                  3. Open the Run:ai platform in an incognito browser tab
                                  4. On the sign-in page click CONTINUE WITH SSO. You are redirected to the identity provider sign in page
                                  5. In the identity provider sign-in page, log-in with the SSO user who you granted with access rules
                                  6. If you are unsuccessful signing-in to the identity provider, follow the Troubleshooting section below
                                  "},{"location":"admin/authentication/sso/saml/#editing-the-identity-provider","title":"Editing the identity provider","text":"

                                  You can view the identity provider details and edit its configuration:

                                  1. Go General settings
                                  2. Open the Security section
                                  3. On the identity provider box, click Edit identity provider
                                  4. You can edit either the metadata file or the user attributes
                                  5. You can view the identity provider URL, identity provider entity ID, and the certificate expiration date
                                  "},{"location":"admin/authentication/sso/saml/#removing-the-identity-provider","title":"Removing the identity provider","text":"

                                  You can remove the identity provider configuration:

                                  1. Go to General settings
                                  2. Open the Security section
                                  3. On the identity provider card, click Remove identity provider
                                  4. In the dialog, click REMOVE to confirm the action

                                  Note

                                  To avoid losing access, removing the identity provider must be carried out by a local user.

                                  "},{"location":"admin/authentication/sso/saml/#downloading-the-xml-metadata-file","title":"Downloading the XML metadata file","text":"

                                  You can download the XML file to view the identity provider settings:

                                  1. Go to General settings
                                  2. Open the Security section
                                  3. On the identity provider card, click Download metadata XML file
                                  "},{"location":"admin/authentication/sso/saml/#troubleshooting","title":"Troubleshooting","text":"

                                  If testing the setup was unsuccessful, try the different troubleshooting scenarios according to the error you received. If an error still occurs, check the advanced troubleshooting section.

                                  "},{"location":"admin/authentication/sso/saml/#troubleshooting-scenarios","title":"Troubleshooting scenarios","text":"Invalid signature in response from identity provider

                                  Description: After trying to log-in, the following message is received in the RunLai log-in page. Mitigation: 1. Go to General settings 2. Open the Security section 3. In the identity provider box, check for a \"Certificate expired\u201d error 4. If it is expired, update the SAML metadata file to include a valid certificate

                                  401 - We\u2019re having trouble identifying your account because your email is incorrect or can\u2019t be found.

                                  Description: Authentication failed because email attribute was not found.

                                  Mitigation:

                                  1. Validate the user\u2019s email attribute is mapped correctly
                                  403 - Sorry, we can\u2019t let you see this page. Something about permissions\u2026

                                  Description: The authenticated user is missing permissions

                                  Mitigation:

                                  1. Validate either the user or its related group/s are assigned with access rules
                                  2. Validate the user\u2019s groups attribute is mapped correctly

                                  Advanced:

                                  1. Open the Chrome DevTools: Right-click on page \u2192 Inspect \u2192 Console tab
                                  2. Run the following command to retrieve and paste the user\u2019s token: localStorage.token;
                                  3. Paste in https://jwt.io
                                  4. Under the Payload section validate the values of the user\u2019s attributes
                                  "},{"location":"admin/authentication/sso/saml/#advanced-troubleshooting","title":"Advanced Troubleshooting","text":"Validating the SAML request

                                  The SAML login flow can be separated into two parts:

                                  • Run:ai redirects to the IDP for log-ins using a SAML Request
                                  • On successful log-in, the IDP redirects back to Run:ai with a SAML Response

                                  Validate the SAML Request to ensure the SAML flow works as expected:

                                  1. Go to the Run:ai login screen
                                  2. Open the Chrome Network inspector: Right-click \u2192 Inspect on the page \u2192 Network tab
                                  3. On the sign-in page click CONTINUE WITH SSO.
                                  4. Once redirected to the Identity Provider, search in the Chrome network inspector for an HTTP request showing the SAML Request. Depending on the IDP url, this would be a request to the IDP domain name. For example, accounts.google.com/idp?1234.
                                  5. When found, go to the Payload tab and copy the value of the SAML Request
                                  6. Paste the value into a SAML decoder (e.g. https://www.samltool.com/decode.php)
                                  7. Validate the request:
                                    • The content of the <saml:Issuer> tag is the same as Entity ID given when adding the identity provider
                                    • The content of the AssertionConsumerServiceURL is the same as the Redirect URI given when adding the identity provider
                                  8. Validate the response:
                                    • The user email under the <saml2:Subject> tag is the same as the logged-in user
                                    • Make sure that under the <saml2:AttributeStatement> tag, there is an Attribute named email (lowercase). This attribute is mandatory.
                                    • If other, optional user attributes (groups, firstName, lastName, uid, gid) are mapped make sure they also exist under <saml2:AttributeStatement> along with their respective values.
                                  "},{"location":"admin/config/access-roles/","title":"Understand the Kubernetes Cluster Access provided to Run:ai","text":"

                                  Run:ai has configuration flags that control specific behavioral aspects of Run:ai. Specifically, those which require additional permissions. Such as automatic namespace/project creation, secret propagation, and more.

                                  The purpose of this document is to provide security officers with the ability to review what cluster-wide access Run:ai requires, and verify that it is in line with organizational policy, before installing the Run:ai cluster.

                                  "},{"location":"admin/config/access-roles/#review-cluster-access-roles","title":"Review Cluster Access Roles","text":"

                                  Run the following:

                                  helm repo add runai https://run-ai-charts.storage.googleapis.com\nhelm repo update\nhelm install runai-cluster runai/runai-cluster -n runai -f runai-<cluster-name>.yaml \\\n        --dry-run > cluster-all.yaml\n

                                  The file cluster-all.yaml can be then be reviewed. You can use the internal filenames (provided in comments within the file) to gain more understanding according to the table below:

                                  Folder File Purpose clusterroles base.yaml Mandatory Kubernetes Cluster Roles and Cluster Role Bindings clusterroles project-controller-ns-creation.yaml Automatic Project Creation and Maintenance. Provides Run:ai with the ability to create Kubernetes namespaces when the Run:ai administrator creates new Projects. Can be turned on/off via flag clusterroles project-controller-rb-creation.yaml Automatically assign Users to Projects. Can be turned on/off via flag clusterroles project-controller-limit-range.yaml Disables the usage of the Kubernetes Limit Range feature. Can be turned on/off via flag ocp scc.yaml OpenShift-specific Security Contexts priorityclasses 4 files Folder contains a list of Priority Classes used by Run:ai"},{"location":"admin/config/admin-messages/","title":"Administrator Messages","text":"

                                  System administrators can use Administrator messages to make announcements to users once they have logged in. These messages typically are used to keep user informed about different aspects of the platform.

                                  To configure an Administrator message:

                                  1. Press General settings.
                                  2. Expand the Message from administrator pane.
                                  3. Press Message.
                                  4. Enter your message in the text box. Use the formatting tools in the toolbar to add special formatting or links to the message.
                                  5. Enable the Display \"Don't show this again\" checkbox on message to users to allow the users to see the message only once.
                                  6. Press Publish when complete.
                                  "},{"location":"admin/config/advanced-cluster-config/","title":"Advanced Cluster Configuration","text":"

                                  Advanced cluster configurations can be used to tailor your Run:ai cluster deployment to meet specific operational requirements and optimize resource management. By fine-tuning these settings, you can enhance functionality, ensure compatibility with organizational policies, and achieve better control over your cluster environment. This article provides guidance on implementing and managing these configurations to adapt the Run:ai cluster to your unique needs.

                                  After the Run:ai cluster is installed, you can adjust various settings to better align with your organization's operational needs and security requirements.

                                  "},{"location":"admin/config/advanced-cluster-config/#edit-cluster-configurations","title":"Edit cluster configurations","text":"

                                  Advanced cluster configurations are managed through the runaiconfig Kubernetes Custom Resource. To modify the cluster configurations, use the following command:

                                  kubectl edit runaiconfig runai -n runai\n
                                  "},{"location":"admin/config/advanced-cluster-config/#configurations","title":"Configurations","text":"

                                  The following configurations allow you to enable or disable features, control permissions, and customize the behavior of your Run:ai cluster:

                                  Key Description Default spec.project-controller.createNamespaces (boolean) Allows Kubernetes namespace creation for new projects true spec.mps-server.enabled (boolean) Enabled when using NVIDIA MPS false spec.global.subdomainSupport (boolean) Allows the creation of subdomains for ingress endpoints, enabling access to workloads via unique subdomains on the Fully Qualified Domain Name (FQDN). For details, see External Access to Container false spec.runai-container-toolkit.enabled (boolean) Allows workloads to use GPU fractions true spec.prometheus.spec.retention (string) Defines how long Prometheus retains Run:ai metrics locally, which is useful in case of potential connectivity issues. For more information, see Prometheus Storage 2h spec.prometheus.spec.retentionSize (string) Allocates storage space for Run:ai metrics in Prometheus, which is useful in case of potential connectivity issues. For more information, see Prometheus Storage \"\" spec.prometheus.logLevel (string) Sets the Prometheus log levelPossible values: [debug, info, warn, error] \u201cinfo\" spec.prometheus.additionalAlertLabels (object) Sets additional custom labels for the built-in alerts Example: {\u201cenv\u201d: \u201cprod\u201d} {} spec.global.schedulingServices (object) Defines resource constraints uniformly for the entire set of Run:ai scheduling services. For more information, see Resource requests and limits of Pod and container {resources: {}} spec.global.syncServices (object) Defines resource constraints uniformly for the entire set of Run:ai sync services. For more information, see Resource requests and limits of Pod and container {resources: {}} spec.global.workloadServices (object) Defines resource constraints uniformly for the entire set of Run:ai workload services. For more information, see Resource requests and limits of Pod and container {resources: {}} spec.global.nodeAffinity.restrictScheduling (boolean) Enables setting node roles and restricting workload scheduling to designated nodes false spec.global.affinity (object) Sets the system nodes where Run:ai system-level services are scheduled. Using global.affinity will overwrite the node roles set using the Administrator CLI (runai-adm). Prefer to schedule on nodes that are labeled with node-role.kubernetes.io/runai-system spec.global.tolerations (object) Configure Kubernetes tolerations for Run:ai system-level services. spec.daemonSetsTolerations (object) Configure Kubernetes tolerations for Run:ai daemonSets / engine. spec.runai-container-toolkit.logLevel (boolean) Specifies the run:ai-container-toolkit logging level: either 'SPAM', 'DEBUG', 'INFO', 'NOTICE', 'WARN', or 'ERROR' INFO node-scale-adjuster.args.gpuMemoryToFractionRatio (object) A scaling-pod requesting a single GPU device will be created for every 1 to 10 pods requesting fractional GPU memory (1/gpuMemoryToFractionRatio). This value represents the ratio (0.1-0.9) of fractional GPU memory (any size) to GPU fraction (portion) conversion. 0.1 spec.global.core.dynamicFractions.enabled (boolean) Enables dynamic GPU fractions true spec.global.core.swap.enabled (boolean) Enables memory swap for GPU workloads false spec.global.core.swap.limits.cpuRam (string) Sets the CPU memory size used to swap GPU workloads 100Gi spec.global.core.swap.limits.reservedGpuRam (string) Sets the reserved GPU memory size used to swap GPU workloads 2Gi spec.global.core.nodeScheduler.enabled (boolean) Enables the node-level scheduler false spec.global.replicaCount (int) Sets a global number of pod replicas to be created for services that support replication 1 spec.limitRange.cpuDefaultRequestCpuLimitFactorNoGpu (string) Sets a default ratio between the CPU request and the limit for workloads without GPU requests 0.1 spec.limitRange.memoryDefaultRequestMemoryLimitFactorNoGpu (string) Sets a default ratio between the memory request and the limit for workloads without GPU requests 0.1 spec.limitRange.cpuDefaultRequestGpuFactor (string) Sets a default amount of CPU allocated per GPU when the CPU is not specified spec.limitRange.cpuDefaultLimitGpuFactor (int) Sets a default CPU limit based on the number of GPUs requested when no CPU limit is specified NO DEFAULT spec.limitRange.memoryDefaultRequestGpuFactor (string) Sets a default amount of memory allocated per GPU when the memory is not specified 100Mi spec.limitRange.memoryDefaultLimitGpuFactor (string) Sets a default memory limit based on the number of GPUs requested when no memory limit is specified NO DEFAULT spec.global.core.timeSlicing.mode (string) Sets the GPU time-slicing mode.Possible values:timesharing - all pods on a GPU share the GPU compute time evenly.\u2018strict\u2019 - each pod gets an exact time slice according to its memory fraction value.fair - each pod gets an exact time slice according to its memory fraction value and any unused GPU compute time is split evenly between the running pods. timesharing runai-scheduler.fullHierarchyFairness (boolean) Enables fairness between departments, on top of projects fairness true spec.pod-grouper.args.gangSchedulingKnative (boolean) Enables gang scheduling for inference workloads.For backward compatibility with versions earlier than v2.19, change the value to false true runai-scheduler.args.defaultStalenessGracePeriod Sets the timeout in seconds before the scheduler evicts a stale pod-group (gang) that went below its min-members in running state: 0s - Immediately (no timeout) -1 - Never 60s spec.runai-scheduler.args.verbosity (int) Configures the level of detail in the logs generated by the scheduler service 4 pod-grouper.args.gangScheduleArgoWorkflow (boolean) Groups all pods of a single ArgoWorkflow workload into a single Pod-Group for gang scheduling. true"},{"location":"admin/config/advanced-cluster-config/#runai-managed-nodes","title":"Run:ai Managed Nodes","text":"

                                  To include or exclude specific nodes from running workloads within a cluster managed by Run:ai, use the nodeSelectorTerms flag. For additional details, see Kubernetes nodeSelector.

                                  Label the nodes using the below:

                                  • key: Label key (e.g., zone, instance-type).
                                  • operator: Operator defining the inclusion/exclusion condition (In, NotIn, Exists, DoesNotExist).
                                  • values: List of values for the key when using In or NotIn.

                                  The below example shows how to include NVIDIA GPUs only and exclude all other GPU types in a cluster with mixed nodes, based on product type GPU label:

                                  nodeSelectorTerms:\n- matchExpressions:\n  - key: nvidia.com/gpu.product  \n    operator: Exists\n

                                  Tip

                                  To view the full runaiconfig object structure, use the following command:

                                  kubectl get crds/runaiconfigs.run.ai -n runai -o yaml\n

                                  "},{"location":"admin/config/allow-external-access-to-containers/","title":"External access to Containers","text":""},{"location":"admin/config/allow-external-access-to-containers/#introduction","title":"Introduction","text":"

                                  Researchers working with containers may at times need to remotely access the container. Some examples:

                                  • Using a Jupyter notebook that runs within the container
                                  • Using PyCharm to run python commands remotely.
                                  • Using TensorBoard to view machine learning visualizations

                                  This requires exposing container ports. When using docker, the way Researchers expose ports is by declaring them when starting the container. Run:ai has similar syntax.

                                  Run:ai is based on Kubernetes. Kubernetes offers an abstraction of the container's location. This complicates the exposure of ports. Kubernetes offers several options:

                                  Method Description Prerequisites Port Forwarding Simple port forwarding allows access to the container via local and/or remote port. None NodePort Exposes the service on each Node\u2019s IP at a static port (the NodePort). You\u2019ll be able to contact the NodePort service from outside the cluster by requesting <NODE-IP>:<NODE-PORT> regardless of which node the container actually resides in. None LoadBalancer Exposes the service externally using a cloud provider\u2019s load balancer. Only available with cloud providers

                                  See https://kubernetes.io/docs/concepts/services-networking/service for further details on these options.

                                  "},{"location":"admin/config/allow-external-access-to-containers/#workspaces-configuration","title":"Workspaces configuration","text":"

                                  Workspaces allow the Researcher to build AI models interactively.

                                  Workspaces allow the Researcher to launch tools such as Visual Studio code, TensorFlow, TensorBoard etc. These tools require access to the container. Access is provided via URLs.

                                  Run:ai uses the Cluster URL provided to dynamically create SSL-secured URLs for researchers\u2019 workspaces in the format of https://<CLUSTER_URL>/project-name/workspace-name.

                                  While this form of path-based routing conveniently works with applications like Jupyter Notebooks, it may often not be compatible with other applications. These applications assume running at the root file system, so hardcoded file paths and settings within the container may become invalid when running at a path other than the root. For instance, if the container is expecting to find a file at /etc/config.json but is running at /project-name/workspace-name, the file will not be found. This can cause the container to fail or not function as intended.

                                  To address this issue, Run:ai provides support for host-based routing. When enabled, Run:ai creates workspace URLs in a subdomain format (https://project-name-workspace-name.<CLUSTER_URL>/), which allows all workspaces to run at the root path and function properly.

                                  To enable host-based routing you must perform the following steps:

                                  Note

                                  For OpenShift, editing the Runaiconfig command is the only step required to generate workspace URLs. Refer to the last step below.

                                  1. Create a second DNS entry (A record) for *.<CLUSTER_URL>, pointing to the same IP as the cluster Fully Qualified Domain Name (FQDN)
                                  2. Obtain a wildcard SSL certificate for this DNS.

                                  3. Add the certificate as a secret:

                                  kubectl create secret tls runai-cluster-domain-star-tls-secret -n runai \\ \n    --cert /path/to/fullchain.pem --key /path/to/private.pem\n
                                  1. Create the following ingress rule:
                                  apiVersion: networking.k8s.io/v1\nkind: Ingress\nmetadata:\n  name: runai-cluster-domain-star-ingress\n  namespace: runai\nspec:\n  ingressClassName: nginx\n  rules:\n  - host: '*.<CLUSTER_URL>'\n  tls:\n  - hosts:\n    - '*.<CLUSTER_URL>'\n    secretName: runai-cluster-domain-star-tls-secret\n

                                  Replace <CLUSTER_URL> as described above and run: kubectl apply -f <filename>.

                                  1. Edit Runaiconfig to generate the URLs correctly:
                                  kubectl patch RunaiConfig runai -n runai --type=\"merge\" \\\n    -p '{\"spec\":{\"global\":{\"subdomainSupport\": true}}}' \n

                                  Once these requirements have been met, all workspaces will automatically be assigned a secured URL with a subdomain, ensuring full functionality for all researcher applications.

                                  "},{"location":"admin/config/allow-external-access-to-containers/#see-also","title":"See Also","text":"
                                  • To learn how to use port forwarding see the Quickstart document: Launch an Interactive Build Workload with Connected Ports.
                                  • See CLI command runai submit.
                                  "},{"location":"admin/config/cli-admin-install/","title":"Administrator CLI","text":"

                                  The Run:ai Administrator (runai-adm) is a lightweight tool designed to support infrastructure administrators by simplifying two key tasks:

                                  • Collecting logs for troubleshooting and sharing with Run:ai support.
                                  • Configuring node roles in the cluster for optimal performance and reliability.

                                  This article outlines the installation and usage of the Run:ai Administrator CLI to help you get started quickly.

                                  "},{"location":"admin/config/cli-admin-install/#prerequisites","title":"Prerequisites","text":"

                                  Before installing the CLI, review the following:

                                  • Operating system: The CLI is supported on Mac and Linux.
                                  • Kubectl: The Kubernetes command-line interface must be installed and configured to access your cluster. Follow the official guide.
                                  • Cluster administrative permissions: The CLI requires a Kubernetes profile with administrative privileges.
                                  "},{"location":"admin/config/cli-admin-install/#installation","title":"Installation","text":"

                                  To install the Run:ai Administrator CLI, ensure that the CLI version matches the version of your Run:ai cluster. You can either install the latest version or a specific version from the list.

                                  "},{"location":"admin/config/cli-admin-install/#installing-the-latest-version","title":"Installing the latest version","text":"

                                  Use the following commands to download and install the latest version of the CLI:

                                  MacLinux
                                  wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/darwin # (1) \nchmod +x runai-adm  \nsudo mv runai-adm /usr/local/bin/runai-adm\n
                                  1. In self-hosted environment, use the control-plane URL instead of app.run.ai
                                  wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/linux # (1) \nchmod +x runai-adm  \nsudo mv runai-adm /usr/local/bin/runai-adm\n
                                  1. In self-hosted environment, use the control-plane URL instead of app.run.ai
                                  "},{"location":"admin/config/cli-admin-install/#installing-a-specific-version","title":"Installing a specific version","text":"

                                  To install a specific version of the Administrator CLI that matches your Run:ai cluster version, append the version number to the download URL. Refer to the list of available versions linked above for the correct version number.

                                  MacLinux
                                  wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/<version>/darwin # Replace <version> with the desired version in the format vX.X.X (e.g., v2.19.5) \nchmod +x runai-adm  \nsudo mv runai-adm /usr/local/bin/runai-adm\n
                                  wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/<version>/linux # Replace <version> with the desired version in the format vX.X.X (e.g., v2.19.5)\nchmod +x runai-adm  \nsudo mv runai-adm /usr/local/bin/runai-adm\n
                                  "},{"location":"admin/config/cli-admin-install/#verifying-installation","title":"Verifying installation","text":"

                                  Verify your installation completed successfully by running the following command:

                                  runai-adm version\n
                                  "},{"location":"admin/config/cli-admin-install/#reference","title":"Reference","text":""},{"location":"admin/config/cli-admin-install/#node-roles","title":"Node roles","text":"

                                  To set or remove node rules using the runai-adm tool, run the following:

                                  runai-adm set node-role [--runai-system-worker | --gpu-worker | --cpu-worker] <node-name>\n
                                  runai-adm remove node-role [--runai-system-worker | --gpu-worker | --cpu-worker] <node-name>\n

                                  Note

                                  Use the --all flag to set or remove a role to all nodes.

                                  "},{"location":"admin/config/cli-admin-install/#collect-logs","title":"Collect logs","text":"

                                  To collect logs using the runai-adm tool:

                                  1. Run the following command:

                                    runai-adm collect-logs\n
                                  2. Locate the generated compressed log file.

                                  "},{"location":"admin/config/cluster-wide-pvc/","title":"Cluster wide PVCs","text":"

                                  A PersistentVolumeClaim (PVC) is a request for storage by a user. It is similar to a Pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes. For more information about PVCs, see Persistent Volumes.

                                  PVCs are namespace-specific. If your PVC relates to all run:ai Projects, do the following to propagate the PVC to all Projects:

                                  Create a PVC within the run:ai namespace, then run the following once to propagate the PVC to all run:ai Projects:

                                  kubectl label persistentvolumeclaims -n runai <PVC_NAME> runai/cluster-wide=true\n

                                  To delete a PVC from all run:ai Projects, run:

                                  kubectl label persistentvolumeclaims -n runai <PVC_NAME> runai/cluster-wide-\n

                                  You can add a PVC to a job using the New job form.

                                  To add a PVC to a new job:

                                  1. On the New job form, press Storage.
                                  2. In Persistent Volume Claims press Add.
                                  3. Enable Existing PVC.
                                  4. Enter the name (claim name) of the PVC.
                                  5. Enter the storage class. (Optional)
                                  6. Enter the size.
                                  7. Enable / disable access modes.
                                  "},{"location":"admin/config/clusters/","title":"Clusters","text":"

                                  This article explains the procedure to view and manage Clusters.

                                  The Cluster table provides a quick and easy way to see the status of your cluster.

                                  "},{"location":"admin/config/clusters/#clusters-table","title":"Clusters table","text":"

                                  The Clusters table can be found under Resources in the Run:ai platform.

                                  The clusters table provides a list of the clusters added to Run:ai platform, along with their status.

                                  The clusters table consists of the following columns:

                                  Column Description Cluster The name of the cluster Status The status of the cluster. For more information see the table below. Hover over the information icon for a short description and links to troubleshooting Creation time The timestamp when the cluster was created URL The URL that was given to the cluster Run:ai cluster version The Run:ai version installed on the cluster Kubernetes distribution The flavor of Kubernetes distribution Kubernetes version The version of Kubernetes installed Run:ai cluster UUID The unique ID of the cluster"},{"location":"admin/config/clusters/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"admin/config/clusters/#cluster-status","title":"Cluster status","text":"Status Description Waiting to connect The cluster has never been connected. Disconnected There is no communication from the cluster to the {{glossary.Control plane}}. This may be due to a network issue. See the troubleshooting scenarios. Missing prerequisites Some prerequisites are missing from the cluster. As a result, some features may be impacted. See the troubleshooting scenarios. Service issues At least one of the services is not working properly. You can view the list of nonfunctioning services for more information. See the troubleshooting scenarios. Connected The Run:ai cluster is connected, and all Run:ai services are running."},{"location":"admin/config/clusters/#adding-a-new-cluster","title":"Adding a new cluster","text":"

                                  To add a new cluster see the installation guide.

                                  "},{"location":"admin/config/clusters/#removing-a-cluster","title":"Removing a cluster","text":"
                                  1. Select the cluster you want to remove
                                  2. Click REMOVE
                                  3. A dialog appears: Make sure to carefully read the message before removing
                                  4. Click REMOVE to confirm the removal.
                                  "},{"location":"admin/config/clusters/#using-the-api","title":"Using the API","text":"

                                  Go to the Clusters API reference to view the available actions

                                  "},{"location":"admin/config/clusters/#troubleshooting","title":"Troubleshooting","text":"

                                  Before starting, make sure you have the following:

                                  • Access to the Kubernetes cluster where Run:ai is deployed with the necessary permissions
                                  • Access to the Run:ai Platform
                                  "},{"location":"admin/config/clusters/#troubleshooting-scenarios","title":"Troubleshooting scenarios","text":"Cluster disconnected

                                  Description: When the cluster's status is \u2018disconnected\u2019, there is no communication from the cluster services reaching the Run:ai Platform. This may be due to networking issues or issues with Run:ai services.

                                  Mitigation:

                                  1. Check Run:ai\u2019s services status:

                                    • Open your terminal
                                    • Make sure you have access to the Kubernetes cluster with permission to view pods
                                    • Copy and paste the following command to verify that Run:ai\u2019s services are running:

                                    kubectl get pods -n runai | grep -E 'runai-agent|cluster-sync|assets-sync'\n
                                    * If any of the services are not running, see the \u2018cluster has service issues\u2019 scenario.

                                  2. Check the network connection

                                    • Open your terminal
                                    • Make sure you have access to the Kubernetes cluster with permissions to create pods
                                    • Copy and paste the following command to create a connectivity check pod:
                                    kubectl run control-plane-connectivity-check -n runai --image=wbitt/network-multitool \\\n    --command -- /bin/sh -c 'curl -sSf <control-plane-endpoint> > /dev/null && echo \"Connection Successful\" \\\n    || echo \"Failed connecting to the Control Plane\"'\n
                                    • Replace <control-plane-endpoint> with the URL of the Control Plane in your environment. If the pod fails to connect to the Control Plane, check for potential network policies
                                  3. Check and modify the network policies

                                    • Open your terminal
                                    • Copy and paste the following command to check the existence of network policies:

                                      kubectl get networkpolicies -n runai\n

                                    • Review the policies to ensure that they allow traffic from the Run:ai namespace to the Control Plane. If necessary, update the policies to allow the required traffic. Example of allowing traffic:

                                    apiVersion: networking.k8s.io/v1\nkind: NetworkPolicy\nmetadata:\nname: allow-control-plane-traffic\nnamespace: runai\nspec:\npodSelector:\n    matchLabels:\n    app: runai\npolicyTypes:\n    - Ingress\n    - Egress\negress:\n    - to:\n        - ipBlock:\n            cidr: <control-plane-ip-range>\n    ports:\n        - protocol: TCP\n        port: <control-plane-port>\ningress:\n    - from:\n        - ipBlock:\n            cidr: <control-plane-ip-range>\n    ports:\n        - protocol: TCP\n        port: <control-plane-port>\n
                                    • Check infrastructure-level configurations:

                                      • Ensure that firewall rules and security groups allow traffic between your Kubernetes cluster and the Control Plane
                                      • Verify required ports and protocols:
                                        • Ensure that the necessary ports and protocols for Run:ai\u2019s services are not blocked by any firewalls or security groups
                                  4. Check Run:ai services logs

                                    • Open your terminal
                                    • Make sure you have access to the Kubernetes cluster with permissions to view logs
                                    • Copy and paste the following commands to view the logs of the Run:ai services:
                                    kubectl logs deployment/runai-agent -n runai\nkubectl logs deployment/cluster-sync -n runai\nkubectl logs deployment/assets-sync -n runai\n
                                    • Try to identify the problem from the logs. If you cannot resolve the issue, continue to the next step.
                                  5. Contact Run:ai\u2019s support

                                    • If the issue persists, contact Run:ai\u2019s support for assistance.
                                  Cluster has service issues

                                  Description: When a cluster's status is Has service issues, it means that one or more Run:ai services running in the cluster are not available.

                                  Mitigation:

                                  1. Verify non-functioning services

                                    • Open your terminal
                                    • Make sure you have access to the Kubernetes cluster with permissions to view the runaiconfig resource
                                    • Copy and paste the following command to determine which services are not functioning:
                                    kubectl get runaiconfig -n runai runai -ojson | jq -r '.status.conditions | map(select(.type == \"Available\"))'\n
                                  2. Check for Kubernetes events

                                    • Open your terminal
                                    • Make sure you have access to the Kubernetes cluster with permissions to view events
                                    • Copy and paste the following command to get all Kubernetes events:
                                  3. Inspect resource details

                                    • Open your terminal
                                    • Make sure you have access to the Kubernetes cluster with permissions to describe resources
                                    • Copy and paste the following command to check the details of the required resource:
                                    kubectl describe <resource_type> <name>\n
                                  4. Contact Run:ai\u2019s Support

                                    • If the issue persists, contact contact Run:ai\u2019s support for assistance.
                                  Cluster is waiting to connect

                                  Description: When the cluster's status is \u2018waiting to connect\u2019, it means that no communication from the cluster services reaches the Run:ai Platform. This may be due to networking issues or issues with Run:ai services.

                                  Mitigation:

                                  1. Check Run:ai\u2019s services status

                                    • Open your terminal
                                    • Make sure you have access to the Kubernetes cluster with permissions to view pods
                                    • Copy and paste the following command to verify that Run:ai\u2019s services are running:
                                    kubectl get pods -n runai | grep -E 'runai-agent|cluster-sync|assets-sync'\n
                                    • If any of the services are not running, see the \u2018cluster has service issues\u2019 scenario.
                                  2. Check the network connection

                                    • Open your terminal
                                    • Make sure you have access to the Kubernetes cluster with permissions to create pods
                                    • Copy and paste the following command to create a connectivity check pod:
                                    kubectl run control-plane-connectivity-check -n runai --image=wbitt/network-multitool --command -- /bin/sh -c 'curl -sSf <control-plane-endpoint> > /dev/null && echo \"Connection Successful\" || echo \"Failed connecting to the Control Plane\"'\n
                                    • Replace <control-plane-endpoint> with the URL of the Control Plane in your environment. If the pod fails to connect to the Control Plane, check for potential network policies:
                                  3. Check and modify the network policies

                                    • Open your terminal
                                    • Copy and paste the following command to check the existence of network policies:
                                    kubectl get networkpolicies -n runai\n
                                    • Review the policies to ensure that they allow traffic from the Run:ai namespace to the Control Plane. If necessary, update the policies to allow the required traffic. Example of allowing traffic:
                                    apiVersion: networking.k8s.io/v1\nkind: NetworkPolicy\nmetadata:\nname: allow-control-plane-traffic\nnamespace: runai\nspec:\n  podSelector:\n    matchLabels:\n    app: runai\n  policyTypes:\n    - Ingress\n    - Egress\n  egress:\n    - to:\n        - ipBlock:\n            cidr: <control-plane-ip-range>\n    ports:\n        - protocol: TCP\n        port: <control-plane-port>\n  ingress:\n    - from:\n        - ipBlock:\n            cidr: <control-plane-ip-range>\n    ports:\n        - protocol: TCP\n        port: <control-plane-port>\n
                                    • Check infrastructure-level configurations:
                                    • Ensure that firewall rules and security groups allow traffic between your Kubernetes cluster and the Control Plane
                                    • Verify required ports and protocols:
                                      • Ensure that the necessary ports and protocols for Run:ai\u2019s services are not blocked by any firewalls or security groups
                                  4. Check Run:ai services logs

                                    • Open your terminal
                                    • Make sure you have access to the Kubernetes cluster with permission to view logs
                                    • Copy and paste the following commands to view the logs of the Run:ai services:
                                    kubectl logs deployment/runai-agent -n runai\nkubectl logs deployment/cluster-sync -n runai\nkubectl logs deployment/assets-sync -n runai\n
                                    • Try to identify the problem from the logs. If you cannot resolve the issue, continue to the next step
                                  5. Contact Run:ai\u2019s support

                                    • If the issue persists, contact Run:ai\u2019s support for assistance.
                                  Cluster is missing prerequisites

                                  Description: When a cluster's status displays Missing prerequisites, it indicates that at least one of the Mandatory Prerequisites has not been fulfilled. In such cases, Run:ai services may not function properly.

                                  Mitigation:

                                  If you have ensured that all prerequisites are installed and the status still shows missing prerequisites, follow these steps:

                                  1. Check the message in the Run:ai platform for further details regarding the missing prerequisites.
                                  2. Inspect the runai-public ConfigMap:

                                    • Open your terminal. In the terminal, type the following command to list all ConfigMaps in the runai namespace:
                                    kubectl get configmap -n runai\n
                                  3. Describe the ConfigMap

                                    • Locate the ConfigMap named runai-public from the list
                                    • To view the detailed contents of this ConfigMap, type the following command:
                                    kubectl describe configmap runai-public -n runai\n
                                  4. Find Missing Prerequisites

                                    • In the output displayed, look for a section labeled dependencies.required
                                    • This section provides detailed information about any missing resources or prerequisites. Review this information to identify what is needed
                                  5. Contact Run:ai\u2019s support

                                    • If the issue persists, contact Run:ai\u2019s support for assistance.
                                  "},{"location":"admin/config/create-k8s-assets-in-advance/","title":"Creating Kubernetes Assets in Advance","text":"

                                  The article describe how to mark Kubernetes assets for use by Run:ai

                                  "},{"location":"admin/config/create-k8s-assets-in-advance/#creating-pvcs-in-advance","title":"Creating PVCs in advance","text":"

                                  Add PVCs in advance to be used when creating a PVC-type data source via the Run:ai UI.

                                  Follow the steps below for each required scope:

                                  "},{"location":"admin/config/create-k8s-assets-in-advance/#cluster-scope","title":"Cluster scope","text":"
                                  1. Locate the PVC in the Run:ai namespace (runai)
                                  2. To authorize Run:ai to use the PVC, label it: run.ai/cluster-wide: \"true\u201d The PVC is now displayed for that scope in the list of existing PVCs.
                                  "},{"location":"admin/config/create-k8s-assets-in-advance/#department-scope","title":"Department scope","text":"
                                  1. Locate the PVC in the Run:ai namespace (runai)
                                  2. To authorize Run:ai to use the PVC, label it: run.ai/department: \"id\" The PVC is now displayed for that scope in the list of existing PVCs.
                                  "},{"location":"admin/config/create-k8s-assets-in-advance/#project-scope","title":"Project scope","text":"
                                  1. Locate the PVC in the project\u2019s namespace The PVC is now displayed for that scope in the list of existing PVCs.
                                  "},{"location":"admin/config/create-k8s-assets-in-advance/#creating-configmaps-in-advance","title":"Creating ConfigMaps in advance","text":"

                                  Add ConfigMaps in advance to be used when creating a ConfigMap-type data source via the Run:ai UI.

                                  "},{"location":"admin/config/create-k8s-assets-in-advance/#cluster-scope_1","title":"Cluster scope","text":"
                                  1. Locate the ConfigMap in the Run:ai namespace (runai)
                                  2. To authorize Run:ai to use the ConfigMap, label it: run.ai/cluster-wide: \"true\u201d
                                  3. The ConfigMap must have a label of run.ai/resource: <resource-name>

                                    The ConfigMap is now displayed for that scope in the list of existing ConfigMaps.

                                  "},{"location":"admin/config/create-k8s-assets-in-advance/#department-scope_1","title":"Department scope","text":"
                                  1. Locate the ConfigMap in the Run:ai namespace (runai)
                                  2. To authorize Run:ai to use the ConfigMap, label it: run.ai/department: \"<department-id>\"
                                  3. The ConfigMap must have a label of run.ai/resource: <resource-name>

                                    The ConfigMap is now displayed for that scope in the list of existing ConfigMaps.

                                  "},{"location":"admin/config/create-k8s-assets-in-advance/#project-scope_1","title":"Project scope","text":"
                                  1. Locate the ConfigMap in the project\u2019s namespace
                                  2. The ConfigMap must have a label of run.ai/resource: <resource-name>

                                    The ConfigMap is now displayed for that scope in the list of existing ConfigMaps.

                                  "},{"location":"admin/config/default-scheduler/","title":"Setting Run:ai as the default scheduler per Namespace (Project)","text":""},{"location":"admin/config/default-scheduler/#introduction","title":"Introduction","text":"

                                  Kubernetes has a default scheduler that makes decisions on where to place Kubernetes Pods. Run:ai has implemented a different scheduler called the runai-scheduler. By default, Run:ai uses its own scheduler

                                  You can decide to use the Run:ai scheduler for other, non-Run:ai, workloads by adding the following to the workload's YAML file:

                                  schedulerName: runai-scheduler\n
                                  "},{"location":"admin/config/default-scheduler/#making-runai-the-default-scheduler","title":"Making Run:ai the default scheduler","text":"

                                  There may be cases where you cannot change the YAML file but still want to use the Run:ai Scheduler to schedule those workloads.

                                  For such cases, another option is to configure the Run:ai Scheduler as the default scheduler for a specific namespace. This will now make any workload type that is submitted to that namespace (equivalent to a Run:ai Project) use the Run:ai scheduler.

                                  To configure this, add the following annotation to the namespace itself:

                                  runai/enforce-scheduler-name: true

                                  "},{"location":"admin/config/default-scheduler/#example","title":"Example","text":"

                                  To annotate a project named proj-a, use the following command:

                                  kubectl annotate ns runai-proj-a runai/enforce-scheduler-name=true\n

                                  Verify the namespace in YAML format to see the annotation:

                                  kubectl get ns runai-proj-a -o yaml\n

                                  Output:

                                  apiVersion: v1\nkind: Namespace\nmetadata:\n  annotations:\n    runai/enforce-scheduler-name: \"true\"\n  creationTimestamp: \"2024-04-09T08:15:50Z\"\n  labels:\n    kubernetes.io/metadata.name: runai-proj-a\n    runai/namespace-version: v2\n    runai/queue: proj-a\n  name: runai-proj-a\n  resourceVersion: \"388336\"\n  uid: c53af666-7989-43df-9804-42bf8965ce83\nspec:\n  finalizers:\n  - kubernetes\nstatus:\n  phase: Active\n
                                  "},{"location":"admin/config/dr/","title":"Backup & Restore","text":""},{"location":"admin/config/dr/#runai-cluster-restore","title":"Run:ai Cluster Restore","text":"

                                  This article explains how to restore a Run:ai cluster on a different Kubernetes environment.

                                  In the event of a critical Kubernetes failure or alternatively, if you want to migrate a Run:ai cluster to a new Kubernetes environment, simply reinstall the Run:ai cluster. Once you have reinstalled and reconnected the cluster - projects, workloads and other cluster data is synced automatically.

                                  The restoration or back-up of Run:ai cluster Advanced features and Customized deployment configurations which are stored locally on the Kubernetes cluster is optional and they can be restored and backed-up separately.

                                  "},{"location":"admin/config/dr/#backup","title":"Backup","text":"

                                  As back-up of data is not required, the backup procedure is optional for advanced deployments, as explained above.

                                  "},{"location":"admin/config/dr/#backup-cluster-configurations","title":"Backup cluster configurations","text":"

                                  To backup Run:ai cluster configurations:

                                  1. Run the following command in your terminal:
                                    kubectl get runaiconfig runai -n runai -o yaml -o=jsonpath='{.spec}' > runaiconfig_backup.yaml\n
                                  2. Once the runaiconfig_back.yaml back-up file is created, save the file externally, so that it can be retrieved later.
                                  "},{"location":"admin/config/dr/#restore","title":"Restore","text":"

                                  Follow the steps below to restore the Run:ai cluster on a new Kubernetes environment.

                                  "},{"location":"admin/config/dr/#prerequisites","title":"Prerequisites","text":"

                                  Before restoring the Run:ai cluster, it is essential to validate that it is both disconnected and uninstalled.

                                  1. If the Kubernetes cluster is still available, uninstall the Run:ai cluster - make sure not to remove the cluster from the Control Plane
                                  2. Navigate to the Cluster page in the Run:ai platform
                                  3. Search for the cluster, and make sure its status is Disconnected
                                  "},{"location":"admin/config/dr/#re-installing-runai-cluster","title":"Re-installing Run:ai Cluster","text":"
                                  1. Follow the Run:ai cluster installation instructions and ensure all prerequisites are met
                                  2. If you have a back-up of the cluster configurations, reload it once the installation is complete
                                    kubectl apply -f runaiconfig_backup.yaml -n runai\n
                                  3. Navigate to the Cluster page in the Run:ai platform
                                  4. Search for the cluster, and make sure its status is Connected
                                  "},{"location":"admin/config/dr/#runai-control-plane","title":"Run:ai Control Plane","text":"

                                  The self-hosted variant of Run:ai also installs the control-plane at the customer site. As such, it becomes the responsibility of the IT organization to verify that the system is configured for proper backup and learn how to recover the data when needed.

                                  "},{"location":"admin/config/dr/#database-storage","title":"Database Storage","text":"

                                  Run:ai uses an internal PostgreSQL database. The database is stored on a Kubernetes Persistent Volume (PV). You must provide a backup solution for the database. Some options:

                                  • Backing up of PostgreSQL itself. Example: kubectl -n runai-backend exec -it runai-backend-postgresql-0 -- env PGPASSWORD=password pg_dump -U postgres backend > cluster_name_db_backup.sql
                                  • Backing up the persistent volume holding the database storage.
                                  • Using third-party backup solutions.

                                  Run:ai also supports an external PostgreSQL database. For details on how to configure an external database please contact Run:ai customer support.

                                  "},{"location":"admin/config/dr/#metrics-storage","title":"Metrics Storage","text":"

                                  Run:ai stores metric history using Thanos. Thanos is configured to store data on a persistent volume. The recommendation is to back up the PV.

                                  "},{"location":"admin/config/dr/#backing-up-control-plane-configuration","title":"Backing up Control-Plane Configuration","text":"

                                  The installation of the Run:ai control plane can be configured. The configuration is provided as --set command in the helm installation. These changes will be preserved on upgrade, but will not be preserved on uninstall or upon damage to Kubernetes. Thus, it is best to back up these customizations. For a list of customizations used during the installation, run:

                                  helm get values runai-backend -n runai-backend

                                  "},{"location":"admin/config/dr/#recovery","title":"Recovery","text":"

                                  To recover Run:ai

                                  • Re-create the Kubernetes/OpenShift cluster.
                                  • Recover the persistent volumes for metrics and database.
                                  • Re-install the Run:ai control plane. Use the additional configuration previously saved and connect to the restored PostgreSQL PV. Connect Prometheus to the stored metrics PV.
                                  • Re-install the cluster. Add additional configuration post-install.
                                  • If the cluster is configured such that Projects do not create a namespace automatically, you will need to re-create namespaces and apply role bindings as discussed in Kubernetes or OpenShift.
                                  "},{"location":"admin/config/ha/","title":"High Availability","text":"

                                  The purpose of this document is to configure Run:ai such that it will continue to provide service even if parts of the system are down.

                                  A frequent fail scenario is a physical node in the system becoming non-responsive due to physical problems or lack of resources. In such a case, Kubernetes will attempt to relocate the running pods, but the process may take time, during which Run:ai will be down.

                                  A different scenario is a high transaction load, leading to system overload. To address such a scenario, please review the article: scaling the Run:ai system.

                                  "},{"location":"admin/config/ha/#runai-control-plane","title":"Run:ai Control Plane","text":""},{"location":"admin/config/ha/#runai-system-workers","title":"Run:ai system workers","text":"

                                  The Run:ai control plane allows the optional gathering of Run:ai pods into specific nodes. When this feature is used, it is important to set more than one node as a Run:ai system worker. Otherwise, the horizontal scaling described below will not span multiple nodes, and the system will remain with a single point of failure.

                                  "},{"location":"admin/config/ha/#horizontal-scalability-of-runai-services","title":"Horizontal Scalability of Run:ai services","text":"

                                  Horizontal scalability is about instructing the system to create more pods to dynamically scale according to incoming load and downsize when the load subsides.

                                  The Run:ai control plane is running on a single Kubernetes namespace named runai-backend. The namespace contains various Kubernetes Deployments and StatefulSets. Each of these services can be scaled horizontally.

                                  "},{"location":"admin/config/ha/#deployments","title":"Deployments","text":"

                                  Each of the Run:ai deployments can be set to scale up, by adding a helm settings on install/upgrade. E.g. --set frontend.autoscaling.enabled=true. For a full list of settings, please contact Run:ai customer support.

                                  "},{"location":"admin/config/ha/#statefulsets","title":"StatefulSets","text":"

                                  Run:ai uses three third parties which are managed as Kubernetes StatefulSets:

                                  • Keycloak\u2014Stores the Run:ai authentication configuration as well as user identities. To scale Keycloak, use the Run:ai control-plane helm flag --set keycloakx.autoscaling.enabled=true. By default, Keycloak sets a minimum of 3 pods and will scale to more on transaction load.
                                  • PostgreSQL\u2014It is not possible to configure an internal PostgreSQL to scale horizontally. If this is of importance, please contact Customer Support to understand how to connect Run:ai to an external PostgreSQL service which can be configured for high availability.
                                  • Thanos\u2014To enable Thanos autoscaling, use the following Run:ai control-plane helm flags:
                                  --set thanos.query.autoscaling.enabled=true  \n--set thanos.query.autoscaling.maxReplicas=2\n--set thanos.query.autoscaling.minReplicas=2 \n
                                  "},{"location":"admin/config/ha/#runai-cluster","title":"Run:ai Cluster","text":""},{"location":"admin/config/ha/#runai-system-workers_1","title":"Run:ai system workers","text":"

                                  The Run:ai cluster allows the mandatory gathering of Run:ai pods into specific nodes. When this feature is used, it is important to set more than one node as a Run:ai system worker. Otherwise, the horizontal scaling described below may not span multiple nodes, and the system will remain with a single point of failure.

                                  "},{"location":"admin/config/ha/#prometheus","title":"Prometheus","text":"

                                  The default Prometheus installation uses a single pod replica. If the node running the pod is unresponsive, metrics will not be scraped from the cluster and will not be sent to the Run:ai control-plane.

                                  Prometheus supports high availability by allowing to run multiple instances. The tradeoff of this approach is that all instances will scrape and send the same data. The Run:ai control plane will identify duplicate metric series and ignore them. This approach will thus increase network, CPU and memory consumption.

                                  To change the number of Prometheus instances, edit the runaiconfig as described under customizing the Run:ai cluster. Under prometheus.spec, set replicas to 2.

                                  "},{"location":"admin/config/large-clusters/","title":"Scaling the Run:ai system","text":"

                                  The purpose of this document is to provide information on how to scale the Run:ai cluster and the Run:ai control-plane to withstand large transaction loads

                                  "},{"location":"admin/config/large-clusters/#scaling-the-runai-control-plane","title":"Scaling the Run:ai Control Plane","text":"

                                  The Control plane deployments which may encounter load are:

                                  Name Kubernetes Deployment name Purpose Backend runai-backend-backend Main control-plane service Frontend runai-backend-frontend Serving of the Run:ai console Grafana runai-backend-grafana Serving of the Run:ai metrics inside the Run:ai console

                                  To increase the number of replicas, run:

                                  To increase the number of replicas, use the following Run:ai control-plane helm flags

                                  --set backend.autoscaling.enabled=true \n--set frontend.autoscaling.enabled=true\n--set grafana.autoscaling.enabled=true --set grafana.autoscaling.minReplicas=2\n

                                  Important

                                  If you have chosen to mark some of the nodes as Run:ai System Workers, the new replicas will attempt to use these nodes first. Thus, for high availability purposes, you will want to mark more than one node as a Run:ai System Worker.

                                  "},{"location":"admin/config/large-clusters/#thanos","title":"Thanos","text":"

                                  Thanos is the 3rd party used by Run:ai to store metrics Under a significant user load, we would also need to increase resources for the Thanos query function. Use the following Run:ai control-plane helm flags:

                                  --set thanos.query.resources.limits.memory=3G\n--set thanos.query.resources.requests.memory=3G\n--set thanos.query.resources.limits.cpu=1\n--set thanos.query.resources.requests.cpu=1\n\n--set thanos.receive.resources.limits.memory=6G \n--set thanos.receive.resources.requests.memory=6G\n--set thanos.receive.resources.limits.cpu=1 \n--set thanos.receive.resources.requests.cpu=1\n
                                  "},{"location":"admin/config/large-clusters/#scaling-the-runai-cluster","title":"Scaling the Run:ai Cluster","text":""},{"location":"admin/config/large-clusters/#cpu-memory-resources","title":"CPU & Memory Resources","text":"

                                  Under Kubernetes, each of the Run:ai containers, has default resource requirements that reflect an average customer load. With significantly larger cluster loads, certain Run:ai services will require more CPU and memory resources. Run:ai now supports the ability to configure these resources and to do so for each Run:ai service group separately.

                                  "},{"location":"admin/config/large-clusters/#service-groups","title":"Service Groups","text":"

                                  Run:ai supports setting requests and limits configurations for CPU and memory for Run:ai containers. The configuration is set per service group. Each service group reflects a certain load type:

                                  Service Group Description Run:ai containers SchedulingServices Containers associated with the Run:ai scheduler Scheduler, StatusUpdater, MetricsExporter, PodGrouper, PodGroupAssigner, Binder SyncServices Containers associated with syncing updates between the Run:ai cluster and the Run:ai control plane Agent, ClusterSync, AssetsSync WorkloadServices Containers associated with submitting Run:ai Workloads WorkloadController, JobController"},{"location":"admin/config/large-clusters/#configuration-steps","title":"Configuration Steps","text":"

                                  To configure resource requirements for a group of services, update the RunaiConfig. Set the spec.global.<service-group>. resources section. The following example shows the configuration of scheduling services resource requirements:

                                  apiVersion: run.ai/v1\nkind: RunaiConfig\nmetadata:\nspec:\n global:\n   schedulingServices:\n     resources:\n       limits:\n         cpu: 1000m\n         memory: 1Gi\n       requests:\n         cpu: 100m\n         memory: 512Mi\n

                                  Use syncServices and workloadServices for the other two service groups.

                                  "},{"location":"admin/config/large-clusters/#recommended-resource-specifications-for-large-clusters","title":"Recommended Resource Specifications For Large Clusters","text":"

                                  In large clusters (100 nodes or 1500 GPUs or more), we recommend the following configuration for SchedulingServices and SyncServices groups:

                                  resources:\n requests:\n   cpu: 1\n   memory: 1Gi\n limits:\n   cpu: 2\n   memory: 2Gi\n
                                  "},{"location":"admin/config/large-clusters/#sending-metrics","title":"Sending Metrics","text":"

                                  Run:ai uses Prometheus to scrape metrics from the Run:ai cluster and to send them to the Run:ai control plane. The number of metrics is a function of the number of Nodes, Jobs and Projects which the system contains. When reaching hundreds of Nodes and Projects, the system will be sending large quantities of metrics which, in turn, will create a strain on the network as well as the receiving side in the control plane (SaaS or self-hosted).

                                  To reduce this strain, we suggest to configure Prometheus to send information in larger bulks and reduce the number of network connections:

                                  • Edit the runaiconfig as described under customizing the cluster.
                                  • Under prometheus.remoteWrite add the following:
                                  queueConfig:\n  capacity: 5000\n  maxSamplesPerSend: 1000\n  maxShards: 100\n

                                  This article provides additional details and insight.

                                  Also, note that this configuration enlarges the Prometheus queues and thus increases the required memory. It is hence suggested to reduce the metrics retention period as described here

                                  "},{"location":"admin/config/limit-to-node-group/","title":"Group Nodes","text":""},{"location":"admin/config/limit-to-node-group/#why","title":"Why?","text":"

                                  In some business scenarios, you may want to direct the Run:ai scheduler to schedule a Workload to a specific node or a node group. For example, in some academic institutions, Hardware is bought using a specific grant and thus \"belongs\" to a specific research group. Another example is an inference workload that is optimized to a specific GPU type and must have dedicated resources reserved to ensure enough capacity.

                                  Run:ai provides two methods to designate, and group, specific resources:

                                  • Node Pools: Run:ai allows administrators to group specific nodes into a node pool. A node pool is a group of nodes identified by a given name (node pool name) and grouped by any label (key and value combination). The label can be chosen by the administrator or can be an existing, pre-set, label (such as an NVIDIA GPU type label).
                                  • Node Affinity: Run:ai allows this \"taint\" by labeling a node, or a set of nodes and then during scheduling, using the flag --node-type <label> to force this allocation.

                                  Important

                                  One can set and use both node pool and node affinity combined as a prerequisite to the scheduler, for example, if a researcher wants to use a T4 node with an Infiniband card - he or she can use a node pool of T4 and from that group, choose only the nodes with Infiniband card (node-type = infiniband).

                                  There is a tradeoff in place when allowing Researchers to designate specific nodes. Overuse of this feature limits the scheduler in finding an optimal resource and thus reduces overall cluster utilization.

                                  "},{"location":"admin/config/limit-to-node-group/#configuring-node-groups","title":"Configuring Node Groups","text":"

                                  To configure a node pool:

                                  • Find the label key & value you want to use for Run:ai to create the node pool.
                                  • Check that the nodes you want to group as a pool have a unique label to use, otherwise you should mark those nodes with your own uniquely identifiable label.
                                  • Get the names of the nodes you want Run:ai to group together. To get a list of nodes, run:
                                  kubectl get nodes\nKubectl get nodes --show-labels\n
                                  • If you chose to set your own label, run the following:
                                  kubectl label node <node-name> <label-key>=<label-value>\n

                                  The same value can be set to a single node or multiple nodes. Node Pool can only use one label (key & value) at a time.

                                  • To create a node pool use the create node pool Run:ai API.

                                  To configure a node affinity:

                                  • Get the names of the nodes where you want to limit Run:ai. To get a list of nodes, run:
                                  kubectl get nodes\n
                                  • For each node run the following:
                                  kubectl label node <node-name> run.ai/type=<label>\n

                                  The same value can be set to a single node, or for multiple nodes. A node can only be set with a single value.

                                  "},{"location":"admin/config/limit-to-node-group/#using-node-groups-via-the-cli","title":"Using Node Groups via the CLI","text":"

                                  To use Run:ai node pool with a workload, use Run:ai CLI command \u2018node-pool\u2019:

                                  runai submit job1 ... --node-pools \"my-pool\" ...\n

                                  To use multiple node pools with a workload, use the Run:ai CLI command:

                                  runai submit job1 ... --node-pools \"my-pool my-pool2 my-pool3\" ...\n

                                  With multiple node pools, the researcher creates a list of prioritized node pools and lets the scheduler try and choose from any of the node pools in the list, according to the given priority.

                                  To use node affinity, use the node type label with the --node-type flag:

                                  runai submit job1 ... --node-type \"my-nodes\"\n

                                  A researcher may combine the two flags to select both a node pool and a specific set of nodes out of that node pool (e.g. gpu-type=t4 and node-type=infiniband):

                                  runai submit job1 ... --node-pool-name \u201cmy pool\u201d --node-type \"my-nodes\"\n

                                  Note

                                  When submitting a workload, if you choose a node pool label and a node affinity (node type) label which does not intersect, the Run:ai scheduler will not be able to schedule that workload as it represents an empty nodes group.

                                  See the runai submit documentation for further information.

                                  "},{"location":"admin/config/limit-to-node-group/#assigning-node-groups-to-a-project","title":"Assigning Node Groups to a Project","text":"

                                  Node Pools are automatically assigned to all Projects and Departments with zero resource allocation as default. Allocating resources to a node pool can be done for each Project and Department. Submitting a workload to a node pool that has zero allocation for a specific project (or department) results in that workload running as an over-quota workload.

                                  To assign and configure specific node affinity groups or node pools to a Project see working with Projects.

                                  When the command-line interface flag is used in conjunction with Project-based affinity, the flag is used to refine the list of allowable node groups set in the Project.

                                  "},{"location":"admin/config/node-affinity-with-cloud-node-pools/","title":"Node affinity with cloud node pools","text":"

                                  Run:ai allows for node affinity. Node affinity is the ability to assign a Project to run on specific nodes. To use the node affinity feature, You will need to label the target nodes with the label run.ai/node-type. Most cloud clusters allow configuring node labels for the node pools in the cluster. This guide shows how to apply this configuration to different cloud providers.

                                  To make the node affinity work with node pools on various cloud providers, we need to make sure the node pools are configured with the appropriate Kubernetes label (run.ai/type=<TYPE_VALUE>).

                                  "},{"location":"admin/config/node-affinity-with-cloud-node-pools/#setting-node-labels-while-creating-a-new-cluster","title":"Setting node labels while creating a new cluster","text":"

                                  You can configure node-pool labels at cluster creation time

                                  GKEAKSEKS
                                  • At the first creation screen, you will see a menu on the left side named node-pools.
                                  • Expand the node pool you want to label.
                                  • Click on Metadata.
                                  • Near the bottom, you will find the Kubernetes label section. Add the key run.ai/type and the value <TYPE_VALUE>.
                                  • When creating AKS cluster at the node-pools page click on create new node-pool.
                                  • Go to the labels section and add key run.ai/type and the value <TYPE_VALUE>.
                                  • Create a regular EKS cluster.
                                  • Click on compute.
                                  • Click on Add node group.
                                  • In the Kubernetes labels section click on Add label. Add the key run.ai/type and the value <TYPE_VALUE>.
                                  "},{"location":"admin/config/node-affinity-with-cloud-node-pools/#setting-node-labels-for-a-new-node-pool","title":"Setting node labels for a new node pool","text":"GKEAKSEKS
                                  • At the node pool creation screen, go to the metadata section.
                                  • Near the bottom, you will find the Kubernetes label section. Add the key run.ai/type and the value <TYPE_VALUE>.
                                  • Go to your AKS page at Azure.
                                  • On the left menu click the node-pools button.
                                  • Click on Add Node Pool.
                                  • In the new Node Pool page go to Optional settings.
                                  • In the Kubernetes labels section click on Add label. Add the key run.ai/type and the value <TYPE_VALUE>.
                                  • Go to Add node group screen.
                                  • In the Kubernetes labels section click on Add label. Add the key run.ai/type and the value <TYPE_VALUE>.
                                  "},{"location":"admin/config/node-affinity-with-cloud-node-pools/#editing-node-labels-for-an-existing-node-pool","title":"Editing node labels for an existing node pool","text":"GKEAKSEKS
                                  • Go to the Google Kubernetes Engine page in the Google Cloud console.
                                  • Go to Google Kubernetes Engine.
                                  • In the cluster list, click the name of the cluster you want to modify.
                                  • Click the Nodes tab
                                  • Under Node Pools, click the name of the node pool you want to modify, then click Edit.
                                  • Near the bottom, you will find the Kubernetes label section. Add the key run.ai/type and the value <TYPE_VALUE>.

                                  To update an existing node pool label you must use the azure cli. Run the following command:

                                  az aks nodepool update \\\n    --resource-group [RESOURCE GROUP] \\\n    --cluster-name [CLUSTER NAME] \\\n    --name labelnp \\\n    --labels run.ai/type=[TYPE_VALUE] \\\n    --no-wait\n
                                  • Go to the node group page and click on Edit.
                                  • In the Kubernetes labels section click on Add label. Add the key run.ai/type and the value <TYPE_VALUE>.
                                  "},{"location":"admin/config/node-roles/","title":"Node roles","text":"

                                  This article explains how to designate specific node roles in a Kubernetes cluster to ensure optimal performance and reliability in production deployments.

                                  For optimal performance in production clusters, it is essential to avoid extensive CPU usage on GPU nodes where possible. This can be done by ensuring the following:

                                  • Run:ai system-level services run on dedicated CPU-only nodes.
                                  • Workloads that do not request GPU resources (e.g. Machine Learning jobs) are executed on CPU-only nodes.
                                  "},{"location":"admin/config/node-roles/#prerequisites","title":"Prerequisites","text":"

                                  To perform these tasks, make sure to install the Run:ai Administrator CLI.

                                  "},{"location":"admin/config/node-roles/#configure-node-roles","title":"Configure Node Roles","text":"

                                  The following node roles can be configured on the cluster:

                                  • System node: Reserved for Run:ai system-level services.
                                  • GPU Worker node: Dedicated for GPU-based workloads.
                                  • CPU Worker node: Used for CPU-only workloads.
                                  "},{"location":"admin/config/node-roles/#system-nodes","title":"System nodes","text":"

                                  Run:ai system nodes run system-level services required to operate. This can be done via the Run:ai Administrator CLI.

                                  Recommendation

                                  To ensure high availability and prevent a single point of failure, it is recommended to configure at least three system nodes in your cluster.

                                  To set a system role for a node in your Kubernetes cluster, follow these steps:

                                  1. Run the kubectl get nodes command to list all the nodes in your cluster and identify the name of the node you want to modify.
                                  2. Run one of the following commands to set or remove a node\u2019s role:
                                    runai-adm set node-role --runai-system-worker <node-name>\nrunai-adm remove node-role --runai-system-worker <node-name>\n

                                  The runai-adm CLI will label the node and set relevant cluster configurations.

                                  The Run:ai cluster applies Kubernetes Node Affinity using node labels to manage scheduling for cluster services (system).

                                  Warning

                                  Do not assign a system node role to the Kubernetes master node. This may disrupt Kubernetes functionality, particularly if the Kubernetes API Server is configured to use port 443 instead of the default 6443.

                                  "},{"location":"admin/config/node-roles/#worker-nodes","title":"Worker nodes","text":"

                                  Run:ai worker nodes run user-submitted workloads and system-level DeamonSets required to operate. This can be managed via the Run:ai Administrator CLI, or Kubectl.

                                  "},{"location":"admin/config/node-roles/#runai-administrator-cli","title":"Run:ai Administrator CLI","text":"

                                  To set worker role for a node in your Kubernetes cluster via Run:ai Administrator CLI, follow these steps:

                                  1. Use the kubectl get nodes command to list all the nodes in your cluster and identify the name of the node you want to modify.
                                  2. Run one of the following commands to set or remove a node\u2019s role:
                                     runai-adm set node-role [--gpu-worker | --cpu-worker] <node-name>\n runai-adm remove node-role [--gpu-worker | --cpu-worker] <node-name>\n

                                  The runai-adm CLI will label the node and set relevant cluster configurations.

                                  Tip

                                  Use the --all flag to set or remove a role to all nodes.

                                  "},{"location":"admin/config/node-roles/#kubectl","title":"Kubectl","text":"

                                  To set a worker role for a node in your Kubernetes cluster using Kubectl, follow these steps:

                                  1. Validate the global.nodeAffinity.restrictScheduling is set to true in the cluster\u2019s Configurations.
                                  2. Use the kubectl get nodes command to list all the nodes in your cluster and identify the name of the node you want to modify.
                                  3. Run one of the following commands to label the node with its role:
                                    kubectl label nodes <node-name> [node-role.kubernetes.io/runai-gpu-worker=true | node-role.kubernetes.io/runai-cpu-worker=true]\nkubectl label nodes <node-name> [node-role.kubernetes.io/runai-gpu-worker=false | node-role.kubernetes.io/runai-cpu-worker=false]\n
                                  "},{"location":"admin/config/notifications/","title":"Notifications System","text":""},{"location":"admin/config/notifications/#email-notifications-for-data-scientists","title":"Email Notifications for Data Scientists","text":"

                                  Managing numerous data science workloads requires monitoring various stages, including submission, scheduling, initialization, execution, and completion. Additionally, handling suspensions and failures is crucial for ensuring timely workload completion. Email Notifications address this need by sending alerts for critical workload life cycle changes. This empowers data scientists to take necessary actions and prevent delays.

                                  "},{"location":"admin/config/notifications/#setting-up-email-notifications","title":"Setting Up Email Notifications","text":"

                                  Important

                                  The system administrator needs to enable and setup email notifications so that users are kept informed about different system statuses.

                                  To enable email notifications for the system:

                                  1. Press General settings, then select Notifications.

                                    Note

                                    For SaaS deployments, use the Enable email notifications toggle.

                                  2. In the SMTP Host field, enter the SMTP server address and in the SMTP port field the port number.

                                  3. Select an Authentication type Plain or Login. Enter a username and password to be used for authentication.
                                  4. Enter the From email address and the Display name.
                                  5. Press Verify to ensure that the email configuration is working.
                                  6. Press Save when complete.
                                  "},{"location":"admin/config/notifications/#system-notifications","title":"System Notifications","text":"

                                  Administrators can set system wide notifications for all the users in order to alert them of important information. System notifications allows administrators the ability to update users with events that may be occurring within the Run:ai platform. The system notification will appear at each login or after the message has changed for users who are already logged in.

                                  To configure system notifications:

                                  1. Press General settings, then select Notifications.
                                  2. In the System notification pane, press +MESSAGE.
                                  3. Enter your message in the text box. Use the formatting tool bar to add special formats to your message text.
                                  4. Enable the \"Don't show this again\" option to allow users to opt out of seeing the message multiple times.
                                  5. When complete, press Save & Publish.
                                  "},{"location":"admin/config/org-cert/","title":"Working with a Local Certificate Authority","text":"

                                  Run:ai can be installed in an isolated network. In this air-gapped configuration, the organization will not be using an established root certificate authority. Instead, the organization creates a local certificate which serves as the root certificate for the organization. The certificate is installed in all browsers within the organization.

                                  In the context of Run:ai, the cluster and control-plane need to be aware of this certificate for consumers to be able to connect to the system.

                                  "},{"location":"admin/config/org-cert/#preparation","title":"Preparation","text":"

                                  You will need to have the public key of the local certificate authority.

                                  "},{"location":"admin/config/org-cert/#control-plane-installation","title":"Control-Plane Installation","text":"
                                  • Create the runai-backend namespace if it does not exist.
                                  • Add the public key to the runai-backend namespace:

                                    kubectl -n runai-backend create secret generic runai-ca-cert \\ \n    --from-file=runai-ca.pem=<ca_bundle_path>\n

                                  • As part of the installation instructions, you need to create a secret for runai-backend-tls. Use the local certificate authority instead.

                                  • Install the control plane, add the following flag to the helm command --set global.customCA.enabled=true
                                  "},{"location":"admin/config/org-cert/#cluster-installation","title":"Cluster Installation","text":"
                                  • Create the runai namespace if it does not exist.
                                  • Add the public key to the runai namespace:
                                    kubectl -n runai create secret generic runai-ca-cert \\\n    --from-file=runai-ca.pem=<ca_bundle_path>\n
                                  • In case you're using Openshift, add the public key to the openshift-monitoring namespace:
                                    kubectl -n openshift-monitoring create secret generic runai-ca-cert \\\n    --from-file=runai-ca.pem=<ca_bundle_path>\n
                                  • Install the Run:ai operator, add the following flag to the helm command --set global.customCA.enabled=true
                                  "},{"location":"admin/config/overview/","title":"Run:ai Configuration Articles","text":"

                                  This section provides a list of installation-related articles dealing with a wide range of subjects:

                                  Article Purpose Designating Specific Role Nodes Set one or more designated Run:ai system nodes or limit Run:ai monitoring and scheduling to specific nodes in the cluster. Create and Troubleshoot Clusters Create new clusters, view properties and status, and troubleshoot cluster connectivity related issues. Set Default Scheduler Set the default scheduler for a specific namespace Review Kubernetes Access provided to Run:ai In Restrictive Kubernetes environments such as when using OpenShift, understand and control what Kubernetes roles are provided to Run:ai External access to Containers Understand the available options for Researchers to access containers from the outside Install the Run:ai Administrator Command-line Interface The Administrator command-line is useful in a variety of flows such as cluster upgrade, node setup etc. Set Node affinity with cloud node pools Set node affinity when using a cloud provider for your cluster Local Certificate Authority For self-hosted Run:ai environments, specifically air-gapped installation, setup a local certificate authority to allow customers to safely connect to Run:ai Backup & Restore For self-hosted Run:ai environments, set up a scheduled backup of Run:ai data High Availability Configure Run:ai such that it will continue to provide service even if parts of the system are down. Scaling Scale the Run:ai cluster and the Run:ai control-plane to withstand large transaction loads Emails and system notification Configure e-mail notification"},{"location":"admin/config/secure-cluster/","title":"Secure your cluster","text":"

                                  This article details the security considerations for deploying Run:ai. It is intended to help administrators and security officers understand the specific permissions required by Run:ai.

                                  "},{"location":"admin/config/secure-cluster/#access-to-the-kubernetes-cluster","title":"Access to the Kubernetes cluster","text":"

                                  Run:ai integrates with Kubernetes clusters and requires specific permissions to successfully operate. These are permissions are controlled with configuration flags that dictate how Run:ai interacts with cluster resources. Prior to installation, security teams can review the permissions and ensure it aligns with their organization\u2019s policies.

                                  "},{"location":"admin/config/secure-cluster/#permissions-and-their-related-use-case","title":"Permissions and their related use-case","text":"

                                  Run:ai provides various security-related permissions that can be customized to fit specific organizational needs. Below are brief descriptions of the key use cases for these customizations:

                                  Permission Use case Automatic Namespace creation Controls whether Run:ai automatically creates Kubernetes namespaces when new projects are created. Useful in environments where namespace creation must be strictly managed. Automatic user assignment Decides if users are automatically assigned to projects within Run:ai. Helps manage user access more tightly in certain compliance-driven environments. Secret propagation Determines whether Run:ai should propagate secrets across the cluster. Relevant for organizations with specific security protocols for managing sensitive data. Disabling Kubernetes limit range Chooses whether to disable the Kubernetes Limit Range feature. May be adjusted in environments with specific resource management needs.

                                  Note

                                  These security customizations allow organizations to tailor Run:ai to their specific needs. All changes should be modified cautiously and only when necessary to meet particular security, compliance or operational requirements.

                                  "},{"location":"admin/config/secure-cluster/#secure-installation","title":"Secure installation","text":"

                                  Many organizations enforce IT compliance rules for Kubernetes, with strict access control for installing and running workloads. OpenShift uses Security Context Constraints (SCC) for this purpose. Run:ai fully supports SCC, ensuring integration with OpenShift's security requirements.

                                  "},{"location":"admin/config/secure-cluster/#security-vulnerabilities","title":"Security vulnerabilities","text":"

                                  The platform is actively monitored for security vulnerabilities, with regular scans conducted to identify and address potential issues. Necessary fixes are applied to ensure that the software remains secure and resilient against emerging threats, providing a safe and reliable experience.

                                  "},{"location":"admin/config/shared-storage/","title":"Shared Storage","text":"

                                  Shared storage is a critical component in AI and machine learning workflows, particularly in scenarios involving distributed training and shared datasets. In AI and ML environments, data must be readily accessible across multiple nodes, especially when training large models or working with vast datasets. Shared storage enable seamless access to data, ensuring that all nodes in a distributed training setup can read and write to the same datasets simultaneously. This setup not only enhances efficiency but is also crucial for maintaining consistency and speed in high-performance computing environments.

                                  While Run:ai Platform supports a variety of remote data sources, such as Git and S3, it is often more efficient to keep data close to the compute resources. This proximity is typically achieved through the use of shared storage, accessible to multiple nodes in your Kubernetes cluster.

                                  "},{"location":"admin/config/shared-storage/#shared-storage","title":"Shared storage","text":"

                                  When implementing shared storage in Kubernetes, there are two primary approaches:

                                  • Utilizing the Kubernetes Storage Classes of your storage provider; or
                                  • Using a direct NFS (Network File System) mount

                                  Storage Classes being the recommended option.

                                  Run:ai Data Sources support both direct NFS mount and Kubernetes Storage Classes.

                                  "},{"location":"admin/config/shared-storage/#kubernetes-storage-classes","title":"Kubernetes storage classes","text":"

                                  Storage classes in Kubernetes defines how storage is provisioned and managed. This allows you to select storage types optimized for AI workloads. For example, you can choose storage with high IOPS (Input/Output Operations Per Second) for rapid data access during intensive training sessions, or tiered storage options to balance cost and performance-based on your organization\u2019s requirements. This approach supports dynamic provisioning, enabling storage to be allocated on-demand as required by your applications.

                                  Run:ai data sources such as Persistent Volume Claims (PVC) and Data Volumes leverage storage class to manage and allocate storage efficiently. This ensures that the most suitable storage option is always accessible, contributing to the efficiency and performance of AI workloads.

                                  Note

                                  Run:ai lists all available storage classes in the Kubernetes cluster, making it easy for users to select the appropriate storage. Additionally, policies can be set to restrict or enforce the use of specific storage classes, to helpl maintain compliance with organizational standards and optimize resource utilization.

                                  Kubernetes 1.23 (old)

                                  When using Kubernetes 1.23, Data Source of PVC type does not work using a Storage Class with the property volumeBindingMode equals to WaitForFirstConsumer

                                  "},{"location":"admin/config/shared-storage/#direct-nfs-mount","title":"Direct NFS mount","text":"

                                  Direct NFS allows you to mount a shared file system directly across multiple nodes in your Kubernetes cluster. This method provides a straightforward way to share data among nodes and is often used for simple setups or when a dedicated NFS server is available.

                                  However, using NFS can present challenges related to security and control. Direct NFS setups might lack the fine-grained control and security features available with storage class.

                                  "},{"location":"admin/config/workload-ownership-protection/","title":"Workload Deletion Protection","text":""},{"location":"admin/config/workload-ownership-protection/#workload-deletion-protection","title":"Workload Deletion Protection","text":"

                                  Workload deletion protection in Run:ai ensures that only users who created a workload can delete or modify them. This feature is designed to safeguard important jobs and configurations from accidental or unauthorized modifications by users who did not originally create the workload.

                                  By enforcing ownership rules, Run:ai helps maintain the integrity and security of your machine learning operations. This additional layer of security ensures that only users with the appropriate permissions can delete and suspend workloads.

                                  The protection feature is implemented at the cluster level.

                                  To enable deletion protection run the following command:

                                  kubectl patch -n runai runaiconfigs.run.ai/runai --type='merge' --patch '{\"spec\":{\"global\":{\"enableWorkloadOwnershipProtection\":true}}}'\n
                                  "},{"location":"admin/maintenance/alert-monitoring/","title":"System Monitoring","text":"

                                  This article explains how to configure Run:ai to generate health alerts and to connect these alerts to alert-management systems within your organization. Alerts are generated for Run:ai clusters.

                                  "},{"location":"admin/maintenance/alert-monitoring/#alert-infrastructure","title":"Alert infrastructure","text":"

                                  Run:ai uses Prometheus for externalizing metrics and providing visibility to end-users. The Run:ai Cluster installation includes Prometheus or can connect to an existing Prometheus instance used in your organization. The alerts are based on the Prometheus AlertManager. Once installed, it is enabled by default.

                                  This document explains how to:

                                  • Configure alert destinations - triggered alerts send data to specified destinations
                                  • Understand the out-of-the-box cluster alerts, provided by Run:ai
                                  • Add additional custom alerts
                                  "},{"location":"admin/maintenance/alert-monitoring/#prerequisites","title":"Prerequisites","text":"
                                  • A Kubernetes cluster with the necessary permissions
                                  • Up and running Run:ai environment, including Prometheus Operator
                                  • kubectl command-line tool installed and configured to interact with the cluster
                                  "},{"location":"admin/maintenance/alert-monitoring/#set-up","title":"Set-up","text":"

                                  Use the steps below to set up monitoring alerts.

                                  "},{"location":"admin/maintenance/alert-monitoring/#validating-prometheus-operator-installed","title":"Validating Prometheus operator installed","text":"
                                  1. Verify that the Prometheus Operator Deployment is running Copy the following command and paste it in your terminal, where you have access to the Kubernetes cluster: kubectl get deployment kube-prometheus-stack-operator -n monitoring In your terminal, you can see an output indicating the deployment's status, including the number of replicas and their current state.
                                  2. Verify that Prometheus instances are running Copy the following command and paste it in your terminal: kubectl get prometheus -n runai You can see the Prometheus instance(s) listed along with their status.
                                  "},{"location":"admin/maintenance/alert-monitoring/#enabling-prometheus-alertmanager","title":"Enabling Prometheus AlertManager","text":"

                                  In each of the steps in this section, copy the content of the code snippet to a new YAML file (e.g., step1.yaml).

                                  • Copy the following command to your terminal, to apply the YAML file to the cluster:

                                  kubectl apply -f step1.yaml Copy the following command to your terminal to create the AlertManager CustomResource, to enable AlertManager:

                                  apiVersion: monitoring.coreos.com/v1  \nkind: Alertmanager  \nmetadata:  \n   name: runai  \n   namespace: runai  \nspec:  \n   replicas: 1  \n   alertmanagerConfigSelector:  \n      matchLabels:\n         alertmanagerConfig: runai \n
                                  • Copy the following command to your terminal to validate that the AlertManager instance has started: kubectl get alertmanager -n runai
                                  • Copy the following command to your terminal to validate that the Prometheus operator has created a Service for AlertManager: kubectl get svc alertmanager-operated -n runai
                                  "},{"location":"admin/maintenance/alert-monitoring/#configuring-prometheus-to-send-alerts","title":"Configuring Prometheus to send alerts","text":"
                                  1. Open the terminal on your local machine or another machine that has access to your Kubernetes cluster
                                  2. Copy and paste the following command in your terminal to edit the Prometheus configuration for the runai Namespace:

                                    kubectl edit prometheus runai -n runai\n
                                    This command opens the Prometheus configuration file in your default text editor (usually vi or nano).

                                  3. Copy and paste the following text to your terminal to change the configuration file:

                                    alerting:  \n   alertmanagers:  \n      - namespace: runai  \n        name: alertmanager-operated  \n        port: web\n

                                  4. Save the changes and exit the text editor.

                                  Note

                                  To save changes using vi, type :wq and press Enter. The changes are applied to the Prometheus configuration in the cluster.

                                  "},{"location":"admin/maintenance/alert-monitoring/#alert-destinations","title":"Alert destinations","text":"

                                  Set out below are the various alert destinations.

                                  "},{"location":"admin/maintenance/alert-monitoring/#configuring-alertmanager-for-custom-email-alerts","title":"Configuring AlertManager for custom email alerts","text":"

                                  In each step, copy the contents of the code snippets to a new file and apply it to the cluster using kubectl apply -f.

                                  Add your smtp password as a secret:

                                  apiVersion: v1  \nkind: Secret  \nmetadata:  \n   name: alertmanager-smtp-password  \n   namespace: runai  \nstringData:\n   password: \"your_smtp_password\"\n

                                  Replace the relevant smtp details with your own, then apply the alertmanagerconfig using kubectl apply.

                                   apiVersion: monitoring.coreos.com/v1alpha1  \n kind: AlertmanagerConfig  \n metadata:  \n   name: runai  \n   namespace: runai  \n labels:  \n    alertmanagerConfig: runai  \n spec:  \n    route:  \n       continue: true  \n       groupBy:   \n       - alertname\n\n       groupWait: 30s  \n       groupInterval: 5m  \n       repeatInterval: 1h\n\n    matchers:  \n    - matchType: =~  \n      name: alertname  \n      value: Runai.*\n\n    receiver: email\n\n receivers:  \n - name: 'email'  \n   emailConfigs:  \n   - to: '<destination_email_address>'  \n     from: '<from_email_address>'  \n     smarthost: 'smtp.gmail.com:587'  \n     authUsername: '<smtp_server_user_name>'  \n     authPassword:  \n       name: alertmanager-smtp-password\n         key: password  \n

                                  Save and exit the editor. The configuration is automatically reloaded.

                                  "},{"location":"admin/maintenance/alert-monitoring/#third-party-alert-destinations","title":"Third-party alert destinations","text":"

                                  Prometheus AlertManager provides a structured way to connect to alert-management systems. There are built-in plugins for popular systems such as PagerDuty and OpsGenie, including a generic Webhook.

                                  "},{"location":"admin/maintenance/alert-monitoring/#example-integrating-runai-with-a-webhook","title":"Example: Integrating Run:ai with a Webhook","text":"
                                  1. Use webhook.site to get a unique URL.
                                  2. Use the upgrade cluster instructions to modify the values file: Edit the values file to add the following, and replace <WEB-HOOK-URL> with the URL from webhook.site.

                                  codekube-prometheus-stack:  \n  ...  \n  alertmanager:  \n    enabled: true  \n    config:  \n      global:  \n        resolve_timeout: 5m  \n      receivers:  \n      - name: \"null\"  \n      - name: webhook-notifications  \n        webhook_configs:  \n          - url: <WEB-HOOK-URL>  \n            send_resolved: true  \n      route:  \n        group_by:  \n        - alertname  \n        group_interval: 5m  \n        group_wait: 30s  \n        receiver: 'null'  \n        repeat_interval: 10m  \n        routes:  \n        - receiver: webhook-notifications\n
                                  3. Verify that you are receiving alerts on the webhook.site, in the left pane:

                                  "},{"location":"admin/maintenance/alert-monitoring/#built-in-alerts","title":"Built-in alerts","text":"

                                  A Run:ai cluster comes with several built-in alerts. Each alert notifies on a specific functionality of a Run:ai\u2019s entity. There is also a single, inclusive alert: Run:ai Critical Problems, which aggregates all component-based alerts into a single cluster health test.

                                  Runai agent cluster info push rate low

                                  Meaning The cluster-sync Pod in the runai namespace might not be functioning properly Impact Possible impact - no info/partial info from the cluster is being synced back to the control-plane Severity Critical Diagnosis kubectl get pod -n runai to see if the cluster-sync pod is running Troubleshooting/Mitigation To diagnose issues with the cluster-sync pod, follow these steps: Paste the following command to your terminal, to receive detailed information about the cluster-sync deployment:kubectl describe deployment cluster-sync -n runai Check the Logs: Use the following command to view the logs of the cluster-sync deployment:kubectl logs deployment/cluster-sync -n runai Analyze the Logs and Pod Details: From the information provided by the logs and the deployment details, attempt to identify the reason why the cluster-sync pod is not functioning correctly Check Connectivity: Ensure there is a stable network connection between the cluster and the Run:ai Control Plane. A connectivity issue may be the root cause of the problem. Contact Support: If the network connection is stable and you are still unable to resolve the issue, contact Run:ai support for further assistance

                                  Runai cluster sync handling rate low

                                  | Meaning | The cluster-sync Pod in the runai namespace might be functioning slowly | | | :---- |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Impact | Possible impact - info from the cluster is being synced back to the control-plane with a slow rate | | Severity | Warning | | Diagnosis | kubectl logs deployment/cluster-sync -n runai to see if the cluster-sync pod is running properly | | Troubleshooting/Mitigation | To diagnose issues with the cluster-sync pod, follow these steps: Check the Logs: Use the following command to view the logs of the cluster-sync deployment:kubectl logs deployment/cluster-sync -n runai Analyze the Logs and Pod Details: From the information provided by the logs and the deployment details, attempt to identify the reason why the cluster-sync pod is not functioning correctly Check Connectivity: Ensure there is a stable network connection between the cluster and the Run:ai Control Plane. A connectivity issue may be the root cause of the problem. Contact Support: If the network connection is stable and you are still unable to resolve the issue, contact Run:ai support for further assistance |

                                  Runai agent pull rate low

                                  Meaning The runai-agent pod may be too loaded, is slow in processing data (possible in very big clusters), or the runai-agent pod itself in the runai namespace may not be functioning properly. Impact Possible impact - no info/partial info from the control-plane is being synced in the cluster Severity Critical Diagnosis Run: kubectl get pod -n runai And see if the runai-agent pod is running. Troubleshooting/Mitigation To diagnose issues with the runai-agent pod, follow these steps: Describe the Deployment: Run the following command to get detailed information about the runai-agent deployment:kubectl describe deployment runai-agent -n runai Check the Logs: Use the following command to view the logs of the runai-agent deployment:kubectl logs deployment/runai-agent -n runai Analyze the Logs and Pod Details: From the information provided by the logs and the deployment details, attempt to identify the reason why the runai-agent pod is not functioning correctly. There may be a connectivity issue with the control plane. Check Connectivity: Ensure there is a stable network connection between the runai-agent and the control plane. A connectivity issue may be the root cause of the problem. Consider Cluster Load: If the runai-agent appears to be functioning properly but the cluster is very large and heavily loaded, it may take more time for the agent to process data from the control plane. Adjust Alert Threshold: If the cluster load is causing the alert to fire, you can adjust the threshold at which the alert triggers. The default value is 0.05. You can try changing it to a lower value (e.g., 0.045 or 0.04).To edit the value, paste the following in your terminal:kubectl edit runaiconfig -n runaiIn the editor, navigate to:spec: prometheus: agentPullPushRateMinForAlert: If the agentPullPushRateMinForAlert value does not exist, add it under spec -> prometheus

                                  Runai container memory usage critical

                                  Meaning Runai container is using more than 90% of its Memory limit Impact The container might run out of memory and crash. Severity Critical Diagnosis Calculate the memory usage, this is performed by pasting the following to your terminal: container_memory_usage_bytes{namespace=~\"runai|runai-backend\"} Troubleshooting/Mitigation Add more memory resources to the container. If the issue persists, contact Run:ai

                                  Runai container memory usage warning

                                  Meaning Runai container is using more than 80% of its memory limit Impact The container might run out of memory and crash Severity Warning Diagnosis Calculate the memory usage, this can be done by pasting the following to your terminal: container_memory_usage_bytes{namespace=~\"runai|runai-backend\"} Troubleshooting/Mitigation Add more memory resources to the container. If the issue persists, contact Run:ai

                                  Runai container restarting

                                  Meaning Runai container has restarted more than twice in the last 10 min Impact The container might become unavailable and impact the Run:ai system Severity Warning Diagnosis To diagnose the issue and identify the problematic pods, paste this into your terminal: kubectl get pods -n runai kubectl get pods -n runai-backendOne or more of the pods have a restart count >= 2. Troubleshooting/Mitigation Paste this into your terminal:kubectl logs -n NAMESPACE POD_NAMEReplace NAMESPACE and POD_NAME with the relevant pod information from the previous step. Check the logs for any standout issues and verify that the container has sufficient resources. If you need further assistance, contact Run:ai

                                  Runai CPU usage warning

                                  Meaning runai container is using more than 80% of its CPU limit Impact This might cause slowness in the operation of certain Run:ai features. Severity Warning Diagnosis Paste the following query to your terminal in order to calculate the CPU usage: rate(container_cpu_usage_seconds_total{namespace=~\"runai|runai-backend\"}[2m]) Troubleshooting/Mitigation Add more CPU resources to the container. If the issue persists, please contact Run:ai.

                                  Runai critical problem

                                  Meaning One of the critical Run:ai alerts is currently active Impact Impact is based on the active alert Severity Critical Diagnosis Check Run:ai alerts in Prometheus to identify any active critical alerts

                                  Unknown state alert for a node

                                  Meaning The Kubernetes node hosting GPU workloads is in an unknown state, and its health and readiness cannot be determined. Impact This may interrupt GPU workload scheduling and execution. Severity Critical - Node is either unschedulable or has unknown status. The node is in one of the following states: Ready=Unknown: The control plane cannot communicate with the node. Ready=False: The node is not healthy. Unschedulable=True: The node is marked as unschedulable. Diagnosis Check the node's status using kubectl describe node, verify Kubernetes API server connectivity, and inspect system logs for GPU-specific or node-level errors.

                                  Low Memory Node Alert

                                  Meaning The Kubernetes node hosting GPU workloads has insufficient memory to support current or upcoming workloads. Impact GPU workloads may fail to schedule, experience degraded performance, or crash due to memory shortages, disrupting dependent applications. Severity Critical - Node is using more than 90% of its memory. Warning - Node is using more than 80% of its memory. Diagnosis Use kubectl top node to assess memory usage, identify memory-intensive pods, consider resizing the node or optimizing memory usage in affected pods.

                                  Runai daemonSet rollout stuck / Runai DaemonSet unavailable on nodes

                                  Meaning There are currently 0 available pods for the runai daemonset on the relevant node Impact No fractional GPU workloads support Severity Critical Diagnosis Paste the following command to your terminal: kubectl get daemonset -n runai-backend In the result of this command, identify the daemonset(s) that don\u2019t have any running pods Troubleshooting/Mitigation Paste the following command to your terminal, where daemonsetX is the problematic daemonset from the pervious step: kubectl describe daemonsetX -n runai on the relevant deamonset(s) from the previous step. The next step is to look for the specific error which prevents it from creating pods. Possible reasons might be:Node Resource Constraints: The nodes in the cluster may lack sufficient resources (CPU, memory, etc.) to accommodate new pods from the daemonset. Node Selector or Affinity Rules: The daemonset may have node selector or affinity rules that are not matching with any nodes currently available in the cluster, thus preventing pod creation.

                                  Runai deployment insufficient replicas / Runai deployment no available replicas /RunaiDeploymentUnavailableReplicas

                                  Meaning Runai deployment has one or more unavailable pods Impact When this happens, there may be scale issues. Additionally, new versions cannot be deployed, potentially resulting in missing features. Severity Critical Diagnosis Paste the following commands to your terminal, in order to get the status of the deployments in the runai and runai-backend namespaces:kubectl get deployment -n runai kubectl get deployment -n runai-backendIdentify any deployments that have missing pods. Look for discrepancies in the DESIRED and AVAILABLE columns. If the number of AVAILABLE pods is less than the DESIRED pods, it indicates that there are missing pods. Troubleshooting/Mitigation Paste the following commands to your terminal, to receive detailed information about the problematic deployment:kubectl describe deployment <DEPLOYMENT_NAME> -n runai kubectl describe deployment <DEPLOYMENT_NAME> -n runai-backend Paste the following commands to your terminal, to check the replicaset details associated with the deployment:kubectl describe replicaset <REPLICASET_NAME> -n runai kubectl describe replicaset <REPLICASET_NAME> -n runai-backend Paste the following commands to your terminal to retrieve the logs for the deployment to identify any errors or issues:kubectl logs deployment/<DEPLOYMENT_NAME> -n runai kubectl logs deployment/<DEPLOYMENT_NAME> -n runai-backend From the logs and the detailed information provided by the describe commands, analyze the reasons why the deployment is unable to create pods. Look for common issues such as: Resource constraints (CPU, memory) Misconfigured deployment settings or replicasets Node selector or affinity rules preventing pod schedulingIf the issue persists, contact Run:ai.

                                  Runai project controller reconcile failure

                                  Meaning The project-controller in runai namespace had errors while reconciling projects Impact Some projects might not be in the \u201cReady\u201d state. This means that they are not fully operational and may not have all the necessary components running or configured correctly. Severity Critical Diagnosis Retrieve the logs for the project-controller deployment by pasting the following command in your terminal:kubectl logs deployment/project-controller -n runai Carefully examine the logs for any errors or warning messages. These logs help you understand what might be going wrong with the project controller. Troubleshooting/Mitigation Once errors in the log have been identified, follow these steps to mitigate the issue: The error messages in the logs should provide detailed information about the problem. Read through them to understand the nature of the issue. If the logs indicate which project failed to reconcile, you can further investigate by checking the status of that specific project. Run the following command, replacing <PROJECT_NAME> with the name of the problematic project:kubectl get project <PROJECT_NAME> -o yaml Review the status section in the YAML output. This section describes the current state of the project and provide insights into what might be causing the failure.If the issue persists, contact Run:ai.

                                  Runai StatefulSet insufficient replicas / Runai StatefulSet no available replicas

                                  Meaning Runai statefulset has no available pods Impact Absence of Metrics Database Unavailability Severity Critical Diagnosis To diagnose the issue, follow these steps: Check the status of the stateful sets in the runai-backend namespace by running the following command:kubectl get statefulset -n runai-backend Identify any stateful sets that have no running pods. These are the ones that might be causing the problem. Troubleshooting/Mitigation Once you've identified the problematic stateful sets, follow these steps to mitigate the issue: Describe the stateful set to get detailed information on why it cannot create pods. Replace X with the name of the stateful set:kubectl describe statefulset X -n runai-backend Review the description output to understand the root cause of the issue. Look for events or error messages that explain why the pods are not being created. If you're unable to resolve the issue based on the information gathered, contact Run:ai support for further assistance."},{"location":"admin/maintenance/alert-monitoring/#adding-a-custom-alert","title":"Adding a custom alert","text":"

                                  You can add additional alerts from Run:ai. Alerts are triggered by using the Prometheus query language with any Run:ai metric.

                                  To create an alert, follow these steps using Prometheus query language with Run:ai Metrics:

                                  • Modify Values File: Use the upgrade cluster instructions to modify the values file.
                                  • Add Alert Structure: Incorporate alerts according to the structure outlined below. Replace placeholders <ALERT-NAME>, <ALERT-SUMMARY-TEXT>, <PROMQL-EXPRESSION>, <optional: duration s/m/h>, and <critical/warning> with appropriate values for your alert, as described below.

                                  kube-prometheus-stack:  \n   additionalPrometheusRulesMap:  \n     custom-runai:  \n       groups:  \n       - name: custom-runai-rules  \n         rules:  \n         - alert: <ALERT-NAME>  \n           annotations:  \n             summary: <ALERT-SUMMARY-TEXT>  \n           expr:  <PROMQL-EXPRESSION>  \n           for: <optional: duration s/m/h>  \n           labels:  \n             severity: <critical/warning>\n
                                  * <ALERT-NAME>: Choose a descriptive name for your alert, such as HighCPUUsage or LowMemory. <ALERT-SUMMARY-TEXT>: Provide a brief summary of what the alert signifies, for example, High CPU usage detected or Memory usage below threshold. <PROMQL-EXPRESSION>: Construct a Prometheus query (PROMQL) that defines the conditions under which the alert should trigger. This query should evaluate to a boolean value (1 for alert, 0 for no alert). <optional: duration s/m/h>: Optionally, specify a duration in seconds (s), minutes (m), or hours (h) that the alert condition should persist before triggering an alert. If not specified, the alert triggers as soon as the condition is met. <critical/warning>: Assign a severity level to the alert, indicating its importance. Choose between critical for severe issues requiring immediate attention, or warning for less critical issues that still need monitoring.

                                  You can find an example in the Prometheus documentation.

                                  "},{"location":"admin/maintenance/audit-log/","title":"Audit Log","text":"

                                  This article provides details about Run:ai\u2019s Audit log. The Run:ai control plane provides the audit log API and event history table in the Run:ai UI . Both reflect the same information regarding changes to business objects: clusters, projects and assets etc.

                                  "},{"location":"admin/maintenance/audit-log/#events-history-table","title":"Events history table","text":"

                                  The Events history table can be found under Event history in the Run:ai UI.

                                  The Event history table consists of the following columns:

                                  Column Description Subject The name of the subject Subject type The user or application assigned with the role Source IP The IP address of the subject Date & time The exact timestamp at which the event occurred. Format dd/mm/yyyy for date and hh:mm am/pm for time. Event The type of the event. Possible values: Create, Update, Delete, Login Event ID Internal event ID, can be used for support purposes Status The outcome of the logged operation. Possible values: Succeeded, Failed Entity type The type of the logged business object. Entity name The name of logged business object. Entity ID The system's internal id of the logged business object. URL The endpoint or address that was accessed during the logged event. HTTP Method The HTTP operation method used for the request. Possible values include standard HTTP methods such as GET, POST, PUT, DELETE, indicating what kind of action was performed on the specified URL."},{"location":"admin/maintenance/audit-log/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV or Download as JSON
                                  "},{"location":"admin/maintenance/audit-log/#using-the-event-history-date-selector","title":"Using the event history date selector","text":"

                                  The Event history table saves events for the last 90 days. However, the table itself presents up to the last 30 days of information due to the potentially very high number of operations that might be logged during this period.

                                  To view older events, or to refine your search for more specific results or fewer results, use the time selector and change the period you search for. You can also refine your search by clicking and using ADD FILTER accordingly.

                                  "},{"location":"admin/maintenance/audit-log/#using-api","title":"Using API","text":"

                                  Go to the Audit log API reference to view the available actions. Since the amount of data is not trivial, the API is based on paging. It retrieves a specified number of items for each API call. You can get more data by using subsequent calls.

                                  "},{"location":"admin/maintenance/audit-log/#limitations","title":"Limitations","text":"

                                  Submissions of workloads are not audited. As a result, the system does not track or log details of workload submissions, such as timestamps or user activity.

                                  "},{"location":"admin/maintenance/node-downtime/","title":"Node Maintenance","text":"

                                  This article provides detailed instructions on how to manage both planned and unplanned node downtime in a Kubernetes cluster that is running Run:ai. It covers all the steps to maintain service continuity and ensure the proper handling of workloads during these events.

                                  "},{"location":"admin/maintenance/node-downtime/#prerequisites","title":"Prerequisites","text":"
                                  • Access to Kubernetes cluster Administrative access to the Kubernetes cluster, including permissions to run kubectl commands
                                  • Basic knowledge of Kubernetes Familiarity with Kubernetes concepts such as nodes, taints, and workloads
                                  • Run:ai installation The Run:ai software installed and configured within your Kubernetes cluster
                                  • Node naming conventions Know the names of the nodes within your cluster, as these are required when executing the commands
                                  "},{"location":"admin/maintenance/node-downtime/#node-types","title":"Node types","text":"

                                  This article distinguishes between two types of nodes within a Run:ai installation:

                                  • Worker nodes. Nodes on which AI practitioners can submit and run workloads
                                  • Run:ai system nodes. Nodes on which the Run:ai software runs, managing the cluster's operations
                                  "},{"location":"admin/maintenance/node-downtime/#worker-nodes","title":"Worker nodes","text":"

                                  Worker Nodes are responsible for running workloads. When a worker node goes down, either due to planned maintenance or unexpected failure, workloads ideally migrate to other available nodes or wait in the queue to be executed when possible.

                                  "},{"location":"admin/maintenance/node-downtime/#training-vs-interactive-workloads","title":"Training vs. Interactive workloads","text":"

                                  The following workload types can run on worker nodes:

                                  • Training workloads. These are long-running processes that, in case of node downtime, can automatically move to another node.

                                  • Interactive workloads. These are short-lived, interactive processes that require manual intervention to be relocated to another node.

                                  Note

                                  While training workloads can be automatically migrated, it is recommended to plan maintenance and manually manage this process for a faster response, as it may take time for Kubernetes to detect a node failure,

                                  "},{"location":"admin/maintenance/node-downtime/#planned-maintenance","title":"Planned maintenance","text":"

                                  Before stopping a worker node for maintenance, perform the following steps:

                                  1. Prevent new workloads on the node To stop the Kubernetes Scheduler from assigning new workloads to the node and to safely remove all existing workloads, copy the following command to your terminal:

                                    kubectl taint nodes <node-name> runai=drain:NoExecute\n

                                    Explanation:

                                    • <node-name> Replace this placeholder with the actual name of the node you want to drain
                                    • kubectl taint nodes This command is used to add a taint to the node, which prevents any new pods from being scheduled on it
                                    • runai=drain:NoExecute This specific taint ensures that all existing pods on the node are evicted and rescheduled on other available nodes, if possible.

                                    Result: The node stops accepting new workloads, and existing workloads either migrate to other nodes or are placed in a queue for later execution.

                                  2. Shut down and perform maintenance After draining the node, you can safely shut it down and perform the necessary maintenance tasks.

                                  3. Restart the node Once maintenance is complete and the node is back online, remove the taint to allow the node to resume normal operations. Copy the following command to your terminal:

                                    kubectl taint nodes <node-name> runai=drain:NoExecute-\n

                                    Explanation:

                                    • runai=drain:NoExecute- The - at the end of the command indicates the removal of the taint. This allows the node to start accepting new workloads again.

                                    Result: The node rejoins the cluster's pool of available resources, and workloads can be scheduled on it as usual

                                  "},{"location":"admin/maintenance/node-downtime/#unplanned-downtime","title":"Unplanned downtime","text":"

                                  In the event of unplanned downtime:

                                  1. Automatic Restart If a node fails but immediately restarts, all services and workloads automatically resume.
                                  2. Extended Downtime If the node remains down for an extended period, drain the node to migrate workloads to other nodes. Copy the following command to your terminal:

                                    kubectl taint nodes <node-name> runai=drain:NoExecute\n

                                    Explanation: The command works the same as in the planned maintenance section, ensuring that no workloads remain scheduled on the node while it is down.

                                  3. Reintegrate the Node Once the node is back online, remove the taint to allow it to rejoin the cluster's operations. Copy the following command to your terminal:

                                    kubectl taint nodes <node-name> runai=drain:NoExecute-\n
                                    Result: This action reintegrates the node into the cluster, allowing it to accept new workloads.

                                  4. Permanent Shutdown If the node is to be permanently decommissioned, remove it from Kubernetes with the following command:

                                    kubectl delete node <node-name>\n
                                    Explanation:

                                    • kubectl delete node This command completely removes the node from the cluster
                                    • <node-name> Replace this placeholder with the actual name of the node

                                    Result: The node is no longer part of the Kubernetes cluster. If you plan to bring the node back later, it must be rejoined to the cluster using the steps outlined in the next section.

                                  "},{"location":"admin/maintenance/node-downtime/#runai-system-nodes","title":"Run:ai System nodes","text":"

                                  In a production environment, the services responsible for scheduling, submitting and managing Run:ai workloads operate on one or more Run:ai system nodes. It is recommended to have more than one system node to ensure high availability. If one system node goes down, another can take over, maintaining continuity. If a second system node does not exist, you must designate another node in the cluster as a temporary Run:ai system node to maintain operations.

                                  The protocols for handling planned maintenance and unplanned downtime are identical to those for worker nodes. Refer to the above section for detailed instructions.

                                  "},{"location":"admin/maintenance/node-downtime/#rejoining-a-node-into-the-kubernetes-cluster","title":"Rejoining a node into the Kubernetes cluster","text":"

                                  To rejoin a node to the Kubernetes cluster, follow these steps:

                                  1. Generate a join command on the master node On the master node, copy the following command to your terminal:

                                    kubeadm token create --print-join-command\n

                                    Explanation:

                                    • kubeadm token create This command generates a token that can be used to join a node to the Kubernetes cluster.
                                    • --print-join-command This option outputs the full command that needs to be run on the worker node to rejoin it to the cluster.

                                    Result: The command outputs a kubeadm join command.

                                  2. Run the Join Command on the Worker Node Copy the kubeadm join command generated from the previous step and run it on the worker node that needs to rejoin the cluster.

                                    Explanation:

                                    • The kubeadm join command re-enrolls the node into the cluster, allowing it to start participating in the cluster's workload scheduling.
                                  3. Verify Node Rejoining Verify that the node has successfully rejoined the cluster by running:

                                    kubectl get nodes\n

                                    Explanation:

                                    This command lists all nodes currently part of the Kubernetes cluster, along with their status

                                    Result: The rejoined node should appear in the list with a status of Ready

                                  4. Re-label Nodes Once the node is back online, ensure it is labeled according to its role within the cluster

                                  "},{"location":"admin/maintenance/overview/","title":"Monitoring and maintenance Overview","text":"

                                  Deploying Run:ai in mission-critical environments requires proper monitoring and maintenance of resources to ensure workloads run and are deployed as expected.

                                  Details on how to monitor different parts of the physical resources in your Kubernetes system, including clusters and nodes, can be found in the monitoring and maintenance section. Adjacent configuration and troubleshooting sections also cover high availability, restoring and securing clusters, collecting logs, and reviewing audit logs to meet compliance requirements.

                                  In addition to monitoring Run:ai resources, it is also highly recommended to monitor Run:ai runs on Kubernetes, which manages containerized applications. In particular, focus on three main layers:

                                  "},{"location":"admin/maintenance/overview/#runai-control-plane-and-cluster-services","title":"Run:ai Control Plane and cluster services","text":"

                                  This is the highest layer and includes the parts of Run:ai pods, which run in containers managed by Kubernetes.

                                  "},{"location":"admin/maintenance/overview/#kubernetes-cluster","title":"Kubernetes cluster","text":"

                                  This layer includes the main Kubernetes system that runs and manages Run:ai components. Important elements to monitor include:

                                  • The health of the cluster and nodes (machines in the cluster).
                                  • The status of key Kubernetes services, such as the API server. For detailed information on managing clusters, see the official Kubernetes documentation.
                                  "},{"location":"admin/maintenance/overview/#host-infrastructure","title":"Host infrastructure","text":"

                                  This is the base layer, representing the actual machines (virtual or physical) that make up the cluster IT teams need to handle:

                                  • Managing CPU, memory, and storage
                                  • Keeping the operating system updated
                                  • Setting up the network and balancing the load

                                  Run:ai does not require any special configurations at this level.

                                  The articles below explain how to monitor these layers, maintain system security and compliance, and ensure the reliable operation of Run:ai in critical environments.

                                  "},{"location":"admin/researcher-setup/cli-install/","title":"Install the Run:ai V1 Command-line Interface","text":"

                                  The Run:ai Command-line Interface (CLI) is one of the ways for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, etc.

                                  The instructions below will guide you through the process of installing the CLI. The Run:ai CLI runs on Mac, Linux and Windows.

                                  "},{"location":"admin/researcher-setup/cli-install/#researcher-authentication","title":"Researcher Authentication","text":"

                                  When enabled, Researcher authentication requires additional setup when installing the CLI. To configure authentication see Setup Project-based Researcher Access Control. Use the modified Kubernetes configuration file described in the article.

                                  "},{"location":"admin/researcher-setup/cli-install/#prerequisites","title":"Prerequisites","text":"
                                  • When installing the command-line interface, it is worth considering future upgrades:
                                    • Install the CLI on a dedicated Jumpbox machine. Researchers will connect to the Jumpbox from which they can submit Run:ai commands
                                    • Install the CLI on a shared directory that is mounted on Researchers' machines.
                                  • A Kubernetes configuration file.
                                  "},{"location":"admin/researcher-setup/cli-install/#setup","title":"Setup","text":""},{"location":"admin/researcher-setup/cli-install/#kubernetes-configuration","title":"Kubernetes Configuration","text":"
                                  • In the Researcher's root folder, create a directory .kube. Copy the Kubernetes configuration file into the directory. Each Researcher should have a separate copy of the configuration file. The Researcher should have write access to the configuration file as it stores user defaults.
                                  • If you choose to locate the file at a different location than ~/.kube/config, you must create a shell variable to point to the configuration file as follows:
                                  export KUBECONFIG=<Kubernetes-config-file>\n
                                  • Test the connection by running:
                                  kubectl get nodes\n
                                  "},{"location":"admin/researcher-setup/cli-install/#install-runai-cli","title":"Install Run:ai CLI","text":"
                                  • Go to the Run:ai user interface. On the top right select Researcher Command Line Interface.
                                  • Select Mac, Linux or Windows.
                                  • Download directly using the button or copy the file to run it on a remote machine
                                  Mac or LinuxWindows

                                  Run:

                                  chmod +x runai\nsudo mv runai /usr/local/bin/runai\n

                                  Rename the downloaded file to have a .exe extension and move the file to a folder that is a part of the PATH.

                                  Note

                                  An alternative way of downloading the CLI is provided under the CLI Troubleshooting section.

                                  To verify the installation run:

                                  runai list jobs\n
                                  "},{"location":"admin/researcher-setup/cli-install/#install-command-auto-completion","title":"Install Command Auto-Completion","text":"

                                  It is possible to configure your Linux/Mac shell to complete Run:ai CLI commands. This feature works on bash and zsh shells only.

                                  ZshBash

                                  Edit the file ~/.zshrc. Add the lines:

                                  autoload -U compinit; compinit -i\nsource <(runai completion zsh)\n

                                  Install the bash-completion package:

                                  • Mac: brew install bash-completion
                                  • Ubuntu/Debian: sudo apt-get install bash-completion
                                  • Fedora/Centos: sudo yum install bash-completion

                                  Edit the file ~/.bashrc. Add the lines:

                                  [[ -r \u201c/usr/local/etc/profile.d/bash_completion.sh\u201d ]] && . \u201c/usr/local/etc/profile.d/bash_completion.sh\u201d\nsource <(runai completion bash)\n
                                  "},{"location":"admin/researcher-setup/cli-install/#troubleshoot-the-cli-installation","title":"Troubleshoot the CLI Installation","text":"

                                  See Troubleshooting a CLI installation

                                  "},{"location":"admin/researcher-setup/cli-install/#update-the-runai-cli","title":"Update the Run:ai CLI","text":"

                                  To update the CLI to the latest version perform the same install process again.

                                  "},{"location":"admin/researcher-setup/cli-install/#delete-the-runai-cli","title":"Delete the Run:ai CLI","text":"

                                  If you have installed using the default path, run:

                                  sudo rm /usr/local/bin/runai\n
                                  "},{"location":"admin/researcher-setup/docker-to-runai/","title":"From Docker to Run:ai","text":""},{"location":"admin/researcher-setup/docker-to-runai/#dockers-images-and-kubernetes","title":"Dockers, Images, and Kubernetes","text":"

                                  Researchers are typically proficient in working with Docker. Docker is an isolation level above the operating system which allows creating your own bundle of the operating system + deep learning environment and packaging it within a single file. The file is called a docker image.

                                  You create a container by starting a docker image on a machine.

                                  Run:ai is based on Kubernetes. At its core, Kubernetes is an orchestration software above Docker: Among other things, it allows location abstraction as to where the actual container is running. This calls for some adaptation to the Researcher's workflow as follows.

                                  "},{"location":"admin/researcher-setup/docker-to-runai/#image-repository","title":"Image Repository","text":"

                                  If your Kubernetes cluster contains a single GPU node (machine), then your image can reside on the node itself (in which case, when runai submit workloads, the Researcher must use the flag --local-image).

                                  If your Kubernetes cluster contains more than a single node, then, to enable location abstraction, the image can no longer reside on the node itself. It must be relocated to an image repository. There are quite a few repository-as-a-service, most notably Docker hub. Alternatively, the organization can install a private repository on-prem.

                                  Day-to-day work with the image located remotely is almost identical to local work. The image name now contains its location. For example, nvcr.io/nvidia/pytorch:19.12-py_3 is a PyTorch image that is located in nvcr.io. This is the Nvidia image repository as found on the web.

                                  "},{"location":"admin/researcher-setup/docker-to-runai/#data","title":"Data","text":"

                                  Deep learning is about data. It can be your code, the training data, saved checkpoints, etc.

                                  If your Kubernetes cluster contains a single GPU node (machine), then your data can reside on the node itself.

                                  If your Kubernetes cluster contains more than a single node, then, to enable location abstraction, the data must sit outside the machine, typically on network storage. The storage must be uniformly mapped to your container when it starts (using the -v command).

                                  "},{"location":"admin/researcher-setup/docker-to-runai/#working-with-containers","title":"Working with Containers","text":"

                                  Starting a container using docker usually involves a single command-line with multiple flags. A typical example:

                                  docker run --runtime=nvidia --shm-size 16G -it --rm -e HOSTNAME='hostname' \\\n    -v /raid/public/my_datasets:/root/dataset:ro   -i  nvcr.io/nvidia/pytorch:19.12-py3\n

                                  The docker command docker run should be replaced with a Run:ai command runai submit. The flags are usually the same but some adaptation is required. A complete list of flags can be found here: runai submit.

                                  There are similar commands to get a shell into the container (runai bash), get the container logs (runai logs), and more. For a complete list see the Run:ai CLI reference.

                                  "},{"location":"admin/researcher-setup/docker-to-runai/#schedule-an-onboarding-session","title":"Schedule an Onboarding Session","text":"

                                  It is highly recommended to schedule an onboarding session for Researchers with a Run:ai customer success professional. Run:ai can help with the above transition, but adding to that, we at Run:ai have also acquired a large body of knowledge on data science best practices which can help streamline Researchers' work as well as save money for the organization.

                                  "},{"location":"admin/researcher-setup/new-cli-install/","title":"Installing the V2 Command-line interface","text":"

                                  This article explains the procedure for installing and configuring the new researcher Command Line Interface (CLI).

                                  Important

                                  This document refers to the new CLI which only works with clusters of version 2.18 and up. The installation instructions for the older CLI are here.

                                  "},{"location":"admin/researcher-setup/new-cli-install/#enabling-the-v2-cli","title":"Enabling the V2 CLI","text":"

                                  Under General settings \u2192 Workloads, enable the flag Improved command line interface

                                  "},{"location":"admin/researcher-setup/new-cli-install/#installing-the-cli","title":"Installing the CLI","text":"
                                  1. Click the Help (?) icon in the top right corner
                                  2. Select Researcher Command Line Interface
                                  3. Select the cluster you want the CLI to communicate with
                                  4. Select your computer\u2019s operating system
                                  5. Copy the installer command and run it in the terminal
                                  6. Follow the installation process instructions
                                  7. Click Enter to use the default values (recommended)
                                  "},{"location":"admin/researcher-setup/new-cli-install/#testing-the-installation","title":"Testing the installation","text":"

                                  To verify the CLI client was installed properly

                                  1. Open the terminal
                                  2. Run the command runai version
                                  "},{"location":"admin/researcher-setup/new-cli-install/#configuring-the-cli","title":"Configuring the CLI","text":"

                                  Follow the steps below to configure the CLI.

                                  "},{"location":"admin/researcher-setup/new-cli-install/#authenticating-the-cli","title":"Authenticating the CLI","text":"

                                  After installation, sign in to the Run:ai platform to authenticate the CLI:

                                  1. Open the terminal on your local machine.
                                  2. Run runai login.
                                  3. Enter your username and password on the Run:ai platform's sign-in page.
                                  4. Return to the terminal window to use the CLI.
                                  "},{"location":"admin/researcher-setup/new-cli-install/#setting-the-default-cluster","title":"Setting the default cluster","text":"

                                  If only one cluster is connected to the account, it is set as the default cluster when you first sign in. If there are multiple clusters, you must follow the steps below to set your preferred cluster for workload submission:

                                  1. Open the terminal on your local machine.
                                  2. Run runai cluster list to find the required cluster name.
                                  3. Run the following command runai cluster set <CLUSTER_NAME>
                                  "},{"location":"admin/researcher-setup/new-cli-install/#setting-a-default-project","title":"Setting a default project","text":"

                                  Set a default working project, to easily submit workloads without mentioning the project name in every command.

                                  1. Run the following command on the terminal: runai project set <PROJECT_NAME>
                                  2. If successful, the following message is returned: project <PROJECT_NAME> configured successfully
                                  3. To see the current configuration run: runai config generate --json
                                  "},{"location":"admin/researcher-setup/new-cli-install/#installing-command-auto-completion","title":"Installing command auto-completion","text":"

                                  Auto-completion assists with completing the command syntax automatically for ease of use. Auto-completion is installed automatically. The interfaces below require manual installation:

                                  ZshBashWindows
                                  1. Edit the file ~/.zshrc
                                  2. Add the following code:
                                  autoload -U compinit; compinit -i\nsource <(runai completion zsh)\n
                                  1. Install the bash-completion package
                                  2. Choose your operating system: Mac: brew install bash-completion Ubuntu/Debian: sudo apt-get install bash-completion Fedora/Centos: sudo yum install bash-completion
                                  3. Edit the file ~/.bashrc and add the following lines:
                                  [[ $PS1 && -f /usr/share/bash-completion/bash_completion ]] && . /usr/share/bash-completion/bash_completion\nsource <(runai completion bash)\n

                                  Add the following code in the powershell profile:

                                  runai.exe completion powershell | Out-String | Invoke-Expression\nSet-PSReadLineKeyHandler -Key Tab -Function MenuComplete\n
                                  For more completion modes options, see Powershell completions.

                                  "},{"location":"admin/researcher-setup/researcher-setup-intro/","title":"Researcher Setup Overview","text":"

                                  Following is a step-by-step guide for getting a new Researcher up to speed with Run:ai and Kubernetes.

                                  "},{"location":"admin/researcher-setup/researcher-setup-intro/#change-of-paradigms-from-docker-to-kubernetes","title":"Change of Paradigms: from Docker to Kubernetes","text":"

                                  As part of Run:ai, the organization is typically moving from Docker-based workflows to Kubernetes. This document is an attempt to help the Researcher with this paradigm shift. It explains the basic concepts and provides links for further information about the Run:ai CLI.

                                  "},{"location":"admin/researcher-setup/researcher-setup-intro/#setup-the-runai-command-line-interface","title":"Setup the Run:ai Command-Line Interface","text":"

                                  Run:ai CLI needs to be installed on the Researcher's machine. This document provides step by step instructions.

                                  "},{"location":"admin/researcher-setup/researcher-setup-intro/#provide-the-researcher-with-a-gpu-quota","title":"Provide the Researcher with a GPU Quota","text":"

                                  To submit workloads with Run:ai, the Researcher must be provided with a Project that contains a GPU quota. Please see Working with Projects document on how to create Projects and set a quota.

                                  "},{"location":"admin/researcher-setup/researcher-setup-intro/#provide-access-to-the-runai-user-interface","title":"Provide access to the Run:ai User Interface","text":"

                                  See Setting up users for further information on how to provide access to users.

                                  "},{"location":"admin/researcher-setup/researcher-setup-intro/#schedule-an-onboarding-session","title":"Schedule an Onboarding Session","text":"

                                  It is highly recommended to schedule an onboarding session for Researchers with a Run:ai customer success professional. Run:ai can help with the above transition, but adding to that, we at Run:ai have also acquired a large body of knowledge on data science best practices which can help streamline the Researchers' work as well as save money for the organization.

                                  "},{"location":"admin/runai-setup/installation-types/","title":"Installation Types","text":"

                                  Run:ai consists of two components:

                                  • The Run:ai Cluster. One or more data-science GPU clusters hosted by the customer (on-prem or cloud).
                                  • The Run:ai Control plane. A single entity that monitors clusters, sets priorities, and business policies.

                                  There are two main installation options:

                                  Installation Type Description Classic (SaaS) Run:ai is installed on the customer's data science GPU clusters. The cluster connects to the Run:ai control plane on the cloud (https://<tenant-name>.run.ai). With this installation, the cluster requires an outbound connection to the Run:ai cloud. Self-hosted The Run:ai control plane is also installed in the customer's data center

                                  The self-hosted option is for organizations that cannot use a SaaS solution due to data leakage concerns. The self-hosted installation is priced differently. For further information please talk to Run:ai sales.

                                  "},{"location":"admin/runai-setup/installation-types/#self-hosted-installation","title":"Self-hosted Installation","text":"

                                  Run:ai self-hosting comes with two variants:

                                  Self-hosting Type Description Connected The organization can freely download from the internet (though upload is not allowed) Air-gapped The organization has no connection to the internet"},{"location":"admin/runai-setup/installation-types/#self-hosting-with-kubernetes-vs-openshift","title":"Self-hosting with Kubernetes vs OpenShift","text":"

                                  Kubernetes has many Certified Kubernetes Providers. Run:ai has been certified with several of them (see the Kubernetes distribution section). The OpenShift installation is different from the rest. As such, the Run:ai self-hosted installation instructions are divided into two separate sections:

                                  • OpenShift-based installation. See Run:ai OpenShift installation.
                                  • Kubernetes-based installation. See Run:ai Kubernetes installation.
                                  "},{"location":"admin/runai-setup/installation-types/#secure-installation","title":"Secure Installation","text":"

                                  In many organizations, Kubernetes is governed by IT compliance rules. In this scenario, there are strict access control rules during the installation and running of workloads:

                                  • OpenShift is secured using Security Context Constraints (SCC). The Run:ai installation supports SCC.
                                  • Run:ai provides limited support for Kubernetes Pod Security Admission (PSA). For more information see Kubernetes prerequisites.
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-delete/","title":"Cluster Uninstall","text":"

                                  This article explains how to uninstall Run:ai Cluster installation from the Kubernetes cluster.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-delete/#unistall-runai-cluster","title":"Unistall Run:ai cluster","text":"

                                  Uninstall of Run:ai cluster from the Kubernetes cluster does not delete existing projects, departments or workloads submitted by users.

                                  To uninstall the Run:ai cluster, run the following helm command in your terminal:

                                  helm uninstall runai-cluster -n runai\n

                                  To delete the Run:ai cluster from the Run:ai Platform, see Removing a cluster.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/","title":"Cluster Install","text":"

                                  This article explains the steps required to install the Run:ai cluster on a Kubernetes cluster using Helm.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#before-installation","title":"Before installation","text":"

                                  There are a number of matters to consider prior to installing using Helm.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#system-and-network-requirements","title":"System and network requirements","text":"

                                  Before installing the Run:ai cluster, validate that the system requirements and network requirements are met.

                                  Once all the requirements are met, it is highly recommend to use the Run:ai cluster preinstall diagnostics tool to:

                                  • Test the below requirements in addition to failure points related to Kubernetes, NVIDIA, storage, and networking
                                  • Look at additional components installed and analyze their relevance to a successful installation

                                  To run the preinstall diagnostics tool, download the latest version, and run:

                                  SaaSSelf-hostedAirgap
                                  • On EKS deployments, run aws configure prior to execution
                                  chmod +x ./preinstall-diagnostics-<platform> && \\\n./preinstall-diagnostics-<platform> \\\n  --domain ${COMPANY_NAME}.run.ai \\\n  --cluster-domain ${CLUSTER_FQDN}\n
                                  chmod +x ./preinstall-diagnostics-<platform> && \\ \n./preinstall-diagnostics-<platform> \\\n  --domain ${CONTROL_PLANE_FQDN} \\\n  --cluster-domain ${CLUSTER_FQDN} \\\n#if the diagnostics image is hosted in a private registry\n  --image-pull-secret ${IMAGE_PULL_SECRET_NAME} \\\n  --image ${PRIVATE_REGISTRY_IMAGE_URL}    \n

                                  In an air-gapped deployment, the diagnostics image is saved, pushed, and pulled manually from the organization's registry.

                                  #Save the image locally\ndocker save --output preinstall-diagnostics.tar gcr.io/run-ai-lab/preinstall-diagnostics:${VERSION}\n#Load the image to the organization's registry\ndocker load --input preinstall-diagnostics.tar\ndocker tag gcr.io/run-ai-lab/preinstall-diagnostics:${VERSION} ${CLIENT_IMAGE_AND_TAG} \ndocker push ${CLIENT_IMAGE_AND_TAG}\n

                                  Run the binary with the --image parameter to modify the diagnostics image to be used:

                                  chmod +x ./preinstall-diagnostics-darwin-arm64 && \\\n./preinstall-diagnostics-darwin-arm64 \\\n  --domain ${CONTROL_PLANE_FQDN} \\\n  --cluster-domain ${CLUSTER_FQDN} \\\n  --image-pull-secret ${IMAGE_PULL_SECRET_NAME} \\\n  --image ${PRIVATE_REGISTRY_IMAGE_URL}    \n

                                  For more information see preinstall diagnostics.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#helm","title":"Helm","text":"

                                  Run:ai cluster requires Helm 3.14 or above. To install Helm, see Helm Install.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#permissions","title":"Permissions","text":"

                                  A Kubernetes user with the cluster-admin role is required to ensure a successful installation, for more information see Using RBAC authorization.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#runai-namespace","title":"Run:ai namespace","text":"

                                  Run:ai cluster must be installed in a namespace named runai. Create the namespace by running:

                                  kubectl create ns runai\n
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#tls-certificates","title":"TLS certificates","text":"

                                  A TLS private and public keys are required for HTTP access to the cluster. Create a Kubernetes Secret named runai-cluster-domain-tls-secret in the runai namespace with the cluster\u2019s Fully Qualified Domain Name (FQDN) private and public keys, by running the following:

                                  kubectl create secret tls runai-cluster-domain-tls-secret -n runai \\\n    --cert /path/to/fullchain.pem  \\ # Replace /path/to/fullchain.pem with the actual path to your TLS certificate\n    --key /path/to/private.pem # Replace /path/to/private.pem with the actual path to your private key\n
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#installation","title":"Installation","text":"

                                  Follow these instructions to install using Helm.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#adding-a-new-cluster","title":"Adding a new cluster","text":"

                                  Follow the steps below to add a new cluster.

                                  Note

                                  When adding a cluster for the first time, the New Cluster form automatically opens when you log-in to the Run:ai platform. Other actions are prevented, until the cluster is created.

                                  If this is your first cluster and you have completed the New Cluster form, start at step 3. Otherwise, start at step 1.

                                  1. In the Run:ai platform, go to Resources
                                  2. Click +NEW CLUSTER
                                  3. Enter a unique name for your cluster
                                  4. Optional: Chose the Run:ai cluster version (latest, by default)
                                  5. Enter the Cluster URL . For more information see Domain Name Requirement
                                  6. Click Continue
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#installing-runai-cluster","title":"Installing Run:ai cluster","text":"

                                  In the next Section, the Run:ai cluster installation steps will be presented.

                                  1. Follow the installation instructions and run the commands provided on your Kubernetes cluster.
                                  2. Click DONE

                                  The cluster is displayed in the table with the status Waiting to connect, once installation is complete, the cluster status changes to Connected

                                  Note

                                  To customize the installation based on your environment, see Customize cluster installation.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#troubleshooting","title":"Troubleshooting","text":"

                                  If you encounter an issue with the installation, try the troubleshooting scenario below.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#installation_1","title":"Installation","text":"

                                  If the Run:ai cluster installation failed, check the installation logs to identify the issue. Run the following script to print the installation logs:

                                  curl -fsSL https://raw.githubusercontent.com/run-ai/public/main/installation/get-installation-logs.sh\n
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-install/#cluster-status","title":"Cluster status","text":"

                                  If the Run:ai cluster installation completed, but the cluster status did not change its status to Connected, check the cluster troubleshooting scenarios

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/","title":"System Requirements","text":"

                                  The Run:ai Cluster is a Kubernetes application.

                                  This article explains the required hardware and software system requirements for the Run:ai cluster.

                                  Set out below are the system requirements for the Run:ai cluster.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#hardware-requirements","title":"Hardware Requirements","text":"

                                  The following hardware requirements are for the Kubernetes Cluster nodes\u2019. By default, all Run:ai cluster services run on all available nodes. For production deployments, you may want to Set Node Roles, to separate between system and worker nodes, reduce downtime and save CPU cycles on expensive GPU Machines.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#runai-cluster-system-nodes","title":"Run:ai Cluster - system nodes","text":"

                                  This configuration is the minimum requirement you need to install and use Run:ai Cluster.

                                  Component Required Capacity CPU 10 cores Memory 20GB Disk space 50GB"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#runai-cluster-worker-nodes","title":"Run:ai Cluster - Worker nodes","text":"

                                  The Run:ai Cluster supports both x86 CPUs and NVIDIA x86 GPUs. For the list of supported GPU models, see Supported NVIDIA Data Center GPUs and Systems.

                                  The following configuration represents the minimum hardware requirements for installing and operating the Run:ai cluster on worker nodes. Each node must meet these specifications:

                                  Component Required Capacity CPU 2 cores Memory 4GB"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#shared-storage","title":"Shared storage","text":"

                                  Run:ai workloads must be able to access data from any worker node in a uniform way, to access training data and code as well as save checkpoints, weights, and other machine-learning-related artifacts.

                                  Typical protocols are Network File Storage (NFS) or Network-attached storage (NAS). Run:ai Cluster supports both, for more information see Shared storage.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#software-requirements","title":"Software requirements","text":"

                                  The following software requirements must be fulfilled on the Kubernetes cluster.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#operating-system","title":"Operating system","text":"
                                  • Any Linux operating system supported by both Kubernetes and NVIDIA GPU Operator
                                  • Run:ai cluster on Google Kubernetes Engine (GKE) supports both Ubuntu and Container Optimized OS (COS). COS is supported only with NVIDIA GPU Operator 24.6 or newer, and Run:ai cluster version 2.19 or newer.
                                  • Internal tests are being performed on Ubuntu 22.04 and CoreOS for OpenShift.
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#kubernetes-distribution","title":"Kubernetes distribution","text":"

                                  Run:ai Cluster requires Kubernetes. The following Kubernetes distributions are supported:

                                  • Vanilla Kubernetes
                                  • OpenShift Container Platform (OCP)
                                  • NVIDIA Base Command Manager (BCM)
                                  • Elastic Kubernetes Engine (EKS)
                                  • Google Kubernetes Engine (GKE)
                                  • Azure Kubernetes Service (AKS)
                                  • Oracle Kubernetes Engine (OKE)
                                  • Rancher Kubernetes Engine (RKE1)
                                  • Rancher Kubernetes Engine 2 (RKE2)

                                  Important

                                  The latest release of the Run:ai cluster supports Kubernetes 1.29 to 1.32 and OpenShift 4.14 to 4.17

                                  For existing Kubernetes clusters, see the following Kubernetes version support matrix for the latest Run:ai cluster releases:

                                  Run:ai version Supported Kubernetes versions Supported OpenShift versions v2.13 1.23 to 1.28 4.10 to 4.13 v2.16 1.26 to 1.28 4.11 to 4.14 v2.17 1.27 to 1.29 4.12 to 4.15 v2.18 1.28 to 1.30 4.12 to 4.16 v2.19 1.28 to 1.31 4.12 to 4.17 v2.20 (latest) 1.29 to 1.32 4.14 to 4.17

                                  For information on supported versions of managed Kubernetes, it's important to consult the release notes provided by your Kubernetes service provider. There, you can confirm the specific version of the underlying Kubernetes platform supported by the provider, ensuring compatibility with Run:ai. For an up-to-date end-of-life statement see Kubernetes Release History or OpenShift Container Platform Life Cycle Policy

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#kubernetes-pod-security-admission","title":"Kubernetes Pod Security Admission","text":"

                                  Run:ai v2.15 and above supports restricted policy for Pod Security Admission (PSA) on OpenShift only. Other Kubernetes distributions are only supported with privileged policy.

                                  For Run:ai on OpenShift to run with PSA restricted policy:

                                  • Label the runai namespace as described in Pod Security Admission with the following labels:
                                  pod-security.kubernetes.io/audit=privileged\npod-security.kubernetes.io/enforce=privileged\npod-security.kubernetes.io/warn=privileged\n
                                  • The workloads submitted through Run:ai should comply with the restrictions of PSA restricted policy, This can be enforced using Policies.
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#kubernetes-ingress-controller","title":"Kubernetes Ingress Controller","text":"

                                  Run:ai cluster requires Kubernetes Ingress Controller to be installed on the Kubernetes cluster.

                                  • OpenShift, RKE and RKE2 come pre-installed ingress controller.
                                  • Internal tests are being performed on NGINX, Rancher NGINX, OpenShift Router, and Istio.
                                  • Make sure that a default ingress controller is set.

                                  There are many ways to install and configure different ingress controllers. A simple example to install and configure NGINX ingress controller using helm:

                                  Vanilla KubernetesManaged Kubernetes (EKS, GKE, AKS)Oracle Kubernetes Engine (OKE)

                                  Run the following commands:

                                  helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx\nhelm repo update\nhelm upgrade -i nginx-ingress ingress-nginx/ingress-nginx \\\n    --namespace nginx-ingress --create-namespace \\\n    --set controller.kind=DaemonSet \\\n    --set controller.service.externalIPs=\"{<INTERNAL-IP>,<EXTERNAL-IP>}\" # Replace <INTERNAL-IP> and <EXTERNAL-IP> with the internal and external IP addresses of one of the nodes\n

                                  Run the following commands:

                                  helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx\nhelm repo update\nhelm install nginx-ingress ingress-nginx/ingress-nginx \\\n    --namespace nginx-ingress --create-namespace\n

                                  Run the following commands:

                                  helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx\nhelm repo update\nhelm install nginx-ingress ingress-nginx/ingress-nginx \\\n    --namespace ingress-nginx --create-namespace \\\n    --set controller.service.annotations.oci.oraclecloud.com/load-balancer-type=nlb \\\n    --set controller.service.annotations.oci-network-load-balancer.oraclecloud.com/is-preserve-source=True \\\n    --set controller.service.annotations.oci-network-load-balancer.oraclecloud.com/security-list-management-mode=None \\\n    --set controller.service.externalTrafficPolicy=Local \\\n    --set controller.service.annotations.oci-network-load-balancer.oraclecloud.com/subnet=<SUBNET-ID> # Replace <SUBNET-ID> with the subnet ID of one of your cluster\n
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#nvidia-gpu-operator","title":"NVIDIA GPU Operator","text":"

                                  Run:ai Cluster requires NVIDIA GPU Operator to be installed on the Kubernetes Cluster, supports version 22.9 to 24.6

                                  See the Installing the NVIDIA GPU Operator, followed by notes below:

                                  • Use the default gpu-operator namespace . Otherwise, you must specify the target namespace using the flag runai-operator.config.nvidiaDcgmExporter.namespace as described in customized cluster installation.
                                  • NVIDIA drivers may already be installed on the nodes. In such cases, use the NVIDIA GPU Operator flags --set driver.enabled=false. DGX OS is one such example as it comes bundled with NVIDIA Drivers.
                                  • For distribution-specific additional instructions see below:
                                  OpenShift Container Platform (OCP)

                                  The Node Feature Discovery (NFD) Operator is a prerequisite for the NVIDIA GPU Operator in OpenShift. Install the NFD Operator using the Red Hat OperatorHub catalog in the OpenShift Container Platform web console. For more information see Installing the Node Feature Discovery (NFD) Operator

                                  Elastic Kubernetes Service (EKS)
                                  • When setting-up the cluster, do not install the NVIDIA device plug-in (we want the NVIDIA GPU Operator to install it instead).
                                  • When using the eksctl tool to create a cluster, use the flag --install-nvidia-plugin=false to disable the installation.

                                  For GPU nodes, EKS uses an AMI which already contains the NVIDIA drivers. As such, you must use the GPU Operator flags: --set driver.enabled=false

                                  Google Kubernetes Engine (GKE)

                                  Before installing the GPU Operator, create the gpu-operator namespace by running

                                  kubectl create ns gpu-operator\n

                                  create the following file:

                                  resourcequota.yaml
                                  apiVersion: v1\nkind: ResourceQuota\nmetadata:\nname: gcp-critical-pods\nnamespace: gpu-operator\nspec:\nscopeSelector:\n    matchExpressions:\n    - operator: In\n    scopeName: PriorityClass\n    values:\n    - system-node-critical\n    - system-cluster-critical\n

                                  And then run:

                                  kubectl apply -f resourcequota.yaml\n
                                  Rancher Kubernetes Engine 2 (RKE2)

                                  Make sure to specify the CONTAINERD_CONFIG option exactly as outlined in the documentation and custom configuration guide, using the path /var/lib/rancher/rke2/agent/etc/containerd/config.toml.tmpl. Do not create the file manually if it does not already exist. The GPU Operator will handle this configuration during deployment.

                                  Oracle Kubernetes Engine (OKE)
                                  • During cluster setup, create a nodepool, and set initial_node_labels to include oci.oraclecloud.com/disable-gpu-device-plugin=true which disables the NVIDIA GPU device plugin.
                                  • For GPU nodes, OKE defaults to Oracle Linux, which is incompatible with NVIDIA drivers. To resolve this, use a custom Ubuntu image instead.

                                  For troubleshooting information, see the NVIDIA GPU Operator Troubleshooting Guide.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#prometheus","title":"Prometheus","text":"

                                  Run:ai Cluster requires Prometheus to be installed on the Kubernetes cluster.

                                  • OpenShift comes pre-installed with prometheus
                                  • For RKE2 see Enable Monitoring instructions to install Prometheus

                                  There are many ways to install Prometheus. A simple example to install the community Kube-Prometheus Stack using helm, run the following commands:

                                  helm repo add prometheus-community https://prometheus-community.github.io/helm-charts\nhelm repo update\nhelm install prometheus prometheus-community/kube-prometheus-stack \\\n    -n monitoring --create-namespace --set grafana.enabled=false\n
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#optional-software-requirements","title":"Optional software requirements","text":"

                                  Optional Run:ai capabilities, Distributed Training and Inference require additional Kubernetes applications (frameworks) to be installed on the cluster.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#distributed-training","title":"Distributed training","text":"

                                  Distributed training enables training of AI models over multiple nodes. This requires installing a distributed training framework on the cluster. The following frameworks are supported:

                                  • TensorFlow
                                  • PyTorch
                                  • XGBoost
                                  • MPI v2

                                  There are several ways to install each framework. A simple method of installation example is the Kubeflow Training Operator which includes TensorFlow, PyTorch, and XGBoost.

                                  It is recommended to use Kubeflow Training Operator v1.8.1, and MPI Operator v0.6.0 or later for compatibility with advanced workload capabilities, such as Stopping a workload and Scheduling rules.

                                  • To install the Kubeflow Training Operator for TensorFlow, PyTorch and XGBoost frameworks, run the following command:
                                  kubectl apply -k \"github.com/kubeflow/training-operator.git/manifests/overlays/standalone?ref=v1.8.1\"\n
                                  • To install the MPI Operator for MPI v2, run the following command:
                                  kubectl apply --server-side -f https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.6.0/deploy/v2beta1/mpi-operator.yaml\n

                                  Note

                                  If you require both the MPI Operator and Kubeflow Training Operator, follow the steps below:

                                  • Install the Kubeflow Training Operator as described above.
                                  • Disable and delete MPI v1 in the Kubeflow Training Operator by running:
                                  kubectl patch deployment training-operator -n kubeflow --type='json' -p='[{\"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/args\", \"value\": [\"--enable-scheme=tfjob\", \"--enable-scheme=pytorchjob\", \"--enable-scheme=xgboostjob\"]}]'\nkubectl delete crd mpijobs.kubeflow.org\n
                                  • Install the MPI Operator as described above.
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#inference","title":"Inference","text":"

                                  Inference enables serving of AI models. This requires the Knative Serving framework to be installed on the cluster and supports Knative versions 1.11 to 1.16.

                                  Follow the Installing Knative instructions. After installation, configure Knative to use the Run:ai scheduler and features, by running:

                                  kubectl patch configmap/config-autoscaler \\\n  --namespace knative-serving \\\n  --type merge \\\n  --patch '{\"data\":{\"enable-scale-to-zero\":\"true\"}}' && \\\nkubectl patch configmap/config-features \\\n  --namespace knative-serving \\\n  --type merge \\\n  --patch '{\"data\":{\"kubernetes.podspec-schedulername\":\"enabled\",\"kubernetes.podspec-affinity\":\"enabled\",\"kubernetes.podspec-tolerations\":\"enabled\",\"kubernetes.podspec-volumes-emptydir\":\"enabled\",\"kubernetes.podspec-securitycontext\":\"enabled\",\"kubernetes.containerspec-addcapabilities\":\"enabled\",\"kubernetes.podspec-persistent-volume-claim\":\"enabled\",\"kubernetes.podspec-persistent-volume-write\":\"enabled\",\"multi-container\":\"enabled\",\"kubernetes.podspec-init-containers\":\"enabled\"}}'\n
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#knative-autoscaling","title":"Knative Autoscaling","text":"

                                  Run:ai allows for autoscaling a deployment according to the below metrics:

                                  • Latency (milliseconds)
                                  • Throughput (requests/sec)
                                  • Concurrency (requests)

                                  Using a custom metric (for example, Latency) requires installing the Kubernetes Horizontal Pod Autoscaler (HPA). Use the following command to install. Make sure to update the VERSION in the below command with a supported Knative version.

                                  kubectl apply -f https://github.com/knative/serving/releases/download/knative-{VERSION}/serving-hpa.yaml\n
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#domain-name-requirement","title":"Domain Name Requirement","text":"

                                  The following requirement must be followed for naming the domain.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#fully-qualified-domain-name-fqdn","title":"Fully Qualified Domain Name (FQDN)","text":"

                                  You must have a Fully Qualified Domain Name (FQDN) to install Run:ai Cluster (ex: runai.mycorp.local). This cannot be an IP. The domain name must be accessible inside the organization only. You also need a TLS certificate (private and public) for HTTPS access.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/","title":"SaaS Cluster Setup Introduction","text":"

                                  This section is a step-by-step guide for setting up a Run:ai cluster.

                                  • A Run:ai cluster is a Kubernetes application installed on top of a Kubernetes cluster.
                                  • A Run:ai cluster connects to the Run:ai control plane on the cloud. The control plane provides a control point as well as a monitoring and control user interface for Administrators and Researchers.
                                  • A customer may have multiple Run:ai Clusters, all connecting to a single control plane.

                                  For additional details see the Run:ai system components

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#documents","title":"Documents","text":"
                                  • Review Run:ai cluster System Requirements and Network Requirements.
                                  • Cluster Install step-by-step guid.
                                  • Look for troubleshooting tips if required.
                                  • Cluster Upgrade and Cluster Uninstall instructions.
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#customization","title":"Customization","text":"

                                  For a list of optional customizations see Customize Installation

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#additional-configuration","title":"Additional Configuration","text":"

                                  For a list of advanced configuration scenarios such as configuring researcher authentication, Single sign-on limiting the installation to specific nodes, and more, see the Configuration Articles section.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#next-steps","title":"Next Steps","text":"

                                  After setting up the cluster, you may want to start setting up Researchers. See: Researcher Setup.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/","title":"Cluster Upgrade","text":"

                                  This article explains how to upgrade Run:ai cluster version.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#before-upgrade","title":"Before upgrade","text":"

                                  There are a number of matters to consider prior to upgrading the Run:ai cluster version.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#system-and-network-requirements","title":"System and network requirements","text":"

                                  Before upgrading the Run:ai cluster, validate that the latest system requirements and network requirements are met, as they can change from time to time.

                                  Important

                                  It is highly recommended to upgrade the Kubernetes version together with the Run:ai cluster version, to ensure compatibility with latest supported version of your Kubernetes distribution

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#helm","title":"Helm","text":"

                                  The latest releases of the Run:ai cluster require Helm 3.14 or above.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#upgrade","title":"Upgrade","text":"

                                  Follow the instructions to upgrade using Helm. The Helm commands to upgrade the Run:ai cluster version may differ between versions. The steps below describe how to get the instructions from the Run:ai UI.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#getting-the-installation-instructions","title":"Getting the installation instructions","text":"

                                  Follow the setup and installation instructions below to get the installation instructions to upgrade the Run:ai cluster.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#setup","title":"Setup","text":"
                                  1. In the Run:ai UI, go to Clusters
                                  2. Select the cluster you want to upgrade
                                  3. Click INSTALLATION INSTRUCTIONS
                                  4. Optional: Select the Run:ai cluster version (latest, by default)
                                  5. Click CONTINUE
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#installation-instructions","title":"Installation instructions","text":"
                                  1. Follow the installation instructions (See the additional instructions below when upgrading to v2.13) run the Helm commands provided on your Kubernetes cluster (see the troubleshooting below if installation fails)
                                  2. Click DONE
                                  3. Once installation is complete, validate the cluster is Connected and listed with the new cluster version (see the cluster troubleshooting scenarios). Once you have done this, the cluster is upgraded to the latest version.

                                  Note

                                  To upgrade to a specific version, modify the --version flag by specifying the desired <version-number>. You can find all available versions by using the helm search repo command.

                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#upgrade-to-runai-cluster-version-213-old-release","title":"Upgrade to Run:ai cluster version 2.13 (old release)","text":"

                                  Run:ai cluster version 2.13 (old release) does not support migration of the configured Helm values. If you have customized configurations you want to migrate, follow the additional steps below:

                                  1. Download the Run:ai Helm values file by running the command provided in your terminal
                                  2. Run the following command to save existing cluster Helm values into old-values.yaml
                                  helm get values runai-cluster -n runai > old-values.yaml\n
                                  1. Identify configured custom values that you want to migrate
                                  2. Manually merge the values from old-values.yaml into the new values file
                                  "},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#troubleshooting","title":"Troubleshooting","text":"

                                  If you encounter an issue with the cluster upgrade, use the troubleshooting scenario below.

                                  Installation fails

                                  If the Run:ai cluster upgrade fails, check the installation logs to identify the issue.

                                  Run the following script to print the installation logs:

                                  curl -fsSL https://raw.githubusercontent.com/run-ai/public/main/installation/get-installation-logs.sh\n
                                  Cluster status

                                  If the Run:ai cluster upgrade completes, but the cluster status does not show as Connected, refer to the cluster troubleshooting scenarios

                                  .

                                  "},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/","title":"Customize Installation","text":"

                                  This article explains the available configurations for customizing the Run:ai cluster installation.

                                  "},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/#helm-chart-values","title":"Helm chart values","text":"

                                  The Run:ai cluster installation can be customized to support your environment via Helm values files or Helm install flags.

                                  These configurations are saved in the runaiconfig Kubernetes object and can be edited post-installation as needed. For more information, see Advanced Cluster Configurations.

                                  "},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/#values","title":"Values","text":"

                                  The following table lists the available Helm chart values that can be configured to customize the Run:ai cluster installation.

                                  Key Description Default global.image.registry (string) Global Docker image registry Default: \"\" global.additionalImagePullSecrets (list) List of image pull secrets references Default: [] spec.researcherService.ingress.tlsSecret (string) Existing secret key where cluster TLS Certificates are stored (non-OpenShift) Default: runai-cluster-domain-tls-secret spec.researcherService.route.tlsSecret (string) Existing secret key where cluster TLS Certificates are stored (OpenShift only) Default: \"\" spec.prometheus.spec.image (string) Due to a known issue In the Prometheus Helm chart, the imageRegistry setting is ignored. To pull the image from a different registry, you can manually specify the Prometheus image reference. Default: quay.io/prometheus/prometheus spec.prometheus.spec.imagePullSecrets (string) List of image pull secrets references in the runai namespace to use for pulling Prometheus images (relevant for air-gapped installations). Default: []"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/","title":"Install using Base Command Manager","text":"

                                  This article explains the steps required to install the Run:ai cluster on a DGX Kubernetes Cluster using NVIDIA Base Command Manager (BCM).

                                  "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#runai-installer","title":"Run:ai Installer","text":"

                                  The Run:ai Installer is an User Interface (UI) wizard that simplifies the deployment of Run:ai Cluster on DGX. The Run:ai installer can be installed via the BCM cluster wizard on cluster creation.

                                  Note

                                  For advanced configuration and custom deployment options, refer to the Install using Helm.

                                  "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#before-installation","title":"Before installation","text":"

                                  There are a number of matters to consider prior to installing using the Run:ai Installer.

                                  "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#application-secret-key","title":"Application secret key","text":"

                                  An Application secret key is required to connect the cluster to the Run:ai Platform, In order to get the Application secret key, a new cluster must be added.

                                  1. follow the Adding a new cluster setup instructions, Do not follow the Installation instructions.
                                  2. Once cluster instructions are displayed, find the controlPlane.clientSecret flag in the displayed Helm command, copy and save its value.

                                  Note

                                  For DGX Bundle customers, installing their first Run:ai cluster - The Application secret key will be provided by the Run:ai Support team.

                                  "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#system-and-network-requirements","title":"System and network requirements","text":"

                                  Before installing the Run:ai cluster on a DGX system using BCM, ensure that your System requirements and Network requirements meets the necessary prerequisites.

                                  The BCM cluster wizard deploys essential Software Requirements, such as the Kubernetes Ingress Controller, NVIDIA GPU Operator, and Prometheus, as part of the Run:ai Installer deployment. Additional optional software requirements for Distributed training and Inference, requires manual setup.

                                  "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#tenant-name","title":"Tenant Name","text":"

                                  Your tenant name is predefined and supplied by Run:ai. Each customer is provided with a unique, dedicated URL in the format <tenant-name>.run.ai which includes the required tenant name.

                                  "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#tls-certificate","title":"TLS certificate","text":"

                                  A TLS private and public keys for the cluster\u2019s Fully Qualified Domain Name (FQDN) are required for HTTP access to the cluster

                                  Important

                                  TLS Certificate must be trusted, Self-signed certificates are not supported.

                                  "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#installation","title":"Installation","text":"

                                  Follow these instructions to install using BCM.

                                  "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#installing-a-cluster","title":"Installing a cluster","text":"

                                  The cluster installer is available via the locally installed BCM landing page,

                                  1. Go to the locally installed BCM landing page, Select the Run:ai tile or access directly to http://<BCM-CLUSTER-IP>:30080/runai-installer (HTTP only)
                                  2. Click VERIFY in order to check System Requirements are met.
                                  3. After verification completed successfully, click CONTINUE.
                                  4. Enter the cluster information and click CONTINUE.
                                  5. The Run:ai installation will start and should be complete within a few minutes
                                  6. Once a message of Run:ai was installed successfully! is displayed, Click on START USING RUN:AI to launch the login page of the tenant in a new browser tab.
                                  "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#troubleshooting","title":"Troubleshooting","text":"

                                  If you encounter an issue with the installation, try the troubleshooting scenario below.

                                  "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#runai-installer_1","title":"Run:ai Installer","text":"

                                  The Run:ai Installer is a pod in Kubernetes. The pod is responsible for the installation preparation and prerequisite gathering phase. In case of an error during the Prerequisites verification, Run the following command to print the logs:

                                  kubectl get pods -n runai | grep 'cluster-installer' # Find the cluster installer pod's name\nkubectl logs <POD-NAME> -n runai # Print the cluster installer pod logs\n
                                  "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#installation_1","title":"Installation","text":"

                                  If the Run:ai cluster installation failed, check the installation logs to identify the issue. Run the following script to print the installation logs:

                                  curl -fsSL https://raw.githubusercontent.com/run-ai/public/main/installation/get-installation-logs.sh\n
                                  "},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#cluster-status","title":"Cluster status","text":"

                                  If the Run:ai cluster installation completed, but the cluster status did not change its status to Connected, check the cluster troubleshooting scenarios

                                  "},{"location":"admin/runai-setup/cluster-setup/network-req/","title":"Network Requirements","text":"

                                  The following network requirements are for the Run:ai cluster installation and usage.

                                  "},{"location":"admin/runai-setup/cluster-setup/network-req/#external-access","title":"External access","text":"

                                  Set out below are the domains to whitelist and ports to open for installation, upgrade, and usage of the application and its management.

                                  Ensure the inbound and outbound rules are correctly applied to your firewall.

                                  "},{"location":"admin/runai-setup/cluster-setup/network-req/#inbound-rules","title":"Inbound rules","text":"

                                  To allow your organization\u2019s Run:ai users to interact with the cluster using the Run:ai Command-line interface, or access specific UI features, certain inbound ports need to be open.

                                  Name Description Source Destination Port Run:ai cluster Run:ai cluster HTTPS entrypoint 0.0.0.0 all k8s nodes 443"},{"location":"admin/runai-setup/cluster-setup/network-req/#outbound-rules","title":"Outbound rules","text":"

                                  For the Run:ai cluster installation and usage, certain outbound ports must be open.

                                  Name Description Source Destination Port Run:ai Platform Run:ai cloud instance Run:ai system nodes app.run.ai 443 Grafana Run:ai cloud metrics store Run:ai system nodes prometheus-us-central1.grafana.net and runailabs.com 443 Google Container Registry Run:ai image repository All K8S nodes gcr.io/run-ai-prod 443 JFrog Artifactory Run:ai Helm repository Helm client machine runai.jfrog.io 443

                                  The Run:ai installation has software requirements that require additional components to be installed on the cluster. This article includes simple installation examples which can be used optionally and require the following cluster outbound ports to be open:

                                  Name Description Source Destination Port Kubernetes Registry Ingress Nginx image repository All K8S nodes registry.k8s.io 443 Google Container Registry GPU Operator, and Knative image repository All K8S nodes gcr.io 443 Red Hat Container Registry Prometheus Operator image repository All K8S nodes quay.io 443 Docker Hub Registry Training Operator image repository All K8S nodes docker.io 443

                                  Note

                                  If you are using an HTTP proxy, contact Run:ai support for further instructions.

                                  "},{"location":"admin/runai-setup/cluster-setup/network-req/#internal-network","title":"Internal network","text":"

                                  Ensure that all Kubernetes nodes can communicate with each other across all necessary ports. Kubernetes assumes full interconnectivity between nodes, so you must configure your network to allow this seamless communication. Specific port requirements may vary depending on your network setup.

                                  "},{"location":"admin/runai-setup/cluster-setup/project-management/","title":"Manually Create Projects","text":""},{"location":"admin/runai-setup/cluster-setup/project-management/#manual-creation-of-namespaces-for-projects","title":"Manual Creation of Namespaces for Projects","text":""},{"location":"admin/runai-setup/cluster-setup/project-management/#introduction","title":"Introduction","text":"

                                  The Administrator creates Run:ai Projects via the Run:ai user interface. When enabling Researcher Authentication you also assign users to Projects.

                                  Run:ai Projects are implemented as Kubernetes namespaces. When creating a new Run:ai Project, Run:ai does the following automatically:

                                  1. Creates a namespace by the name of runai-<PROJECT-NAME>.
                                  2. Labels the namespace as managed by Run:ai.
                                  3. Provides access to the namespace for Run:ai services.
                                  4. Associates users with the namespace.

                                  This process may need to be altered if,

                                  • Researchers already have existing Kubernetes namespaces
                                  • The organization's Kubernetes namespace naming convention does not allow the runai- prefix.
                                  • The organization's policy does not allow the automatic creation of namespaces.
                                  "},{"location":"admin/runai-setup/cluster-setup/project-management/#process","title":"Process","text":"

                                  Run:ai allows the association of a Run:ai Project with any existing Kubernetes namespace:

                                  • Disable namespace creation by setting the cluster flag createNamespaces to false. For more information see Advanced Cluster Configuration
                                  • Using the Run:ai User Interface, create a new Project <PROJECT-NAME>. A namespace will not be created.
                                  • Associate and existing namepace <NAMESPACE> with the Run:ai project by running:
                                  kubectl label ns <NAMESPACE>  runai/queue=<PROJECT_NAME>\n

                                  Caution

                                  Setting the createNamespaces flag to false moves the responsibility of creating namespaces to match Run:ai Projects to the administrator.

                                  "},{"location":"admin/runai-setup/self-hosted/overview/","title":"Self Hosted Run:ai Installation","text":"

                                  The self-hosted option is for organizations that cannot use a SaaS solution due to data leakage concerns.

                                  Run:ai self-hosting comes with two variants:

                                  Self-hosting Type Description Connected The organization can freely download from the internet (though upload is not allowed) Air-gapped The organization has no connection to the internet

                                  The self-hosted installation is priced differently. For further information please talk to Run:ai sales.

                                  "},{"location":"admin/runai-setup/self-hosted/overview/#self-hosting-with-kubernetes-vs-openshift","title":"Self-hosting with Kubernetes vs OpenShift","text":"

                                  Run:ai has been certified with a specified set of Kubernetes distributions. The OpenShift installation is different from the rest. As such, the Run:ai self-hosted installation instructions are divided into two separate sections:

                                  • OpenShift-based installation. See Run:ai OpenShift installation. The Run:ai operator for OpenShift is certified by Red Hat.
                                  • Kubernetes-based installation. See Run:ai Kubernetes installation.
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/additional-clusters/","title":"Installing additional Clusters","text":"

                                  The first Run:ai cluster is typically installed on the same Kubernetes cluster as the Run:ai control plane. Run:ai supports multiple clusters per single control plane. This document is about installing additional clusters on different Kubernetes clusters.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/additional-clusters/#installation","title":"Installation","text":"

                                  Follow the Run:ai SaaS installation network instructions as described in Domain name requirement. Specifically:

                                  1. Install Run:ai prerequisites. Including ingress controller and Prometheus.
                                  2. The Cluster should have a dedicated URL with a trusted certificate.
                                  3. Create a secret in the Run:ai namespace containing the details of a trusted certificate.
                                  4. Run the helm command as instructed.
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/backend/","title":"Install the Run:ai Control Plane","text":""},{"location":"admin/runai-setup/self-hosted/k8s/backend/#prerequisites-and-preparations","title":"Prerequisites and preparations","text":"

                                  Make sure you have followed the Control Plane prerequisites and preparations.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/backend/#helm-install","title":"Helm install","text":"

                                  Run the helm command below:

                                  ConnectedAirgapped
                                  helm repo add runai-backend https://runai.jfrog.io/artifactory/cp-charts-prod\nhelm repo update\nhelm upgrade -i runai-backend -n runai-backend runai-backend/control-plane --version \"~2.20.0\" \\\n    --set global.domain=<DOMAIN>  # (1)\n
                                  1. Domain name described here.

                                  Info

                                  To install a specific version, add --version <version> to the install command. You can find available versions by running helm search repo -l runai-backend.

                                  helm upgrade -i runai-backend control-plane-<VERSION>.tgz  \\ # (1)\n    --set global.domain=<DOMAIN>  \\ # (2)\n    --set global.customCA.enabled=true \\  # (3)\n    -n runai-backend -f custom-env.yaml  # (4)\n
                                  1. Replace <VERSION> with the Run:ai control plane version.
                                  2. Domain name described here.
                                  3. See the Local Certificate Authority instructions below
                                  4. custom-env.yaml should have been created by the prepare installation script in the previous section.

                                  Tip

                                  Use the --dry-run flag to gain an understanding of what is being installed before the actual installation.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/backend/#additional-runai-configurations-optional","title":"Additional Run:ai configurations (optional)","text":"

                                  There may be cases where you need to set additional properties, To apply the changes run helm upgrade and use --set to set specific configurations, and restart the relevant Run:ai pods so they can fetch the new configurations.

                                  Key Change Description global.ingress.ingressClass Ingress class Run:ai default is using NGINX. If your cluster has a different ingress controller, you can configure the ingress class to be created by Run:ai global.ingress.tlsSecretName TLS secret name Run:ai requires the creation of a secret with domain certificate. If the runai-backend namespace already had such a secret, you can set the secret name here <component> resources: limits: cpu: 500m memory: 512Mi requests: cpu: 250m memory: 256Mi Pod request and limits Set Run:ai and 3rd party services' resources disableIstioSidecarInjection.enabled Disable Istio sidecar injection Disable the automatic injection of Istio sidecars across the entire Run:ai Control Plane services. global.affinity System nodes Sets the system nodes where the Run:ai control plane services are scheduled."},{"location":"admin/runai-setup/self-hosted/k8s/backend/#additional-3rd-party-configurations-optional","title":"Additional 3rd party configurations (optional)","text":"

                                  The Run:ai Control Plane chart, includes multiple sub-charts of 3rd party components:

                                  • PostgreSQL - Data store
                                  • Thanos - Metrics Store
                                  • Keycloakx - Identity & Access Management
                                  • Grafana - Analytics Dashboard
                                  • Redis - Caching (Disabled, by default)

                                  Tip

                                  Click on any component, to view it's chart values and configurations

                                  If you have opted to connect to an external PostgreSQL database, refer to the additional configurations table below. Adjust the following parameters based on your connection details:

                                  1. Disable PostgreSQL deployment - postgresql.enabled
                                  2. Run:ai connection details - global.postgresql.auth
                                  3. Grafana connection details - grafana.dbUser, grafana.dbPassword
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/backend/#postgresql","title":"PostgreSQL","text":"Key Change Description postgresql.enabled PostgreSQL installation If set to false the PostgreSQL will not be installed global.postgresql.auth.host PostgreSQL host Hostname or IP address of the PostgreSQL server global.postgresql.auth.port PostgreSQL port Port number on which PostgreSQL is running global.postgresql.auth.username PostgreSQL username Username for connecting to PostgreSQL global.postgresql.auth.password PostgreSQL password Password for the PostgreSQL user specified by global.postgresql.auth.username global.postgresql.auth.postgresPassword PostgreSQL default admin password Password for the built-in PostgreSQL superuser (postgres) global.postgresql.auth.existingSecret Postgres Credentials (secret) Existing secret name with authentication credentials global.postgresql.auth.dbSslMode Postgres connection SSL mode Set the SSL mode, see list in Protection Provided in Different Modes, prefer mode is not supported postgresql.primary.initdb.password PostgreSQL default admin password Set the same password as in global.postgresql.auth.postgresPassword (if changed) postgresql.primary.persistence.storageClass Storage class The installation to work with a specific storage class rather than the default one"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#thanos","title":"Thanos","text":"Key Change Description thanos.receive.persistence.storageClass Storage class The installation to work with a specific storage class rather than the default one"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#keycloakx","title":"Keycloakx","text":"Key Change Description keycloakx.adminUser User name of the internal identity provider administrator This user is the administrator of Keycloak keycloakx.adminPassword Password of the internal identity provider administrator This password is for the administrator of Keycloak keycloakx.existingSecret Keycloakx Credentials (secret) Existing secret name with authentication credentials global.keycloakx.host KeyCloak (Run:ai internal identity provider) host path Override the DNS for Keycloak. This can be used to access Keycloak from outside the Run:ai Control Plane cluster via ingress

                                  The keycloakx.adminUser can only be set during the initial installation. The admin password, however, can also be changed later through the Keycloak UI, but you must also update the keycloakx.adminPassword value in the Helm chart using helm upgrade. Failing to update the Helm values after changing the password can lead to control plane services encountering errors.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/backend/#grafana","title":"Grafana","text":"Key Change Description grafana.db.existingSecret Grafana database connection credentials (secret) Existing secret name with authentication credentials grafana.dbUser Grafana database username Username for accessing the Grafana database grafana.dbPassword Grafana database password Password for the Grafana database user grafana.admin.existingSecret Grafana admin default credentials (secret) Existing secret name with authentication credentials grafana.adminUser Grafana username Override the Run:ai default user name for accessing Grafana grafana.adminPassword Grafana password Override the Run:ai default password for accessing Grafana"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#redis","title":"Redis","text":"Key Change Description redisCache.auth.password Redis (Runai internal cache mechanism) applicative password Override the default password redisCache.auth.existingSecret Redis credentials (secret) Existing secret name with authentication credentials"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#next-steps","title":"Next Steps","text":""},{"location":"admin/runai-setup/self-hosted/k8s/backend/#connect-to-runai-user-interface","title":"Connect to Run:ai User interface","text":"

                                  Go to: runai.<domain>. Log in using the default credentials: User: test@run.ai, Password: Abcd!234. Go to the Users area and change the password.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/backend/#enable-forgot-password-optional","title":"Enable Forgot Password (optional)","text":"

                                  To support the Forgot password functionality, follow the steps below.

                                  • Go to runai.<domain>/auth and Log in.
                                  • Under Realm settings, select the Login tab and enable the Forgot password feature.
                                  • Under the Email tab, define an SMTP server, as explained here
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/backend/#install-runai-cluster","title":"Install Run:ai Cluster","text":"

                                  Continue with installing a Run:ai Cluster.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/cluster/","title":"Self Hosted installation over Kubernetes - Cluster Setup","text":""},{"location":"admin/runai-setup/self-hosted/k8s/cluster/#prerequisites","title":"Prerequisites","text":"

                                  Install prerequisites as per System Requirements document.

                                  Note

                                  For self-hosted deployments, Kubernetes Ingress Controller and Cluster Fully Qualified Domain Name (FQDN) requirements are only necessary when the Run:ai Control Plane and Run:ai Cluster reside on seperate Kuebrnetes clusters.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/cluster/#install-cluster","title":"Install Cluster","text":"ConnectedAirgapped

                                  Perform the cluster installation instructions explained here.

                                  Perform the cluster installation instructions explained here.

                                  On the second tab of the cluster wizard, when copying the helm command for installation, you will need to use the pre-provided installation file instead of using helm repositories. As such:

                                  • Do not add the helm repository and do not run helm repo update.
                                  • Instead, edit the helm upgrade command.
                                    • Replace runai/runai-cluster with runai-cluster-<version>.tgz.
                                    • Add --set global.image.registry=<Docker Registry address> where the registry address is as entered in the preparation section

                                  The command should look like the following:

                                  helm upgrade -i runai-cluster runai-cluster-<version>.tgz \\\n    --set controlPlane.url=... \\\n    --set controlPlane.clientSecret=... \\\n    --set cluster.uid=... \\\n    --set cluster.url=... --create-namespace \\\n    --set global.image.registry=registry.mycompany.local \\\n

                                  Tip

                                  Use the --dry-run flag to gain an understanding of what is being installed before the actual installation. For more details see Understanding cluster access roles.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/cluster/#optional-customize-installation","title":"(Optional) Customize Installation","text":"

                                  To customize specific aspects of the cluster installation see customize cluster installation.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/next-steps/","title":"Next Steps","text":"
                                  • Create additional I Users.
                                  • Set up Project-based Researcher Access Control.
                                  • Set up Researchers to work with the Run:ai Command-line interface (CLI). See Installing the Run:ai Command-line Interface on how to install the CLI for users.
                                  • Review advanced setup and maintenace scenarios.
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/","title":"Preparing for a Run:ai Kubernetes installation","text":"

                                  The following section provides IT with the information needed to prepare for a Run:ai installation.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#prerequisites","title":"Prerequisites","text":"

                                  Follow the prerequisites as explained in Self-Hosted installation over Kubernetes.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#software-artifacts","title":"Software artifacts","text":"ConnectedAirgapped

                                  You should receive a file: runai-reg-creds.yaml from Run:ai Customer Support. The file provides access to the Run:ai Container registry.

                                  SSH into a node with kubectl access to the cluster and Docker installed. Run the following to enable image download from the Run:ai Container Registry on Google cloud:

                                  kubectl create namespace runai-backend\nkubectl apply -f runai-reg-creds.yaml\n

                                  You should receive a single file runai-air-gapped-<VERSION>.tar.gz from Run:ai customer support

                                  SSH into a node with kubectl access to the cluster and Docker installed.

                                  Run:ai assumes the existence of a Docker registry for images. Most likely installed within the organization. The installation requires the network address and port for the registry (referenced below as <REGISTRY_URL>).

                                  To extract Run:ai files, replace <VERSION> in the command below and run:

                                  tar xvf runai-airgapped-package-<VERSION>.tar.gz\n\nkubectl create namespace runai-backend\n

                                  Upload images

                                  Upload images to a local Docker Registry. Set the Docker Registry address in the form of NAME:PORT (do not add https):

                                  export REGISTRY_URL=<Docker Registry address>\n

                                  Run the following script (you must dockerd installed and at least 20GB of free disk space to run):

                                  sudo -E ./setup.sh\n

                                  If Docker is configured to run as non-root then sudo is not required.

                                  The script should create a file named custom-env.yaml which will be used by the control-plane installation.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#private-docker-registry-optional","title":"Private Docker Registry (optional)","text":"

                                  To access the organization's docker registry it is required to set the registry's credentials (imagePullSecret)

                                  Create the secret named runai-reg-creds based on your existing credentials. For more information, see Pull an Image from a Private Registry.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#configure-your-environment","title":"Configure your environment","text":""},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#domain-certificate","title":"Domain Certificate","text":"

                                  The Run:ai control plane requires a domain name (FQDN). You must supply a domain name as well as a trusted certificate for that domain.

                                  • When installing the first Run:ai cluster on the same Kubernetes cluster as the control plane, the Run:ai cluster URL will be the same as the control-plane URL.
                                  • When installing the Run:ai cluster on a separate Kubernetes cluster, follow the Run:ai Domain name requirement.
                                  • If your network is air-gapped, you will need to provide the Run:ai control-plane and cluster with information about the local certificate authority.

                                  You must provide the domain's private key and crt as a Kubernetes secret in the runai-backend namespace. Run:

                                  kubectl create secret tls runai-backend-tls -n runai-backend \\\n    --cert /path/to/fullchain.pem --key /path/to/private.pem\n
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#local-certificate-authority-air-gapped-only","title":"Local Certificate Authority (air-gapped only)","text":"

                                  In air-gapped environments, you must prepare the public key of your local certificate authority as described here. It will need to be installed in Kubernetes for the installation to succeed.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#mark-runai-system-workers-optional","title":"Mark Run:ai system workers (optional)","text":"

                                  You can optionally set the Run:ai control plane to run on specific nodes. Kubernetes will attempt to schedule Run:ai pods to these nodes. If lacking resources, the Run:ai nodes will move to another, non-labeled node.

                                  To set system worker nodes run:

                                  kubectl label node <NODE-NAME> node-role.kubernetes.io/runai-system=true\n

                                  Warning

                                  Do not select the Kubernetes master as a runai-system node. This may cause Kubernetes to stop working (specifically if Kubernetes API Server is configured on 443 instead of the default 6443).

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#external-postgres-database-optional","title":"External Postgres database (optional)","text":"

                                  If you have opted to use an external PostgreSQL database, you need to perform initial setup to ensure successful installation. Follow these steps:

                                  1. Create a SQL script file, edit the parameters below, and save it locally:

                                    • Replace <DATABASE_NAME> with a dedicate database name for RunAi in your PostgreSQL database.
                                    • Replace <ROLE_NAME> with a dedicated role name (user) for RunAi database.
                                    • Replace <ROLE_PASSWORD> with a password for the new PostgreSQL role.
                                    • Replace <GRAFANA_PASSWORD> with the password to be set for Grafana integration.
                                    -- Create a new database for runai\nCREATE DATABASE <DATABASE_NAME>; \n\n-- Create the role with login and password\nCREATE ROLE <ROLE_NAME>  WITH LOGIN PASSWORD '<ROLE_PASSWORD>'; \n\n-- Grant all privileges on the database to the role\nGRANT ALL PRIVILEGES ON DATABASE <DATABASE_NAME> TO <ROLE_NAME>; \n\n-- Connect to the newly created database\n\\c <DATABASE_NAME> \n\n-- grafana\nCREATE ROLE grafana WITH LOGIN PASSWORD '<GRAFANA_PASSWORD>'; \nCREATE SCHEMA grafana authorization grafana;\nALTER USER grafana set search_path='grafana';\n-- Exit psql\n\\q\n
                                  2. Run the following command on a machine where PostgreSQL client (pgsql) is installed:

                                    psql --host <POSTGRESQL_HOST> \\ # (1)\n--user <POSTGRESQL_USER> \\ # (2)\n--port <POSTGRESQL_PORT> \\ # (3)\n--dbname <POSTGRESQL_DB> \\ # (4)\n-a -f <SQL_FILE> \\ # (5)\n
                                    1. Replace <POSTGRESQL_HOST> with the PostgreSQL ip address or hostname.
                                    2. Replace <POSTGRESQL_USER> with the PostgreSQL username.
                                    3. Replace <POSTGRESQL_PORT> with the port number where PostgreSQL is running.
                                    4. Replace <POSTGRESQL_DB> with the name of your PostgreSQL database.
                                    5. Replace <POSTGRESQL_DB> with the name of your PostgreSQL database.
                                    6. Replace <SQL_FILE> with the path to the SQL script created in the previous step.
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#additional-permissions","title":"Additional permissions","text":"

                                  As part of the installation, you will be required to install the Run:ai Control Plane and Cluster Helm Charts. The Helm Charts require Kubernetes administrator permissions. You can review the exact permissions provided by using the --dry-run on both helm charts.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#validate-prerequisites","title":"Validate Prerequisites","text":"

                                  Once you believe that the Run:ai prerequisites and preperations are met, we highly recommend installing and running the Run:ai pre-install diagnostics script. The tool:

                                  • Tests the below requirements as well as additional failure points related to Kubernetes, NVIDIA, storage, and networking.
                                  • Looks at additional components installed and analyze their relevance to a successful Run:ai installation.

                                  To use the script download the latest version of the script and run:

                                  chmod +x preinstall-diagnostics-<platform>\n./preinstall-diagnostics-<platform> --domain <dns-entry>\n

                                  If the script fails, or if the script succeeds but the Kubernetes system contains components other than Run:ai, locate the file runai-preinstall-diagnostics.txt in the current directory and send it to Run:ai technical support.

                                  For more information on the script including additional command-line flags, see here.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#next-steps","title":"Next steps","text":"

                                  Continue with installing the Run:ai Control Plane.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/","title":"Self-Hosted installation over Kubernetes - Prerequisites","text":"

                                  Before proceeding with this document, please review the installation types documentation to understand the difference between air-gapped and connected installations.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#runai-components","title":"Run:ai Components","text":"

                                  As part of the installation process you will install:

                                  • A control-plane managing cluster
                                  • One or more clusters

                                  Both the control plane and clusters require Kubernetes. Typically the control plane and first cluster are installed on the same Kubernetes cluster but this is not a must.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#installer-machine","title":"Installer machine","text":"

                                  The machine running the installation script (typically the Kubernetes master) must have:

                                  • At least 50GB of free space.
                                  • Docker installed.
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#helm","title":"Helm","text":"

                                  Run:ai requires Helm 3.14 or later. To install Helm, see Installing Helm. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#cluster-hardware-requirements","title":"Cluster hardware requirements","text":"

                                  The Run:ai control plane services require the following resources:

                                  Component Required Capacity CPU 10 cores Memory 12GB Disk space 110GB

                                  If Run:ai cluster is planned to be installed on the same cluster as the Run:ai control plane: Ensure the control plane requirements are in addition to the Run:ai cluster hardware requirements.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#arm-limitation","title":"ARM Limitation","text":"

                                  The control plane does not support CPU nodes with ARM64k architecture. To schedule the Run:ai control plane services on supported nodes, use the global.affinity configuration paramter as detailed in Additional Run:ai configurations.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#runai-software-requirements","title":"Run:ai software requirements","text":""},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#cluster-nodes","title":"Cluster Nodes","text":"

                                  See Run:ai Cluster prerequisites operating system requirements.

                                  Nodes are required to be synchronized by time using NTP (Network Time Protocol) for proper system functionality.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#kubernetes","title":"Kubernetes","text":"

                                  See Run:ai Cluster prerequisites Kubernetes distribution requirements.

                                  The Run:ai control plane operating system prerequisites are identical.

                                  The Run:ai control-plane requires a default storage class to create persistent volume claims for Run:ai storage. The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the Run:ai persistent data is saved or deleted when the Run:ai control plane is deleted.

                                  Note

                                  For a simple (nonproduction) storage class example see Kubernetes Local Storage Class. The storage class will set the directory /opt/local-path-provisioner to be used across all nodes as the path for provisioning persistent volumes.

                                  Then set the new storage class as default:

                                  kubectl patch storageclass local-path -p '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"true\"}}}'\n
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#install-prerequisites","title":"Install prerequisites","text":""},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#ingress-controller","title":"Ingress Controller","text":"

                                  The Run:ai control plane installation assumes an existing installation of NGINX as the ingress controller. You can follow the Run:ai Cluster prerequisites Kubernetes ingress controller installation.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#nvidia-gpu-operator","title":"NVIDIA GPU Operator","text":"

                                  See Run:ai Cluster prerequisites NVIDIA GPU operator requirements.

                                  The Run:ai control plane, when installed without a Run:ai cluster, does not require the NVIDIA prerequisites.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#prometheus","title":"Prometheus","text":"

                                  See Run:ai Cluster prerequisites Prometheus requirements.

                                  The Run:ai control plane, when installed without a Run:ai cluster, does not require the Prometheus prerequisites.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#inference-optional","title":"Inference (optional)","text":"

                                  See Run:ai Cluster prerequisites Inference requirements.

                                  The Run:ai control plane, when installed without a Run:ai cluster, does not require the Inference prerequisites.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#external-postgres-database-optional","title":"External Postgres database (optional)","text":"

                                  The Run:ai control plane installation includes a default PostgreSQL database. However, you may opt to use an existing PostgreSQL database if you have specific requirements or preferences. Please ensure that your PostgreSQL database is version 16 or higher.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#next-steps","title":"Next steps","text":"

                                  Continue to Preparing for a Run:ai Kubernetes Installation .

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/project-management/","title":"Self Hosted installation over Kubernetes - Create Projects","text":""},{"location":"admin/runai-setup/self-hosted/k8s/project-management/#introduction","title":"Introduction","text":"

                                  The Administrator creates Run:ai Projects via the Run:ai user interface. When enabling Researcher Authentication you also assign users to Projects.

                                  Run:ai Projects are implemented as Kubernetes namespaces. When creating a new Run:ai Project, Run:ai does the following automatically:

                                  1. Creates a namespace by the name of runai-<PROJECT-NAME>.
                                  2. Labels the namespace as managed by Run:ai.
                                  3. Provides access to the namespace for Run:ai services.
                                  4. Associates users with the namespace.

                                  This process may need to be altered if,

                                  • Researchers already have existing Kubernetes namespaces
                                  • The organization's Kubernetes namespace naming convention does not allow the runai- prefix.
                                  • The organization's policy does not allow the automatic creation of namespaces.
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/project-management/#process","title":"Process","text":"

                                  Run:ai allows the association of a Run:ai Project with any existing Kubernetes namespace:

                                  • When setting up a Run:ai cluster, Disable namespace creation by setting the cluster flag createNamespaces to false.
                                  • Using the Run:ai User Interface, create a new Project <PROJECT-NAME>. A namespace will not be created.
                                  • Associate and existing namepace <NAMESPACE> with the Run:ai project by running:
                                  kubectl label ns <NAMESPACE>  runai/queue=<PROJECT_NAME>\n

                                  Caution

                                  Setting the createNamespaces flag to false moves the responsibility of creating namespaces to match Run:ai Projects to the administrator.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/uninstall/","title":"Uninstall Run:ai","text":""},{"location":"admin/runai-setup/self-hosted/k8s/uninstall/#uninstall-a-runai-cluster","title":"Uninstall a Run:ai Cluster","text":"

                                  To uninstall the cluster see: cluster delete

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/uninstall/#uninstall-the-runai-control-plane","title":"Uninstall the Run:ai Control Plane","text":"

                                  To delete the control plane, run:

                                  helm uninstall runai-backend -n runai-backend\n
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/","title":"Upgrade Run:ai","text":""},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#preparations","title":"Preparations","text":""},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#helm","title":"Helm","text":"

                                  Run:ai requires Helm 3.14 or later. Before you continue, validate your installed helm client version. To install or upgrade Helm, see Installing Helm. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#software-files","title":"Software files","text":"ConnectedAirgapped

                                  Run the helm command below:

                                  helm repo add runai-backend https://runai.jfrog.io/artifactory/cp-charts-prod\nhelm repo update\n
                                  • Ask for a tar file runai-air-gapped-<NEW-VERSION>.tar.gz from Run:ai customer support. The file contains the new version you want to upgrade to. <NEW-VERSION> is the updated version of the Run:ai control plane.
                                  • Upload the images as described here.
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#before-upgrade","title":"Before upgrade","text":"

                                  Before proceeding with the upgrade, it's crucial to apply the specific prerequisites associated with your current version of Run:ai and every version in between up to the version you are upgrading to.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-from-version-29","title":"Upgrade from version 2.9","text":"

                                  Two significant changes to the control-plane installation have happened with version 2.12: PVC ownership, Ingress and installation customization.

                                  "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#pvc-ownership","title":"PVC ownership","text":"

                                  Run:ai will no longer directly create the PVCs that store Run:ai data (metrics and database). Instead, going forward, * Run:ai requires a Kubernetes storage class to be installed. * The PVCs are created by the Kubernetes StatefulSets.

                                  The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the data is saved or deleted when the Run:ai control plane is deleted.

                                  To remove the ownership in an older installation, run:

                                  kubectl patch pvc -n runai-backend pvc-thanos-receive  -p '{\"metadata\": {\"annotations\":{\"helm.sh/resource-policy\": \"keep\"}}}'\nkubectl patch pvc -n runai-backend pvc-postgresql  -p '{\"metadata\": {\"annotations\":{\"helm.sh/resource-policy\": \"keep\"}}}'\n
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#ingress","title":"Ingress","text":"

                                  Delete the ingress object which will be recreated by the control plane upgrade

                                  kubectl delete ing -n runai-backend runai-backend-ingress\n
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#installation-customization","title":"Installation customization","text":"

                                  The Run:ai control-plane installation has been rewritten and is no longer using a backend values file. Instead, to customize the installation use standard --set flags. If you have previously customized the installation, you must now extract these customizations and add them as --set flag to the helm installation:

                                  • Find previous customizations to the control plane if such exist. Run:ai provides a utility for that here https://raw.githubusercontent.com/run-ai/docs/v2.13/install/backend/cp-helm-vals-diff.sh. For information on how to use this utility please contact Run:ai customer support.
                                  • Search for the customizations you found in the optional configurations table and add them in the new format.
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-control-plane","title":"Upgrade Control Plane","text":""},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-from-version-213-or-later","title":"Upgrade from version 2.13, or later","text":"ConnectedAirgapped
                                  helm get values runai-backend -n runai-backend > runai_control_plane_values.yaml\nhelm upgrade runai-backend -n runai-backend runai-backend/control-plane --version \"~2.20.0\" -f runai_control_plane_values.yaml --reset-then-reuse-values\n
                                  helm get values runai-backend -n runai-backend > runai_control_plane_values.yaml\nhelm upgrade runai-backend control-plane-<NEW-VERSION>.tgz -n runai-backend  -f runai_control_plane_values.yaml --reset-then-reuse-values\n
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-from-version-29_1","title":"Upgrade from version 2.9","text":"
                                  • Create a tls secret as described in the control plane installation.
                                  • Upgrade the control plane as described in the control plane installation. During the upgrade, you must tell the installation not to create the two PVCs:
                                  ConnectedAirgapped
                                  helm upgrade -i runai-backend -n runai-backend runai-backend/control-plane --version \"~2.20.0\" \\\n--set global.domain=<DOMAIN> \\\n--set postgresql.primary.persistence.existingClaim=pvc-postgresql \\ \n--set thanos.receive.persistence.existingClaim=pvc-thanos-receive \n

                                  Note

                                  The helm repository name has changed from runai-backend/runai-backend to runai-backend/control-plane.

                                  helm upgrade -i runai-backend control-plane-<NEW-VERSION>.tgz -n runai-backend \\\n--set global.domain=<DOMAIN> \\\n--set postgresql.primary.persistence.existingClaim=pvc-postgresql \\ \n--set thanos.receive.persistence.existingClaim=pvc-thanos-receive \n
                                  "},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-cluster","title":"Upgrade Cluster","text":"

                                  To upgrade the cluster follow the instructions here.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/additional-clusters/","title":"Installing additional clusters","text":"

                                  The first Run:ai cluster is typically installed on the same OpenShift cluster as the Run:ai control plane. Run:ai supports multiple clusters per single control plane. This document is about installing additional clusters on different OpenShift clusters.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/additional-clusters/#additional-cluster-installation","title":"Additional cluster installation","text":"

                                  Create a new cluster, then:

                                  • Select a target platform OpenShift
                                  • Select a Cluster location Remote to Control Plane.
                                  • You must enter a specific cluster URL with the format https://runai.apps.<BASE_DOMAIN>. To get the base Domain run oc get dns cluster -oyaml | grep baseDomain
                                  • Ignore the instructions for creating a secret.
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/backend/","title":"Install the Run:ai Control Plane","text":""},{"location":"admin/runai-setup/self-hosted/ocp/backend/#prerequisites-and-preparations","title":"Prerequisites and preparations","text":"

                                  Make sure you have followed the Control Plane prerequisites and preparations.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/backend/#helm-install","title":"Helm Install","text":"

                                  Run the helm command below:

                                  ConnectedAirgapped
                                  helm repo add runai-backend https://runai.jfrog.io/artifactory/cp-charts-prod\nhelm repo update\nhelm upgrade -i runai-backend -n runai-backend runai-backend/control-plane --version \"~2.20.0\" \\\n    --set global.domain=runai.apps.<OPENSHIFT-CLUSTER-DOMAIN> \\ # (1)\n    --set global.config.kubernetesDistribution=openshift\n
                                  1. The subdomain configured for the OpenShift cluster.

                                  Info

                                  To install a specific version, add --version <version> to the install command. You can find available versions by running helm search repo -l runai-backend.

                                  helm upgrade -i runai-backend  ./control-plane-<version>.tgz -n runai-backend \\\n    --set global.domain=runai.apps.<OPENSHIFT-CLUSTER-DOMAIN> \\ # (1)\n    --set global.config.kubernetesDistribution=openshift \\\n    --set global.customCA.enabled=true \\ # (2)\n    -f custom-env.yaml  # (3)\n
                                  1. The domain configured for the OpenShift cluster. To find out the OpenShift cluster domain, run oc get routes -A
                                  2. See the Local Certificate Authority instructions below
                                  3. custom-env.yaml should have been created by the prepare installation script in the previous section.

                                  (replace <version> with the control plane version)

                                  Tip

                                  Use the --dry-run flag to gain an understanding of what is being installed before the actual installation.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/backend/#additional-runai-configurations-optional","title":"Additional Run:ai configurations (optional)","text":"

                                  There may be cases where you need to set additional properties, To apply the changes run helm upgrade and use --set to set specific configurations, and restart the relevant Run:ai pods so they can fetch the new configurations.

                                  Key Change Description <component> resources: limits: cpu: 500m memory: 512Mi requests: cpu: 250m memory: 256Mi Pod request and limits Set Run:ai and 3rd party services' resources disableIstioSidecarInjection.enabled Disable Istio sidecar injection Disable the automatic injection of Istio sidecars across the entire Run:ai Control Plane services."},{"location":"admin/runai-setup/self-hosted/ocp/backend/#additional-3rd-party-configurations-optional","title":"Additional 3rd party configurations (optional)","text":"

                                  The Run:ai Control Plane chart, includes multiple sub-charts of 3rd party components:

                                  • PostgreSQL - Data store
                                  • Keycloakx - Identity & Access Management
                                  • Grafana - Analytics Dashboard
                                  • Redis - Caching (Disabled, by default)

                                  Tip

                                  Click on any component, to view it's chart values and configurations

                                  If you have opted to connect to an external PostgreSQL database, refer to the additional configurations table below. Adjust the following parameters based on your connection details:

                                  1. Disable PostgreSQL deployment - postgresql.enabled
                                  2. Run:ai connection details - global.postgresql.auth
                                  3. Grafana connection details - grafana.dbUser, grafana.dbPassword
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/backend/#postgresql","title":"PostgreSQL","text":"Key Change Description postgresql.enabled PostgreSQL installation If set to false the PostgreSQL will not be installed global.postgresql.auth.host PostgreSQL host Hostname or IP address of the PostgreSQL server global.postgresql.auth.port PostgreSQL port Port number on which PostgreSQL is running global.postgresql.auth.username PostgreSQL username Username for connecting to PostgreSQL global.postgresql.auth.password PostgreSQL password Password for the PostgreSQL user specified by global.postgresql.auth.username global.postgresql.auth.postgresPassword PostgreSQL default admin password Password for the built-in PostgreSQL superuser (postgres) global.postgresql.auth.existingSecret Postgres Credentials (secret) Existing secret name with authentication credentials global.postgresql.auth.dbSslMode Postgres connection SSL mode Set the SSL mode, see list in Protection Provided in Different Modes, prefer mode is not supported postgresql.primary.initdb.password PostgreSQL default admin password Set the same password as in global.postgresql.auth.postgresPassword (if changed) postgresql.primary.persistence.storageClass Storage class The installation to work with a specific storage class rather than the default one"},{"location":"admin/runai-setup/self-hosted/ocp/backend/#keycloakx","title":"Keycloakx","text":"Key Change Description keycloakx.adminUser User name of the internal identity provider administrator This user is the administrator of Keycloak keycloakx.adminPassword Password of the internal identity provider administrator This password is for the administrator of Keycloak keycloakx.existingSecret Keycloakx credentials (secret) Existing secret name with authentication credentials global.keycloakx.host KeyCloak (Run:ai internal identity provider) host path Override the DNS for Keycloak. This can be used to access Keycloak from outside the Run:ai Control Plane cluster via ingress

                                  The keycloakx.adminUser can only be set during the initial installation. The admin password, however, can also be changed later through the Keycloak UI, but you must also update the keycloakx.adminPassword value in the Helm chart using helm upgrade. Failing to update the Helm values after changing the password can lead to control plane services encountering errors.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/backend/#grafana","title":"Grafana","text":"Key Change Description grafana.db.existingSecret Grafana database connection credentials (secret) Existing secret name with authentication credentials grafana.dbUser Grafana database username Username for accessing the Grafana database grafana.dbPassword Grafana database password Password for the Grafana database user grafana.admin.existingSecret Grafana admin default credentials (secret) Existing secret name with authentication credentials grafana.adminUser Grafana username Override the Run:ai default user name for accessing Grafana grafana.adminPassword Grafana password Override the Run:ai default password for accessing Grafana"},{"location":"admin/runai-setup/self-hosted/ocp/backend/#redis","title":"Redis","text":"Key Change Description redisCache.auth.password Redis (Runai internal cache mechanism) applicative password Override the default password redisCache.auth.existingSecret Redis credentials (secret) Existing secret name with authentication credentials"},{"location":"admin/runai-setup/self-hosted/ocp/backend/#next-steps","title":"Next steps","text":""},{"location":"admin/runai-setup/self-hosted/ocp/backend/#connect-to-runai-user-interface","title":"Connect to Run:ai user interface","text":"
                                  • Run: oc get routes -n runai-backend to find the Run:ai Administration User Interface URL.
                                  • Log in using the default credentials: User: test@run.ai, Password: Abcd!234.
                                  • Go to the Users area and change the password.
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/backend/#enable-forgot-password-optional","title":"Enable Forgot Password (optional)","text":"

                                  To support the Forgot password functionality, follow the steps below.

                                  • Go to runai.<openshift-cluster-domain>/auth and Log in.
                                  • Under Realm settings, select the Login tab and enable the Forgot password feature.
                                  • Under the Email tab, define an SMTP server, as explained here
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/backend/#install-runai-cluster","title":"Install Run:ai Cluster","text":"

                                  Continue with installing a Run:ai Cluster.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/cluster/","title":"Self-Hosted installation over OpenShift - Cluster Setup","text":""},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#prerequisites","title":"Prerequisites","text":"

                                  Install prerequisites as per System Requirements document.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#create-openshift-projects","title":"Create OpenShift Projects","text":"

                                  Run:ai cluster installation uses several namespaces (or projects in OpenShift terminology). Run the following:

                                  oc new-project runai\noc new-project runai-reservation\noc new-project runai-scale-adjust\n

                                  The last namespace (runai-scale-adjust) is only required if the cluster is a cloud cluster and is configured for auto-scaling.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#cluster-installation","title":"Cluster Installation","text":"ConnectedAirgapped

                                  Perform the cluster installation instructions explained in Cluster install. When creating a new cluster, select the OpenShift target platform.

                                  Info

                                  To install a specific version, add --version <version> to the install command. You can find available versions by running helm search repo -l runai-cluster.

                                  Perform the cluster installation instructions explained in Cluster install. When creating a new cluster, select the OpenShift target platform.

                                  On the second tab of the cluster wizard, when copying the helm command for installation, you will need to use the pre-provided installation file instead of using helm repositories. As such:

                                  • Do not add the helm repository and do not run helm repo update.
                                  • Instead, edit the helm upgrade command.
                                    • Replace runai/runai-cluster with runai-cluster-<version>.tgz.
                                    • Add --set global.image.registry=<Docker Registry address> where the registry address is as entered in the preparation section
                                    • Add --set global.customCA.enabled=true and perform the instructions for local certificate authority.

                                  The command should look like the following:

                                  helm upgrade -i runai-cluster runai-cluster-<version>.tgz \\\n    --set controlPlane.url=... \\\n    --set controlPlane.clientSecret=... \\\n    --set cluster.uid=... \\\n    --set cluster.url=... --create-namespace \\\n    --set global.image.registry=registry.mycompany.local \\\n    --set global.customCA.enabled=true\n

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#optional-customize-installation","title":"(Optional) Customize Installation","text":"

                                  To customize specific aspects of the cluster installation see customize cluster installation.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#next-steps","title":"Next Steps","text":"

                                  Continue to create Run:ai Projects.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/next-steps/","title":"Next Steps","text":"
                                  • Create additional Run:ai Users.
                                  • Set up Project-based Researcher Access Control.
                                  • Set up Researchers to work with the Run:ai Command-line interface (CLI). See Installing the Run:ai Command-line Interface on how to install the CLI for users.
                                  • Review advanced setup and maintenace scenarios.
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/","title":"Preparing for a Run:ai OpenShift installation","text":"

                                  The following section provides IT with the information needed to prepare for a Run:ai installation.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#prerequisites","title":"Prerequisites","text":"

                                  See the Prerequisites section above.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#software-artifacts","title":"Software artifacts","text":"ConnectedAirgapped

                                  You should receive a file: runai-reg-creds.yaml from Run:ai Customer Support. The file provides access to the Run:ai Container registry.

                                  SSH into a node with oc access (oc is the OpenShift command line) to the cluster and Docker installed.

                                  Run the following to enable image download from the Run:ai Container Registry on Google cloud:

                                  oc apply -f runai-reg-creds.yaml -n runai-backend\n

                                  You should receive a single file runai-<version>.tar from Run:ai customer support

                                  Run:ai assumes the existence of a Docker registry for images. Most likely installed within the organization. The installation requires the network address and port for the registry (referenced below as <REGISTRY_URL>).

                                  SSH into a node with oc access (oc is the OpenShift command line) to the cluster and Docker installed.

                                  To extract Run:ai files, replace <VERSION> in the command below and run:

                                  tar xvf runai-airgapped-package-<VERSION>.tar.gz\n
                                  Upload images

                                  Upload images to a local Docker Registry. Set the Docker Registry address in the form of NAME:PORT (do not add https):

                                  export REGISTRY_URL=<Docker Registry address>\n

                                  Run the following script (you must have at least 20GB of free disk space to run):

                                  ./setup.sh\n

                                  (If docker is configured to run as non-root then sudo is not required).

                                  The script should create a file named custom-env.yaml which will be used by the control-plane installation.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#private-docker-registry-optional","title":"Private Docker Registry (optional)","text":"

                                  To access the organization's docker registry it is required to set the registry's credentials (imagePullSecret)

                                  Create the secret named runai-reg-creds in the runai-backend namespace based on your existing credentials. The configuration will be copied over to the runai namespace at cluster install. For more information, see Allowing pods to reference images from other secured registries.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#configure-your-environment","title":"Configure your environment","text":""},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#create-openshift-project","title":"Create OpenShift project","text":"

                                  The Run:ai control plane uses a namespace (or project in OpenShift terminology) name runai-backend. You must create it before installing:

                                  oc new-project runai-backend\n
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#local-certificate-authority-air-gapped-only","title":"Local Certificate Authority (air-gapped only)","text":"

                                  In Air-gapped environments, you must prepare the public key of your local certificate authority as described here. It will need to be installed in Kubernetes for the installation to succeed.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#mark-runai-system-workers-optional","title":"Mark Run:ai system workers (optional)","text":"

                                  You can optionally set the Run:ai control plane to run on specific nodes. Kubernetes will attempt to schedule Run:ai pods to these nodes. If lacking resources, the Run:ai nodes will move to another, non-labeled node.

                                  To set system worker nodes run:

                                  kubectl label node <NODE-NAME> node-role.kubernetes.io/runai-system=true\n

                                  Warning

                                  Do not select the Kubernetes master as a runai-system node. This may cause Kubernetes to stop working (specifically if Kubernetes API Server is configured on 443 instead of the default 6443).

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#external-postgres-database-optional","title":"External Postgres database (optional)","text":"

                                  If you have opted to use an external PostgreSQL database, you need to perform initial setup to ensure successful installation. Follow these steps:

                                  1. Create a SQL script file, edit the parameters below, and save it locally:

                                    • Replace <DATABASE_NAME> with a dedicate database name for RunAi in your PostgreSQL database.
                                    • Replace <ROLE_NAME> with a dedicated role name (user) for RunAi database.
                                    • Replace <ROLE_PASSWORD> with a password for the new PostgreSQL role.
                                    • Replace <GRAFANA_PASSWORD> with the password to be set for Grafana integration.
                                    -- Create a new database for runai\nCREATE DATABASE <DATABASE_NAME>; \n\n-- Create the role with login and password\nCREATE ROLE <ROLE_NAME>  WITH LOGIN PASSWORD '<ROLE_PASSWORD>'; \n\n-- Grant all privileges on the database to the role\nGRANT ALL PRIVILEGES ON DATABASE <DATABASE_NAME> TO <ROLE_NAME>; \n\n-- Connect to the newly created database\n\\c <DATABASE_NAME> \n\n-- grafana\nCREATE ROLE grafana WITH LOGIN PASSWORD '<GRAFANA_PASSWORD>'; \nCREATE SCHEMA grafana authorization grafana;\nALTER USER grafana set search_path='grafana';\n-- Exit psql\n\\q\n
                                  2. Run the following command on a machine where PostgreSQL client (pgsql) is installed:

                                    psql --host <POSTGRESQL_HOST> \\ # (1)\n--user <POSTGRESQL_USER> \\ # (2)\n--port <POSTGRESQL_PORT> \\ # (3)\n--dbname <POSTGRESQL_DB> \\ # (4)\n-a -f <SQL_FILE> \\ # (5)\n
                                    1. Replace <POSTGRESQL_HOST> with the PostgreSQL ip address or hostname.
                                    2. Replace <POSTGRESQL_USER> with the PostgreSQL username.
                                    3. Replace <POSTGRESQL_PORT> with the port number where PostgreSQL is running.
                                    4. Replace <POSTGRESQL_DB> with the name of your PostgreSQL database.
                                    5. Replace <POSTGRESQL_DB> with the name of your PostgreSQL database.
                                    6. Replace <SQL_FILE> with the path to the SQL script created in the previous step.
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#additional-permissions","title":"Additional permissions","text":"

                                  As part of the installation, you will be required to install the Control plane and Cluster Helm Charts. The Helm Charts require Kubernetes administrator permissions. You can review the exact permissions provided by using the --dry-run on both helm charts.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#validate-prerequisites","title":"Validate prerequisites","text":"

                                  Once you believe that the Run:ai prerequisites and preperations are met, we highly recommend installing and running the Run:ai pre-install diagnostics script. The tool:

                                  • Tests the below requirements as well as additional failure points related to Kubernetes, NVIDIA, storage, and networking.
                                  • Looks at additional components installed and analyzes their relevancy to a successful Run:ai installation.

                                  To use the script download the latest version of the script and run:

                                  chmod +x preinstall-diagnostics-<platform>\n./preinstall-diagnostics-<platform> \n

                                  If the script fails, or if the script succeeds but the Kubernetes system contains components other than Run:ai, locate the file runai-preinstall-diagnostics.txt in the current directory and send it to Run:ai technical support.

                                  For more information on the script including additional command-line flags, see here.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#next-steps","title":"Next steps","text":"

                                  Continue with installing the Run:ai Control Plane.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/","title":"Self Hosted installation over OpenShift - prerequisites","text":"

                                  Before proceeding with this document, please review the installation types documentation to understand the difference between air-gapped and connected installations.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#runai-components","title":"Run:ai components","text":"

                                  As part of the installation process you will install:

                                  • A control-plane managing cluster
                                  • One or more clusters

                                  Both the control plane and clusters require Kubernetes. Typically the control plane and first cluster are installed on the same Kubernetes cluster but this is not a must.

                                  Important

                                  In OpenShift environments, adding a cluster connecting to a remote control plane currently requires the assistance of customer support.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#installer-machine","title":"Installer machine","text":"

                                  The machine running the installation script (typically the Kubernetes master) must have:

                                  • At least 50GB of free space.
                                  • Docker installed.
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#helm","title":"Helm","text":"

                                  Run:ai requires Helm 3.14 or later. To install Helm, see Installing Helm. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#cluster-hardware-requirements","title":"Cluster hardware requirements","text":"

                                  The Run:ai control plane services require the following resources:

                                  Component Required Capacity CPU 10 cores Memory 12GB Disk space 110GB

                                  If Run:ai cluster is planned to be installed on the same cluster as the Run:ai control plane: Ensure the control plane requirements are in addition to the Run:ai cluster hardware requirements.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#runai-software-requirements","title":"Run:ai software requirements","text":""},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#cluster-nodes","title":"Cluster Nodes","text":"

                                  Nodes are required to be synchronized by time using NTP (Network Time Protocol) for proper system functionality.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#openshift","title":"OpenShift","text":"

                                  Run:ai supports OpenShift. OpenShift Versions supported are detailed in Kubernetes distribution.

                                  • OpenShift must be configured with a trusted certificate. Run:ai installation relies on OpenShift to create certificates for subdomains.
                                  • OpenShift must have a configured identity provider (Idp).
                                  • If your network is air-gapped, you will need to provide the Run:ai control-plane and cluster with information about the local certificate authority.
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#install-prerequisites","title":"Install prerequisites","text":""},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#nvidia-gpu-operator","title":"NVIDIA GPU Operator","text":"

                                  See Run:ai Cluster prerequisites installing NVIDIA dependencies in OpenShift.

                                  The Run:ai control plane, when installed without a Run:ai cluster, does not require the NVIDIA prerequisites.

                                  Information on how to download the GPU Operator for air-gapped installation can be found in the NVIDIA GPU Operator pre-requisites.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#inference-optional","title":"Inference (optional)","text":"

                                  See Run:ai Cluster prerequisites Inference requirements.

                                  The Run:ai control plane, when installed without a Run:ai cluster, does not require the Inference prerequisites.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#external-postgresql-database-optional","title":"External PostgreSQL database (optional)","text":"

                                  The Run:ai control plane installation includes a default PostgreSQL database. However, you may opt to use an existing PostgreSQL database if you have specific requirements or preferences. Please ensure that your PostgreSQL database is version 16 or higher.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#next-steps","title":"Next steps","text":"

                                  Continue to Preparing for a Run:ai OpenShift Installation .

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/project-management/","title":"Self Hosted installation over OpenShift - Create Projects","text":""},{"location":"admin/runai-setup/self-hosted/ocp/project-management/#introduction","title":"Introduction","text":"

                                  The Administrator creates Run:ai Projects via the Run:ai User Interface. When enabling Researcher Authentication you also assign users to Projects.

                                  Run:ai Projects are implemented as Kubernetes namespaces. When creating a new Run:ai Project, Run:ai does the following automatically:

                                  1. Creates a namespace by the name of runai-<PROJECT-NAME>.
                                  2. Labels the namespace as managed by Run:ai.
                                  3. Provides access to the namespace for Run:ai services.
                                  4. Associates users with the namespace.

                                  This process may need to be altered if,

                                  • Researchers already have existing Kubernetes namespaces
                                  • The organization's Kubernetes namespace naming convention does not allow the runai- prefix.
                                  • The organization's policy does not allow the automatic creation of namespaces
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/project-management/#process","title":"Process","text":"

                                  Run:ai allows the association of a Run:ai Project with any existing Kubernetes namespace:

                                  • When setting up a Run:ai cluster, Disable namespace creation by setting the cluster flag createNamespaces to false.
                                  • Using the Run:ai User Interface, create a new Project <PROJECT-NAME>. A namespace will not be created.
                                  • Associate and existing namepace <NAMESPACE> with the Run:ai project by running:
                                  oc label ns <NAMESPACE>  runai/queue=<PROJECT_NAME>\n

                                  Caution

                                  Setting the createNamespaces flag to false moves the responsibility of creating namespaces to match Run:ai Projects to the administrator.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/uninstall/","title":"Uninstall Run:ai","text":"

                                  See uninstall section here

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/","title":"Upgrade Run:ai","text":""},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#preparations","title":"Preparations","text":""},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#helm","title":"Helm","text":"

                                  Run:ai requires Helm 3.14 or later. Before you continue, validate your installed helm client version. To install or upgrade Helm, see Installing Helm. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#software-files","title":"Software files","text":"ConnectedAirgapped

                                  Run the helm command below:

                                  helm repo add runai-backend https://runai.jfrog.io/artifactory/cp-charts-prod\nhelm repo update\n
                                  • Ask for a tar file runai-air-gapped-<NEW-VERSION>.tar.gz from Run:ai customer support. The file contains the new version you want to upgrade to. <NEW-VERSION> is the updated version of the Run:ai control plane.
                                  • Upload the images as described here.
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#before-upgrade","title":"Before upgrade","text":"

                                  Before proceeding with the upgrade, it's crucial to apply the specific prerequisites associated with your current version of Run:ai and every version in between up to the version you are upgrading to.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-from-version-29","title":"Upgrade from version 2.9","text":"

                                  Two significant changes to the control-plane installation have happened with version 2.12: PVC ownership and installation customization.

                                  "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#pvc-ownership","title":"PVC ownership","text":"

                                  Run:ai no longer directly creates the PVCs that store Run:ai data (metrics and database). Instead, going forward,

                                  • Run:ai requires a Kubernetes storage class to be installed.
                                  • The PVCs are created by the Kubernetes StatefulSets.

                                  The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the data is saved or deleted when the Run:ai control plane is deleted.

                                  To remove the ownership in an older installation, run:

                                  kubectl patch pvc -n runai-backend pvc-postgresql  -p '{\"metadata\": {\"annotations\":{\"helm.sh/resource-policy\": \"keep\"}}}'\n
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#installation-customization","title":"Installation customization","text":"

                                  The Run:ai control-plane installation has been rewritten and is no longer using a backend values file. Instead, to customize the installation use standard --set flags. If you have previously customized the installation, you must now extract these customizations and add them as --set flag to the helm installation:

                                  • Find previous customizations to the control plane if such exist. Run:ai provides a utility for that here https://raw.githubusercontent.com/run-ai/docs/v2.13/install/backend/cp-helm-vals-diff.sh. For information on how to use this utility please contact Run:ai customer support.
                                  • Search for the customizations you found in the optional configurations table and add them in the new format.
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-control-plane","title":"Upgrade Control Plane","text":""},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-from-version-213-or-later","title":"Upgrade from version 2.13, or later","text":"ConnectedAirgapped
                                  helm get values runai-backend -n runai-backend > runai_control_plane_values.yaml\nhelm upgrade runai-backend -n runai-backend runai-backend/control-plane --version \"~2.20.0\" -f runai_control_plane_values.yaml --reset-then-reuse-values\n
                                  helm get values runai-backend -n runai-backend > runai_control_plane_values.yaml\nhelm upgrade runai-backend control-plane-<NEW-VERSION>.tgz -n runai-backend  -f runai_control_plane_values.yaml --reset-then-reuse-values\n
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-from-version-29_1","title":"Upgrade from version 2.9","text":"ConnectedAirgapped
                                  helm upgrade -i runai-backend -n runai-backend runai-backend/control-plane --version \"~2.20.0\" \\\n--set global.domain=runai.apps.<OPENSHIFT-CLUSTER-DOMAIN> \\ #(1)\n--set global.config.kubernetesDistribution=openshift \\\n--set thanos.query.stores={thanos-grpc-port-forwarder:10901} \\\n--set postgresql.primary.persistence.existingClaim=pvc-postgresql\n
                                  1. The subdomain configured for the OpenShift cluster.

                                  Note

                                  The helm repository name has changed from runai-backend/runai-backend to runai-backend/control-plane.

                                  helm upgrade -i runai-backend  ./control-plane-<NEW-VERSION>.tgz -n runai-backend \\\n--set global.domain=runai.apps.<OPENSHIFT-CLUSTER-DOMAIN> \\ #(1)\n--set global.config.kubernetesDistribution=openshift \\\n--set thanos.query.stores={thanos-grpc-port-forwarder:10901} \\\n--set postgresql.primary.persistence.existingClaim=pvc-postgresql\n
                                  1. The subdomain configured for the OpenShift cluster.
                                  "},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-cluster","title":"Upgrade Cluster","text":"

                                  To upgrade the cluster follow the instructions here.

                                  "},{"location":"admin/troubleshooting/diagnostics/","title":"Diagnostic Tools","text":""},{"location":"admin/troubleshooting/diagnostics/#add-verbosity-to-the-database-container","title":"Add Verbosity to the Database container","text":"

                                  Run:ai Self-hosted installation contains an internal database. To diagnose database issues, you can run the database in debug mode.

                                  In the runai-backend-values, search for postgresql. Add:

                                  postgresql:\n  image:\n    debug: true\n

                                  Re-install the Run:ai control-plane and then review the database logs by running:

                                  kubectl logs -n runai-backend runai-postgresql-0\n
                                  "},{"location":"admin/troubleshooting/diagnostics/#internal-networking-issues","title":"Internal Networking Issues","text":"

                                  Run:ai is based on Kubernetes. Kubernetes runs its own internal subnet with a separate DNS service. If you see in the logs that services have trouble connecting, the problem may reside there. You can find further information on how to debug Kubernetes DNS here. Specifically, it is useful to start a pod with networking utilities and use it for network resolution:

                                  kubectl run -i --tty netutils --image=dersimn/netutils -- bash\n
                                  "},{"location":"admin/troubleshooting/diagnostics/#add-verbosity-to-prometheus","title":"Add Verbosity to Prometheus","text":"

                                  Add verbosity to Prometheus by editing RunaiConfig:

                                  kubectl edit runaiconfig runai -n runai\n

                                  Add a debug log level:

                                  prometheus-operator:\n  prometheus:\n    prometheusSpec:\n      logLevel: debug\n

                                  To view logs, run:

                                  kubectl logs prometheus-runai-prometheus-operator-prometheus-0 prometheus \\\n      -n monitoring -f --tail 100\n

                                  "},{"location":"admin/troubleshooting/diagnostics/#add-verbosity-to-scheduler","title":"Add Verbosity to Scheduler","text":"

                                  To view extended logs run:

                                  kubectl edit ruaiconfig runai -n runai\n

                                  Then under the scheduler section add:

                                  runai-scheduler:\n   args:\n     verbosity: 6\n

                                  Warning

                                  Verbose scheduler logs consume a significant amount of disk space.

                                  "},{"location":"admin/troubleshooting/logs-collection/","title":"Logs Collection","text":"

                                  This article provides instructions for IT administrators on collecting Run:ai logs for support, including prerequisites, CLI commands, and log file retrieval. It also covers enabling verbose logging for Prometheus and the Run:ai Scheduler.

                                  "},{"location":"admin/troubleshooting/logs-collection/#collect-logs-to-send-to-support","title":"Collect logs to send to support","text":"

                                  To collect Run:ai logs, follow these steps:

                                  "},{"location":"admin/troubleshooting/logs-collection/#prerequisites","title":"Prerequisites","text":"
                                  • Ensure that you have administrator-level access to the Kubernetes cluster where Run:ai is installed.
                                  • The Run:ai Administrator Command-Line Interface (CLI) must be installed.
                                  "},{"location":"admin/troubleshooting/logs-collection/#step-by-step-instructions","title":"Step-by-Step Instructions","text":"
                                  1. Run the Command from your local machine or a Bastion Host (secure server) Open a terminal on your local machine (or any machine that has network access to the Kubernetes cluster) where the Run:ai Administrator CLI is installed.
                                  2. Collect the Logs Execute the following command to collect the logs:

                                    runai-adm collect-logs\n

                                    This command gathers all relevant Run:ai logs from the system and generate a compressed file.

                                  3. Locate the Generated File After running the command, note the location of the generated compressed log file. You can retrieve and send this file to Run:ai Support for further troubleshooting.

                                  Note

                                  The tar file packages the logs of Run:ai components only. It does not include logs of researcher containers that may contain private information

                                  "},{"location":"admin/troubleshooting/logs-collection/#logs-verbosity","title":"Logs verbosity","text":"

                                  Increase log verbosity to capture more detailed information, providing deeper insights into system behavior and make it easier to identify and resolve issues.

                                  "},{"location":"admin/troubleshooting/logs-collection/#prerequisites_1","title":"Prerequisites","text":"

                                  Before you begin, ensure you have the following:

                                  • Access to the Kubernetes cluster where Run:ai is installed
                                  • Including necessary permissions to view and modify configurations.
                                  • kubectl installed and configured:
                                  • The Kubernetes command-line tool, kubectl, must be installed and configured to interact with the cluster.
                                  • Sufficient privileges to edit configurations and view logs.
                                  • Monitoring Disk Space
                                  • When enabling verbose logging, ensure adequate disk space to handle the increased log output, especially when enabling debug or high verbosity levels.
                                  "},{"location":"admin/troubleshooting/logs-collection/#adding-verbosity","title":"Adding verbosity","text":"Adding verbosity to Prometheus

                                  To increase the logging verbosity for Prometheus, follow these steps:

                                  1. Edit the RunaiConfig to adjust Prometheus log levels. Copy the following command to your terminal:
                                  kubectl edit runaiconfig runai -n runai\n
                                  1. In the configuration file that opens, add or modify the following section to set the log level to debug:
                                  spec:\n    prometheus:\n        spec:\n            logLevel: debug\n
                                  1. Save the changes. To view the Prometheus logs with the new verbosity level, run:
                                  kubectl logs -n runai prometheus-runai-0\n

                                  This command streams the last 100 lines of logs from Prometheus, providing detailed information useful for debugging.

                                  Adding verbosity to the scheduler

                                  To enable extended logging for the Run:ai scheduler:

                                  1. Edit the RunaiConfig to adjust scheduler verbosity:
                                  kubectl edit runaiconfig runai -n runai\n

                                  2 Add or modify the following section under the scheduler settings:

                                  runai-scheduler:\n    args:\n        verbosity: 6\n

                                  This increases the verbosity level of the scheduler logs to provide more detailed output.

                                  Warning

                                  Enabling verbose logging can significantly increase disk space usage. Monitor your storage capacity and adjust the verbosity level as necessary.

                                  "},{"location":"admin/troubleshooting/troubleshooting/","title":"Troubleshooting Run:ai","text":""},{"location":"admin/troubleshooting/troubleshooting/#installation","title":"Installation","text":"Upgrade fails with \"Ingress already exists\"

                                  Symptom: The installation fails with error: Error: rendered manifests contain a resource that already exists. Unable to continue with install: IngressClass \"nginx\" in namespace \"\" exists

                                  Root cause: Run:ai installs NGINX, but there is an existing NGINX on the cluster.

                                  Resolution: In the Run:ai cluster YAML file, disable the installation of NGINX by setting:

                                  ingress-nginx:\n    enabled: false\n
                                  How to get installation logs

                                  Symptom: Installation fails and you need to troubleshoot the issue.

                                  Resolution: Run the following script to obtain any relevant installation logs in case of an error.

                                  curl -fsSL https://raw.githubusercontent.com/run-ai/public/main/installation/get-installation-logs.sh | bash\n
                                  Upgrade fails with \"rendered manifests contain a resource that already exists\" error

                                  Symptom: The installation fails with error: Error: rendered manifests contain a resource that already exists. Unable to continue with install:...

                                  Root cause: The Run:ai installation is trying to create a resource that already exists, which may be due to a previous installation that was not properly removed.

                                  Resolution: Run the following script to remove all Run:ai resources and reinstall:

                                  helm template <release-name> <chart-name> --namespace <namespace> | kubectl delete -f -\n

                                  Then reinstall Run:ai.

                                  Pods are failing due to certificate issues

                                  Symptom: Pods are failing with certificate issues.

                                  Root cause: The certificate provided during the Control Plane's installation is not valid.

                                  Resolution: Verify that the certificate is valid and trusted. If the certificate is valid, but is signed by a local CA, make sure you have followed the procedure for a local certificate authority.

                                  "},{"location":"admin/troubleshooting/troubleshooting/#cluster-health","title":"Cluster Health","text":"

                                  See Cluster Health Troubleshooting

                                  "},{"location":"admin/troubleshooting/troubleshooting/#dashboard-issues","title":"Dashboard Issues","text":"No Metrics are showing on Dashboard

                                  Symptom: No metrics are showing on dashboards at https://<company-name>.run.ai/dashboards/now

                                  Typical root causes:

                                  • Firewall-related issues.
                                  • Internal clock is not synced.
                                  • Prometheus pods are not running.

                                  Firewall issues

                                  Add verbosity to Prometheus as describe here.Verify that there are no errors. If there are connectivity-related errors you may need to:

                                  • Check your firewall for outbound connections. See the required permitted URL list in Network requirements.
                                  • If you need to set up an internet proxy or certificate, please contact Run:ai customer support.

                                  Machine Clocks are not synced

                                  Run: date on cluster nodes and verify that date/time is correct. If not:

                                  • Set the Linux time service (NTP).
                                  • Restart Run:ai services. Depending on the previous time gap between servers, you may need to reinstall the Run:ai cluster

                                  Prometheus pods are not running

                                  Run: kubectl get pods -n monitoring -o wide

                                  • Verify that all pods are running.
                                  • The default Prometheus installation is not built for high availability. If a node is down, the Prometheus pod may not recover by itself unless manually deleted. Delete the pod to see it start on a different node and consider adding a second replica to Prometheus.
                                  GPU Related metrics not showing

                                  Symptom: GPU-related metrics such as GPU Nodes and Total GPUs are showing zero but other metrics, such as Cluster load are shown.

                                  Root cause: An installation issue related to the NVIDIA stack.

                                  Resolution:

                                  Need to run through the NVIDIA stack and find the issue. The current NVIDIA stack looks as follows:

                                  1. NVIDIA Drivers (at the OS level, on every node)
                                  2. NVIDIA Docker (extension to Docker, on every node)
                                  3. Kubernetes Node feature discovery (mark node properties)
                                  4. NVIDIA GPU Feature discovery (mark nodes as \u201chaving GPUs\u201d)
                                  5. NVIDIA Device plug-in (Exposes GPUs to Kubernetes)
                                  6. NVIDIA DCGM Exporter (Exposes metrics from GPUs in Kubernetes)

                                  Run:ai requires the installation of the NVIDIA GPU Operator which installs the entire stack above. However, there are two alternative methods for using the operator:

                                  • Use the default operator values to install 1 through 6.
                                  • If NVIDIA Drivers (#1 above) are already installed on all nodes, use the operator with a flag that disables drivers install.

                                  For more information see [System requirements](../runai-setup/cluster-setup/.

                                  NVIDIA GPU Operator

                                  Run: kubectl get pods -n gpu-operator | grep nvidia and verify that all pods are running.

                                  Node and GPU feature discovery

                                  Kubernetes Node feature discovery identifies and annotates nodes. NVIDIA GPU Feature Discovery identifies and annotates nodes with GPU properties. See that:

                                  • All such pods are up.
                                  • The GPU feature discovery pod is available for every node with a GPU.
                                  • And finally, when describing nodes, they show an active gpu/nvidia resource.

                                  NVIDIA Drivers

                                  • If NVIDIA drivers have been installed on the nodes themselves, ssh into each node and run nvidia-smi. Run sudo systemctl status docker and verify that docker is running. Run nvidia-docker and verify that it is installed and working. Linux software upgrades may require a node restart.
                                  • If NVIDIA drivers are installed by the Operator, verify that the NVIDIA driver daemonset has created a pod for each node and that all nodes are running. Review the logs of all such pods. A typical problem may be the driver version which is too advanced for the GPU hardware. You can set the driver version via operator flags.

                                  NVIDIA DCGM Exporter

                                  • View the logs of the DCGM exporter pod and verify that no errors are prohibiting the sending of metrics.
                                  • To validate that the dcgm-exporter exposes metrics, find one of the DCGM Exporter pods and run:
                                  kubectl port-forward <dcgm-exporter-pod-name> 9400:9400\n

                                  Then browse to http://localhost:9400/metrics and verify that the metrics have reached the DCGM exporter.

                                  • The next step after the DCGM Exporter is Prometheus. To validate that metrics from the DCGM Exporter reach Prometheus, run:
                                  kubectl port-forward svc/runai-cluster-kube-prometh-prometheus -n monitoring 9090:9090\n

                                  Then browse to localhost:9090. In the UI, type DCGM_FI_DEV_GPU_UTIL as the metric name, and verify that the metric has reached Prometheus.

                                  If the DCGM Exporter is running correctly and exposing metrics, but this metric does not appear in Prometheus, there may be a connectivity issue between these components.

                                  Allocation-related metrics not showing

                                  Symptom: GPU Allocation-related metrics such as Allocated GPUs are showing zero but other metrics, such as Cluster load are shown.

                                  Root cause: The origin of such metrics is the scheduler.

                                  Resolution:

                                  • Run: kubectl get pods -n runai | grep scheduler. Verify that the pod is running.
                                  • Review the scheduler logs and look for errors. If such errors exist, contact Run:ai customer support.
                                  All metrics are showing \"No Data\"

                                  Symptom: All data on all dashboards is showing the text \"No Data\".

                                  Root cause: Internal issue with metrics infrastructure.

                                  Resolution: Please contact Run:ai customer support.

                                  "},{"location":"admin/troubleshooting/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"After a successful login, you are redirected to the same login page

                                  For a self-hosted installation, check Linux clock synchronization as described above. Use the Run:ai preinstall diagnostics tool to validate System and network requirements and test this automatically.

                                  Single-sign-on issues

                                  For single-sign-on issues, see the troubleshooting section in the single-sign-on configuration documents.

                                  "},{"location":"admin/troubleshooting/troubleshooting/#user-interface-submit-job-issues","title":"User Interface Submit Job Issues","text":"New Job button is grayed out

                                  Symptom: The New Job button on the top right of the Job list is grayed out.

                                  Root Cause: This can happen due to multiple configuration issues:

                                  • Open Chrome developer tools and refresh the screen.
                                  • Under Network locate a network call error. Search for the HTTP error code.

                                  Resolution for 401 HTTP Error

                                  • The Cluster certificate provided as part of the installation is valid and trusted (not self-signed).
                                  • Researcher Authentication has not been properly configured. Try running runai login from the Command-line interface. Alternatively, run: kubectl get pods -n kube-system, identify the api-server pod and review its logs.

                                  Resolution for 403 HTTP Error

                                  Run: kubectl get pods -n runai, identify the agent pod, see that it's running, and review its logs.

                                  New Job button is not showing

                                  Symptom: The New Job button on the top right of the Job list does not show.

                                  Root Causes: (multiple)

                                  • You do not have Researcher or Research Manager permissions.
                                  • Under Settings | General, verify that Unified UI is on.
                                  Submit form is distorted

                                  Symptom: Submit form is showing vertical lines.

                                  Root Cause: The control plane does not know the cluster URL.

                                  Using the Run:ai user interface, go to the Clusters list. See that there is no cluster URL next to your cluster.

                                  Resolution: Cluster must be re-installed.

                                  Submit form does not show the list of Projects

                                  Symptom: When connected with Single-sign-on, in the Submit form, the list of Projects is empty.

                                  Root Cause: SSO is on and researcher authentication is not properly configured as such.

                                  Resolution: Verify API Server settings as described in Researcher Authentication configuration.

                                  Job form is not opening on OpenShift

                                  Symptom: When clicking on \"New Job\" the Job forms does not load. Network shows 405

                                  Root Cause: An installation step has been missed.

                                  Resolution: Open the Cluster list and open the cluster installation wizard again. After selecting OpenShift, you will see a patch command at the end of the instruction set. Run it.

                                  "},{"location":"admin/troubleshooting/troubleshooting/#networking-issues","title":"Networking Issues","text":"'admission controller' connectivity issue

                                  Symptoms:

                                  • Pods are failing with 'admission controller' connectivity errors.
                                  • The command-line runai submit fails with an 'admission controller' connectivity error.
                                  • Agent or cluster sync pods are crashing in self-hosted installation.

                                  Root cause: Connectivity issues between different nodes in the cluster.

                                  Resolution:

                                  • Run the preinstall diagnostics tool to validate System and network requirements and test connectivity issues.
                                  • Run: kubectl get pods -n kube-system -o wide. Verify that all networking pods are running.
                                  • Run: kubectl get nodes. Check that all nodes are ready and connected.
                                  • Run: kubectl get pods -o wide -A to see which pods are Pending or in Error and which nodes they belong to.
                                  • See if pods from different nodes have trouble communicating with each other.
                                  • Advanced, run: kubectl exec <pod-name> -it /bin/sh from a pod in one node and ping a pod from another.
                                  Projects are not syncing

                                  Symptom: Create a Project on the Run:ai user interface, then run: runai list projects. The new Project does not appear.

                                  Root cause: The Run:ai agent is not syncing properly. This may be due to firewall issues.

                                  Resolution

                                  • Run: runai pods -n runai | grep agent. See that the agent is in Running state. Select the agent's full name and run: kubectl logs -n runai runai-agent-<id>.
                                  • Verify that there are no errors. If there are connectivity-related errors you may need to check your firewall for outbound connections. See the required permitted URL list in Network requirements.
                                  • If you need to set up an internet proxy or certificate, please contact Run:ai customer support.
                                  Jobs are not syncing

                                  Symptom: A Job on the cluster (runai list jobs) does not show in the Run:ai user interface Job list.

                                  Root cause: The Run:ai cluster-sync pod is not syncing properly.

                                  Resolution: Search the cluster-sync pod for errors.

                                  "},{"location":"admin/troubleshooting/troubleshooting/#job-related-issues","title":"Job-related Issues","text":"Jobs fail with ContainerCannotRun status

                                  Symptom: When running runai list jobs, your Job has a status of ContainerCannotRun.

                                  Root Cause: The issue may be caused due to an unattended upgrade of the NVIDIA driver.

                                  To verify, run: runai describe job <job-name>, and search for an error driver/library version mismatch.

                                  Resolution: Reboot the node on which the Job attempted to run.

                                  Going forward, we recommend blacklisting NVIDIA driver from unattended-upgrades. You can do that by editing /etc/apt/apt.conf.d/50unattended-upgrades, and adding nvidia-driver- to the Unattended-Upgrade::Package-Blacklist section. It should look something like that:

                                  Unattended-Upgrade::Package-Blacklist {\n    // The following matches all packages starting with linux-\n    //  \"linux-\";\n    \"nvidia-driver-\";\n
                                  "},{"location":"admin/troubleshooting/troubleshooting/#inference-issues","title":"Inference Issues","text":"New Deployment button is grayed out

                                  Symptoms:

                                  • The New workload type -> Inference button is grayed out.
                                  • Cannot create a deployment via Inference API.

                                  Root Cause: Run:ai Inference prerequisites have not been met.

                                  Resolution: Review inference prerequisites and install accordingly.

                                  Submitted workload type of inference remains in Pending state

                                  Symptom: A submitted inference is not running.

                                  Root Cause: The patch statement to add the runai-scheduler has not been performed.

                                  Workload of type inference status is \"Failed\"

                                  Symptom: Inference status is always Failed.

                                  Root Cause: (multiple)

                                  • Not enough resources in the cluster.
                                  • Server model command is misconfigured (i.e sleep infinity).
                                  • Server port is misconfigured.
                                  Worload of type inference does not scale up from zero

                                  Symptom: In the Inference form, when \"Auto-scaling\" is enabled, and \"Minimum Replicas\" is set to zero, the inference cannot scale up from zero.

                                  Root Cause:

                                  • Clients are not sending requests.
                                  • Clients are not using the same port/protocol as the server model.
                                  • Server model command is misconfigured (i.e sleep infinity).
                                  "},{"location":"admin/troubleshooting/troubleshooting/#command-line-interface-issues","title":"Command-line interface Issues","text":"Unable to install CLI due to certificate errors

                                  Symptom: The curl command and download button to download the CLI is not working.

                                  Root Cause: The cluster is not accessible from the download location

                                  Resolution:

                                  Use an alternate method for downloading the CLI. Run:

                                  kubectl port-forward -n runai svc/researcher-service 4180\n

                                  In another shell, run:

                                  wget --content-disposition http://localhost:4180/cli/linux\n

                                  When running the CLI you get an error: open .../.kube/config.lock: permission denied

                                  Symptom: When running any CLI command you get a permission denied error.

                                  Root Cause: The user running the CLI does not have read permissions to the .kube directory.

                                  Resolution: Change permissions for the directory.

                                  When running 'runai logs', the logs are delayed

                                  Symptom: Printout from the container is not immediately shown in the log.

                                  Root Cause: By default, Python buffers stdout, and stderr, which are not flushed in real-time. This may cause logs to appear sometimes minutes after being buffered.

                                  Resolution: Set the env var PYTHONUNBUFFERED to any non-empty string or pass -u to Python. e.g. python -u main.py.

                                  CLI does not download properly on OpenShift

                                  Symptom: When trying to download the CLI on OpenShift, the wget statement downloads a text file named darwin or linux rather than the binary runai.

                                  Root Cause: An installation step has been missed.

                                  Resolution: Open the Cluster list and open the cluster installation wizard again. After selecting OpenShift, you will see a patch command at the end of the instruction set. Run it.

                                  "},{"location":"developer/overview-developer/","title":"Developer Documentation Overview","text":"

                                  Developers can access Run:ai through various programmatic interfaces.

                                  "},{"location":"developer/overview-developer/#api-architecture","title":"API Architecture","text":"

                                  Run:ai is composed of a single, multi-tenant control plane. Each tenant can be connected to one or more GPU clusters. See Run:ai system components for detailed information.

                                  The following programming interfaces are available:

                                  API Description Purpose Run:ai REST API Get and Modify any Run:ai business object This is the API mostly used by system developers. The API is also used by the Run:ai user interface as well as the new command-line interface Cluster API (Deprecated) Submit Workloads directly to the Cluster A YAML-based API allowing submittion of Workloads directly to the Cluster. With Run:ai 2.18, this API is replaced by the above Run:ai, which is now the recommended method Metrics API (deprecated) Get cluster metrics Get utilization metrics."},{"location":"developer/overview-developer/#runai-rest-api","title":"Run:ai REST API","text":"

                                  Allows you to Add, delete, modify and list Run:ai meta-data objects such as Projects, Departments, Users. For Clusters of Run:ai 2.18 and above, allows the submitting of Workloasd.

                                  The API is provided as REST and is accessible via the control plane endpoint.

                                  For more information see Run:ai REST API.

                                  Important

                                  The endpoints and fields specified in the API reference are the ones that are officially supported by Run:ai. Endpoints and fields that are not listed in the API reference are not supported.

                                  Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

                                  For details, see the Deprecation notifications.

                                  "},{"location":"developer/overview-developer/#cluster-api-deprecated","title":"Cluster API (Deprecated)","text":"

                                  The Cluster API allows you to submit and delete Workloads directly to the cluster itself.

                                  The API is provided as Kubernetes API.

                                  Cluster API is accessible via the GPU cluster itself. As such, multiple clusters may have multiple endpoints.

                                  Important

                                  • This API is replaced by a Run:ai REST API to submit jobs, which is now the recommended method for cluster versions of 2.18 and above.
                                  • If you are looking to automate tasks with older versions of Run:ai, it's best to use the Run:ai Command-line interface which provides forward compatibility.
                                  "},{"location":"developer/overview-developer/#metrics-api","title":"Metrics API","text":"

                                  Retrieve metrics from multiple GPU clusters.

                                  See the Metrics API document.

                                  "},{"location":"developer/overview-developer/#api-authentication","title":"API Authentication","text":"

                                  See API Authentication for information on how to gain authenticated access to Run:ai APIs.

                                  "},{"location":"developer/rest-auth/","title":"API Authentication","text":"

                                  The following document explains how to authenticate with Run:ai APIs.

                                  Run:ai APIs are accessed using bearer tokens. A token can be obtained by creating an Application through the Run:ai user interface.

                                  An application contains a client ID and a client secret. With the client credentials you can obtain a token and use it within subsequent API calls.

                                  • To create applications for your organization, see Applications.
                                  • To create your own user applications, see User Applications.
                                  "},{"location":"developer/rest-auth/#request-an-api-token","title":"Request an API Token","text":"

                                  Use the client credentials created to get a temporary token to access Run:ai as follows.

                                  "},{"location":"developer/rest-auth/#example-command-to-get-an-api-token","title":"Example command to get an API token","text":"

                                  Replace <runai_url> below with:

                                  • For SaaS installations, use <tenant-name>.run.ai

                                  • For self-hosted use the Run:ai user interface URL.

                                  cURLPython
                                      curl  -X POST \\\n      'https://<runai_url>/api/v1/token' \\\n      --header 'Accept: */*' \\\n      --header 'Content-Type: application/json' \\\n      --data-raw '{\n      \"grantType\":\"client_credentials\",\n      \"clientId\":\"<CLIENT ID>\",\n      \"clientSecret\" : \"<CLIENT SECRET>\"\n    }'\n
                                      import requests\n    import json\n    reqUrl = \"https://<runai_url>/api/v1/token\"\n    headersList = {\n     \"Accept\": \"*/*\",\n     \"Content-Type\": \"application/json\"\n    }\n    payload = json.dumps({\n      \"grantType\":\"client_credentials\",\n      \"clientId\":\"<CLIENT ID>\",\n      \"clientSecret\" : \"<CLIENT SECRET>\"\n    })\n    response = requests.request(\"POST\", reqUrl, data=payload,  headers=headersList)\n    print(response.text)\n
                                  "},{"location":"developer/rest-auth/#response","title":"Response","text":"

                                  The API response will look as follows:

                                  API Response
                                  {\n  \"accessToken\": \"<TOKEN>\", \n}\n

                                  To call Run:ai REST APIs, the application must pass the retrieved accessToken as a Bearer token in the Authorization header of your HTTP request.

                                  "},{"location":"developer/user-applications/","title":"User Applications","text":"

                                  This article explains the procedure to create your own user applications.

                                  Applications are used for API integrations with Run:ai. An application contains a client ID and a client secret. With the client credentials, you can obtain a token as detailed in API authentication and use it within subsequent API calls.

                                  Note

                                  • User applications are supported in cluster version 2.20 and above.
                                  • The token obtained through user applications assumes the roles and permissions of the user.
                                  "},{"location":"developer/user-applications/#creating-applications","title":"Creating Applications","text":"

                                  To create an application:

                                  1. Click the user icon, then select Settings
                                  2. Click +APPLICATION
                                  3. Enter the application\u2019s name
                                  4. Click CREATE
                                  5. Copy the Client ID and Client secret and store securely
                                  6. Click DONE

                                  You can create up to 20 user applications.

                                  Note

                                  The client secret is visible only at the time of creation. It cannot be recovered but can be regenerated.

                                  "},{"location":"developer/user-applications/#regenerating-client-secret","title":"Regenerating client secret","text":"

                                  To regenerate a client secret:

                                  1. Locate the application you want to regenerate its client secret
                                  2. Click Regenerate client secret
                                  3. Click REGENERATE
                                  4. Copy the New client secret and store it securely
                                  5. Click DONE

                                  Warning

                                  Regenerating a client secret revokes the previous one.

                                  "},{"location":"developer/user-applications/#deleting-an-application","title":"Deleting an application","text":"
                                  1. Locate the application you want to delete
                                  2. Click on the trash icon
                                  3. On the dialog, click DELETE to confirm
                                  "},{"location":"developer/user-applications/#using-api","title":"Using API","text":"

                                  Go to the User Applications API reference to view the available actions

                                  "},{"location":"developer/admin-rest-api/overview/","title":"Run:ai REST API","text":"

                                  The purpose of the Run:ai REST API is to provide an easy-to-use programming interface for administrative tasks.

                                  "},{"location":"developer/admin-rest-api/overview/#endpoint-url-for-api","title":"Endpoint URL for API","text":"

                                  The domain used for Run:ai REST APIs is the same domain used to browse for the Run:ai User Interface. Either <company>.run.ai, or app.run.ai for older tenants or a custom URL used for Self-hosted installations.

                                  "},{"location":"developer/admin-rest-api/overview/#authentication","title":"Authentication","text":"
                                  • Create a Client Application to make API requests. Use the client application and secret, to obtain a time-bound bearer token (<ACCESS-TOKEN>). For details, see Calling REST APIs.
                                  • Use the token for subsequent API calls.
                                  "},{"location":"developer/admin-rest-api/overview/#example-usage","title":"Example Usage","text":"

                                  For example, if you have an Administrator role, you can get a list of clusters by running:

                                  cURLPython
                                  curl 'https://<COMPANY-URL>/v1/k8s/clusters' \\\n--header 'Accept: application/json' \\\n--header 'Content-Type: application/json' \\\n--header 'Authorization: Bearer <ACCESS-TOKEN>'\n
                                  import http.client\n\nconn = http.client.HTTPSConnection(\"https://<COMPANY-URL>\")\nheaders = {\n    'content-type': \"application/json\",\n    'authorization': \"Bearer <ACCESS-TOKEN>\"\n    }\nconn.request(\"GET\", \"/v1/k8s/clusters\", headers=headers)\n\nres = conn.getresponse()\ndata = res.read()\n\nprint(data.decode(\"utf-8\"))\n

                                  (replace <ACCESS-TOKEN> with the bearer token from above).

                                  For an additional example, see the following code. It is an example of how to use the Run:ai REST API to create a User and a Project and set the User to the Project.

                                  "},{"location":"developer/admin-rest-api/overview/#runai-rest-api-documentation","title":"Run:ai REST API Documentation","text":"

                                  The Run:ai REST API offers developers a robust interface for interacting with and managing Run:ai metadata objects, including Projects, Departments, Clusters, and Users.

                                  Public API documentation is available at api-docs.run.ai. For self-hosted deployments, access the documentation at https://<control-plane-url>/api/docs.

                                  View Documentation

                                  "},{"location":"developer/admin-rest-api/overview/#runai-api-policy","title":"Run:ai API Policy","text":"

                                  At Run:ai, we are dedicated to delivering stable, reliable, and well-documented APIs. Our goal is to ensure that our APIs evolve in a predictable, transparent manner, offering users a seamless experience.

                                  Run:ai follows strict API design and operational standards to ensure a consistent and high-quality experience for users.

                                  "},{"location":"developer/admin-rest-api/overview/#api-lifecycle-and-deprecation","title":"API Lifecycle and Deprecation","text":"

                                  While our goal is to maintain stable and backward-compatible APIs, there may be times when breaking changes or deprecations are necessary.

                                  In case of breaking changes, the deprecated version of the API will be supported for two additional versions in self-hosted deployments and for six months in SaaS deployments. During this period, no new features or functionality will be added to the deprecated API. When an API or API field is deprecated, the following process is followed: Documentation: The deprecated API or field is clearly labeled in the documentation, with a replacement provided where applicable. Release Notes: Information about deprecated APIs, including those scheduled for future removal, is included in the release notes. Customer Notification: Customers are notified of upcoming deprecations as part of the regular release communications.

                                  "},{"location":"developer/admin-rest-api/overview/#api-removal","title":"API Removal","text":"

                                  After the defined backward compatibility period has ended, deprecated APIs or fields are removed from both the codebase and the documentation.

                                  "},{"location":"developer/cluster-api/other-resources/","title":"Support for other Kubernetes Applications","text":""},{"location":"developer/cluster-api/other-resources/#introduction","title":"Introduction","text":"

                                  Kubernetes has several built-in resources that encapsulate running Pods. These are called Kubernetes Workloads and should not be confused with Run:ai Workloads.

                                  Examples of such resources are a Deployment that manages a stateless application, or a Job that runs tasks to completion.

                                  Run:ai natively runs Run:ai Workloads. A Run:ai workload encapsulates all the resources needed to run, creates them, and deletes them together. However, Run:ai, being an open platform allows the scheduling of any Kubernetes Workflow.

                                  "},{"location":"developer/cluster-api/other-resources/#how-to","title":"How To","text":"

                                  To run Kubernetes Workloads with Run:ai you must add the following to the YAML:

                                  • A namespace that is associated with a Run:ai Project.
                                  • A scheduler name: runai-scheduler.
                                  • When using Fractions, use a specific syntax for the nvidia/gpu limit.
                                  "},{"location":"developer/cluster-api/other-resources/#example-job","title":"Example: Job","text":"job1.yaml
                                  apiVersion: batch/v1\nkind: Job # (1)\nmetadata:\n  name: job1\n  namespace: runai-team-a # (2)\nspec:\n  template:\n    spec:\n      containers:\n      - name: job1-container\n        image: runai.jfrog.io/demo/quickstart\n        resources:\n          limits:\n            nvidia.com/gpu: 1 # (4)\n      restartPolicy: Never\n      schedulerName: runai-scheduler # (3)\n
                                  1. This is a Kubernetes Job.
                                  2. Namespace: Replace runai-team-a with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>).
                                  3. The job to be scheduled with the Run:ai scheduler.
                                  4. To run with half a GPU replace 1 with \"0.5\" (with apostrophes).

                                  To submit the Job run:

                                  kubectl apply -f job1.yaml\n

                                  You will be able to see the Job in the Run:ai User interface, including all metrics and lists

                                  "},{"location":"developer/cluster-api/other-resources/#example-deployment","title":"Example: Deployment","text":"deployment1.yaml
                                  apiVersion: apps/v1\nkind: Deployment # (1)\nmetadata:\n  name: inference-1\n  namespace: runai-team-a # (2)\nspec:\n  replicas: 1\n  selector:\n    matchLabels:\n      app: inference-1\n  template:\n    metadata:\n      labels:\n        app: inference-1\n    spec:\n      containers:\n        - resources:\n            limits:\n              nvidia.com/gpu: 1 # (4)\n          image: runai/example-marian-server\n          imagePullPolicy: Always\n          name: inference-1\n          ports:\n            - containerPort: 8888\n      schedulerName: runai-scheduler # (3)\n\n---\napiVersion: v1\nkind: Service # (5)\nmetadata:\n  labels:\n    app: inference-1\n  name: inference-1\nspec:\n  type: ClusterIP\n  ports:\n    - port: 8888\n      targetPort: 8888\n  selector:\n    app: inference-1\n
                                  1. This is a Kubernetes Deployment.
                                  2. Namespace: Replace runai-team-a with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>).
                                  3. The job to be scheduled with the Run:ai scheduler.
                                  4. To run with half a GPU replace 1 with \"0.5\" (with apostrophes).
                                  5. This example also contains the creation of a service to connect to the deployment. It is not mandatory.

                                  To submit the Deployment run:

                                  kubectl apply -f deployment1.yaml\n
                                  "},{"location":"developer/cluster-api/other-resources/#example-submit-a-cron-job-via-yaml","title":"Example: Submit a Cron job via YAML","text":"

                                  The cron command-line utility is a job scheduler typically used to set up and maintain software environments at scheduled intervals. Run:ai now supports submitting jobs with cron using a YAML file.

                                  To submit a job using cron, run the following command:

                                  kubectl apply -f <file_name>.yaml\n

                                  The following is an example YAML file:

                                  apiVersion: batch/v1\nkind: CronJob\nmetadata:\n  name: hello\nspec:\n  schedule: \"* * * * *\"\n  jobTemplate:\n    spec:\n      template:\n        metadata:\n          labels:\n          - (Mandatory) runai/queue: team-a\n        spec:\n          (Mandatory) schedulerName: runai-scheduler\n          containers:\n          - name: hello\n            image: busybox:1.28\n            imagePullPolicy: IfNotPresent\n            command:\n            - /bin/sh\n            - -c\n            - date; echo Hello from the Kubernetes cluster\n          restartPolicy: OnFailure\n          (Optional) priorityClassName: build / train / inference / interactivePreemptible\n
                                  "},{"location":"developer/cluster-api/other-resources/#limitations","title":"Limitations","text":"

                                  The Run:ai command line interface provides limited support for Kubernetes Workloads.

                                  "},{"location":"developer/cluster-api/other-resources/#see-also","title":"See Also","text":"

                                  Run:ai has specific integrations with additional third-party tools such as KubeFlow, MLFlow, and more. These integrations use the same instructions as described above.

                                  "},{"location":"developer/cluster-api/reference/","title":"Reference","text":"

                                  For a full reference for the YAML API parameters see the YAML Reference document.

                                  "},{"location":"developer/cluster-api/submit-rest/","title":"Submitting Workloads via HTTP/REST","text":"

                                  You can submit Workloads via HTTP calls, using the Kubernetes REST API.

                                  "},{"location":"developer/cluster-api/submit-rest/#submit-workload-example","title":"Submit Workload Example","text":"

                                  To submit a workload via HTTP, run the following:

                                  curl -X POST \\ # (1) \n'https://<IP>:6443/apis/run.ai/v2alpha1/namespaces/<PROJECT>/trainingworkloads' \\ \n    --header 'Content-Type: application/yaml' \\\n    --header 'Authorization: Bearer <BEARER>' \\  # (2) \n    --data-raw 'apiVersion: run.ai/v2alpha1\nkind: TrainingWorkload  # (3)\nmetadata:\n  name: job-1    \nspec:\n  gpu:\n    value: \"1\"\n  image:\n    value: runai.jfrog.io/demo/quickstart\n  name:\n    value: job-1  \n
                                  1. Replace <IP> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile). Replace <PROJECT> with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>). Replace trainingworkloads with interactiveworkloads, distributedworkloads or inferenceworkloads according to type.
                                  2. Add Bearer token. To obtain a Bearer token see API authentication.
                                  3. See Submitting a Workload via YAML for an explanation of the YAML-based workload.

                                  Run: runai list jobs to see the new Workload.

                                  "},{"location":"developer/cluster-api/submit-rest/#delete-workload-example","title":"Delete Workload Example","text":"

                                  To delete a workload run:

                                  curl -X DELETE \\ # (1) \n'https://<IP>:6443/apis/run.ai/v2alpha1/namespaces/<PROJECT>/trainingworkloads/<JOB-NAME>' \\ \n    --header 'Content-Type: application/yaml' \\\n    --header 'Authorization: Bearer <BEARER>'   # (2)\n
                                  1. Replace <IP> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile). Replace <PROJECT> with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>). Replace trainingworkloads with interactiveworkloads, distributedworkloads or inferenceworkloads according to type. Replace <JOB-NAME> with the name of the Job.
                                  2. Add Bearer token. To obtain a Bearer token see API authentication.
                                  "},{"location":"developer/cluster-api/submit-rest/#suspendstop-workload-example","title":"Suspend/Stop workload example","text":"

                                  To suspend or stop a workload run:

                                  curl -X PATCH \\ # (1) \n'https://<IP>:6443/apis/run.ai/v2alpha1/namespaces/<PROJECT>/interactiveworkload/<JOB-NAME>' \\\n    --header 'Content-Type: application/json' \n    --header 'Authorization: Bearer <TOKEN>'# (2) \n    --data '{\"spec\":{\"active\": {\"value\": \"false\"}}}'\n
                                  1. Replace <IP> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile). Replace <PROJECT> with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>). Replace trainingworkloads with interactiveworkloads, distributedworkloads or inferenceworkloads according to type. Replace <JOB-NAME> with the name of the Job.
                                  2. Add Bearer token. To obtain a Bearer token see API authentication.
                                  "},{"location":"developer/cluster-api/submit-rest/#using-other-programming-languages","title":"Using other Programming Languages","text":"

                                  You can use any Kubernetes client library together with the YAML documentation above to submit workloads via other programming languages. For more information see Kubernetes client libraries.

                                  "},{"location":"developer/cluster-api/submit-rest/#python-example","title":"Python example","text":"

                                  Create the following file and run it via python:

                                  create-train.py
                                  import json\nimport requests\n\n# (1)\nurl = \"https://<IP>:6443/apis/run.ai/v2alpha1/namespaces/<PROJECT>/trainingworkloads\"\n\npayload = json.dumps({\n  \"apiVersion\": \"run.ai/v2alpha1\",\n  \"kind\": \"TrainingWorkload\",\n  \"metadata\": {\n    \"name\": \"train1\",\n    \"namespace\": \"runai-team-a\"\n  },\n  \"spec\": {\n    \"image\": {\n      \"value\": \"runai.jfrog.io/demo/quickstart\"\n    },\n    \"name\": {\n      \"value\": \"train1\"\n    },\n    \"gpu\": {\n      \"value\": \"1\"\n    }\n  }\n})\n\nheaders = {\n  'Content-Type': 'application/json',\n  'Authorization': 'Bearer <TOKEN>' #(2)\n}\n\nresponse = requests.request(\"POST\", url, headers=headers, data=payload) # (3)\n\nprint(json.dumps(json.loads(response.text), indent=4))\n
                                  1. Replace <IP> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile). Replace <PROJECT> with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>). Replace trainingworkloads with interactiveworkloads, distributedworkloadsor inferenceworkloads according to type.
                                  2. Add Bearer token. To obtain a Bearer token see API authentication.
                                  3. if you do not have a valid certificate, you can add the flag verify=False.
                                  "},{"location":"developer/cluster-api/submit-yaml/","title":"Submitting Workloads via YAML","text":"

                                  You can use YAML to submit Workloads directly to Run:ai. Below are examples of how to create training, interactive and inference workloads via YAML.

                                  For details on YAML parameters, see the YAML Reference.

                                  "},{"location":"developer/cluster-api/submit-yaml/#submit-workload-example","title":"Submit Workload Example","text":"

                                  Create a file named training1.yaml with the following text:

                                  training1.yaml
                                  apiVersion: run.ai/v2alpha1\nkind: TrainingWorkload # (1)\nmetadata:\n  name: job-1  # (2) \n  namespace: runai-team-a # (3)\nspec:\n  gpu:\n    value: \"1\"\n  image:\n    value: runai.jfrog.io/demo/quickstart\n  name:\n    value: job-1 # (4)\n
                                  1. This is a Training workload.
                                  2. Kubernetes object name. Mandatory, but has no functional significance.
                                  3. Namespace: Replace runai-team-a with the name of the Run:ai namespace for the specific Project (typically runai-<Project-Name>).
                                  4. Job name as appears in Run:ai. Can provide name, or create automatically if name prefix is configured.

                                  Change the namespace and run: kubectl apply -f training1.yaml

                                  Run: runai list jobs to see the new Workload.

                                  "},{"location":"developer/cluster-api/submit-yaml/#delete-workload-example","title":"Delete Workload Example","text":"

                                  Run: kubectl delete -f training1.yaml to delete the Workload.

                                  "},{"location":"developer/cluster-api/submit-yaml/#creating-a-yaml-syntax-from-a-cli-command","title":"Creating a YAML syntax from a CLI command","text":"

                                  An easy way to create a YAML for a workload is to generate it from the runai submit command by using the --dry-run flag. For example, run:

                                  runai submit build1 -i ubuntu -g 1 --interactive --dry-run \\\n     -- sleep infinity \n

                                  The result will be the following Kubernetes object declaration:

                                  apiVersion: run.ai/v2alpha1\nkind: InteractiveWorkload  # (1)\nmetadata:\n  creationTimestamp: null\n  labels:\n    PreviousJob: \"true\"\n  name: job-0-2022-05-02t08-50-57\n  namespace: runai-team-a\nspec:\n  command:\n    value: sleep infinity\n  gpu:\n    value: \"1\"\n  image:\n    value: ubuntu\n  imagePullPolicy:\n    value: Always\n  name:\n    value: job-0\n\n... Additional internal and status properties...\n
                                  1. This is an Interactive workload.
                                  "},{"location":"developer/cluster-api/submit-yaml/#inference-workload-example","title":"Inference Workload Example","text":"

                                  Creating an inference workload is similar to the above two examples.

                                  apiVersion: run.ai/v2alpha1\nkind: InferenceWorkload\nmetadata:\n  name: inference1\n  namespace: runai-team-a\nspec:\n  name:\n    value: inference1\n  gpu:\n    value: \"0.5\"\n  image:\n    value: \"runai.jfrog.io/demo/example-triton-server\"\n  minScale:\n    value: 1\n  maxScale:\n    value: 2\n  metric:\n    value: concurrency # (1)\n  target:\n    value: 80  # (2)\n  ports:\n      items:\n        port1:\n          value:\n            container: 8000\n            protocol: http\n            serviceType: ServingPort\n
                                  1. Possible metrics are throughput, concurrency and latency.
                                  2. Inference requires a port to receive requests.
                                  "},{"location":"developer/cluster-api/submit-yaml/#suspendresume-interactivetraining-workload","title":"Suspend/Resume Interactive/Training Workload","text":"

                                  To suspend training:

                                  apiVersion: run.ai/v2alpha1\nkind: TrainingWorkload # \nmetadata:\n  name: job-1  #  \n  namespace: runai-team-a # \nspec:\n  gpu:\n    value: \"1\"\n  active:\n    value: false\n  image:\n    value: runai.jfrog.io/demo/quickstart\n  name:\n    value: job-1 # \n
                                  In order to suspend the workload, set active to false. To resume the workload, either set active to true or remove it entirely.

                                  "},{"location":"developer/cluster-api/submit-yaml/#see-also","title":"See Also","text":"
                                  • To understand how to connect to the inference workload, see Inference Quickstart.
                                  • To learn more about Inference and Run:ai see Inference overview.
                                  "},{"location":"developer/cluster-api/workload-overview-dev/","title":"Cluster API (Deprecated)","text":"

                                  The Run:ai Cluster API allows the submission of Workloads via YAML, directly to Kubernetes.

                                  Important

                                  With Run:ai 2.18 clusters, you can now submit Workloads via the Run:ai REST API. We recommend using this API if your cluster is of that version.

                                  "},{"location":"developer/cluster-api/workload-overview-dev/#workloads","title":"Workloads","text":"

                                  Run:ai schedules Workloads. Run:ai workloads contain:

                                  • The Kubernetes resource (Job, Deployment, etc) that is used to launch the container inside which the data science code runs.
                                  • A set of additional resources that is required to run the Workload. Examples: a service entry point that allows access to the Job, a persistent volume claim to access data on the network and more.

                                  Run:ai supports the following Workloads types:

                                  Workload Type Kubernetes Name Description Interactive InteractiveWorkload Submit an interactive workload Training TrainingWorkload Submit a training workload Distributed Training DistributedWorkload Submit a distributed training workload using TensorFlow, PyTorch or MPI Inference InferenceWorkload Submit an inference workload"},{"location":"developer/cluster-api/workload-overview-dev/#values","title":"Values","text":"

                                  A Workload will typically have a list of values, such as name, image, and resources. A full list of values is available in the runai-submit Command-line reference.

                                  You can also find the exact YAML syntax run:

                                  kubectl explain TrainingWorkload.spec\n

                                  (and similarly for other Workload types).

                                  To get information on a specific value (e.g. node type), you can also run:

                                  kubectl explain TrainingWorkload.spec.nodeType\n

                                  Result:

                                  KIND:     TrainingWorkload\nVERSION:  run.ai/v2alpha1\n\nRESOURCE: nodeType <Object>\n\nDESCRIPTION:\n     Specifies nodes (machines) or a group of nodes on which the workload will\n     run. To use this feature, your Administrator will need to label nodes as\n     explained in the Group Nodes guide at\n     https://docs.run.ai/admin/researcher-setup/limit-to-node-group. This flag\n     can be used in conjunction with Project-based affinity. In this case, the\n     flag is used to refine the list of allowable node groups set in the\n     Project. For more information consult the Projects guide at\n     https://docs.run.ai/admin/admin-ui-setup/project-setup.\n\nFIELDS:\n   value    <string>\n
                                  "},{"location":"developer/cluster-api/workload-overview-dev/#how-to-submit","title":"How to Submit","text":"

                                  A Workload can be submitted via various channels:

                                  • The Run:ai user interface.
                                  • The Run:ai command-line interface, via the runai submit command.
                                  • The Run:ai Cluster API.
                                  "},{"location":"developer/cluster-api/workload-overview-dev/#policies","title":"Policies","text":"

                                  An Administrator can set Policies for Workload submission. Policies serve two purposes:

                                  1. To constrain the values a researcher can specify.
                                  2. To provide default values.

                                  For example, an administrator can,

                                  • Set a maximum of 5 GPUs per Workload.
                                  • Provide a default value of 1 GPU for each container.

                                  Each workload type has a matching kind of workload policy. For example, an InteractiveWorkload has a matching InteractivePolicy

                                  A Policy of each type can be defined per-project. There is also a global policy that applies to any project that does not have a per-project policy.

                                  For further details on policies, see Policies.

                                  "},{"location":"developer/metrics/metrics-api/","title":"Metrics and telemetry","text":""},{"location":"developer/metrics/metrics-api/#telemetry","title":"Telemetry","text":"

                                  Telemetry is a numeric measurement recorded in real-time when emitted from the Run:ai cluster.

                                  "},{"location":"developer/metrics/metrics-api/#metrics","title":"Metrics","text":"

                                  Metrics are numeric measurements recorded over time that are emitted from the Run:ai cluster. Typical metrics involve utilization, allocation, time measurements and so on. Metrics are used in Run:ai dashboards as well as in the Run:ai administration user interface.

                                  The purpose of this document is to detail the structure and purpose of metrics emitted by Run:ai. This enables customers to create custom dashboards or integrate metric data into other monitoring systems.

                                  Run:ai provides metrics via the Run:ai Control-plane API. Previoulsy, Run:ai provided metrics information via direct access to an internal metrics store. This method is deprecated but is still documented here.

                                  "},{"location":"developer/metrics/metrics-api/#metric-and-telemetry-scopes","title":"Metric and telemetry Scopes","text":"

                                  Run:ai provides Control-plane API which supports and aggregates metrics at various levels.

                                  Level Description Cluster A cluster is a set of Nodes Pools & Nodes. With Cluster metrics, metrics are aggregated at the Cluster level Node Data is aggregated at the Node level. Node Pool Data is aggregated at the Node Pool level. Workload Data is aggregated at the Workload level. In some Workloads, e.g. with distributed workloads, these metrics aggregate data from all worker pods Pod The basic execution unit Project The basic organizational unit. Projects are the tool to implement resource allocation policies as well as the segregation between different initiatives. Department Departments are a grouping of projects. ## Supported Metrics Metric Cluster Node Pool Node Workload Pod Project Department API Cluster API Node Pool API Workload API Pod API ALLOCATED_GPU TRUE TRUE TRUE AVG_WORKLOAD_WAIT_TIME TRUE TRUE CPU_LIMIT_CORES TRUE CPU_MEMORY_LIMIT_BYTES TRUE CPU_MEMORY_REQUEST_BYTES TRUE CPU_MEMORY_USAGE_BYTES TRUE TRUE TRUE CPU_MEMORY_UTILIZATION TRUE TRUE TRUE CPU_REQUEST_CORES TRUE CPU_USAGE_CORES TRUE TRUE TRUE CPU_UTILIZATION TRUE TRUE TRUE GPU_ALLOCATION TRUE TRUE TRUE GPU_MEMORY_REQUEST_BYTES TRUE GPU_MEMORY_USAGE_BYTES TRUE TRUE GPU_MEMORY_USAGE_BYTES_PER_GPU TRUE TRUE GPU_MEMORY_UTILIZATION TRUE TRUE GPU_MEMORY_UTILIZATION_PER_GPU TRU GPU_QUOTA TRUE TRUE TRUE TRUE GPU_UTILIZATION TRUE TRUE TRUE TRUE GPU_UTILIZATION_PER_GPU TRUE TRUE POD_COUNT TRUE RUNNING_POD_COUNT TRUE TOTAL_GPU TRUE TRUE TOTAL_GPU_NODES TRUE TRUE GPU_UTILIZATION_DISTRIBUTION TRUE TRUE UNALLOCATED_GPU TRUE TRUE CPU_QUOTA_MILLICORES TRUE TRUE CPU_MEMORY_QUOTA_MB TRUE TRUE CPU_ALLOCATION_MILLICORES TRUE TRUE CPU_MEMORY_ALLOCATION_MB TRUE TRUE"},{"location":"developer/metrics/metrics-api/#advanced-metrics","title":"Advanced Metrics","text":"

                                  NVIDIA provides extended metrics at the Pod level. These are documented here. To enable these metrics please contact Run:ai customer support.

                                  Metric Cluster Node Pool Workload Pod GPU_FP16_ENGINE_ACTIVITY_PER_GPU TRUE GPU_FP32_ENGINE_ACTIVITY_PER_GPU TRUE GPU_FP64_ENGINE_ACTIVITY_PER_GPU TRUE GPU_GRAPHICS_ENGINE_ACTIVITY_PER_GPU TRUE GPU_MEMORY_BANDWIDTH_UTILIZATION_PER_GPU TRUE GPU_NVLINK_RECEIVED_BANDWIDTH_PER_GPU TRUE GPU_NVLINK_TRANSMITTED_BANDWIDTH_PER_GPU TRUE GPU_PCIE_RECEIVED_BANDWIDTH_PER_GPU TRUE GPU_PCIE_TRANSMITTED_BANDWIDTH_PER_GPU TRUE GPU_SM_ACTIVITY_PER_GPU TRUE GPU_SM_OCCUPANCY_PER_GPU TRUE GPU_TENSOR_ACTIVITY_PER_GPU TRUE"},{"location":"developer/metrics/metrics-api/#_1","title":"Metrics via API","text":""},{"location":"developer/metrics/metrics-api/#supported-telemetry","title":"Supported telemetry","text":"telemetry Node Workload Project Department API Node API Workload API WORKLOADS_COUNT TRUE ALLOCATED_GPUS TRUE TRUE TRUE TRUE READY_GPU_NODES TRUE READY_GPUS TRUE TOTAL_GPU_NODES TRUE TOTAL_GPUS TRUE IDLE_ALLOCATED_GPUS TRUE FREE_GPUS TRUE TOTAL_CPU_CORES TRUE USED_CPU_CORES TRUE ALLOCATED_CPU_CORES TRUE TRUE TRUE TOTAL_GPU_MEMORY_BYTES TRUE USED_GPU_MEMORY_BYTES TRUE TOTAL_CPU_MEMORY_BYTES TRUE USED_CPU_MEMORY_BYTES TRUE ALLOCATED_CPU_MEMORY_BYTES TRUE TRUE TRUE GPU_QUOTA TRUE TRUE CPU_QUOTA TRUE TRUE MEMORY_QUOTA TRUE TRUE GPU_ALLOCATION_NON_PREEMPTIBLE TRUE TRUE CPU_ALLOCATION_NON_PREEMPTIBLE TRUE TRUE MEMORY_ALLOCATION_NON_PREEMPTIBLE TRUE TRUE"},{"location":"developer/metrics/metrics/","title":"Metrics API","text":""},{"location":"developer/metrics/metrics/#what-are-metrics","title":"What are Metrics","text":"

                                  Metrics are numeric measurements recorded over time that are emitted from the Run:ai cluster. Typical metrics involve utilization, allocation, time measurements and so on. Metrics are used in Run:ai dashboards as well as in the Run:ai administration user interface.

                                  The purpose of this document is to detail the structure and purpose of metrics emitted by Run:ai to enable customers to create custom dashboards or integrate metric data into other monitoring systems.

                                  Run:ai uses Prometheus for collecting and querying metrics.

                                  Warning

                                  From cluster version 2.17 and onwards, Run:ai supports metrics via the Run:ai Control-plane API. Direct metrics queries (metrics that are queried directly from Prometheus) are deprecated.

                                  "},{"location":"developer/metrics/metrics/#published-runai-metrics","title":"Published Run:ai Metrics","text":"

                                  Following is the list of published Run:ai metrics, per cluster version (make sure to pick the right cluster version in the picker at the top of the page):

                                  Metric name Labels Measurement Description runai_active_job_cpu_requested_cores {clusterId, job_name, job_uuid} CPU Cores Workload's requested CPU cores runai_active_job_memory_requested_bytes {clusterId, job_name, job_uuid} Bytes Workload's requested CPU memory runai_cluster_cpu_utilization {clusterId} 0 to 1 CPU utilization of the entire cluster runai_cluster_memory_used_bytes {clusterId} Bytes Used CPU memory of the entire cluster runai_cluster_memory_utilization {clusterId} 0 to 1 CPU memory utilization of the entire cluster runai_allocated_gpu_count_per_gpu {gpu, clusterId, node} 0/1 Is a GPU hosting a pod runai_last_gpu_utilization_time_per_gpu {gpu, clusterId, node} Unix time Last time GPU was not idle runai_requested_gpu_memory_mb_per_workload {clusterId, job_type, job_uuid, job_name, project, workload_id} MegaBytes Requested GPU memory per workload (0 if not specified by the user) runai_requested_gpus_per_workload {clusterId, workload_type, workload_id, workload_name, project} Double Number of requested GPUs per workload runai_run_time_seconds_per_workload {clusterId, workload_id, workload_name} Seconds Total run time per workload runai_wait_time_seconds_per_workload {clusterId, workload_id, workload_name} Seconds Total wait time per workload runai_node_cpu_requested_cores {clusterId, node} Double Sum of the requested CPU cores of all workloads running in a node runai_node_cpu_utilization {clusterId, node} 0 to 1 CPU utilization per node runai_node_memory_utilization {clusterId, node} 0 to 1 CPU memory utilization per node runai_node_requested_memory_bytes {clusterId, node} Bytes Sum of the requested CPU memory of all workloads running in a node runai_node_used_memory_bytes {clusterId, node} Bytes Used CPU memory per node runai_project_guaranteed_gpus {clusterId, project} Double Guaranteed GPU quota per project runai_project_info {memory_quota, cpu_quota, gpu_guaranteed_quota, clusterId, project, department} N/A Information on CPU, CPU memory, GPU quota per project runai_queue_info {memory_quota, cpu_quota, gpu_guaranteed_quota, clusterId, nodepool, queue_name, department} N/A Information on CPU, CPU memory, GPU quota per project/department per nodepool runai_cpu_limits_per_active_workload {clusterId, job_name , job_uuid} CPU Cores Workloads CPU limit (in number of cores). See link runai_job_cpu_usage {clusterId, workload_id, workload_name, project} Double Workloads CPU usage (in number of cores) runai_memory_limits_per_active_workload {clusterId, job_name, job_uuid} Bytes Workloads CPU memory limit. See link runai_active_job_memory_requested_bytes {clusterId, job_name, job_uuid} Bytes Workloads requested CPU memory. See link runai_job_memory_used_bytes {clusterId, workload_id, workload_name, project} Bytes Workloads used CPU memory runai_mig_mode_gpu_count {clusterId, node} Double Number of GPUs on MIG nodes (Deprecated) runai_gpu_utilization_per_gpu {clusterId, gpu, node} % GPU Utilization per GPU runai_gpu_utilization_per_node {clusterId, node} % GPU Utilization per Node runai_gpu_memory_used_mebibytes_per_gpu {clusterId, gpu, node} MiB Used GPU memory per GPU runai_gpu_memory_used_mebibytes_per_node {clusterId, node} MiB Used GPU memory per Node runai_gpu_memory_total_mebibytes_per_gpu {clusterId, gpu, node} MiB Total GPU memory per GPU runai_gpu_memory_total_mebibytes_per_node {clusterId, node} MiB Total GPU memory per Node runai_gpu_count_per_node {clusterId, node, modelName, ready, schedulable} Number Number of GPUs per Node runai_allocated_gpu_count_per_workload {clusterId, workload_id, workload_name, workload_type, user} Double Number of allocated GPUs per Workload runai_allocated_gpu_count_per_project {clusterId, project} Double Number of allocated GPUs per Project runai_gpu_memory_used_mebibytes_per_pod_per_gpu {clusterId, pod_name, pod_uuid, pod_namespace, node, gpu} MiB Used GPU Memory per Pod, per Gpu on which the workload is running runai_gpu_memory_used_mebibytes_per_workload {clusterId, workload_id, workload_name, workload_type, user} MiB Used GPU Memory per Workload runai_gpu_utilization_per_pod_per_gpu {clusterId, pod_name, pod_uuid, pod_namespace, node, gpu} % GPU Utilization per Pod per GPU runai_gpu_utilization_per_workload {clusterId, workload_id, workload_name, workload_type, user} % Average GPU Utilization per Workload runai_gpu_utilization_per_project {clusterId, project} % Average GPU Utilization per Project runai_last_gpu_utilization_time_per_workload {clusterId, workload_id, workload_name, workload_type, user} Seconds (Unix Timestamp) The Last Time (Unix Timestamp) That The Workload Utilized Any Of Its Allocated GPUs runai_gpu_idle_seconds_per_workload {clusterId, workload_id, workload_name, workload_type, user} Seconds Seconds Passed Since The Workload Utilized Any Of Its Allocated GPUs runai_allocated_gpu_count_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Double Number Of Allocated GPUs per Pod runai_allocated_gpu_count_per_node {clusterId, node} Double Number Of Allocated GPUs per Node runai_allocated_millicpus_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Integer Number Of Allocated Millicpus per Pod runai_allocated_memory_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Bytes Allocated Memory per Pod

                                  Following is a list of labels appearing in Run:ai metrics:

                                  Label Description clusterId Cluster Identifier department Name of Run:ai Department cpu_quota CPU limit per project gpu GPU index gpu_guaranteed_quota Guaranteed GPU quota per project image Name of Docker image namespace_name Namespace deployment_name Deployment name job_name Job name job_type Job type: training, interactive or inference job_uuid Job identifier workload_name Workload name workload_type Workload type: training, interactive or inference workload_uuid Workload identifier pod_name Pod name. A Workload can contain many pods. pod_namespace Pod namespace memory_quota CPU memory limit per project node Node name project Name of Run:ai Project status Workload status: Running, Pending, etc. For more information on Workload statuses see document user User identifier"},{"location":"developer/metrics/metrics/#other-metrics","title":"Other Metrics","text":"

                                  Run:ai exports other metrics emitted by NVIDIA and Kubernetes packages, as follows:

                                  Metric name Description runai_gpu_utilization_per_gpu GPU utilization kube_node_status_capacity The capacity for different resources of a node kube_node_status_condition The condition of a cluster node kube_pod_container_resource_requests_cpu_cores The number of CPU cores requested by container kube_pod_container_resource_requests_memory_bytes Bytes of memory requested by a container kube_pod_info Information about pod

                                  For additional information, see Kubernetes kube-state-metrics and NVIDIA dcgm exporter.

                                  "},{"location":"developer/metrics/metrics/#changed-metrics-and-api-mapping","title":"Changed metrics and API mapping","text":"

                                  Starting in cluster version 2.17, some of the metrics names have been changed. In addition some Run:ai metrics are available as API endpoints. Using the API endpoints is more efficient and provides an easier way of retrieving metrics in any application. The following table lists the metrics that were changed.

                                  Metric name in version 2.16 2.17 Change Description 2.17 API Endpoint runai_active_job_cpu_requested_cores available also via API https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"CPU_REQUEST_CORES\" metricType runai_active_job_memory_requested_bytes available also via API https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"CPU_MEMORY_REQUEST_BYTES\" metricType runai_cluster_cpu_utilization available also via API https://app.run.ai/api/v2/clusters/{clusterUuid}/metrics ; with \"CPU_UTILIZATION\" metricType runai_cluster_memory_utilization available also via API https://app.run.ai/api/v2/clusters/{clusterUuid}/metrics ; with \"CPU_MEMORY_UTILIZATION\" metricType runai_gpu_utilization_non_fractional_jobs no longer available runai_allocated_gpu_count_per_workload labels changed runai_gpu_utilization_per_pod_per_gpu available also via API https://app.run.ai/api/v1/workloads/{workloadId}/pods/{podId}/metrics ; with \"GPU_UTILIZATION_PER_GPU\" metricType runai_gpu_utilization_per_workload available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"GPU_UTILIZATION\" metricType runai_job_image no longer available runai_job_requested_gpu_memory available also via API and renamed to: \"runai_requested_gpu_memory_mb_per_workload\" with different labels https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"GPU_MEMORY_REQUEST_BYTES\" metricType runai_job_requested_gpus renamed to: \"runai_requested_gpus_per_workload\" with different labels runai_job_total_runtime renamed to: \"runai_run_time_seconds_per_workload\" with different labels runai_job_total_wait_time renamed to: \"runai_wait_time_seconds_per_workload\" with different labels runai_gpu_memory_used_mebibytes_per_workload available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"GPU_MEMORY_USAGE_BYTES\" metricType runai_gpu_memory_used_mebibytes_per_pod_per_gpu available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/pods/{podId}/metrics ; with \"GPU_MEMORY_USAGE_BYTES_PER_GPU\" metricType runai_node_gpu_used_memory_bytes renamed and changed units: \"runai_gpu_memory_used_mebibytes_per_node\" runai_node_total_memory_bytes renamed and changed units: \"runai_gpu_memory_total_mebibytes_per_node\" runai_project_info labels changed runai_active_job_cpu_limits available also via API and renamed to: \"runai_cpu_limits_per_active_workload\" https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"CPU_LIMIT_CORES\" metricType runai_job_cpu_usage available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"CPU_USAGE_CORES\" metricType runai_active_job_memory_limits available also via API and renamed to: \"runai_memory_limits_per_active_workload\" https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"CPU_MEMORY_LIMIT_BYTES\" metricType runai_running_job_memory_requested_bytes was a duplication of \"runai_active_job_memory_requested_bytes\", see above runai_job_memory_used_bytes available also via API and labels changed https://app.run.ai/api/v1/workloads/{workloadId}/metrics ; with \"CPU_MEMORY_USAGE_BYTES\" metricType runai_job_swap_memory_used_bytes no longer available runai_gpu_count_per_node added labels runai_last_gpu_utilization_time_per_workload labels changed runai_gpu_idle_time_per_workload renamed to: \"runai_gpu_idle_seconds_per_workload\" with different labels"},{"location":"developer/metrics/metrics/#create-custom-dashboards","title":"Create custom dashboards","text":"

                                  To create custom dashboards based on the above metrics, please contact Run:ai customer support.

                                  "},{"location":"home/components/","title":"Run:ai System Components","text":""},{"location":"home/components/#components","title":"Components","text":"

                                  Run:ai is made up of two components:

                                  • The Run:ai cluster provides scheduling services and workload management.
                                  • The Run:ai control plane provides resource management, Workload submission and cluster monitoring.

                                  Technology-wise, both are installed over a Kubernetes Cluster.

                                  Run:ai users:

                                  • Researchers submit Machine Learning workloads via the Run:ai Console, the Run:ai Command-Line Interface (CLI), or directly by sending YAML files to Kubernetes.
                                  • Administrators monitor and set priorities via the Run:ai User Interface

                                  "},{"location":"home/components/#runai-cluster","title":"Run:ai Cluster","text":"
                                  • Run:ai comes with its own Scheduler. The Run:ai scheduler extends the Kubernetes scheduler. It uses business rules to schedule workloads sent by Researchers.
                                  • Run:ai schedules Workloads. Workloads include the actual researcher code running as a Kubernetes container, together with all the system resources required to run the code, such as user storage, network endpoints to access the container etc.
                                  • The cluster uses an outbound-only, secure connection to synchronize with the Run:ai control plane. Information includes meta-data sync and various metrics on Workloads, Nodes etc.
                                  • The Run:ai cluster is installed as a Kubernetes Operator
                                  • Run:ai is installed in its own Kubernetes namespace named runai
                                  • Workloads are run in the context of Run:ai Projects. Each Project is mapped to a Kubernetes namespace with its own settings and access control.
                                  "},{"location":"home/components/#runai-control-plane-on-the-cloud","title":"Run:ai Control Plane on the cloud","text":"

                                  The Run:ai control plane is used by multiple customers (tenants) to manage resources (such as Projects & Departments), submit Workloads and monitor multiple clusters.

                                  A single Run:ai customer (tenant) defined in the control-plane, can manage multiple Run:ai clusters. So a single customer, can manage mutltiple GPU clusters in multiple locations/subnets from a single interface.

                                  "},{"location":"home/components/#self-hosted-control-plane","title":"Self-hosted Control-Plane","text":"

                                  The Run:ai control plane can also be locally installed. To understand the various installation options see the installation types document.

                                  "},{"location":"home/data-privacy-details/","title":"Data Privacy","text":"

                                  This article details the data privacy and compliance considerations for deploying Run:ai. It is intended to help administrators and compliance teams understand the data management practices involved with Run:ai. This ensures the permissions align with organizational policies and regulatory requirements before installation and during integration and onboarding of the various teams.

                                  When using the Run:ai SaaS cluster, the Control plane operates through the Run:ai cloud, requiring the transmission of certain data for control and analytics. Below is a detailed breakdown of the specific data sent to the Run:ai cloud in the SaaS offering.

                                  Note

                                  For organizations where data privacy policies do not align with this data transmission, Run:ai offers a self-hosted version. This version includes the control plane on premise and does not communicate with the cloud.

                                  "},{"location":"home/data-privacy-details/#data-sent-to-the-runai-cloud","title":"Data sent to the Run:ai cloud","text":"Asset Details Workload Metrics Includes workload names, CPU, GPU, and memory metrics, as well as parameters provided during the runai submit command. Workload Assets Covers environments, compute resources, and data resources associated with workloads. Resource Credentials Credentials for cluster resources, encrypted with a SHA-512 algorithm specific to each tenant. Node Metrics Node-specific data including names, IPs, and performance metrics (CPU, GPU, memory). Cluster Metrics Cluster-wide metrics such as names, CPU, GPU, and memory usage. Projects & Departments Includes names and quota information for projects and departments. Users User roles within Run:ai, email addresses, and passwords."},{"location":"home/data-privacy-details/#key-consideration","title":"Key consideration","text":"

                                  Run:ai ensures that no deep-learning artefacts, such as code, images, container logs, training data, models, or checkpoints, are transmitted to the cloud. These assets remain securely within your organization's firewalls, safeguarding sensitive intellectual property and data.

                                  "},{"location":"home/data-privacy-details/#see-also","title":"See Also","text":"

                                  The Run:ai privacy policy.

                                  "},{"location":"home/overview/","title":"Run:ai Documentation Library","text":"

                                  Welcome to the Run:ai documentation area. For an introduction about what is the Run:ai Platform see Run:ai platform on the run.ai website.

                                  The Run:ai documentation is targeting four personas:

                                  • Infrastructure Administrator - An IT person, responsible for the installation, setup and IT maintenance of the Run:ai product. Infrastructure Administrator documentation can be found here.

                                  • Platform Administrator - Responsible for the day-to-day administration of the product. Platform Administrator documentation can be found here.

                                  • Researcher \u2014 Using Run:ai to spin up notebooks, submit Workloads, prompt models, etc. Researcher documentation can be found here.

                                  • Developer \u2014 Using various APIs to automate work with Run:ai. The Developer documentation can be found here.

                                  "},{"location":"home/overview/#how-to-get-support","title":"How to Get Support","text":"

                                  To get support use the following channels:

                                  • On the Run:ai user interface at <company-name>.run.ai, use the 'Contact Support' link on the top right.

                                  • Or submit a ticket by clicking the button below:

                                  Submit a Ticket

                                  "},{"location":"home/overview/#community","title":"Community","text":"

                                  Run:ai provides its customers with access to the Run:ai Customer Community portal to submit tickets, track ticket progress and update support cases.

                                  Customer Community Portal

                                  Reach out to customer support for credentials.

                                  "},{"location":"home/overview/#runai-cloud-status-page","title":"Run:ai Cloud Status Page","text":"

                                  Run:ai cloud availability is monitored at status.run.ai.

                                  "},{"location":"home/overview/#collect-logs-to-send-to-support","title":"Collect Logs to Send to Support","text":"

                                  As an IT Administrator, you can collect Run:ai logs to send to support. For more information see logs collection.

                                  "},{"location":"home/overview/#example-code","title":"Example Code","text":"

                                  Code for the Docker images referred to on this site is available at https://github.com/run-ai/docs/tree/master/quickstart.

                                  The following images are used throughout the documentation:

                                  Image Description Source runai.jfrog.io/demo/quickstart Basic training image. Multi-GPU support https://github.com/run-ai/docs/tree/master/quickstart/main runai.jfrog.io/demo/quickstart-distributed Distributed training using MPI and Horovod https://github.com/run-ai/docs/tree/master/quickstart/distributed zembutsu/docker-sample-nginx Build (interactive) with Connected Ports https://github.com/zembutsu/docker-sample-nginx runai.jfrog.io/demo/quickstart-x-forwarding Use X11 forwarding from Docker image https://github.com/run-ai/docs/tree/master/quickstart/x-forwarding runai.jfrog.io/demo/pycharm-demo Image used for tool integration (PyCharm and VSCode) https://github.com/run-ai/docs/tree/master/quickstart/python%2Bssh runai.jfrog.io/demo/example-triton-client and runai.jfrog.io/demo/example-triton-server Basic Inference https://github.com/run-ai/models/tree/main/models/triton"},{"location":"home/overview/#contributing-to-the-documentation","title":"Contributing to the documentation","text":"

                                  This documentation is made better by individuals from our customer and partner community. If you see something worth fixing, please comment at the bottom of the page or create a pull request via GitHub. The public GitHub repository can be found on the top-right of this page.

                                  "},{"location":"home/saas-updates/","title":"What's New for the Run:ai SaaS Platform","text":"

                                  The what's new provides transparency into the latest changes and improvements to Run:ai\u2019s SaaS platform. The updates include new features, optimizations, and fixes aimed at improving performance and user experience.

                                  Latest GA release notes (https://docs.run.ai/latest/home/whats-new-2-20/)

                                  "},{"location":"home/saas-updates/#gradual-rollout","title":"Gradual Rollout","text":"

                                  SaaS features are gradually rolled out to customers over the course of a week to ensure a smooth transition and minimize any potential disruption.

                                  "},{"location":"home/saas-updates/#february-release","title":"February Release","text":""},{"location":"home/saas-updates/#product-enhancements","title":"Product Enhancements","text":"
                                  • Workload Events API, /api/v1/workloads/{workloadId}/events, now supports the sort order parameter (asc, desc).
                                  • MIG profile and MIG options are now marked as deprecated in CLI v2, following the deprecation notice in the last version.
                                  • As part of inference support in CLI v2, Knative readiness is now validated on submit requests.
                                  • Improved permission error messaging when attempting to delete a user with higher privileges.
                                  • Improved visibility of metrics in the Resources utilization widget by repositioning them above the graphs.
                                  • Added a new Idle workloads table widget to help users easily identify and manage underutilized resources.
                                  • Renamed and updated the \"Workloads by type\" widget to provide clearer insights into cluster usage with a focus on workloads.
                                  • Improved user experience by moving the date picker to a dedicated section within the overtime widgets, Resources allocation and Resources utilization.
                                  • Simplified configuration by enabling auto-creation of storage class for discovered storage classes.
                                  • Enhanced PVC underlying storage configuration by specifying allowed context for the selected storage (Workload Volume, PVC, both, or neither).
                                  • Added configurable grace period for workload preemption in CLI v2.
                                  "},{"location":"home/saas-updates/#resolved-bugs","title":"Resolved Bugs","text":"ID Description RUN-24838 Fixed an issue where an environment asset could not be created if it included an environment variable with no value specified. RUN-25031 Fixed an issue in the Templates form where existing credentials in the environment variables section were not displayed. RUN-25303 Fixed an issue where submitting with the --attach flag was supported only in a workspace workload. RUN-24354 Fixed an issue where migrating workloads failed due to slow network connection. RUN-25220 CLI v2: Changed --image flag from a required field to an optional one. RUN-25290 Fixed a security vulnerability in golang.org/x/net v0.33.0 with CVE-2024-45338 with severity HIGH. RUN-24688 Fixed an issue that blocked the Create Template submission due to a server error. This occurred when using the Copy & Edit Template form."},{"location":"home/whats-new-2-13/","title":"Run:ai version 2.13","text":""},{"location":"home/whats-new-2-13/#version-2137","title":"Version 2.13.7","text":""},{"location":"home/whats-new-2-13/#release-date","title":"Release date","text":"

                                  July 2023

                                  "},{"location":"home/whats-new-2-13/#release-content","title":"Release content","text":"
                                  • Added filters to the historic quota ratio widget on the Quota management dashboard.
                                  "},{"location":"home/whats-new-2-13/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-11080 Fixed an issue in OpenShift environments where log in via SSO with the kubeadmin user, gets blank pages for every page. RUN-11119 Fixed an issue where values that should be the Order of priority column are in the wrong column. RUN-11120 Fixed an issue where the Projects table does not show correct metrics when Run:ai version 2.13 is paired with a Run:ai 2.8 cluster. RUN-11121 Fixed an issue where the wrong over quota memory alert is shown in the Quota management pane in project edit form. RUN-11272 Fixed an issue in OpenShift environments where the selection in the cluster drop down in the main UI does not match the cluster selected on the login page."},{"location":"home/whats-new-2-13/#version-2134","title":"Version 2.13.4","text":""},{"location":"home/whats-new-2-13/#release-date_1","title":"Release date","text":"

                                  July 2023

                                  "},{"location":"home/whats-new-2-13/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-11089 Fixed an issue when creating an environment, commands in the Runtime settings pane and are not persistent and cannot be found in other assets (for example in a new Training)."},{"location":"home/whats-new-2-13/#version-2131","title":"Version 2.13.1","text":""},{"location":"home/whats-new-2-13/#release-date_2","title":"Release date","text":"

                                  July 2023

                                  "},{"location":"home/whats-new-2-13/#release-content_1","title":"Release content","text":"
                                  • Made an improvement so that occurrences of labels that are not in use anymore are deleted.
                                  "},{"location":"home/whats-new-2-13/#fixed-issues_2","title":"Fixed issues","text":"

                                  N/A

                                  "},{"location":"home/whats-new-2-13/#version-2130","title":"Version 2.13.0","text":""},{"location":"home/whats-new-2-13/#release-content_2","title":"Release content","text":"

                                  This version contains features and fixes from previous versions starting with 2.9. Refer to the prior versions for specific features and fixes.

                                  Projects

                                  • Improved the Projects UI for ease of use. Projects follows UI upgrades and changes that are designed to make setting up of components and assets easier for administrators and researchers. To configure a project, see Projects.

                                  Dashboards

                                  • Added a new dashboard for Quota management, which provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard filters the display of resource quotas based on Departments, Projects, and Node pools. For more information, see Quota management dashboard.

                                  • Added to the Overview dashboard, the ability to filter the cluster by one or more node pools. For more information, see Node pools.

                                  Nodes and Node pools

                                  • Run:ai scheduler supports 2 scheduling strategies: Bin Packing (default) and Spread. For more information, see Scheduling strategies. You can configure the scheduling strategy in the node pool level to improve the support of clusters with mixed types of resources and workloads. For configuration information, see Creating new node pools.

                                  • GPU device level DCGM Metrics are collected per GPU and presented by Run:ai in the Nodes table. Each node contains a list of its embedded GPUs with their respective DCGM metrics. See DCGM Metrics for the list of metrics which are provided by NVidia DCGM and collected by Run:ai. Contact your Run:ai customer representative to enable this feature.

                                  • Added per node pool over-quota priority. Over-quota priority sets the relative amount of additional unused resources that an asset can get above its current quota. For more information, see Over-quota priority.
                                  • Added support of associating workspaces to node pool. The association between workspaces and node pools is done using Compute resources section. In order to associate a compute resource to a node pool, in the Compute resource section, press More settings. Press Add new to add more node pools to the configuration. Drag and drop the node pools to set their priority.
                                  • Added Node pool selection as part of the workload submission form. This allows researchers to quickly determine the list of node pools available and their priority. Priority is set by dragging and dropping them in the desired order of priority. In addition, when the node pool priority list is locked by a policy, the list isn't editable by the Researcher even if the workspace is created from a template or copied from another workspace.

                                  Time limit duration

                                  • Improved the behavior of any workload time limit (for example, Idle time limit) so that the time limit will affect existing workloads that were created before the time limit was configured. This is an optional feature which provides help in handling situations where researchers leave sessions open even when they do not need to access the resources. For more information, see Limit duration of interactive training jobs.

                                  • Improved workspaces time limits. Workspaces that reach a time limit will now transition to a state of stopped so that they can be reactivated later.

                                  • Added time limits for training jobs per project. Administrators (Department Admin, Editor) can limit the duration of Run:ai Training jobs per Project using a specified time limit value. This capability can assist administrators to limit the duration and resources consumed over time by training jobs in specific projects. Each training job that reaches this duration will be terminated.

                                  Workload assets

                                  • Extended the collaboration functionality for any workload asset such as Environment, Compute resource, and some Data source types. These assets are now shared with Departments in the organization in addition to being shared with specific projects, or the entire cluster.
                                  • Added a search box for card galleries in any asset based workload creation form to provide an easy way to search for assets and resources. To filter use the asset name or one of the field values of the card.

                                  PVC data sources

                                  • Added support for PVC block storage in the New data source form. In the New data source form for a new PVC data source, in the Volume mode field, select from Filesystem or Block. For more information, see Create a PVC data source.

                                  Credentials

                                  • Added Docker registry to the Credentials menu. Users can create docker credentials for use in specific projects for image pulling. To configure credentials, see Configuring credentials.

                                  Policies

                                  • Improved policy support by adding DEFAULTS in the items section in the policy. The DEFAULTS section sets the default behavior for items declared in this section. For example, this can be use to limit the submission of workloads only to existing PVCs. For more information and an example, see Policies, Complex values.
                                  • Added support for making a PVC data source available to all projects. In the New data source form, when creating a new PVC data source, select All from the Project pane.

                                  Researcher API

                                  • Extended researcher's API to allow stopping and starting of workloads using the API. For more information, see Submitting Workloads via HTTP/REST.

                                  Integrations

                                  • Added support for Spark and Elastic jobs. For more information, see Running Spark jobs with Run:ai.
                                  • Added support for Ray jobs. Ray is an open-source unified framework for scaling AI and Python applications. For more information, see Integrate Run:ai with Ray.

                                  • Added integration with Weights & Biases Sweep to allow data scientists to submit hyperparameter optimization workloads directly from the Run:ai UI. To configure sweep, see Sweep configuration.

                                  • Added support for XGBoost. XGBoost, which stands for Extreme Gradient Boosting, is a scalable, distributed gradient-boosted decision tree (GBDT) machine learning library. It provides parallel tree boosting and is the leading machine learning library for regression, classification, and ranking problems. For more information, see runai submit-dist xgboost

                                  Compatability

                                  • Added support for multiple OpenShift clusters. For configuration information, see Installing additional Clusters.
                                  "},{"location":"home/whats-new-2-13/#installation","title":"Installation","text":"
                                  • The manual process of upgrading Kubernetes CRDs is no longer needed when upgrading to the most recent version (2.13) of Run:ai.
                                  • From Run:ai 2.12 and above, the control-plane installation has been simplified and no longer requires the creation of a backend values file. Instead, install directly using helm as described in Install the Run:ai Control Plane.
                                  • From Run:ai 2.12 and above, the air-gapped, control-plane installation now generates a custom-env.yaml values file during the preparation stage. This is used when installing the control-plane.
                                  "},{"location":"home/whats-new-2-13/#known-issues","title":"Known issues","text":"Internal ID Description RUN-11005 Incorrect error messages when trying to run runai CLI commands in an OpenShift environment. RUN-11009 Incorrect error message when a user without permissions to tries to delete another user."},{"location":"home/whats-new-2-13/#fixed-issues_3","title":"Fixed issues","text":"Internal ID Description RUN-9039 Fixed an issue where in the new job screen, after toggling off the preemptible flag, and a job is submitted, the job still shows as preemptible. RUN-9323 Fixed an issue with a non-scaleable error message when scheduling hundreds of nodes is not successful. RUN-9324 Fixed an issue where the scheduler did not take into consideration the amount of storage so there is no explanation that pvc is not ready. RUN-9902 Fixed an issue in OpenShift environments, where there are no metrics in the dashboard because Prometheus doesn\u2019t have permissions to monitor the runai namespace after an installation or upgrade to 2.9. RUN-9920 Fixed an issue where the canEdit key in a policy is not validated properly for itemized fields when configuring an interactive policy. RUN-10052 Fixed an issue when loading a new job from a template gives an error until there are changes made on the form. RUN-10053 Fixed an issue where the Node pool column is unsearchable in the job list. RUN-10422 Fixed an issue where node details show running workloads that were actually finished (successfully/failed/etc.). RUN-10500 Fixed an issue where jobs are shown as running even though they don't exist in the cluster. RUN-10813 Fixed an issue in adding a data source where the path is case sensitive and didn't allow uppercase."},{"location":"home/whats-new-2-15/","title":"What's New 2.15 - December 3, 2023","text":""},{"location":"home/whats-new-2-15/#release-content","title":"Release Content","text":""},{"location":"home/whats-new-2-15/#researcher","title":"Researcher","text":""},{"location":"home/whats-new-2-15/#jobs-workloads-trainings-and-workspaces","title":"Jobs, Workloads, Trainings, and Workspaces","text":"
                                  • Added support to run distributed workloads via the training view in the UI. You can configure distributed training on the following:

                                    • Trainings form
                                    • Environments form

                                    You can select single or multi-node (distributed) training. When configuring distributed training, you will need to select a framework from the list. Supported frameworks now include:

                                    • PyTorch
                                    • Tensorflow
                                    • XGBoost
                                    • MPI

                                    For Trainings configuration, see Adding trainings. See your Run:ai representative to enable this feature. For Environments configuration, see Creating an Environment.

                                  • Preview the new Workloads view. Workloads is a new view for jobs that are running in the AI cluster. The Workloads view provides a more advanced UI than the previous Jobs UI. The new table format provides:

                                    • Improved views of the data
                                    • Improved filters and search
                                    • More information

                                    Use the toggle at the top of the Jobs page to switch to the Workloads view. For more information.

                                  • Improved support for Kubeflow Notebooks. Run:ai now supports the scheduling of Kubeflow notebooks with fractional GPUs. Kubeflow notebooks are identified automatically and appear with a dedicated icon in the Jobs UI.

                                  • Improved the Trainings and Workspaces forms. Now the runtime field for Command and Arguments can be edited directly in the new Workspace or Training creation form.
                                  • Added new functionality to the Run:ai CLI that allows submitting a workload with multiple service types at the same time in a CSV style format. Both the CLI and the UI now offer the same functionality. For more information, see runai submit.
                                  • Improved functionality in the runai submit command so that the port for the container is specified using the nodeport flag. For more information, see runai submit --service-type nodeport.
                                  "},{"location":"home/whats-new-2-15/#credentials","title":"Credentials","text":"
                                  • Improved Credentials creation. A Run:ai scope can now be added to credentials. For more information, see Credentials.
                                  "},{"location":"home/whats-new-2-15/#environments","title":"Environments","text":"
                                  • Added support for workload types when creating a new or editing existing environments. Select from single-node or multi-node (distributed) workloads. The environment is available only on feature forms which are relevant to the workload type selected.
                                  "},{"location":"home/whats-new-2-15/#volumes-and-storage","title":"Volumes and Storage","text":"
                                  • Added support for Ephemeral volumes in Workspaces. Ephemeral storage is temporary storage that gets wiped out and lost when the workspace is deleted. Adding Ephemeral storage to a workspace ties that storage to the lifecycle of the Workspace to which it was added. Ephemeral storage is added to the Workspace configuration form in the Volume pane. For configuration information, see Create a new workspace.
                                  "},{"location":"home/whats-new-2-15/#templates","title":"Templates","text":"
                                  • Added support for Run:ai a Scope in the template form. For configuration information, see Creating templates.
                                  "},{"location":"home/whats-new-2-15/#deployments","title":"Deployments","text":"
                                  • Improvements in the New Deployment form include:
                                    • Support for Tolerations. Tolerations guide the system to which node each pod can be scheduled to or evicted by matching between rules and taints defined for each Kubernetes node.
                                    • Support for Multi-Process Service (MPS). MPS is a service which allows the running of parallel processes on the same GPU, which are all run by the same userid. To enable MPS support, use the toggle switch on the Deployments form.

                                    Note

                                    If you do not use the same userid, the processes will run in serial and could possibly degrade performance.

                                  "},{"location":"home/whats-new-2-15/#auto-delete-jobs","title":"Auto Delete Jobs","text":"
                                  • Added new functionality to the UI and CLI that provides configuration options to automatically delete jobs after a specified amount of time upon completion. Auto-deletion provides more efficient use of resources and makes it easier for researchers to manage their jobs. For more configuration options in the UI, see Auto deletion (Step 9) in Create a new workspace. For more information on the CLI flag, see --auto-deletion-time-after-completion.
                                  "},{"location":"home/whats-new-2-15/#runai-administrator","title":"Run:ai Administrator","text":""},{"location":"home/whats-new-2-15/#authorization","title":"Authorization","text":"
                                  • Run:ai has now revised and updated the Role Based Access Control (RBAC) mechanism, expanding the scope of Kubernetes. Using the new RBAC mechanism makes it easier for administrators to manage access policies across multiple clusters and to define specific access rules over specific scopes for specific users and groups. Along with the revised RBAC mechanism, new user interface views are introduced to support the management of users, groups, and access rules. For more information, see Role based access control.
                                  "},{"location":"home/whats-new-2-15/#policies","title":"Policies","text":"
                                  • During Workspaces and Training creation, assets that do not comply with policies cannot be selected. These assets are greyed out and have a button on the cards when the item does not comply with a configured policy. The button displays information about which policies are non-compliant.
                                  • Added configuration options to Policies in order to prevent the submission of workloads that use data sources of type host path. This prevents data from being stored on the node, so that data is not lost when a node is deleted. For configuration information, see Prevent Data Storage on the Node.
                                  • Improved flexibility when creating policies which provide the ability to allocate a min and a max value for CPU and GPU memory. For configuration information, see GPU and CPU memory limits in Configuring policies.
                                  "},{"location":"home/whats-new-2-15/#nodes-and-node-pools","title":"Nodes and Node Pools","text":"
                                  • Node pools are now enabled by default. There is no need to enable the feature in the settings.
                                  "},{"location":"home/whats-new-2-15/#quotas-and-over-quota","title":"Quotas and Over-Quota","text":"
                                  • Improved control over how over-quota is managed by adding the ability to block over-subscription of the quota in Projects or Departments. For more information, see Limit Over-Quota.
                                  • Improved the scheduler fairness for departments using the over quota priority switch (in Settings). When the feature flag is disabled, over-quota weights are equal to the deserved quota and any excess resources are divided in the same proportion as the in-quota resources. For more information, see Over Quota Priority.
                                  • Added new functionality to always guarantee in-quota workloads at the expense of inter-Department fairness. Large distributed workloads from one department may preempt in-quota smaller workloads from another department. This new setting in the RunaiConfig file preserves in-quota workloads, even if the department quota or over-quota-fairness is not preserved. For more information, see Scheduler Fairness.
                                  "},{"location":"home/whats-new-2-15/#control-and-visibility","title":"Control and Visibility","text":""},{"location":"home/whats-new-2-15/#dashboards","title":"Dashboards","text":"
                                  • To ease the management of AI CPU and cluster resources, a new CPU focused dashboard was added for CPU based environments. The dashboards display specific information for CPU based nodes, node-pools, clusters, or tenants. These dashboards also include additional metrics that are specific to CPU based environments. This will help optimize visual information eliminating the views of empty GPU dashlets. For more information see CPU Dashboard.
                                  • Improved the Consumption report interface by moving the Cost settings to the General settings menu.
                                  • Added table to the Consumption dashboard that displays the consumption and cost per department. For more information, see Consumption dashboard.
                                  "},{"location":"home/whats-new-2-15/#nodes","title":"Nodes","text":"
                                  • Improved the readability of the Nodes table to include more detailed statuses and descriptions. The added information in the table makes it easier to inspect issues that may impact resource availability in the cluster. For more information, see Node and Node Pool Status.
                                  "},{"location":"home/whats-new-2-15/#ui-enhancements","title":"UI Enhancements","text":"
                                  • Added the ability to download a CSV file from any page that contains a table. Downloading a CSV provides a snapshot of the page's history over time, and helps with compliance tracking. All the columns that are selected (displayed) in the table are downloaded to the file.
                                  "},{"location":"home/whats-new-2-15/#installation-and-configuration","title":"Installation and Configuration","text":""},{"location":"home/whats-new-2-15/#cluster-installation-and-configuration","title":"Cluster Installation and configuration","text":"
                                  • New cluster wizard for adding and installing new clusters to your system.
                                  "},{"location":"home/whats-new-2-15/#openshift-support","title":"OpenShift Support","text":"
                                  • Added support for restricted policy for Pod Security Admission (PSA) on OpenShift only. For more information, see [Pod security admission](../admin/runai-setup/cluster-setup/
                                  • Added the ability, in OpenShift environments, to configure cluster routes created by Run:ai instead of using the OpenShift certificate. For more information, see the table entry Dedicated certificate for the researcher service route.
                                  "},{"location":"home/whats-new-2-16/","title":"Version 2.16","text":""},{"location":"home/whats-new-2-16/#release-content-january-25-2024","title":"Release Content - January 25, 2024","text":""},{"location":"home/whats-new-2-16/#researcher","title":"Researcher","text":"
                                  • Added enterprise level security for researcher tools such as Jupyter Notebooks, VSCode, or any other URL associated with the workload. Using this feature, anyone within the organization requesting access to a specific URL will be redirected to the login page to be authenticated and authorized. This results in protected URLs which cannot be reached from outside the organization. Researchers can enhance the URL privacy by using the Private toggle which means that only the researcher who created the workload can is authorized to access it. The Private toggle is available per tool that uses an external URL as a connection type and is located in the workload creation from in the UI in the environment section. This toggle sets a flag of isPrivate in the connections section of a policy for the connection type ExternalUrl. For more information, see Creating a new Workspace.
                                  "},{"location":"home/whats-new-2-16/#jobs-workloads-and-workspaces","title":"Jobs, Workloads, and Workspaces","text":"
                                  • Added the capability view and edit policies directly in the project submission form. Pressing on Policy will open a window that displays the effective policy. For more information, see Viewing Project Policies.
                                  • Running machine learning workloads effectively on Kubernetes can be difficult, but Run:ai makes it easy. The new Workloads experience introduces a simpler and more efficient way to manage machine learning workloads, which will appeal to data scientists and engineers alike. The Workloads experience provides a fast, reliable, and easy to use unified interface.

                                    • Fast-query of data from the new workloads service.
                                    • Reliable data retrieval and presentation in the CLI, UI, and API.
                                    • Easy to use single unified view with all workload types in one place.

                                    For more information, see Workloads Overview.

                                  • Changed the workload default auto deletion time after completion value from Never to 90 days. This ensures that environments will be cleaned from old data. This field is editable by default, allowing researchers the ability to change the value while submitting a workload. Using workload policies, administrators can increase, decrease, set the default value to never, or even lock access to this value so researchers can not edit it when they submit workloads.

                                  "},{"location":"home/whats-new-2-16/#assets","title":"Assets","text":"
                                  • When creating an asset such as data sources, credentials, or others, the scope is limited to the cluster selected at the top of the UI.
                                  "},{"location":"home/whats-new-2-16/#runai-administrator","title":"Run:ai Administrator","text":"
                                  • Added the capability for administrators to configure messages to users when they log into the platform. Messages are configured using the Message Editor screen. For more information, see Administrator Messages.
                                  "},{"location":"home/whats-new-2-16/#monitoring-and-analytics","title":"Monitoring and Analytics","text":"
                                  • Added to the dashboard updated GPU and CPU resource availability.

                                    • Added a chart displaying the number of free GPUs per node. Free GPU are GPUs that have not been allocated to a workload.
                                    • Added a dashlet that displays the total vs. ready resources for GPUs and CPUs. The dashlet indicates how many total nodes are in the platform, and how many are available.
                                  • Added additional columns to the consumption report for both Projects and Departments tables. The new columns are:

                                    • GPU Idle allocated hours\u2014the portion of time the GPUs spend idle from the total allocation hours.
                                    • CPU usage hours\u2014the actual usage time of CPU.
                                    • Memory usage time\u2014the actual usage time of CPU memory.

                                    For more information, see Consumption Dashboard.

                                  "},{"location":"home/whats-new-2-16/#authentication-and-authorization","title":"Authentication and Authorization","text":"
                                  • SSO users who have logged into the system will now be visible in the Users table. In addition, added a column to the Users table for the type of user that was created (Local or SSO). For more information, see Adding, Updating, and Deleting Users.
                                  "},{"location":"home/whats-new-2-16/#policies","title":"Policies","text":"
                                  • Added new Policy Manager. The new Policy Manager provides administrators the ability to impose restrictions and default values on system resources. The new Policy Manager provides a YAML editor for the configuration of the policies. Administrators can easily add both Workspace or Training policies. The editor makes it easy to see the configuration that has been applied and provides a quick and easy method to edit the policies. The new Policy Editor* brings other important policy features such as the ability to see non-compliant resources in workloads. For more information, see Policies.

                                  • Added a new policy manager. Enabling the New Policy Manager provides new tools to discover how resources are not compliant. Non-compliant resources and will appear greyed out and cannot be selected. To see how a resource is not compliant, press on the clipboard icon in the upper right hand corner of the resource. Policies can also be applied to specific scopes within the Run:ai platform. For more information, see Viewing Project Policies.

                                  "},{"location":"home/whats-new-2-16/#control-and-visibility","title":"Control and Visibility","text":"
                                  • Improved the clarity of the status column in the Clusters view. Now users have more insight about the actual status of Run:ai on the cluster. Users can now see extended details about the state of the Run:ai installation and services on the cluster, and its connectivity state. For more information, see Cluster status.
                                  "},{"location":"home/whats-new-2-16/#deprecation-notifications","title":"Deprecation Notifications","text":"

                                  Deprecation notifications allow you to plan for future changes in the Run:ai Platform. Deprecated features will be available for two versions ahead of the notification. For questions, see your Run:ai representative.

                                  "},{"location":"home/whats-new-2-16/#project-migration","title":"Project migration","text":"
                                  • Run:ai will be deprecating the migration of projects between departments. This affects:

                                    • API\u2014the departmentId field will be marked as deprecated in theput endpoint in the projects category.
                                    • User Interface\u2014there will no longer be an option to:
                                      • migrate projects to another department, when deleting departments.
                                      • change departments, when editing a project.
                                  "},{"location":"home/whats-new-2-16/#api-deprecations","title":"API deprecations","text":""},{"location":"home/whats-new-2-16/#removed-apis-and-api-fields-completed-deprecation","title":"Removed APIs and API fields (completed deprecation)","text":"

                                  The following list of API endpoints and fields that have completed their deprecation process and therefore will be changed as follows:

                                  Endpoint Change /v1/k8s/clusters The endpoint was removed and is replaced by /api/v1/clusters /v1/k8s/clusters/{uuid} The endpoint was removed and is replaced by /api/v1/clusters/{uuid}

                                  Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

                                  For a full explanation of the API Deprecation policy, see the Run:ai API Policy

                                  "},{"location":"home/whats-new-2-17/","title":"Version 2.17","text":""},{"location":"home/whats-new-2-17/#release-content-april-14-2024","title":"Release Content - April 14, 2024","text":"
                                  • Deprecation notifications
                                  • Breaking changes
                                  "},{"location":"home/whats-new-2-17/#researcher","title":"Researcher","text":""},{"location":"home/whats-new-2-17/#scheduler","title":"Scheduler","text":"
                                  • Added functionality to configure over provisioning ratios for node pools running any kind of workload. Over provisioning assumes that workloads are either under utilizing or intermittently using GPUs. This indicates that the real utilization is lower than the actual GPU allocation requested. Over provisioning allows the administrator to condense more workloads on a single GPU than what the workload required. For more information, see Optimize performance with Node Level Scheduler.

                                  • Added the GPU Resource Optimization feature to the UI. Now you can enable and configure GPU Portion (Fraction) limit and GPU Memory Limit from the UI. For more information, see Compute resources UI with Dynamic Fractions.

                                  • Added the ability to set Run:ai as the default scheduler for any project or namespace. This provides the administrator the ability to ensure that all workloads in a project or namespace are scheduled using the Run:ai scheduler. For more information, see Setting Run:ai as default scheduler.

                                  "},{"location":"home/whats-new-2-17/#jobs-workloads-and-workspaces","title":"Jobs, Workloads, and Workspaces","text":"
                                  • Added to the workload details view, the ability to filter by pod. You can now filter metrics and logs per pod or all the pods. Also, the Workloads table now has additional columns including connections and preemtability adding more at a glance information about the workload. In addition, using the Copy & edit button, you can submit a new workload via CLI based on the selected workload. For more information, see Workloads.

                                  • Added Inference to workload types. Inference workloads can now be created and managed from the unified Workloads table. The Deployments workload type has been deprecated, and replaced with Inference workloads which are submitted using the workload form. For more information, see Inference and for submitting an Inference workload, see Submitting workloads.

                                  • Added functionality that supports a single workloads submission selection. Now you can submit workloads by pressing + New workloads in the Workloads table. You can submit the following workloads from this table:

                                    • Workspace
                                    • Training
                                    • Inference

                                    This improvement phases out the previous version's Workspace and Jobs tables. The Jobs table and submission forms have been deprecated and can be reactivated. To reenable the Jobs table and forms, press Tools & settings, then General, then Workloads, and then Toggle the Jobs view and the Jobs submission buttons. For more information, see Submitting workloads.

                                  • Added the ability to configure a Kubernetes readiness probe. The readiness probe detects resources and workloads that are ready to receive traffic.

                                  "},{"location":"home/whats-new-2-17/#assets","title":"Assets","text":"
                                  • Added the capability to use a ConfigMap as a data source. The ability to use a ConfigMap as a data source can be configured in the Data sources UI, the CLI, and as part of a policy. For more information, see Setup a ConfigMap as a data source, Setup a ConfigMap as a volume using the CLI.

                                  • Added a Status column to the Credentials table, and the Data sources table. The Status column displays the state of the resource and provides troubleshooting information about that asset. For more information, see the Credentials table and the Data sources table.

                                  • Added functionality for asset creation that validates the asset based on version compatibility of the cluster or the control plane within a specific scope. At time of asset creation, invalid scopes will appear greyed out and will show a pop-up with the reason for the invalidation. This improvement is designed to increase the confidence that an asset is created properly and successfully.

                                  "},{"location":"home/whats-new-2-17/#runai-administrator","title":"Run:ai Administrator","text":""},{"location":"home/whats-new-2-17/#configuration-and-administration","title":"Configuration and Administration","text":"
                                  • Introducing a new Tools & Settings menu. The new Tools & Settings menu provides a streamlined UI for administrators to configure the Run:ai environment. The new UI is divided into categories that easily identify the areas where the administrator can change settings. The new categories include:

                                    • Analytics\u2014features related to analytics and metrics.
                                    • Resources\u2014features related to resource configuration and allocation.
                                    • Workloads\u2014features related to configuration and submission of workloads.
                                    • Security\u2014features related to configuration of SSO (Single Sign On).
                                    • Notifications\u2014used for system notifications.
                                    • Cluster authentication\u2014snippets related to Researcher authentication.

                                    Some features are now labeled either Experimental or Legacy. Experimental features are new features in the environment, that may have certain instabilities and may not perform as expected. Legacy features are features that are in the process of being deprecated, and may be removed in future versions.

                                  "},{"location":"home/whats-new-2-17/#clusters","title":"Clusters","text":"
                                  • Added new columns to the Clusters table to show Kubernetes distribution and version. This helps administrators view potential compatibility issues that may arise.

                                  • Improved the location of the cluster filter. The cluster filter has been relocated to filter bar and the drop down cluster filter in the header of the page has been removed. This improvement creates the following:

                                    • Filter assets by cluster in the following tables:

                                      • Data sources
                                      • Environments
                                      • Computer resources
                                      • Templates
                                      • Credentials
                                    • Creating a new asset, will automatically display only the scope of the selected cluster.

                                    • Prevention of account (top most level in the Scope) from being selected when creating assets.
                                    • Enforcement a cluster specific scope. This increases the confidence that an asset is created properly and successfully.

                                    Note

                                    This feature is only applicable if the all the clusters are version 2.17 and above.

                                  "},{"location":"home/whats-new-2-17/#monitoring-and-analytics","title":"Monitoring and Analytics","text":"
                                  • Improved GPU Overview dashboard. This improvement provides rich and extensive GPU allocation and performance data and now has interactive tiles that provide direct links to the Nodes, Workloads, and Departments tables. Hover over tiles with graphs to show rich data in the selected time frame filter. Tiles with graphs can be downloaded as CSV files. The new dashboard is enabled by default. Use the Go back to legacy view to return to the previous dashboard style. For more information, see Dashboard analysis.

                                  • Updated the knative and autoscaler metrics. Run:ai currently supports the following metrics:

                                    • Throughput
                                    • Concurrency

                                    For more information, see Autoscaling metrics.

                                  • Improved availability of metrics by using Run:ai APIs. Using the API endpoints is now the preferred method to retrieve metrics for use in any application. For more information, see Metrics.

                                  "},{"location":"home/whats-new-2-17/#authentication-and-authorization","title":"Authentication and Authorization","text":"
                                  • Added new functionality to SAML 2.0 identity provider configuration in the Security category of the General settings. The added functionality assists with troubleshooting SSO configuration and authentication issues that may arise. Now administrators now have the ability to:

                                    • View and edit the identity provider settings for SAML 2.0
                                    • Upload or download the SAML 2.0 identity provider metadata XML file.

                                  For more information, see SSO UI configuration.

                                  "},{"location":"home/whats-new-2-17/#deprecation-notifications","title":"Deprecation Notifications","text":"

                                  Deprecation notifications allow you to plan for future changes in the Run:ai Platform.

                                  "},{"location":"home/whats-new-2-17/#feature-deprecations","title":"Feature deprecations","text":"

                                  Deprecated features will be available for two versions ahead of the notification. For questions, see your Run:ai representative. The following features have been marked for deprecation:

                                  • Jobs\u2014the Jobs feature (submission form and view) has been moved to the category of Legacy. To enable them, go to Tools & Settings, General, open the Workloads pane, and then toggle the Jobs view and Job submission switch to the enabled position.
                                  • Deployments\u2014the Deployments feature has been removed. It has been replaced by Inference workloads. For more information, see Jobs, Workloads, and Workspaces above.
                                  • Workspaces view\u2014the Workspaces menu has been removed. You can now submit a Workspace workload using the + New workload form from the Workloads table.
                                  "},{"location":"home/whats-new-2-17/#api-support-and-endpoint-deprecations","title":"API support and endpoint deprecations","text":"

                                  The endpoints and parameters specified in the API reference are the ones that are officially supported by Run:ai. For more information about Run:ai's API support policy and deprecation process, see Developer overview.

                                  "},{"location":"home/whats-new-2-17/#deprecated-apis-and-api-fields","title":"Deprecated APIs and API fields","text":"

                                  The following list of API endpoints and fields that have been marked for deprecation:

                                  "},{"location":"home/whats-new-2-17/#jobs-and-pods-api","title":"Jobs and Pods API","text":"Deprecated Replacement /v1/k8s/clusters/{uuid}/jobs /api/v1/workloads /v1/k8s/clusters/{uuid}/jobs/count /api/v1/workloads/count /v1/k8s/clusters/{uuid}/jobs/{jobId}/pods /api/v1/workloads/{workloadId}/pods /v1/k8s/clusters/{uuid}/pods /api/v1/workloads/pods"},{"location":"home/whats-new-2-17/#clusters-api","title":"Clusters API","text":"Deprecated Replacement /v1/k8s/clusters/{clusterUuid}/metrics /api/v1/clusters/{clusterUuid}/metrics"},{"location":"home/whats-new-2-17/#authorization-and-authentication-api","title":"Authorization and Authentication API","text":"Deprecated Replacement /v1/k8s/auth/token/exchange /api/v1/token /v1/k8s/auth/oauth/tokens/refresh /api/v1/token /v1/k8s/auth/oauth/apptoken /api/v1/token /v1/k8s/users/roles /api/v1/authorization/roles /v1/k8s/users /api/v1/users /v1/k8s/users/{userId} /api/v1/users/{userId} /v1/k8s/users/{userId}/roles /api/v1/authorization/access-rules /v1/k8s/apps /api/v1/apps /v1/k8s/apps/{clientId} /api/v1/apps/{appId} /v1/k8s/groups /api/v1/authorization/access-rules /v1/k8s/groups/{groupName} /api/v1/authorization/access-rules /v1/k8s/clusters/{clusterId}/departments/{department-id}/access-control /api/v1/authorization/access-rules /api/v1/authorization/access-rules - subjectIdFilter field Use filterBy / sortBy fields /api/v1/authorization/access-rules - scopeType field Use filterBy / sortBy fields /api/v1/authorization/access-rules - roleId field Use filterBy / sortBy fields"},{"location":"home/whats-new-2-17/#projects-api","title":"Projects API","text":"Deprecated Replacement /v1/k8s/clusters/{clusterId}/projects - permissions field /api/v1/authorization/access-rules /v1/k8s/clusters/{clusterId}/projects - resources field Use nodePoolResources field /v1/k8s/clusters/{clusterId}/projects - deservedGpus field Use nodePoolResources field /v1/k8s/clusters/{clusterId}/projects - maxAllowedGpus field Use nodePoolResources field /v1/k8s/clusters/{clusterId}/projects - gpuOverQuotaWeight field Use nodePoolResources field"},{"location":"home/whats-new-2-17/#departments-api","title":"Departments API","text":"Deprecated Replacement /v1/k8s/clusters/{clusterId}/departments - resources field Use nodePoolResources field /v1/k8s/clusters/{clusterId}/departments - deservedGpus field Use nodePoolResources field /v1/k8s/clusters/{clusterId}/departments - allowOverQuota field Use nodePoolResources field /v1/k8s/clusters/{clusterId}/departments - maxAllowedGpus field Use nodePoolResources field"},{"location":"home/whats-new-2-17/#policy-api","title":"Policy API","text":"Deprecated Replacement /api/v1/policy/workspace /api/v2/policy/workspaces /api/v1/policy/training /api/v2/policy/trainings"},{"location":"home/whats-new-2-17/#logo-api","title":"Logo API","text":"Deprecated Replacement /v1/k8s/tenant/{tenantId}/logo /api/v1/logo"},{"location":"home/whats-new-2-17/#removed-apis-and-api-fields-completed-deprecation","title":"Removed APIs and API fields (completed deprecation)","text":"

                                  The following list of API endpoints and fields that have completed their deprecation process and therefore will be changed as follows:

                                  "},{"location":"home/whats-new-2-17/#assets-api","title":"Assets API","text":"Endpoint Change /api/v1/asset/compute gpuRequest field was removed and is replaced by the following fields: * gpuDevicesRequest (New and mandatory) * gpuRequestType (New and mandatory if gpuDevicesRequest=1 otherwise optional for values 0 or greater than 1) * gpuPortion was changed to gpuPortionRequest and accepts values between 0 and 1 (for example 0.75) * gpuPortionLimit (New and optional) * gpuMemory was changed to gpuMemoryRequest * gpuMemoryLimit (New and optional)"},{"location":"home/whats-new-2-17/#metrics-deprecations","title":"Metrics deprecations","text":"

                                  The following metrics are deprecated and replaced by API endpoints. For details about the replacement APIs, see Changed Metrics:

                                  Metric runai_active_job_cpu_requested_cores runai_active_job_memory_requested_bytes runai_cluster_cpu_utilization runai_cluster_memory_utilization runai_gpu_utilization_per_pod_per_gpu runai_gpu_utilization_per_workload runai_job_requested_gpu_memory runai_gpu_memory_used_mebibytes_per_workload runai_gpu_memory_used_mebibytes_per_pod_per_gpu runai_active_job_cpu_limits runai_job_cpu_usage runai_active_job_memory_limits runai_job_memory_used_bytes

                                  Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

                                  For a full explanation of the API Deprecation policy, see the Run:ai API Policy

                                  "},{"location":"home/whats-new-2-17/#breaking-changes","title":"Breaking changes","text":"

                                  Breaking changes notifications allow you to plan around potential changes that may interfere your current workflow when interfacing with the Run:ai Platform.

                                  "},{"location":"home/whats-new-2-17/#metrics","title":"Metrics","text":"

                                  Be aware that some names of metrics have been changed. For more information, see Changed Metrics.

                                  "},{"location":"home/whats-new-2-18/","title":"Version 2.18","text":""},{"location":"home/whats-new-2-18/#release-content-june-30-2024","title":"Release Content - June 30, 2024","text":"
                                  • Deprecation notifications
                                  • Breaking changes
                                  "},{"location":"home/whats-new-2-18/#researcher","title":"Researcher","text":""},{"location":"home/whats-new-2-18/#jobs-workloads-and-workspaces","title":"Jobs, Workloads, and Workspaces","text":"
                                  • Added to UI backoff limit functionality to Training and Workspace workloads. The backoff limit is the maximum number of retry attempts for failed workloads. After reaching the limit, the workload's status will change to Failed. The UI will display the default number of retries based on 6 attempts for each pod in the workload. (For example, 6 pods = 36 attempts).

                                  • Updated Auto-deletion time default value from never to 30 days. The Auto-deletion time count starts when any Run:ai workload reaches a a completed, or failed status will be automatically deleted (including logs). This change only affects new or cloned workloads.

                                  • Added new Data sources of type Secret to workload form. Data sources of type Secret are used to hide 3rd party access credentials when submitting workloads. For more information, see Submitting Workloads.

                                  • Added new graphs for Inference workloads. The new graphs provide more information for Inference workloads to help analyze performance of the workloads. New graphs include Latency, Throughput, and number of replicas. For more information, see Workloads View. (Requires minimum cluster version v2.18).

                                  • Added latency metric for autoscaling. This feature allows automatic scale-up/down the number of replicas of a Run:ai inference workload based on the threshold set by the ML Engineer. This ensures that response time is kept under the target SLA. (Requires minimum cluster version v2.18).

                                  • Improved autoscaling for inference models by taking out ChatBot UI from models images. By moving ChatBot UI to predefined Environments, autoscaling is more accurate by taking into account all types of requests (API, and ChatBot UI). Adding a ChatBot UI environment preset by Run:ai allows AI practitioners to easily connect them to workloads.

                                  • Added more precision to trigger auto-scaling to zero. Now users can configure a precise consecutive idle threshold custom setting to trigger Run:ai inference workloads to scale-to-zero. (Requires minimum cluster version v2.18).

                                  • Added Hugging Face catalog integration of community models. Run:ai has added Hugging Face integration directly to the inference workload form, providing the ability to select models (vLLM models) from Hugging Face. This allows organizations to quickly experiment with the latest open source community language models. For more information on how Hugging Face is integrated, see Hugging Face.

                                  • Improved access permissions to external tools. This improvement now allows more granular control over which personas can access external tools (external URLs) such as Jupyter Notebooks, Chatbot UI, and others. For configuration information, see Submitting workloads. (Requires minimum cluster version v2.18).

                                  • Added a new API for submitting Run:ai inference workloads. This API allows users to easily submit inference workloads. This new API provides a consistent user experience for workload submission which maintains data integrity across all the user interfaces in the Run:ai platform. (Requires minimum cluster version v2.18).

                                  "},{"location":"home/whats-new-2-18/#command-line-interface-v2","title":"Command Line Interface V2","text":"
                                  • Added an improved, researcher-focused Command Line Interface (CLI). The improved CLI brings usability enhancements for the Researcher which include:

                                    • Support multiple clusters
                                    • Self-upgrade
                                    • Interactive mode
                                    • Align CLI to be data consistent with UI and API
                                    • Improved usability and performance

                                    This is an early access feature available for customers to use; however, be aware that there may be functional gaps versus the older, V1 CLI. For more information about installing and using the V2 CLI, see CLI V2. (Requires minimum cluster version v2.18).

                                  "},{"location":"home/whats-new-2-18/#gpu-memory-swap","title":"GPU memory swap","text":"
                                  • Added new GPU to CPU memory swap. To ensure efficient usage of an organization\u2019s resources, Run:ai provides multiple features on multiple layers to help administrators and practitioners maximize their existing GPUs resource utilization. Run:ai\u2019s GPU memory swap feature helps administrators and AI practitioners to further increase the utilization of existing GPU HW by improving GPU sharing between AI initiatives and stakeholders. This is done by expending the GPU physical memory to the CPU memory which is typically an order of magnitude larger than that of the GPU. For more information see, GPU Memory Swap. (Requires minimum cluster version v2.18).
                                  "},{"location":"home/whats-new-2-18/#yaml-workload-reference-table","title":"YAML Workload Reference table","text":"
                                  • Added a new YAML reference document that contains the value types and workload YAML references. Each table contains the field name, its description and the supported Run:ai workload types. The YAML field details contains information on the value type and currently available example workload snippets. For more information see, YAML Reference PDF.
                                  "},{"location":"home/whats-new-2-18/#email-notifications-workload-status-and-timeouts","title":"Email Notifications - Workload Status and timeouts","text":"
                                  • Added new Email notification system. AI Practitioners can setup the types of workload notifications they want to receive. In order to receive email notifications, you must ensure that the admin has enabled and configured notifications for the tenant. For more information, see Email notifications.
                                  "},{"location":"home/whats-new-2-18/#assets","title":"Assets","text":"
                                  • Improved UI asset creation form by adding a Description field. Now asset creators can add a free text description(max 250 characters) to any asset created. The description field is intended to help explain the nature and goal of the asset, this way AI practitioners will be able to make better decisions when choosing their assets in workload creation.
                                  "},{"location":"home/whats-new-2-18/#runai-administrator","title":"Run:ai Administrator","text":""},{"location":"home/whats-new-2-18/#data-sources","title":"Data Sources","text":"
                                  • Added Data Volumes new feature. Data Volumes are snapshots of datasets stored in Kubernetes Persistent Volume Claims (PVCs). They act as a central repository for training data, and offer several key benefits.

                                    • Managed with dedicated permissions\u2014Data Admins, a new role within Run.ai, have exclusive control over data volume creation, data population, and sharing.
                                    • Shared between multiple scopes\u2014unlike other Run:ai data sources, data volumes can be shared across projects, departments, or clusters. This promotes data reuse and collaboration within your organization.
                                    • Coupled to workloads in the submission process\u2014similar to other Run:ai data sources, Data volumes can be easily attached to AI workloads during submission, specifying the data path within the workload environment.

                                    For more information, see Data Volumes. (Requires minimum cluster version v2.18).

                                  • Added new data source of type Secret. Run:ai now allows you to configure a Credential as a data source. A Data source of type Secret is best used in workloads so that access to 3rd party interfaces and storage used in containers, keep access credentials hidden. For more information, see Secrets as a data source.

                                  • Updated the logic of data source initializing state which keeps the workload in \u201cinitializing\u201d status until S3 data is fully mapped. For more information see Sidecar containers documentation.

                                  • Additional storage unit sizes MiB, GiB & TiB (Megabyte, Gigabyte, and Terabyte respectively) added to the UI and API when creating a new data source of type PVC.

                                  "},{"location":"home/whats-new-2-18/#credentials","title":"Credentials","text":"
                                  • Added new Generic secret to Credentials. Credentials had been used only for access to data sources (S3, Git, etc.). However, AI practitioners need to use secrets to access sensitive data (interacting with 3rd party APIs, or other services) without having to put their credentials in their source code. Generic secrets leverage multiple key value pairs which helps reduce the number of Kubernetes resources and simplifies resource management by reducing the overhead associated with maintaining multiple Secrets. Generic secrets are best used as a data source of type Secret so that they can be used in containers to keep access credentials hidden. (Requires minimum cluster version v2.18).
                                  "},{"location":"home/whats-new-2-18/#single-sign-on","title":"Single Sign On","text":"
                                  • Added support for Single Sign On using OpenShift v4 (OIDC based). When using OpenShift, you must first define OAuthClient which interacts with OpenShift's OAuth server to authenticate users and request access tokens. For more information, see Single Sign-On.

                                  • Added OIDC scopes to authentication requests. OIDC Scopes are used to specify what access privileges are being requested for access tokens. The scopes associated with the access tokens determine what resource are available when they are used to access OAuth 2.0 protected endpoints. Protected endpoints may perform different actions and return different information based on the scope values and other parameters used when requesting the presented access token. For more information, see UI configuration.

                                  "},{"location":"home/whats-new-2-18/#ownership-protection","title":"Ownership protection","text":"
                                  • Added new ownership protection feature. Run:ai Ownership Protection ensures that only authorized users can delete or modify workloads. This feature is designed to safeguard important jobs and configurations from accidental or unauthorized modifications by users who did not originally create the workload. For configuration information, see your Run:ai representative.
                                  "},{"location":"home/whats-new-2-18/#email-notifications","title":"Email notifications","text":"
                                  • Added new email notifications feature. Email Notifications sends alerts for critical workload life cycle changes empowering data scientists to take necessary actions and prevent delays.

                                    • System administrators will need to configure the email notifications. For more information, see System notifications.
                                  "},{"location":"home/whats-new-2-18/#policy-for-distributed-and-inference-workloads-in-the-api","title":"Policy for distributed and inference workloads in the API","text":"
                                  • Added a new API for creating distributed training workload policies and inference workload policies. These new policies in the API allow to set defaults, enforce rules and impose setup on distributed training and inference workloads. For distributed policies, worker and master may require different rules due to their different specifications. The new capability is currently available via API only. Documentation on submitting policies to follow shortly.
                                  "},{"location":"home/whats-new-2-18/#policy-for-distributed-and-inference-workloads-in-the-api_1","title":"Policy for distributed and inference workloads in the API","text":"
                                  • Added a new API for creating distributed training workload policies and inference workload policies. These new policies in the API allow to set defaults, enforce rules and impose setup on distributed training and inference workloads. For distributed policies, worker and master may require different rules due to their different specifications. The new capability is currently available via API only. Documentation on submitting policies to follow shortly.
                                  "},{"location":"home/whats-new-2-18/#deprecation-notifications","title":"Deprecation Notifications","text":"

                                  Existing notifications feature requires cluster configuration, is being deprecated in favor of an improved Notification System. If you have been using the existing notifications feature in the cluster, you can continue to use it for the next two versions. It is recommend that you change to the new notifications system in the Control Plane for better control and improved message granularity.

                                  "},{"location":"home/whats-new-2-18/#feature-deprecations","title":"Feature deprecations","text":"

                                  Deprecated features will be available for two versions ahead of the notification. For questions, see your Run:ai representative.

                                  "},{"location":"home/whats-new-2-18/#api-support-and-endpoint-deprecations","title":"API support and endpoint deprecations","text":"

                                  The endpoints and parameters specified in the API reference are the ones that are officially supported by Run:ai. For more information about Run:ai's API support policy and deprecation process, see note under Developer overview.

                                  "},{"location":"home/whats-new-2-18/#deprecated-apis-and-api-fields","title":"Deprecated APIs and API fields","text":""},{"location":"home/whats-new-2-18/#cluster-api-deprecation","title":"Cluster API Deprecation","text":"

                                  Run:ai REST API now supports job submission. The older, Cluster API is now deprecated.

                                  "},{"location":"home/whats-new-2-18/#departments-api","title":"Departments API","text":"Deprecated Replacement /v1/k8s/clusters/{clusterId}/departments /api/v1/org-unit/departments /v1/k8s/clusters/{clusterId}/departments/{department-id} /api/v1/org-unit/departments/{departmentId} /v1/k8s/clusters/{clusterId}/departments/{department-id} /api/v1/org-unit/departments/{departmentId}+PUT/PATCH /api/v1/org-unit/departments/{departmentId}/resources"},{"location":"home/whats-new-2-18/#projects-api","title":"Projects API","text":"Deprecated Replacement /v1/k8s/clusters/{clusterId}/projects /api/v1/org-unit/projects /v1/k8s/clusters/{clusterId}/projects/{id} /api/v1/org-unit/projects/{projectId} /v1/k8s/clusters/{clusterId}/projects/{id} /api/v1/org-unit/projects/{projectId} +\u00a0/api/v1/org-unit/projects/{projectId}/resources

                                  Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

                                  For a full explanation of the API Deprecation policy, see the Run:ai API Policy

                                  "},{"location":"home/whats-new-2-18/#breaking-changes","title":"Breaking changes","text":"

                                  Breaking changes notifications allow you to plan around potential changes that may interfere your current workflow when interfacing with the Run:ai Platform.

                                  "},{"location":"home/whats-new-2-19/","title":"What\u2019s New in Version 2.19","text":""},{"location":"home/whats-new-2-19/#release-content","title":"Release Content
                                  • Deprecation notifications
                                  ","text":""},{"location":"home/whats-new-2-19/#researchers","title":"Researchers","text":""},{"location":"home/whats-new-2-19/#improved-visibility-into-pending-workloads","title":"Improved visibility into pending workloads","text":"

                                  For workloads with the status of \"Pending,\" the user can click the \u201ci\u201d icon next to the status to view details of why the workload hasn\u2019t been scheduled. (Requires a minimum cluster version of v2.19)

                                  "},{"location":"home/whats-new-2-19/#new-workload-events","title":"New workload events","text":"

                                  There are now new GPU resource optimization-related messages that are viewable as workload events. These events help users understand the decisions made by the Run:ai GPU toolkit while handling Run:ai\u2019s GPU resource optimization features. Run:ai\u2019s GPU resource optimization offers unique capabilities that take GPU utilization to a new level and helps customers increase their productivity while maximizing their return on GPU investment. (Requires a minimum cluster version of v2.19)

                                  "},{"location":"home/whats-new-2-19/#improved-command-line-interface-autocompletion","title":"Improved command line interface autocompletion","text":"

                                  CLI V2 now autocompletes nouns such as project names and workload names for better data consistency with the UI, auto-upgrades, and interactive mode.

                                  "},{"location":"home/whats-new-2-19/#details-pane-in-the-workloads-view","title":"Details pane in the Workloads view","text":"

                                  A new DETAILS tab for workloads has been added and presents additional workload information, including Container command, Environment variables, and CLI command syntax (if the workload was submitted via CLI).

                                  "},{"location":"home/whats-new-2-19/#container-path-outside-the-data-source-asset","title":"Container path outside the data source asset","text":"

                                  AI practitioners can now override the predefined container path for each data source when submitting a workload via the Run:ai UI. While the container path must still be specified as part of the data source asset, researchers can now override the default container path when submitting workloads. (Requires a minimum cluster version of v2.16)

                                  "},{"location":"home/whats-new-2-19/#node-toleration-for-workloads","title":"Node toleration for workloads","text":"

                                  Researchers can now optionally set tolerations for workloads, letting them bypass node taints during workload submission via the Run:ai UI. To use this feature, make sure it is activated under General Settings. For more information, refer to the Kubernetes Taints and Tolerations Guide. (Requires a minimum cluster version of v2.19)

                                  "},{"location":"home/whats-new-2-19/#topology-aware-scheduling","title":"Topology-aware scheduling","text":"

                                  When submitting a distributed training workload through the Run:ai UI, researchers can enable topology-aware scheduling. This feature allows an optimized placement within specific placement groups, such as regions, availability zones, or other topologies. To use this, make sure it is activated under General Settings. (Requires a minimum cluster version of v2.19)

                                  "},{"location":"home/whats-new-2-19/#bulk-deletion-of-workloads","title":"Bulk deletion of workloads","text":"

                                  Users can now delete workloads in bulk via the Run:ai UI. They\u2019ll be notified if they try to delete workloads for which they don\u2019t have permissions (and those workloads will not be deleted in this process). Multi-selection can also be done using standard keyboard functions. (Requires a minimum cluster version of v2.19)

                                  "},{"location":"home/whats-new-2-19/#enhanced-policy-representation-in-the-runai-ui","title":"Enhanced policy representation in the Run:ai UI","text":"

                                  To improve AI practitioners' understanding of administrators\u2019 policy rules and defaults, the UI now includes more clarity to the enforcement and the default values representation for workload fields that are not encapsulated in the asset selection. This update aims to make policy enforcement more intuitive and transparent for practitioners. (Requires a minimum cluster version of v2.18)

                                  "},{"location":"home/whats-new-2-19/#configuration-of-credentials-as-environment-variables","title":"Configuration of credentials as environment variables","text":"

                                  Researchers can now easily define pre-configured credentials as environment variables to access private resources. This is available through the Run:ai UI during the workload submission process, specifically under the runtime settings section. (Requires a minimum cluster version pf v2.18)

                                  "},{"location":"home/whats-new-2-19/#expanded-scope-of-configmap-as-data-source","title":"Expanded scope of ConfigMap as data source","text":"

                                  When creating a data source of type ConfigMap, researchers can now not only select a project but also a cluster or department. (Requires a minimum cluster version of v2.19)

                                  "},{"location":"home/whats-new-2-19/#improved-workload-scheduling-algorithm","title":"Improved workload scheduling algorithm","text":"

                                  The Run:ai scheduler algorithm for handling large distributed workloads has been improved and is now more efficient, resulting in better handling of large distributed workloads, and better performance. (Requires a minimum cluster version of v2.19)

                                  "},{"location":"home/whats-new-2-19/#ml-engineer-inference","title":"ML Engineer (Inference)","text":""},{"location":"home/whats-new-2-19/#additional-data-sources-for-inference-workloads","title":"Additional data sources for inference workloads","text":"

                                  When submitting an inference workload via the UI and API, users can now use NFS and hostPath data sources. (Requires a minimum cluster version of v2.19)

                                  "},{"location":"home/whats-new-2-19/#hugging-face-integration-improvements","title":"Hugging Face integration improvements","text":"

                                  To reduce errors when submitting inference workloads, additional validations are done for the Hugging Face integration, ensuring that only valid models are submitted, thus enhancing overall reliability. (Requires a minimum cluster version of v2.19)

                                  "},{"location":"home/whats-new-2-19/#rolling-inference-updates","title":"Rolling inference updates","text":"

                                  ML engineers can now roll updates onto existing inference workloads. Once the revised workload (the update) is up and running, request traffic is redirected to the new version of the workload and the previous version is terminated, ensuring that services are not impacted during the update.

                                  See Inference overview for more information. (Requires a minimum cluster version of v2.19)

                                  "},{"location":"home/whats-new-2-19/#inference-endpoint-authorization","title":"Inference endpoint authorization","text":"

                                  When sharing inference endpoints securely using Run:ai, ML engineers can limit access to the endpoint by specifying the authorized users or groups allowed to use the service (i.e., send requests to the endpoint) after being authenticated. This restriction is especially important when handling sensitive information or when you want to manage costs by sharing the service with a controlled group of consumers. (Requires a minimum cluster version of v2.19)

                                  "},{"location":"home/whats-new-2-19/#runai-developer","title":"Run:ai Developer","text":""},{"location":"home/whats-new-2-19/#metrics-and-telemetry","title":"Metrics and telemetry","text":"

                                  Additional metrics and telemetry are available via the API. For more information, see the details below and in Metrics API:

                                  • Metrics (over time)
                                    • Cluster
                                      • TOTAL_GPU_NODES
                                      • GPU_UTILIZATION_DISTRIBUTION
                                      • UNALLOCATED_GPU
                                    • Nodepool
                                      • TOTAL_GPU_NODES
                                      • GPU_UTILIZATION_DISTRIBUTION
                                      • UNALLOCATED_GPU
                                    • Workload
                                      • GPU_ALLOCATION
                                    • Node
                                      • GPU_UTILIZATION_PER_GPU
                                      • GPU_MEMORY_UTILIZATION_PER_GPU
                                      • GPU_MEMORY_USAGE_BYTES_PER_GPU
                                      • CPU_USAGE_CORES
                                      • CPU_UTILIZATION
                                      • CPU_MEMORY_USAGE_BYTES
                                      • CPU_MEMORY_UTILIZATION
                                  • Telemetry (current time)
                                    • Node
                                      • ALLOCATED_GPUS
                                      • TOTAL_CPU_CORES
                                      • USED_CPU_CORES
                                      • ALLOCATED_CPU_CORES
                                      • TOTAL_GPU_MEMORY_BYTES
                                      • USED_GPU_MEMORY_BYTES
                                      • TOTAL_CPU_MEMORY_BYTES
                                      • USED_CPU_MEMORY_BYTES
                                      • ALLOCATED_CPU_MEMORY_BYTES
                                      • IDLE_ALLOCATED_GPUS
                                  "},{"location":"home/whats-new-2-19/#administrator","title":"Administrator","text":""},{"location":"home/whats-new-2-19/#pagination-in-user-api","title":"Pagination in user API","text":"

                                  Pagination has been added, removing the limitation to the number of users listed in the Run:ai UI.

                                  "},{"location":"home/whats-new-2-19/#audit-log","title":"Audit log","text":"

                                  The audit log has been updated, so system admins can view audit logs directly in the Run:ai UI and download them in CSV or JSON formats, providing flexible options for data analysis and compliance reporting. Version 2.19 reintroduces a fully functional audit log (event history), ensuring comprehensive tracking across projects, departments, access rules, and more. In the new version, all entities are logged except logins and workloads. For more information, see Audit logs.

                                  "},{"location":"home/whats-new-2-19/#platform-administrator","title":"Platform Administrator","text":""},{"location":"home/whats-new-2-19/#department-scheduling-rules","title":"Department scheduling rules","text":"

                                  Scheduling rules have been added at the department level. For more information, see scheduling rules.

                                  "},{"location":"home/whats-new-2-19/#department-node-pool-priority","title":"Department node pool priority","text":"

                                  Node pool priority has been added at the department level. For more information, see node pools

                                  "},{"location":"home/whats-new-2-19/#department-and-project-grids","title":"Department and project grids","text":"

                                  There is now improved filtering and sorting in the Projects and Departments views, including a multi-cluster view and new filters.

                                  "},{"location":"home/whats-new-2-19/#overview-dashboard","title":"Overview dashboard","text":"

                                  \u201cIdle allocated GPU devices\u201d has been added to the Overview dashboard.

                                  "},{"location":"home/whats-new-2-19/#workload-policy-for-distributed-training-workloads-in-the-runai-ui","title":"Workload policy for distributed training workloads in the Run:ai UI","text":"

                                  Distributed workload policies can now be created via the Run:ai UI. Admins can set defaults, enforce rules, and impose setup on distributed training through the UI YAML, as well as view the distributed policies (both in the policy grid and while submitting workloads). For distributed policies, workers and leaders may require different rules due to their different specifications. (Requires a minimum cluster version of v2.18)

                                  "},{"location":"home/whats-new-2-19/#reconciliation-of-policy-rules","title":"Reconciliation of policy rules","text":"

                                  A reconciliation mechanism for policy rules has been added to enhance flexibility in the policy submission process. Previously, if a specific field was governed by a policy for a certain hierarchy, other organizational units couldn\u2019t submit a policy with rules that regarded this specific field. Now, new policies for hierarchies that mention an existing policy field will no longer be blocked. The effective rules are selected based on the following logic: 1. For the compute and security sections in the workload spec of the Run:ai API, the highest hierarchy is chosen for the effective policy (tenant > cluster > department > project). 2. For any other fields in the policy, the lowest hierarchy closest to the actual workload becomes the effective for the policy (similar to policy defaults). Additionally, while viewing the effective policy, each rule displays its source of the origin policy, allowing users to clearly understand the selected hierarchy of the effective policy. | (Requires a minimum cluster version of v2.18)

                                  "},{"location":"home/whats-new-2-19/#infrastructure-administrator","title":"Infrastructure Administrator","text":""},{"location":"home/whats-new-2-19/#support-for-cos-over-gke","title":"Support for COS over GKE","text":"

                                  With Run:ai version 2.19, the Run:ai cluster on Google Kubernetes Engine (GKE) supports Container-Optimized OS (COS) when NVIDIA GPU Operator 24.6 or newer is installed. This is in addition to the already supported Ubuntu on GKE.

                                  "},{"location":"home/whats-new-2-19/#runai-and-karpenter","title":"Run:ai and Karpenter","text":"

                                  Run:ai now supports working with Karpenter. Karpenter is an open-source Kubernetes cluster auto-scaler built for cloud deployments. Karpenter optimizes the cloud cost of a customer\u2019s cluster by moving workloads between different node types, bin-packing nodes, using lower-cost nodes where possible, scaling up new nodes on demand, and shutting down unused nodes with the goal of optimizing and reducing costs. (Requires a minimum cluster version of v2.19)

                                  Please read the documentation for more information on Run:ai and Karpenter integration considerations.

                                  "},{"location":"home/whats-new-2-19/#control-and-visibility-ui-changes","title":"Control and Visibility (UI changes)","text":""},{"location":"home/whats-new-2-19/#new-runai-ui-navigation","title":"New Run:ai UI navigation","text":"

                                  The platform navigation has been updated to offer a more modern design, easier navigation, and address all personas interacting with the UI.

                                  The left-side menu now has seven categories, each with its own reorganized sub-options that appear in the pane next to the menu options.

                                  If you close the sub-options pane, you can hover over the categories, and the sub-options float and can be used in the same way.

                                  The options presented in the menu and categories continue to match each user\u2019s permissions, as in the legacy navigation.

                                  Below is the full list of menu and sub-options and changes:

                                  Analytics Displays the Run:ai dashboards allowing the different users to analyze, plan, and improve system performance AI workload execution. This category contains the following options:

                                  • Overview
                                  • Quota management
                                  • Analytics
                                  • Consumption
                                  • Multi-cluster overview

                                  Workload manager Enables AI practitioners to develop modes, train them, and deploy them into production. All supported tools and capabilities can be found here. This category contains the following options:

                                  • Workloads
                                  • Deleted workloads (now separated from current workloads. If not visible, it can be activated from Settings -> Workloads -> Deleted workloads)
                                  • Templates
                                  • Assets (these options are visible via a collapsible menu)
                                    • Models
                                    • Environments
                                    • Compute resources
                                    • Data sources
                                    • Credentials

                                  Resources Enables viewing and managing all cluster resources. In the new navigation, nodes and node pools have been split into different grids. This category contains the following options:

                                  • Clusters
                                  • Node pools (separated from the Nodes page to its own page)
                                  • Nodes

                                  Organization Maps system organizations to ensure that resource allocation and policies align with the organizational structure, business projects, and priorities. This category contains the following options:

                                  • Departments
                                  • Projects

                                  Access Makes it possible to provide authorization of the different system users to perform actions and alignment with their role and scope of projects within the organization. This was moved from the legacy menu where it appeared in the header of the screen under Tools and Settings. This category contains the following options:

                                  • Users
                                  • Applications
                                  • Roles (separated from the Access rules and roles page to its own page)
                                  • Access rules (separated from the Access rules and roles page to its own page)

                                  Policies Presents the tools to enforce controls over the AI infrastructure enabling different users to be effective while working in alignment with organizational policies. This category contains the following options:

                                  • Workload policies

                                  Admin Presents all administrator functions of the Run:ai platform. This was moved from the legacy menu where it appeared in the header of the screen under Tools and Settings. This category contains the following options:

                                  • General settings (previously General)
                                  • Event history

                                  For users with more than one cluster, in the legacy version the cluster selection appeared in the header of the page. In the new navigation, the cluster selection is part of the grid and changes only affect the items on that page.

                                  If a user prefers not to use the new UI navigation, there is an option to switch back to the legacy navigation by clicking the Back to legacy navigation option.

                                  Installation and configuration

                                  • Tenant logos can now be uploaded to the Run:ai UI via API. The logo should be in base64 format and should not be white to avoid blending into the background. The logo should be no more than 20px tall. See Upload logo for tenant API.
                                  • Run:ai now supports NVIDIA GPU Operator version 24.6
                                  • Run:ai now supports Kubernetes version 1.31
                                  "},{"location":"home/whats-new-2-19/#deprecation-notifications","title":"Deprecation notifications","text":""},{"location":"home/whats-new-2-19/#feature-deprecations","title":"Feature deprecations","text":""},{"location":"home/whats-new-2-19/#legacy-jobs-view","title":"Legacy Jobs view","text":"

                                  The legacy Jobs view will be fully deprecated in the Q1/25 release. We recommend that all users adopt the Workloads view, which offers all the capabilities of the legacy Jobs view with additional enhancements. SaaS customers will gradually be transitioned to the Workloads view during Q4/24.

                                  Note

                                  Users can still submit workloads via the legacy Jobs submission form.

                                  "},{"location":"home/whats-new-2-19/#dynamic-mig-deprecation","title":"Dynamic MIG deprecation","text":"

                                  Dynamic MIG deprecation process starts with Run:ai v2.19 (Q4/24 release)

                                  • The feature is still available and MIG Profile APIs still function but are marked as Deprecated. See the table below for more details.
                                  • In Q1/25 release, \u2018Dynamic MIG\u2019 will not be usable anymore but the APIs will still be accessible.
                                  • In Q2/25 all \u2018Dynamic MIG\u2019 APIs will be fully deprecated.
                                  "},{"location":"home/whats-new-2-19/#legacy-navigation-runai-ui","title":"Legacy navigation - Run:ai UI","text":"

                                  The legacy navigation will be fully deprecated in the Q1/25 release, and during Q1/25 for SaaS customers.

                                  "},{"location":"home/whats-new-2-19/#api-support-and-endpoint-deprecations","title":"API support and endpoint deprecations","text":"Deprecated Replacement /v1/k8s/audit /api/v1/audit/log /api/v1/asset/compute/spec/migProfile /api/v1/workloads/spec/compute/migProfile /api/v1/workloads/workspaces/spec/compute/migProfile /api/v1/workloads/Trainings/spec/compute/migProfile /api/v1/workloads/Inferences/spec/compute/migProfile /api/v1/workloads/distributed/spec/compute/migProfile /api/v1/workloads/distributed/masterSpec/compute/migProfile

                                  Run:ai does not recommend using API endpoints and fields marked as deprecated and will not add functionality to them. Once an API endpoint or field is marked as deprecated, Run:ai will stop supporting it after 2 major releases for self-hosted deployments, and after 6 months for SaaS deployments.

                                  For a full explanation of the API Deprecation policy, see the Run:ai API Policy

                                  "},{"location":"home/whats-new-2-19/#documentation-enhancements","title":"Documentation enhancements","text":""},{"location":"home/whats-new-2-19/#workload-policy-documentation","title":"Workload policy documentation","text":"

                                  A comprehensive set of articles detailing the usage and the process of submitting new workload policies has been introduced. It covers the structure, syntax, best practices, and examples for configuring policy YAML files. The new documentation includes step-by-step explanations of how to create a new rule in a policy, together with information of the different value types, rule types, and policy spec sections. For more information, refer to the Policies section.

                                  "},{"location":"home/whats-new-2-20/","title":"What\u2019s New in Version 2.20","text":""},{"location":"home/whats-new-2-20/#release-content","title":"Release Content

                                  The Run:ai v2.20 What's New provides a detailed summary of the latest features, enhancements, and updates introduced in this version. They serve as a guide to help users, administrators, and researchers understand the new capabilities and how to leverage them for improved workload management, resource optimization, and more.

                                  Important

                                  For a complete list of deprecations, see Deprecation notifications. Deprecated features and capabilities will be available for two versions ahead of the notification.

                                  ","text":""},{"location":"home/whats-new-2-20/#researchers","title":"Researchers","text":""},{"location":"home/whats-new-2-20/#workloads-workspaces-and-training","title":"Workloads - Workspaces and Training","text":"
                                  • Stop/run actions for distributed workloads - You can now stop and run distributed workloads from the UI, CLI, and API. Scheduling rules for training workloads also apply to distributed workloads. This enhances control over distributed workloads, enabling greater flexibility and resource management. From cluster v2.20 onward

                                  • Visibility into idle GPU devices - Idle GPU devices are now displayed in the UI and API showing the number of allocated GPU devices that have been idle for more than 5 minutes. This provides better visibility into resource utilization, enabling more efficient workload management.

                                  • Configurable workload completion with multiple runs - You can now define the number of runs a training workload must complete to be considered finished directly in the UI, API, and CLI v2. Running training workloads multiple times improves the reliability and validity of training results. Additionally, you can configure how many runs can be scheduled in parallel, helping to significantly reduce training time and simplifying the process of managing jobs that require multiple runs. See Train models using a standard training workload for more details. From cluster v2.20 onward

                                  • Configurable grace period for workload preemption - You can now set a grace period in the UI, API and CLI v2 providing a buffer time for preempted workloads to reach a safe checkpoint before being forcibly preempted for standard and distributed training workloads. The grace period can be configured between 0 seconds and 5 minutes. This aims to minimize data loss and avoid unnecessary retraining, ensuring the latest checkpoints are saved. From cluster v2.20 onward

                                  • Pod deletion policy for terminal workloads - You can now specify which pods should be deleted when a distributed workload reaches a terminal state (completed/failed) using cleanPodPolicy in CLI v2 and API. This enhancement provides greater control over resource cleanup and helps maintain a more organized and efficient cluster environment. See cleanPodPolicy for more details.

                                  "},{"location":"home/whats-new-2-20/#workload-assets","title":"Workload Assets","text":"
                                  • Instructions for environment variables - You can now add instructions to environment variables when creating new environments via the UI and API. In addition, Run:ai's environments now include default instructions. Adding instructions provides guidance enabling anyone using the environment to set the environment variable values correctly. From cluster v2.20 onward

                                  • Enhanced environments and compute resource management - The action bar now contains \"Make a Copy\" and \"Edit\" while the \"Rename\" option has been removed. A new \"Last Updated\" column has also been added for easier tracking of asset modifications. From cluster v2.20 onward

                                  • Enhanced data sources and credentials tables - Added a new \"Kubernetes name\" column to data sources and credentials tables for visibility into Kubernetes resource associations. The credentials table now includes an \"Environments\" column displaying the environments associated with the credential. From cluster v2.20 onward

                                  "},{"location":"home/whats-new-2-20/#authentication-and-authorization","title":"Authentication and authorization","text":"
                                  • User applications for API authentication - You can now create your own applications for API integrations with Run:ai. Each application includes client credentials which can be used to obtain an authentication token to utilize for subsequent API calls. See User applications for more details. From cluster v2.20 onward
                                  "},{"location":"home/whats-new-2-20/#scheduler","title":"Scheduler","text":"
                                  • Support for multiple fractional GPUs in a single workload - Run:ai now supports submitting workloads that utilize multiple fractional GPUs within a single workload using the UI and CLI. This feature enhances GPU utilization, increases scheduling probability in shorter timeframes, and allows workloads to consume only the memory they need. It maximizes quota usage and enables more workloads to share the same GPUs effectively. See Multi-GPU fractions and Multi-GPU dynamic fractions for more details. Beta for Dynamic Fractions From cluster v2.20 onward

                                  • Support for GPU memory swap with multiple GPUs per workload - Run:ai now supports GPU memory swap for workloads utilizing multiple GPUs. By leveraging GPU memory swap, you can maximize GPU utilization and serve more workloads using the same hardware. The swap scheduler on each node ensures that all GPUs of a distributed model run simultaneously, maintaining synchronization across GPUs. Workload configurations combine swap settings with multi-GPU dynamic fractions, providing flexibility and efficiency for managing large-scale workloads. See Multi-GPU memory swap. Beta From cluster v2.20 onward

                                  "},{"location":"home/whats-new-2-20/#command-line-interface-cli-v2","title":"Command Line Interface (CLI v2)","text":"
                                  • Support for Windows OS - CLI v2 now supports Windows operating systems, enabling you to leverage the full capabilities of the CLI. From cluster v2.18 onward

                                  • Unified training command structure - Unified the distributed command into the training command to align with the Run:ai UI. The training command now includes a new sub-command to support distributed workloads, ensuring a more consistent and streamlined user experience across both the CLI v2 and UI.

                                  • New command for Kubernetes access - Added a new CLI v2 command, runai kubconfig set, allowing users to set the kubeconfig file with Run:ai authorization token. This enhancement enables users to gain access to the Kubernetes cluster, simplifying authentication and integration with Run:ai-managed environments.

                                  • Added view workload labels - You can now view the labels associated with a workload when using the CLI v2 runai workload describe command for all workload types. This enhancement provides better visibility into workload metadata.

                                  "},{"location":"home/whats-new-2-20/#ml-engineers","title":"ML Engineers","text":""},{"location":"home/whats-new-2-20/#workloads-inference","title":"Workloads - Inference","text":"
                                  • Enhanced visibility into rolling updates for inference workloads - Run:ai now provides a phase message that provides detailed insights into the current state of the update, by hovering over the workload's status. This helps users to monitor and manage updates more effectively. See Rolling inference updates for more details. From cluster v2.20 onward

                                  • Inference serving endpoint configuration - You can now define an inference serving endpoint directly within the environment using the Run:ai UI. From cluster v2.19 onward

                                  • Persistent token management for Hugging Face models - Run:ai allows users to save their Hugging Face tokens persistently as part of their credentials within the Run:ai UI. Once saved, tokens can be easily selected from a list of stored credentials, removing the need to manually enter them each time. This enhancement improves the process of deploying Hugging Face models, making it more efficient and user-friendly. See Deploy inference workloads from Hugging Face for more details. From cluster v2.13 onward

                                  • Deploy and manage NVIDIA NIM models in inference workloads - Run:ai now supports NVIDIA NIM models, enabling you to easily deploy and manage these models when submitting inference workloads. You can select a NIM model and leverage NVIDIA\u2019s hardware optimizations directly through the Run:ai UI. This feature also allows you to take advantage of Run:ai capabilities such as autoscaling and GPU fractioning. See Deploy inference workloads with NVIDIA NIM for more details.

                                  • Customizable autoscaling plans for inference workloads - Run:ai allows advanced users practicing autoscaling for inference workloads to fine-tune their autoscaling plans using the Update inference spec API. This feature enables you to achieve optimal behavior to meet fluctuating request demands. Experimental From cluster v2.20 onward

                                  "},{"location":"home/whats-new-2-20/#platform-administrator","title":"Platform Administrator","text":""},{"location":"home/whats-new-2-20/#analytics","title":"Analytics","text":"
                                  • New Reports view for analytics - The new Reports enables generating and organizing large data in a structured, CSV-formatted layout. With this feature, you can monitor resource consumption, identify trends, and make informed decisions to optimize their AI workloads with greater efficiency.
                                  "},{"location":"home/whats-new-2-20/#authentication-and-authorization_1","title":"Authentication and authorization","text":"
                                  • Client credentials for applications - Applications now use client credentials - Client ID and Client secret - to obtain an authentication token, aligned with OAuth standard. See Applications for more details. From cluster v2.20 onward
                                  "},{"location":"home/whats-new-2-20/#node-pools","title":"Node pools","text":"
                                  • Enhanced metric graphs for node pools - Enhanced metric graphs in the DETAILS tab for node pools by aligning these graphs with the dashboard and the node pools API. As part of this improvement, the following columns have been removed from the Node pools table.

                                    • Node GPU Allocation
                                    • GPU Utilization Distribution
                                    • GPU Utilization
                                    • GPU Memory Utilization
                                    • CPU Utilization
                                    • CPU Memory Utilization
                                  "},{"location":"home/whats-new-2-20/#organizations-projectsdepartments","title":"Organizations - Projects/Departments","text":"
                                  • Enhanced project deletion - Deleting a project will now attempt to delete the project's associated workloads and assets, allowing better management of your organization's assets. From cluster v2.20 onward

                                  • Enhanced resource prioritization for projects and departments - Run:ai has introduced advanced prioritization capabilities to manage resources between projects or between departments more effectively using the Projects and Departments APIs. From cluster v2.20 onward

                                    This feature allows administrators to:

                                    • Prioritize resource allocation and reclaim between different projects and departments.
                                    • Prioritize projects within the same department.
                                    • Set priorities per node-pool for both projects and departments.
                                    • Implement distinct SLAs by assigning strict priority levels to over-quota resources.
                                  • Updated over quota naming - Renamed over quota priority to over quota weight to reflect its actual functionality.

                                  "},{"location":"home/whats-new-2-20/#policy","title":"Policy","text":"
                                  • Added policy-based default field values - Administrators can now set default values for fields that are automatically calculated based on the values of other fields using defaultFrom. This ensures that critical fields in the workload submission form are populated automatically if not provided by the user. From cluster v2.20 onward

                                    This feature supports various field types:

                                    • Integer fields (e.g., cpuCoresRequest),
                                    • Number fields (e.g., gpuPortionRequest),
                                    • Quantity fields (e.g., gpuMemoryRequest)
                                  "},{"location":"home/whats-new-2-20/#data-sources","title":"Data sources","text":"
                                  • Improved control over data source and storage class visibility - Run:ai now provides administrators with the ability to control the visibility of data source types and storage in the UI. Data source types that are restricted by policy will no longer appear during workload submission or when creating new data source assets. Additionally, administrators can configure storage classes as internal using the Storage class configuration API. From cluster v2.20 onward
                                  "},{"location":"home/whats-new-2-20/#email-notifications","title":"Email notifications","text":"
                                  • Added email notifications API - Email notifications can now be configured via API in addition to the UI, enabling integration with external tools. See NotificationChannels API for more details.
                                  "},{"location":"home/whats-new-2-20/#infrastructure-administrator","title":"Infrastructure Administrator","text":""},{"location":"home/whats-new-2-20/#nvidia-data-center-gpus-grace-hopper","title":"NVIDIA Data Center GPUs - Grace-Hopper","text":"
                                  • Support for ARM-Based Grace-Hopper Superchip (GH200) - Run:ai now supports the ARM-based Grace-Hopper Superchip (GH200). Due to a limitation in version 2.20 with ARM64, the Run:ai control plane services must be scheduled on non-ARM based CPU nodes. This limitation will be addressed in a future release. See Self-Hosted installation over Kubernetes for more details. From cluster v2.20 onward
                                  "},{"location":"home/whats-new-2-20/#system-requirements","title":"System requirements","text":"
                                  • Run:ai now supports Kubernetes version 1.32.
                                  • Run:ai now supports OpenShift version 4.17.
                                  • Kubernetes version 1.28 is no longer supported.
                                  • OpenShift versions 4.12 to 4.13 are no longer supported.
                                  "},{"location":"home/whats-new-2-20/#advanced-cluster-configurations","title":"Advanced cluster configurations","text":"
                                  • Exclude nodes in mixed node clusters - Run:ai now allows you to exclude specific nodes in a mixed node cluster using the nodeSelectorTerms flag. See Advanced Cluster Configurations for more details. From cluster v2.20 onward

                                  • Advanced configuration options for cluster services - Introduced new cluster configuration options for setting node affinity and tolerations for Run:ai cluster services. These configuration ensure that the Run:ai cluster services are scheduled on the desired nodes. See Advanced Cluster Configurations for more details. From cluster v2.20 onward

                                    • global.affinity
                                    • global.tolerations
                                    • daemonSetsTolerations
                                  • Added Argo workflows auto-pod grouping - Introduced a new cluster configuration option, gangScheduleArgoWorkflow, to modify the default behavior for grouping ArgoWorkflow pods, allowing you to prevent pods from being grouped into a single pod-group. See Advanced Cluster Configurations for more details. Cluster v2.20 and v2.18

                                  • Added cloud auto-scaling for memory fractions - Run:ai now supports auto-scaling for workloads using memory fractions in cloud environments. Using gpuMemoryToFractionRatio configuration option allows a failed scheduling attempt for a memory fractions workload to create Run:ai scaling pods, triggering the auto-scaler. See Advanced Cluster Configurations for more details. From cluster v2.19 onward

                                  • Added stale gang eviction timeout for improved stability - Run:ai has introduced a default timeout of 60 seconds for gang eviction in gang scheduling workloads using defaultStalenessGracePeriod. This timeout allows both the workload controller and the scheduler sufficient time to remediate the workload, improving the stability of large training jobs. See Advanced Cluster Configurations for more details. From cluster v2.18 onward

                                  • Added custom labels for built-in alerts - Administrators can now add their own custom labels to the built-in alerts from Prometheus by setting spec.prometheus.additionalAlertLabels in their cluster. See Advanced Cluster Configurations for mode details. From cluster v2.20 onward

                                  • Enhanced configuration flexibility for cluster replica management - Administrators can now use the spec.global.replicaCount to manage replicas for Run:ai services. See Advanced Cluster Configurations for more details. From cluster v2.20 onward

                                  "},{"location":"home/whats-new-2-20/#runai-built-in-alerts","title":"Run:ai built-in alerts","text":"
                                  • Added two new Run:ai built-in alerts for Kubernetes nodes hosting GPU workloads. The unknown state alert notifies when the node's health and readiness cannot be determined, and the low memory alert warns when the node has insufficient memory to support current or upcoming workloads. From cluster v2.20 onward
                                  "},{"location":"home/whats-new-2-20/#runai-developer","title":"Run:ai Developer","text":""},{"location":"home/whats-new-2-20/#metrics-and-telemetry","title":"Metrics and Telemetry","text":"
                                  • Additional metrics and telemetry are available via the API. For more details, see Metrics API:

                                    • Metrics (over time)

                                      • Project
                                        • GPU_QUOTA
                                        • CPU_QUOTA_MILLICORES
                                        • CPU_MEMORY_QUOTA_MB
                                        • GPU_ALLOCATION
                                        • CPU_ALLOCATION_MILLICORES
                                        • CPU_MEMORY_ALLOCATION_MB
                                      • Department
                                        • GPU_QUOTA
                                        • CPU_QUOTA_MILLICORES
                                        • CPU_MEMORY_QUOTA_MB
                                        • GPU_ALLOCATION
                                        • CPU_ALLOCATION_MILLICORES
                                        • CPU_MEMORY_ALLOCATION_MB
                                    • Telemetry (current time)

                                      • Project
                                        • GPU_QUOTA
                                        • CPU_QUOTA
                                        • MEMORY_QUOTA
                                        • GPU_ALLOCATION
                                        • CPU_ALLOCATION
                                        • MEMORY_ALLOCATION
                                        • GPU_ALLOCATION_NON_PREEMPTIBLE
                                        • CPU_ALLOCATION_NON_PREEMPTIBLE
                                        • MEMORY_ALLOCATION_NON_PREEMPTIBLE
                                      • Department
                                        • GPU_QUOTA
                                        • CPU_QUOTA
                                        • MEMORY_QUOTA
                                        • GPU_ALLOCATION
                                        • CPU_ALLOCATION
                                        • MEMORY_ALLOCATION
                                        • GPU_ALLOCATION_NON_PREEMPTIBLE
                                        • CPU_ALLOCATION_NON_PREEMPTIBLE
                                        • MEMORY_ALLOCATION_NON_PREEMPTIBLE
                                  "},{"location":"home/whats-new-2-20/#deprecation-notifications","title":"Deprecation notifications","text":""},{"location":"home/whats-new-2-20/#ongoing-dynamic-mig-deprecation-process","title":"Ongoing Dynamic MIG deprecation process","text":"

                                  The Dynamic MIG deprecation process started in version 2.19. Run:ai supports standard MIG profiles as detailed in Configuring NVIDIA MIG profiles.

                                  • Before upgrading to version 2.20, workloads submitted with Dynamic MIG and their associated node configurations must be removed
                                  • In version 2.20, MIG was removed from the Run:ai UI under compute resources.
                                  • In Q2/25 all \u2018Dynamic MIG\u2019 APIs and CLI commands will be fully deprecated.
                                  "},{"location":"home/whats-new-2-20/#cli-v1-deprecation","title":"CLI v1 deprecation","text":"

                                  CLI V1 is deprecated and no new features will be developed for it. It will remain available for use for the next two releases to ensure a smooth transition for all users. We recommend switching to CLI v2, which provides feature parity, backward compatibility, and ongoing support for new enhancements. CLI v2 is designed to deliver a more robust, efficient, and user-friendly experience.

                                  "},{"location":"home/whats-new-2-20/#legacy-jobs-view-deprecation","title":"Legacy Jobs view deprecation","text":"

                                  Starting with version 2.20, the legacy Jobs view will be discontinued in favor of the more advanced Workloads view. The legacy submission form will still be accessible via the Workload manager view for a smoother transition.

                                  "},{"location":"home/whats-new-2-20/#appid-and-appsecret-deprecation","title":"appID and appSecret deprecation","text":"

                                  Deprecating appID and appSecret parameters used for requesting an API token. It will remain available for use for the next two releases. To create application tokens, use your client credentials - Client ID and Client secret.

                                  "},{"location":"home/changelog/hotfixes-2-13/","title":"Changelog Version 2.13","text":"

                                  The following is a list of the known and fixed issues for Run:ai V2.13.

                                  "},{"location":"home/changelog/hotfixes-2-13/#version-21348-march-14-2024","title":"Version 2.13.48 - March 14, 2024","text":"Internal ID Description RUN-16787 Fixed an issue after an upgrade to 2.13 where distributed PyTorch jobs were not able to run due to PVCs being assigned to only worker pods. RUN-16626 Fixed an issue in SSO environments, where Workspaces created using a template were assigned the template creator's UID/GID and not the Workspace creator's UID/GID. RUN-16357 Fixed an issue where pressing the Project link in Jobs screen redirects the view to the Projects of a different cluster in multi-cluster environments."},{"location":"home/changelog/hotfixes-2-13/#version-21343-february-15-2024","title":"Version 2.13.43 - February 15, 2024","text":"Internal ID Description RUN-14946 Fixed an issue where Dashboards are displaying the hidden Grafana path."},{"location":"home/changelog/hotfixes-2-13/#version-21337","title":"Version 2.13.37","text":"Internal ID Description RUN-13300 Fixed an issue where projects will appear with a status of empty while waiting for the project controller to update its status. This was caused because the cluster-sync works faster than the project controller."},{"location":"home/changelog/hotfixes-2-13/#version-21335-december-19-2023","title":"Version 2.13.35 - December 19, 2023","text":""},{"location":"home/changelog/hotfixes-2-13/#release-content","title":"Release content","text":"
                                  • Added the ability to set node affinity for Prometheus.
                                  "},{"location":"home/changelog/hotfixes-2-13/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-14472 Fixed an issue where template updates were not being applied to the workload. RUN-14434 Fixed an issue where runai_allocated_gpu_count_per_gpu was multiplied by seven. RUN-13956 Fixed an issue where editing templates failed. RUN-13825 Fixed an issue when deleting a job that is allocated a fraction of a GPU, an associated configmap is not deleted. RUN-13343 Fixed an issue in pod status calculation."},{"location":"home/changelog/hotfixes-2-13/#version-21331","title":"Version 2.13.31","text":"Internal ID Description RUN-11367 Fixed an issue where a double click on SSO Users redirects to a blank screen. RUN-10560 Fixed an issue where the RunaiDaemonSetRolloutStuck alert did not work."},{"location":"home/changelog/hotfixes-2-13/#version-21325","title":"Version 2.13.25","text":"Internal ID Description RUN-13171 Fixed an issue when a cluster is not connected the actions in the Workspace and Training pages are still enabled. After the corrections, the actions will be disabled."},{"location":"home/changelog/hotfixes-2-13/#version-21321","title":"Version 2.13.21","text":"Internal ID Description RUN-12563 Fixed an issue where users are unable to login after upgrading the control plane from 2.9.16 to 2.13.16. To correct the issue, secrets need to be upgraded manually in keycloak."},{"location":"home/changelog/hotfixes-2-13/#version-21320-september-28-2023","title":"Version 2.13.20 - September 28, 2023","text":""},{"location":"home/changelog/hotfixes-2-13/#release-content_1","title":"Release content","text":"
                                  • Added the prevention of selecting tenant or department scopes for credentials, and the prevention of selecting s3, PVC, and Git data sources if the cluster version does not support these.
                                  • Quota management is now enabled by default.
                                  Internal ID Description RUN-12923 Fixed an issue in upgrading due to a misconfigured Docker image for airgapped systems in 2.13.19. The helm chart contained an error, and the image is not used even though it is packaged as part of the tar. RUN-12928, RUN-12968 Fixed an issue in upgrading Prometheus due to a misconfigured image for airgapped systems in 2.13.19. The helm chart contained an error, and the image is not used even though it is packaged as part of the tar. RUN-12751 Fixed an issue when upgrading from 2.9 to 2.13 results with a missing engine-config file. RUN-12717 Fixed an issue where the user that is logged in as researcher manager can't see the clusters. RUN-12642 Fixed an issue where assets-sync could not restart due to failing to get token from control plane. RUN-12191 Fixed an issue where there was a timeout while waiting for the runai_allocated_gpu_count_per_project metric to return values. RUN-10474 Fixed an issue where the runai-conatiner-toolkit-exporter DaemonSet fails to start."},{"location":"home/changelog/hotfixes-2-13/#version-21319-september-27-2023","title":"Version 2.13.19 - September 27, 2023","text":""},{"location":"home/changelog/hotfixes-2-13/#release-content_2","title":"Release content","text":"
                                  • Added the ability to identify Kubeflow notebooks and display them in the Jobs table.
                                  • Added the ability to schedule Kubelow workloads.
                                  • Added functionality that displays Jobs that only belong to the user that is logged in.
                                  • Added and refined alerts to the state of Run:ai components, schedule latency, and warnings for out of memory on Jobs.
                                  • Added the ability to work with restricted PSA policy.
                                  "},{"location":"home/changelog/hotfixes-2-13/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-12650 Fixed an issue that used an incorrect metric in analytics GPU ALLOCATION PER NODE panel. Now the correct allocation is in percentage. RUN-12602 Fixed an issue in runaiconfig where the WorkloadServices spec has memory requests/limits and cpu requests/limits and gets overwritten with the system default. RUN-12585 Fixed an issue where the workload-controller creates a delay in running jobs. RUN-12031 Fixed an issue when upgrading from 2.9 to 2.13 where the Scheduler pod fails to upgrade due to the change of owner. RUN-11091 Fixed an issue where the Departments feature is disabled, you are not able to schedule non-preemable jobs."},{"location":"home/changelog/hotfixes-2-13/#version-21313","title":"Version 2.13.13","text":"Internal ID Description RUN-11321 Fixed an issue where metrics always showed CPU Memory Utilization and CPU Compute Utilization as 0. RUN-11307 Fixed an issue where node affinity might change mid way through a job. Node affinity in now calculated only once at job submission. RUN-11129 Fixed an issue where CRDs are not automatically upgraded when upgrading from 2.9 to 2.13."},{"location":"home/changelog/hotfixes-2-13/#version-21312-august-7-2023","title":"Version 2.13.12 - August 7, 2023","text":"Internal ID Description RUN-11476 Fixed an issue with analytics node pool filter in Allocated GPUs per Project panel."},{"location":"home/changelog/hotfixes-2-13/#version-21311","title":"Version 2.13.11","text":"Internal ID Description RUN-11408 Added to the Run:ai job-controller 2 configurable parameters QPS and Burst which are applied as environment variables in the job-controller Deployment object."},{"location":"home/changelog/hotfixes-2-13/#version-2137-july-2023","title":"Version 2.13.7 - July 2023","text":""},{"location":"home/changelog/hotfixes-2-13/#release-content_3","title":"Release content","text":"
                                  • Added filters to the historic quota ratio widget on the Quota management dashboard.
                                  "},{"location":"home/changelog/hotfixes-2-13/#fixed-issues_2","title":"Fixed issues","text":"Internal ID Description RUN-11080 Fixed an issue in OpenShift environments where log in via SSO with the kubeadmin user, gets blank pages for every page. RUN-11119 Fixed an issue where values that should be the Order of priority column are in the wrong column. RUN-11120 Fixed an issue where the Projects table does not show correct metrics when Run:ai version 2.13 is paired with a Run:ai 2.8 cluster. RUN-11121 Fixed an issue where the wrong over quota memory alert is shown in the Quota management pane in project edit form. RUN-11272 Fixed an issue in OpenShift environments where the selection in the cluster drop down in the main UI does not match the cluster selected on the login page."},{"location":"home/changelog/hotfixes-2-13/#version-2134","title":"Version 2.13.4","text":""},{"location":"home/changelog/hotfixes-2-13/#release-date","title":"Release date","text":"

                                  July 2023

                                  "},{"location":"home/changelog/hotfixes-2-13/#fixed-issues_3","title":"Fixed issues","text":"Internal ID Description RUN-11089 Fixed an issue when creating an environment, commands in the Runtime settings pane and are not persistent and cannot be found in other assets (for example in a new Training)."},{"location":"home/changelog/hotfixes-2-13/#version-2131-july-2023","title":"Version 2.13.1 - July 2023","text":""},{"location":"home/changelog/hotfixes-2-13/#release-content_4","title":"Release content","text":"
                                  • Made an improvement so that occurrences of labels that are not in use anymore are deleted.
                                  "},{"location":"home/changelog/hotfixes-2-13/#fixed-issues_4","title":"Fixed issues","text":"

                                  N/A

                                  "},{"location":"home/changelog/hotfixes-2-15/","title":"Changelog Version 2.15","text":"

                                  The following is a list of the known and fixed issues for Run:ai V2.15.

                                  "},{"location":"home/changelog/hotfixes-2-15/#version-2159-february-5-2024","title":"Version 2.15.9 - February 5, 2024","text":""},{"location":"home/changelog/hotfixes-2-15/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-15296 Fixed an issue where the resources parameter was deprecated in the Projects and Departments API."},{"location":"home/changelog/hotfixes-2-15/#version-2154-january-5-2024","title":"Version 2.15.4 - January 5, 2024","text":""},{"location":"home/changelog/hotfixes-2-15/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-15026 Fixed an issue in workloads that were built on a cluster that does not support the NFS field. RUN-14907 Fixed an issue after an upgrade where the Analytics dashboard was missing the time ranges from before the upgrade. RUN-14903 Fixed an issue where internal operations were exposed to the customer audit log. RUN-14062 Fixed an issue in the Overview dashboard where the content for the Running Workload per Type panel did not fit."},{"location":"home/changelog/hotfixes-2-15/#version-2152-february-5-2024","title":"Version 2.15.2 - February 5, 2024","text":""},{"location":"home/changelog/hotfixes-2-15/#fixed-issues_2","title":"Fixed issues","text":"Internal ID Description RUN-14434 Fixed an issue where the Allocated GPUs metric was multiplied by seven."},{"location":"home/changelog/hotfixes-2-15/#version-2151-december-17-2023","title":"Version 2.15.1 - December 17, 2023","text":""},{"location":"home/changelog/hotfixes-2-15/#release-content","title":"Release content","text":"
                                  • Added environment variables for customizable QPS and burst support.

                                  • Added the ability to support running multiple Prometheus replicas.

                                  "},{"location":"home/changelog/hotfixes-2-15/#fixed-issues_3","title":"Fixed issues","text":"Internal ID Description RUN-14292 Fixed an issue where BCM installations were failing due to missing create cluster permissions. RUN-14289 Fixed an issue where metrics were not working due to an incorrect parameter in the cluster-config file. RUN-14198 Fixed an issue in services where multi nodepool jobs were not scheduled due to an unassigned nodepool status. RUN-14191 Fixed an issue where a consolidation failure would cause unnecessary evictions. RUN-14154 Fixed an issue in the New cluster form, whefre the dropdown listed versions that were incompatible with the installed control plane. RUN-13956 Fixed an issue in the Jobs table where templates were not edited successfully. RUN-13891 Fixed an issue where Ray job statuses were shown as empty. RUN-13825 Fixed an issue where GPU sharing configmaps were not deleted. RUN-13628 Fixed an issue where the pre-install pod failed to run pre-install tasks due to the request being denied (Unauthorized). RUN-13550 Fixed an issue where environments were not recovering from a node restart due to a missing GPU runtime class for containerized nodes. RUN-11895 Fixed an issue where the wrong amount of GPU memory usage was shown (is now MB). RUN-11681 Fixed an issue in OpenShift environments where some metrics were not shown on dashboards when the GPU Operator from the RedHat marketplace was installed."},{"location":"home/changelog/hotfixes-2-15/#version-2150","title":"Version 2.15.0","text":""},{"location":"home/changelog/hotfixes-2-15/#fixed-issues_4","title":"Fixed issues","text":"Internal ID Description RUN-13456 Fixed an issue where the Researcher L1 role did not have permissions to create and manage credentials. RUN-13282 Fixed an issue where Workspace logs crashed unexpectedly after restarting. RUN-13121 Fixed an issue in not being able to launch jobs using the API after an upgrade overrode a change in keycloak for applications which have a custom mapping to an email. RUN-13103 Fixed an issue in the Workspaces and Trainings table where the action buttons were not greyed out for users with only the view role. RUN-12993 Fixed an issue where Prometheus was reporting metrics even though the cluster was disconnected. RUN-12978 Fixed an issue after an upgrade, where permissions fail to sync to a project due to a missing application name in the CRD. RUN-12900 Fixed an issue in the Projects table, when sorting by Allocated GPUs, the projects were displayed alphabetically and not numerically. RUN-12846 Fixed an issue after a control-plane upgrade, where GPU, CPU, and Memory Cost fields (in the Consumption Reports) were missing when not using Grafana. RUN-12824 Fixed an issue where airgapped environments tried to pull an image from gcr.io (Internet). RUN-12769 Fixed an issue where SSO users were unable to see projects in Job Form unless the group they belong to was added directly to the project. RUN-12602 Fixed an issue in the documentation where the WorkloadServices configuration in the runaiconfig file was incorrect. RUN-12528 Fixed an issue where the Workspace duration scheduling rule was suspending workspaces regardless of the configured duration. RUN-12298 Fixed an issue where projects were not shown in the Projects table due to the API not sanitizing the project name at time of creation. RUN-12157 Fixed an issue where querying pods completion time returned a negative number. RUN-10560 Fixed an issue where no Prometheus alerts were sent due to a misconfiguration of the parameter RunaiDaemonSetRolloutStuck."},{"location":"home/changelog/hotfixes-2-16/","title":"Changelog Version 2.16","text":"

                                  The following is a list of the known and fixed issues for Run:ai V2.16.

                                  "},{"location":"home/changelog/hotfixes-2-16/#version-21665","title":"Version 2.16.65","text":"Internal ID Description RUN-21448 Fixed an issue with degraded workload so the condition would reflect the actual state. RUN-20680 Fixed an issue where the workload page did not present the requested GPU."},{"location":"home/changelog/hotfixes-2-16/#version-21657","title":"Version 2.16.57","text":"Internal ID Description RUN-20388 Fixed an issue where cluster-sync caused a memory leak."},{"location":"home/changelog/hotfixes-2-16/#version-21625","title":"Version 2.16.25","text":"Internal ID Description RUN-17241 Fixed an issue where the nodes page showed nodes as not ready due to \"tookit not installed\"."},{"location":"home/changelog/hotfixes-2-16/#version-21621","title":"Version 2.16.21","text":"Internal ID Description RUN-16463 Fixed an issue after a cluster upgrade to v2.16, where some metrics of pre-existing workloads were displayed incorrectly in the Overview Dashboard."},{"location":"home/changelog/hotfixes-2-16/#version-21618","title":"Version 2.16.18","text":"Internal ID Description RUN-16486 Fixed an issue in the Workloads creation form where the GPU fields of the compute resource tiles were showing no data."},{"location":"home/changelog/hotfixes-2-16/#version-21616","title":"Version 2.16.16","text":"Internal ID Description RUN-16340 Fixed an issue in the Workloads table where filters were not saved correctly."},{"location":"home/changelog/hotfixes-2-16/#version-21615","title":"Version 2.16.15","text":""},{"location":"home/changelog/hotfixes-2-16/#release-content","title":"Release content","text":"
                                  • Implemented a new Workloads API to support the Workloads feature.
                                  "},{"location":"home/changelog/hotfixes-2-16/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-16070 Fixed an issue where missing metrics caused the Nodepools table to appear empty."},{"location":"home/changelog/hotfixes-2-16/#version-21614","title":"Version 2.16.14","text":""},{"location":"home/changelog/hotfixes-2-16/#release-content_1","title":"Release content","text":"

                                  *Improved overall performance by slowing down metrics updates from 10 seconds to 30 seconds.

                                  "},{"location":"home/changelog/hotfixes-2-16/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-16255 Fixed an issue in the Analytics dashboard where the GPU Allocation per Node and GPU Memory Allocation per Node panels were displaying incorrect data. RUN-16035 Fixed an issue in the Workloads table where completed pods continue to be counted in the requested resources column."},{"location":"home/changelog/hotfixes-2-16/#version-21612","title":"Version 2.16.12","text":""},{"location":"home/changelog/hotfixes-2-16/#fixed-issues_2","title":"Fixed issues","text":"Internal ID Description RUN-16110 Fixed an issue where creating a training workload (single or multi-node) with a new PVC or Volume, resulted in the Workloads table showing the workload in the Unknown/Pending status. RUN-16086 Fixed an issue in airgapped environments where incorrect installation commands were shown when upgrading to V2.15."},{"location":"home/changelog/hotfixes-2-16/#version-21611","title":"Version 2.16.11","text":"

                                  N/A

                                  "},{"location":"home/changelog/hotfixes-2-16/#version-2169","title":"Version 2.16.9","text":"

                                  N/A

                                  "},{"location":"home/changelog/hotfixes-2-16/#version-2168","title":"Version 2.16.8","text":""},{"location":"home/changelog/hotfixes-2-16/#release-content_2","title":"Release content","text":"

                                  N/A

                                  "},{"location":"home/changelog/hotfixes-2-16/#version-2167","title":"Version 2.16.7","text":""},{"location":"home/changelog/hotfixes-2-16/#release-content_3","title":"Release content","text":"
                                  • Added an API endpoint that retrieves data from a workloads's pod.
                                  "},{"location":"home/changelog/hotfixes-2-16/#fixed-issues_3","title":"Fixed issues","text":"

                                  N/A

                                  "},{"location":"home/changelog/hotfixes-2-16/#version-2166","title":"Version 2.16.6","text":"

                                  N/A

                                  "},{"location":"home/changelog/hotfixes-2-17/","title":"Changelog Version 2.17","text":"

                                  The following is a list of the known and fixed issues for Run:ai V2.17.

                                  "},{"location":"home/changelog/hotfixes-2-17/#version-21763","title":"Version 2.17.63","text":"Internal ID Description RUN-21448 Fixed an issue where a degraded workload was stuck and could not be released."},{"location":"home/changelog/hotfixes-2-17/#version-21746","title":"Version 2.17.46","text":"Internal ID Description RUN-20136 Updated postgres version."},{"location":"home/changelog/hotfixes-2-17/#version-21743","title":"Version 2.17.43","text":"Internal ID Description RUN-19949 Fixed an issue where runai submit arguments were not parsed correctly to the command."},{"location":"home/changelog/hotfixes-2-17/#version-21741","title":"Version 2.17.41","text":"Internal ID Description RUN-19870 Added debug logs to cluster-sync"},{"location":"home/changelog/hotfixes-2-17/#version-21726","title":"Version 2.17.26","text":"Internal ID Description RUN-19189 Fixed an issue in cluster-sync that sometimes caused unnecessary sync process to the control-plane."},{"location":"home/changelog/hotfixes-2-17/#version-21725","title":"Version 2.17.25","text":"Internal ID Description RUN-16357 Fixed an issue where the Project button in the Jobs screen redirects to the Projects page but on the wrong cluster."},{"location":"home/changelog/hotfixes-2-17/#version-21710","title":"Version 2.17.10","text":"Internal ID Description RUN-18065 Fixed an issue where the legacy job sumbission configuration was not available in the Settings page"},{"location":"home/changelog/hotfixes-2-17/#version-2170","title":"Version 2.17.0","text":"Internal ID Description RUN-20010 Fixed an issue of reduced permissions that run:ai grants users"},{"location":"home/changelog/hotfixes-2-18/","title":"Changelog Version 2.18","text":"

                                  The following is a list of the known and fixed issues for Run:ai V2.18.

                                  "},{"location":"home/changelog/hotfixes-2-18/#hotfixes","title":"Hotfixes","text":"Internal ID Hotfix # Description RUN-24521 2.18.83 Fixed a security vulnerability in golang.org.x.crypto with CVE CVE-2024-45337 with severity HIGH. RUN-24733 2.18.83 Fixed an issue where department admins were unable to load the quota management page. RUN-25094 2.18.82 Fixed an issue where OpenShift could not be upgraded due to a broken 3rd binary. RUN-24921 2.18.80 Fixed a security vulnerability in golang.org.x.net and golang.org.x.crypto. RUN-24632 2.18.80 Fixed an issue where an existing monitoring Prometheus setup deployed in an unexpected namespace was reported as missing, causing Run:ai installation to fail on the cluster. The installation mechanism now searches for the monitoring prerequisite in additional relevant namespaces. RUN-24693 2.18.80 Fixed an issue where users were unable to provide metric store authentication details using secret references. RUN-24752 2.18.79 Fixed an issue where a workload would move to a failed state when created with a custom NodePort that was already allocated. RUN-24649 2.18.79 Fixed an issue where submitting a workload with existingPvc=false and not providing a claimName resulted in auto-generating a claimName that included both upper and lower case letters. Since Kubernetes rejects uppercase letters, the workload would fail. The behavior has been updated to generate names using only lowercase letters. RUN-24595 2.18.78 Fixed an issue where the new CLI did not parse master and worker commands/args simultaneously for distributed workloads. RUN-23914 2.18.78 Fixed an issue where unexpected behavior could occur if an application was capturing a graph while memory was being swapped in as part of the GPU memory swap feature. RUN-24020 2.18.77 Fixed a security vulnerability in k8s.io.kubernetes with CVE CVE-2024-0793. RUN-24021 2.18.77 Fixed a security vulnerability in pam with CVE CVE-2024-10963. RUN-23798 2.18.75 Fixed an issue in distributed PyTorch workloads where the worker pods are deleted immediately after completion, not allowing logs to be viewed. RUN-23838 2.18.74 Fixed an issue where the command-line interface could not access resources when configured as single-sign on in a self-hosted environment. RUN-23561 2.18.74 Fixed an issue where the frontend in airgapped environment attempted to download font resources from the internet. RUN-23789 2.18.73 Fixed an issue where in some cases, it was not possible to download the latest version of the command line interface. RUN-23790 2.18.73 Fixed an issue where in some cases it was not possible to download the Windows version of the command line interface. RUN-23855 2.18.73 Fixed an issue where the pods list in the UI showed past pods. RUN-23909 2.18.73 Fixed an issue where users based on group permissions cannot see dashboards. RUN-23857 2.18.72 Dashboard to transition from Grafana v9 to v10. RUN-24010 2.18.72 Fixed an infinite loop issue in the cluster-sync service. RUN-23040 2.18.72 Fixed an edge case where the Run:ai container toolkit hangs when user is spawning hundreds of sub-processes. RUN-23802 2.18.70 Fixed an issue where new scheduling rules were not applied to existing workloads, if those new rules were set on existing projects which had no scheduling rules before. RUN-23211 2.18.70 Fixed an issue where workloads were stuck at \"Pending\" when the command-line interface flag --gpu-memory was set to zero. RUN-23778 2.18.68 Fixed an issue where in single-sign-on configuration, the mapping of UID and other properties would sometimes disappear. RUN-23762 2.18.68 Fixed an issue where the wrong version of a Grafana dashboard was displayed in the UI. RUN-21198 2.18.66 Fixed an issue where creating a training workload via yaml (kubectl apply -f) and specifying spec.namePrefix, created infinite jobs. RUN-23541 2.18.65 Fixed an issue where in some cases workload authorization did not work properly due to wrong oidc configuration. RUN-23291 2.18.64 CLI change text to be user friendly RUN-23283 2.18.64 Fixed a permissions issue with the Analytics dashboard post upgrade for SSO Users RUN-23420 2.18.63 Replaced Redis with Keydb RUN-23140 2.18.63 Fixed an issue where distributed workloads were created with the wrong types RUN-23130 2.18.63 Fixed an issue where inference-workload-controller crashed when WorkloadOwnershipProtection was enabled RUN-23334 2.18.62 Updated core Dockerfiles to ubi9 RUN-23296 2.18.62 Fixed an issue in the CLI where runai attach did not work with auto-complete RUN-23215 2.18.62 Fixed an issue where metrics requests from backend to mimir failed for certain tenants. RUN-22138 2.18.62 Fixed an issue where private URL user(s) input was an email and not a string. RUN-23282 2.18.61 CLI documentation fixes RUN-23055 2.18.60 Fixed unified Distributed and Training CLI commands RUN-23243 2.18.59 Fixed an issue where the scope tree wasn't calculating permissions correctly RUN-22463 2.18.59 Fixed an error in CLI bash command RUN-22314 2.18.59 Fixed distributed framework filtering in API commands RUN-23142 2.18.58 Fixed an issue where advanced GPU metrics per-gpu don't have gpu label RUN-23001 2.18.58 Fixed an issue of false overcommit on out-of-memory killed in the \u201cswap\u201d feature. RUN-22851 2.18.58 Fixed an issue where client may get stuck on device lock acquired during \u201cswap\u201d out-migration RUN-22758 2.18.58 Fixed an issue where inference workload showed wrong status when submission failed. RUN-22544 2.18.58 Updated Grafana version for security vulnerabilities. RUN-23055 2.18.57 Fixed the unified Distributed and Training CLI commands RUN-23014 2.18.56 Fixed an issue where node-scale-adjuster might not create a scaling pod if it is in cool-down and the pod was not updated after that. RUN-22660 2.18.56 Fixed an issue where workload charts have an unclear state RUN-22457 2.18.55 Fixed an issue where in rare edge cases the cluster-sync pod was out of memory. RUN-21825 2.18.55 Fixed all CVEs in Run:ai's Goofys-based image used for S3 integration. RUN-22871 2.18.55 Fixed an issue in runai-container-toolkit where in certain cases when a process is preempted, OOMKill metrics were not published correctly. RUN-22250 2.18.55 Fixed an issue where workloads trying to use an ingress URL which is already in use were behaving inconsistentyly instead of failing immediately. RUN-22880 2.18.55 Fixed an issue where the minAvailable field for training-operator CRDs did not consider all possible replica specs. RUN-22073 2.18.55 Fixed an issue where runai-operator failed to parse cluster URLs ending with '/'. RUN-22453 2.18.55 Fixed an issue where in rare edge cases the workload-overseer pod experienced a crash. RUN-22763 2.18.55 Fixed an issue where in rare edge cases an 'attach' command from CLI-V2 caused a crash in the cluster-api service. RUN-21948 2.18.49 Fixed an issue where in rare edge cases workload child resources could have duplicate names, causing inconsistent behavior. RUN-22623 2.18.49 Fixed an issue in Openshift where workloads were not suspended when reaching their idle GPU time limit. RUN-22600 2.18.49 Fixed an issue in AWS EKS clusters where the V1-CLI returned an empty table when listing all projects as an administrator. RUN-21878 2.18.49 Added a label to disable container toolkit from running on certain nodes run.ai/container-toolkit-enabled. RUN-22452 2.18.47 Fixed an issue where the scheduler has signature errors if TopologySpreadConstraints was partially defined. RUN-22570 2.18.47 Updated git-sync image to version v4.3.0. RUN-22054 2.18.46 Fixed an issue where users could not attach to jobs. RUN-22377 2.18.46 Removed uncached client from accessrule-controller. RUN-21697 2.18.46 Fixed an issue where client may deadlock on suspension during allocation request. RUN-20073 2.18.45 Fixed an issue where it wasn't possible to authenticate with user credentials in the CLI. RUN-21957 2.18.45 Fixed an issue where there was a missing username-loader container in inference workloads. RUN-22276 2.18.39 Fixed an issue where Knative external URL was missing from the Connections modal. RUN-22280 2.18.39 Fixed an issue when setting scale to zero - there was no pod counter in the Workload grid. RUN-19811 2.18.39 Added an option to set k8s tolerations to run:ai daemonsets (container-toolkit, runai-device-plugin, mig-parted, node-exporter, etc..) . RUN-22128 2.18.39 Added GID, UID, Supplemental groups to the V1 CLI. RUN-21800 2.18.37 Fixed an issue with old workloads residing in the cluster. RUN-21907 2.18.34 Fixed an issue where the SSO user credentials contain supplementary groups as string instead of int. RUN-21272 2.18.31 Fixed an issue with multi-cluster credinatils creation, specifically with the same name in different clusters. RUN-20680 2.18.29 Fixed an issue where workloads page do not present requested GPU. RUN-21200 2.18.29 Fixed issues with upgrades and connections from v2.13. RUN-20970 2.18.27 Fixed an issue with PUT APIs. RUN-20927 2.18.26 Fixed an issue where node affinity was not updated correctly in projects edit. RUN-20084 2.18.26 Fixed an issue where default department were deleted instead of a message being displayed. RUN-21062 2.18.26 Fixed issues with the API documentation. RUN-20434 2.18.25 Fixed an issue when creating a Project/Department with memory resources requires 'units'. RUN-20923 2.18.25 Fixed an issue with projects/departments page loading slowly. RUN-19872 2.18.23 Fixed an issue where the Toolkit crashes and fails to create and replace the publishing binaries. RUN-20861 2.18.22 Fixed an issue where a pod is stuck on pending due to a missing resource reservation pod. RUN-20842 2.18.22 Fixed an issue of illegal model name with \".\" in hugging face integration. RUN-20791 2.18.22 Fix an issue where notifications froze after startup. RUN-20865 2.18.22 Fixed an issue where default departments are not deleted when a cluster is deleted. RUN-20698 2.18.21 Fixed an issue where 2 processes requests a device at the same time received the same GPU, causing failures. RUN-20760 2.18.18 Fixed an issue where workload protection UI shows wrong status. RUN-20612 2.18.15 Fixed an issue where it was impossible with the use-table-data to hide node pool columns when there is only one default node pool. RUN-20735 2.18.15 Fixed an issue where nodePool.name is undefined RUN-20721 2.18.12 Added error handling to nodes pages. RUN-20578 2.18.10 Fixed an issue regarding policy enforcement. RUN-20188 2.18.10 Fixed issue with defining SSO in OpenShift identity provider. RUN-20673 2.18.9 Fixed an issue where a researcher uses a distributed elastic job, it is possible that in a specific flow it is scheduled on more than one node-pools. RUN-20360 2.18.7 Fixed an issue where the workload network status was misleading. RUN-22107 2.18.7 Fixed an issue where passwords containing $ were removed from the configuration. RUN-20510 2.18.5 Fixed an issue with external workloads - argocd workflow failed to be updated. RUN-20516 2.18.4 Fixed an issue when after deploying to prod, the cluster-service and authorization-service got multiple OOMKilled every ~1 hour. RUN-20485 2.18.2 Changed policy flags to Beta. RUN-20005 2.18.1 Fixed an issue where a sidecar container failure failed the workload. RUN-20169 2.18.1 Fixed an issue allowing the addition of annotations and labels to workload resources. RUN-20108 2.18.1 Fixed an issue exposing service node ports to workload status. RUN-20160 2.18.1 Fixed an issue with version display when installing a new cluster in an airgapped environment. RUN-19874 2.18.1 Fixed an issue when copying and editing a workload with group access to a tool and the group wasn't removed when selecting users option. RUN-19893 2.18.1 Fixed an issue when using a float number in the scale to zero inactivity value - custom which sometimes caused the submission to fail. RUN-20087 2.18.1 Fixed an issue where inference graphs should be displayed only for minimum cluster versions. RUN-10733 2.18.1 Fixed an issue where we needed to minify and obfuscate our code in production. RUN-19962 2.18.1 Fixed an issue to fix sentry domains regex and map them to relevant projects. RUN-20104 2.18.1 Fixed an issue where frontend Infinite loop on keycloak causes an error. RUN-19906 2.18.1 Fixed an issue where inference workload name validation fails with 2.16 cluster. RUN-19605 2.18.1 Fixed an issue where authorized users should support multiple users (workload-controller) . RUN-19903 2.18.1 Fixed an issue where inference chatbot creation fails with 2.16 cluster. RUN-20409 2.18.1 Fixed an issue where clicking on create new compute during the runai model flow did nothing. RUN-11224 2.18.1 Fixed an issue where ruani-adm collect all logs was not collecting all logs. RUN-20478 2.18.1 Improved workloads error status in overview panel. RUN-19850 2.18.1 Fixed an issue where an application administrator could not submit a job with CLI. RUN-19863 2.18.1 Fixed an issue where department admin received 403 on get tenants and cannot login to UI. RUN-19904 2.18.1 Fixed an issue when filtering by allocatedGPU in get workloads with operator returns incorrect result. RUN-19925 2.18.1 Fixed an issue when upgrade from v2.16 to v2.18 failed on worklaods migrations. RUN-19887 2.18.1 Fixed an issue in the UI when there is a scheduling rule of timeout, the form opened with the rules collapsed and written \"none\". RUN-19941 2.18.1 Fixed an issue where completed and failed jobs were shown in view pods in nodes screen. RUN-19940 2.18.1 Fixed an issue where setting gpu quota failed because the department quota was taken from wrong department. RUN-19890 2.18.1 Fixed an issue where editing a project by removing its node-affinity stuck updating. RUN-20120 2.18.1 Fixed an issue where project update fails when there is no cluster version. RUN-20113 2.18.1 Fixed an issue in the Workloads table where a researcher does not see other workloads once they clear their filters. RUN-19915 2.18.1 Fixed an issue when turning departments toggles on on cluster v2.11+ the gpu limit is -1 and there is ui error. RUN-20178 2.18.1 Fixed an issue where dashboard CPU tabs appeared in new overview. RUN-20247 2.18.1 Fixed an issue where you couldn't create a workload with namespace of a deleted project. RUN-20138 2.18.1 Fixed an issue where the system failed to create node-type on override-backend env. RUN-18994 2.18.1 Fixed an issue where some limitations for department administrator are not working as expected. RUN-19830 2.18.1 Fixed an issue where resources (GPU, CPU, Memory) units were added to k8s events that are published by run:ai scheduler making our messages more readable."},{"location":"home/changelog/hotfixes-2-18/#version-2180-fixes","title":"Version 2.18.0 Fixes","text":"Internal ID Description RUN-20734 Fixed an issue where the enable/disable toggle for the feature was presenting wrong info. RUN-19895 Fixed an issue of empty state for deleted workloads which is incorrect. RUN-19507 Fixed an issue in V1 where get APIs are missing required field in swagger leading to omit empty. RUN-20246 Fixed an issue in Departments v1 org unit where if unrecognizable params are sent, an error is returned. RUN-19947 Fixed an issue where pending multi-nodepool podgroups got stuck after cluster upgrade. RUN-20047 Fixed an issue where Workload status shows as \"deleting\" rather than \"deleted\" in side panel. RUN-20163 Fixed an issue when a DV is shared with a department and a new project is added to this dep - no pvc/pv is created. RUN-20484 Fixed an issue where Create Projects Requests Returned 500 - services is not a valid ResourceType. RUN-20354 Fixed an issue when deleting a department with projects resulted in projects remaining in environment with the status NotReady."},{"location":"home/changelog/hotfixes-2-19/","title":"Changelog Version 2.19","text":"

                                  The following is a list of the known and fixed issues for Run:ai V2.19.

                                  "},{"location":"home/changelog/hotfixes-2-19/#hotfixes","title":"Hotfixes","text":"Internal ID Hotfix # Description RUN-17284 2.19.49 Fixed an issue where workloads were suspended when set with the termination after preemption option. RUN-25290 2.19.49 Fixed a security vulnerability in golang.org/x/net v0.33.0 with CVE-2024-45338 with severity HIGH. RUN-25234 2.19.49 Fixed security vulnerabilities by updating oauth2 proxy image to the latest. RUN-25234 2.19.48 Fixed an authentication issue in CLI V1. RUN-25062 2.19.45 Fixed a security vulnerability in github.com.go-git.go-git.v5 with CVE CVE-2025-21614 with severity HIGH. RUN-25061 2.19.45 Fixed a security vulnerability in github.com.go-git.go-git.v5 with CVE CVE-2025-21613 with severity HIGH. RUN-24857 2.19.45 Fixed a security vulnerability in golang.org.x.net with CVE CVE-2024-45338 with severity HIGH. RUN-24733 2.19.45 Fixed an issue where users were unable to load the quota management dashboard. RUN-25094 2.19.44 Fixed an issue where OpenShift could not be upgraded due to a broken 3rd binary. RUN-24026 2.19.40 Fixed a security vulnerability in krb5-libs with CVE CVE-2024-3596. RUN-24649 2.19.40 Fixed an issue where submitting a workload with existingPvc=false and not providing a claimName resulted in auto-generating a claimName that included both upper and lower case letters. Since Kubernetes rejects uppercase letters, the workload would fail. The behavior has been updated to generate names using only lowercase letters. RUN-24632 2.19.40 Fixed an issue where an existing Prometheus monitoring setup deployed in an unexpected namespace was reported as missing, causing Run:ai installation to fail on the cluster. The installation mechanism now searches for the monitoring prerequisite in additional relevant namespaces. RUN-24693 2.19.40 Fixed an issue where users were unable to provide metric store authentication details using secret references. RUN-23744 2.19.40 Fixed an issue where refreshing some pages (such as the settings, policy, and access rules) removed the side navigation. RUN-24715 2.19.40 Fixed an issue in the templates form where selecting Secret as a data source got stuck in an infinite loading page. RUN-24831 2.19.40 Fixed an issue where some edge cases triggered consolidation without it actually being necessary. RUN-24873 2.19.40 Fixed an issue where users were unable to configure email notifications regarding workload statuses. RUN-24921 2.19.40 Fixed a security vulnerability in golang.org.x.net and golang.org.x.crypto. RUN-23914 2.19.38 Fixed an issue where unexpected behavior could occur if an application was capturing a graph while memory was being swapped in as part of the GPU memory swap feature. RUN-24521 2.19.36 Fixed a security vulnerability in golang.org.x.crypto with CVE CVE-2024-45337 with severity HIGH. RUN-24595 2.19.36 Fixed an issue where the new command-line interface did not parse master and worker commands/args simultaneously for distributed workloads. RUN-24565 2.19.34 Fixed an issue where the UI was hanging at times during Hugging Face model memory calculation. RUN-24021 2.19.33 Fixed a security vulnerability in pam with CVE-2024-10963. RUN-24506 2.19.33 Fixed a security vulnerability in krb5-libs with CVE-2024-3596. RUN-24259 2.19.31 Fixed an issue where the option to reset a local user password is sometimes not available. RUN-23798 2.19.30 Fixed an issue in distributed PyTorch workloads where the worker pods are deleted immediately after completion, not allowing logs to be viewed. RUN-24184 2.19.28 Fixed an issue in database migration when upgrading from 2.16 to 2.19. RUN-23752 2.19.27 Fixed an issue in the distributed training submission form when a policy on the master pod was applied. RUN-23040 2.19.27 Fixed an edge case where the Run:ai container toolkit hangs when user is spawning hundreds of sub-processes. RUN-23211 2.19.27 Fixed an issue where workloads were stuck at \"Pending\" when the command-line interface flag --gpu-memory was set to zero. RUN-23561 2.19.27 Fixed an issue where the frontend in airgapped environment attempted to download font resources from the internet. RUN-23789 2.19.27 Fixed an issue where in some cases, it was not possible to download the latest version of the command-line interface. RUN-23790 2.19.27 Fixed an issue where in some cases it was not possible to download the Windows version of the command-line interface. RUN-23802 2.19.27 Fixed an issue where new scheduling rules were not applied to existing workloads, if those new rules were set on existing projects which had no scheduling rules before. RUN-23838 2.19.27 Fixed an issue where the command-line interface could not access resources when configured as single-sign on in a self-hosted environment. RUN-23855 2.19.27 Fixed an issue where the pods list in the UI showed past pods. RUN-23857 2.19.27 Dashboard to transition from Grafana v9 to v10. RUN-24010 2.19.27 Fixed an infinite loop issue in the cluster-sync service. RUN-23669 2.19.25 Fixed an issue where export function of consumption Grafana dashboard was not showing. RUN-23778 2.19.24 Fixed an issue where mapping of UID and other properties disappears. RUN-23770 2.19.24 Fixed an issue where older overview dashboard does not filter on cluster, even though a cluster is selected. RUN-23762 2.19.24 Fixed an issue where the wrong version of a Grafana dashboard was displayed in the UI. RUN-23752 2.19.24 Fixed an issue in the distributed training submission form when a policy on the master pod was applied. RUN-23664 2.19.24 Fixed an issue where the GPU quota numbers on the department overview page did not mach the department edit page. RUN-21198 2.19.22 Fixed an issue where creating a training workload via yaml (kubectl apply -f) and specifying spec.namePrefix, created infinite jobs. RUN-23583 2.19.21 Fixed an issue where the new UI navigation bar sometimes showed multiple selections. RUN-23541 2.19.21 Fixed an issue where authorization was not working properly in SaaS due to wrong oidc URL being used. RUN-23376 2.19.21 Fixed an issue where the new command-line interface required re-login after 10 minutes. RUN-23162 2.19.21 Fixed an issue where older audit logs did not show on the new audit log UI. RUN-23385 2.19.20 Fixed an issue where calls to api/v1/notifications/config/notifications would return 502 RUN-23382 2.19.20 Fixed an issue where all nodepools were deleted on cluster upgrade RUN-23374 2.19.20 Fixed an issue where \"ghost\" nodepool in project settings prevents workload creation via UI/API RUN-23291 2.19.20 CLI - change text to be user friendly RUN-23283 2.19.20 Fixed a permissions issue with the Analytics dashboard post upgrade for SSO Users RUN-23208 2.19.20 Upload the source map to sentry only RUN-22642 2.19.20 infw-controller service tests for the reconcile RUN-23373 2.19.19 Fixed an issue where a new data source couldn't be created from the \"New Workload\" form. RUN-23368 2.19.19 Fixed an issue where the getProjects v1 API returned a list of users which was not always in the same order. RUN-23333 2.19.19 Fixed an issue where node pool with overProvisioningRatio greater than 1 cannot be created. RUN-23215 2.19.18 Fixed an issue where metrics requests from backend to mimir failed for certain tenants. RUN-23334 2.19.17 Updated some dockerfiles to the latest ubi9 image for security vulnerabilities. RUN-23318 2.19.16 Fixed an issue where some projects held faulty data which caused the getProjectById API to fail RUN-23140 2.19.16 Fixed an issue where distributed workloads were created with the wrong types RUN-22069 2.19.16 Fixed an isuue where JWT parse with claims failed to parse token without Keyfunc. RUN-23321 2.19.15 Fixed and issue where the GetProjectById wrapper API of the org-unit client in the runai-common-packages ignored errors RUN-23296 2.19.15 Fixed an issue in the CLI where runai attach did not work with auto-complete RUN-23282 2.19.15 CLI documentation fixes RUN-23245 2.19.15 Fixed an issue where ther binder service didn't update the pod status RUN-23057 2.19.15 OCP 2.19 upgrade troubleshooting RUN-22138 2.19.15 Fixed an issue where private URL user(s) input was an email and not a string. RUN-23243 2.19.14 Fixed an issue where the scope tree wasn't calculating permissions correctly RUN-23208 2.19.14 Upload the source map to sentry only RUN-23198 2.19.14 Fixed an issue where external-workload-integrator sometimes crashed for RayJob RUN-23191 2.19.13 Fixed an issue where creating workloads in the UI returned only the first 50 projects RUN-23142 2.19.12 Fixed an issue where advanced GPU metrics per-gpu did not have gpu label RUN-23139 2.19.12 Fixed an issue where inference workload showed wrong status. RUN-23027 2.19.12 Deprecated migProfiles API fields RUN-23001 2.19.12 Fixed an issue of false overcommit on out-of-memory kills in the Swap feature. RUN-22851 2.19.12 Fixed an issue where client may get stuck on device lock acquired during \u201cswap\u201d out-migration RUN-22771 2.19.12 Fixed an issue where get cluster by id with metadata verbosity returned zero values RUN-22742 2.19.12 Fixed user experience issue in inference autoscaling RUN-22725 2.19.12 Fixed an issue where the cloud operator failed to get pods in nodes UI. RUN-22720 2.19.12 Fixed an issue where the cloud operator failed to get projects in node pools UI. RUN-22700 2.19.12 Added auto refresh to the overview dashboard, Pods modal in the Workloads page, and Event history page RUN-22544 2.19.12 Updated Grafana version for security vulnerabilities. RUN-23083 2.19.11 Fixed an issue where workload actions were blocked in the UI when the cluster had any issues RUN-22771 2.19.11 Fixed an issue where the getClusterById API with metadata verbosity returned zero values"},{"location":"home/changelog/hotfixes-2-19/#version-2190-fixes","title":"Version 2.19.0 Fixes","text":"Internal ID Description RUN-21756 Fixed an issue where the NFS mount path doesn\u2019t accept \u201c{}\u201d characters RUN-21475 Fixed an issue where users failed to select the compute resource from UI if the compute resource is last in the list and has a long name"},{"location":"home/changelog/hotfixes-2-20/","title":"Changelog Version 2.19","text":"

                                  This section provides details on all hotfixes available for version 2.20. Hotfixes are critical updates released between our major and minor versions to address specific issues or vulnerabilities. These updates ensure the system remains secure, stable, and optimized without requiring a full version upgrade.

                                  "},{"location":"home/changelog/hotfixes-2-20/#hotfixes","title":"Hotfixes","text":"Version Date Internal ID Description 2.20.15 24/01/2025 RUN-24354 Fixed an issue where migrating workloads failed due to slow network connection. 2.20.14 23/01/2025 RUN-24754 Fixed an issue where the status of training and interactive workloads was not updated correctly. 2.20.14 23/01/2025 RUN-24838 Fixed an issue where an environment asset could not be created if it included an environment variable with no value specified. 2.20.11 21/01/2025 RUN-25303 Fixed an issue where submitting with the --attach flag was supported only in a workspace workload. 2.20.11 21/01/2025 RUN-25291 Fixed a security vulnerability in golang.org/x/net v0.33.0 with CVE-2024-45338 with severity HIGH. 2.20.10 20/01/2025 RUN-25234 Fixed an authentication issue in CLI V1. 2.20.9 19/01/2025 RUN-25032 Fixed an issue where inference workloads with large container sizes skipped the Initializing state. 2.20.9 19/01/2025 RUN-24752 Fixed an issue where a workload would move to a failed state when created with a custom NodePort that was already allocated. 2.20.9 19/01/2025 RUN-25031 Fixed an issue in the Templates form where existing credentials in the environment variables section were not displayed. 2.20.5 14/01/2025 RUN-25061 Fixed a security vulnerability in github.com.go-git.go-git.v5 with CVE CVE-2025-21613 with severity HIGH."},{"location":"platform-admin/overview/","title":"Overview: Platform Administrator","text":"

                                  The Platform Administrator is responsible for the day-to-day administration of the product.

                                  As part of the Platform Administrator documentation you will find:

                                  • Provide the right access level to users.
                                  • Configure Run:ai meta-data such as Projects, Departments, Node pools etc.
                                  • Understand Researcher Workloads and set up Workload Policies and Assets.
                                  • Review possible integrations with third-party products.
                                  • Analyze system performance and perform suggested actions.
                                  "},{"location":"platform-admin/aiinitiatives/overview/","title":"Adapting AI initiatives to your organization","text":"

                                  AI initiatives refer to advancing research, development, and implementation of AI technologies. These initiatives represent your business needs and involve collaboration between individuals, teams, and other stakeholders. AI initiatives require compute resources and a methodology to effectively and efficiently use those compute resources and split them among the different AI initiatives stakeholders. The building blocks of AI compute resources are GPUs, CPUs, and CPU memory, which are built into nodes (servers) and can be further grouped into node pools. Nodes and node pools are part of a Kubernetes Cluster.

                                  To manage AI initiatives in Run:ai you should:

                                  • Map your organization and initiatives to projects and optionally departments
                                  • Map compute resources (node pools and quotas) to projects and optionally departments
                                  • Assign users (e.g. AI practitioners, ML engineers, Admins) to projects and departments
                                  "},{"location":"platform-admin/aiinitiatives/overview/#mapping-your-organization","title":"Mapping your organization","text":"

                                  The way you map your AI initiatives and organization into Run:ai projects and departments should reflect your organization\u2019s structure and Project management practices. There are multiple options, and we provide you here with 3 examples of typical forms in which to map your organization, initiatives, and users into Run:ai, but of course, other ways that suit your requirements are also acceptable.

                                  "},{"location":"platform-admin/aiinitiatives/overview/#based-on-individuals","title":"Based on individuals","text":"

                                  A typical use case would be students (individual practitioners) within a faculty (business unit) - an individual practitioner may be involved in one or more initiatives. In this example, the resources are accounted for by the student (project) and aggregated per faculty (department). Department = business unit / Project = individual practitioner

                                  "},{"location":"platform-admin/aiinitiatives/overview/#based-on-business-units","title":"Based on business units","text":"

                                  A typical use case would be an AI service (business unit) split into AI capabilities (initiatives) - an individual practitioner may be involved in several initiatives. In this example, the resources are accounted for by Initiative (project) and aggregated per AI service (department).

                                  Department = business unit / Project = initiative

                                  "},{"location":"platform-admin/aiinitiatives/overview/#based-on-the-organizational-structure","title":"Based on the organizational structure","text":"

                                  A typical use case would be a business unit split into teams - an individual practitioner is involved in a single team (project) but the team may be involved in several AI initiatives. In this example, the resources are accounted for by team (project) and aggregated per business unit (department).

                                  Department = business unit / Project = team

                                  "},{"location":"platform-admin/aiinitiatives/overview/#mapping-your-resources","title":"Mapping your resources","text":"

                                  AI initiatives require compute resources such as GPUs and CPUs to run. Compute resources in any organization are limited, either due to the number of servers (nodes) owned by the organization is limited, the budget it can spend to lease resources in the cloud or spending for in-house servers is also limited. Every organization strives to optimize the usage of its resources by maximizing their utilization and providing all users with their needs. Therefore, the organization needs to split resources according to the organization's internal priorities and budget constraints. But even after splitting the resources, the orchestration layer should still provide fairness between the resourced consumers, and allow access to unused resources to minimize scenarios of idle resources.

                                  Another aspect of resource management is how to group your resources effectively, especially in large environments, or environments that are made of heterogeneous types of hardware, where some users need to use specific hardware types, or where other users should avoid occupying critical hardware of some users or initiatives.

                                  Run:ai assists you with all of these complex issues by allowing you to map your cluster resources to node pools, then map each Project and Department a quota allocation per node pool, and set access rights to unused resources (Over quota) per node pool.

                                  "},{"location":"platform-admin/aiinitiatives/overview/#grouping-your-resources","title":"Grouping your resources","text":"

                                  There are several reasons why you would group resources (nodes) into node pools:

                                  • Control the GPU type to use in heterogeneous hardware environment - in many cases, AI models can be optimized per hardware type they will use, e.g. a training workload that is optimized for H100 does not necessarily run optimally on an A100, and vice versa. Therefore segmenting into node pools, each with a different hardware type gives the AI researcher and ML engineer better control of where to run.
                                  • Quota control - splitting to node pools allows the admin to set specific quota per hardware type, e.g. give high priority project guaranteed access to advanced GPU hardware, while keeping lower priority project with a lower quota or even with no quota at all for that high-end GPU, but give it a \u201cbest-effort\u201d access only (i.e. if the high priority guaranteed project is not using those resources).
                                  • Multi-region or multi-availability-zone cloud environments - if some or all of your clusters run on the cloud (or even on-premise) but any of your clusters uses different physical locations or different topologies (e.g. racks), you probably want to segment your resources per region/zone/topology to be able to control where to run your workloads, how much quota to assign to specific environments (per project, per department), even if all those locations are all using the same hardware type. This methodology can help in optimizing the performance of your workloads because of the superior performance of local computing such as the locality of distributed workloads, local storage etc.
                                  • Explainability and predictability - large environments are complex to understand, this becomes even more complex when an environment is loaded. To maintain users\u2019 satisfaction and their understanding of the resources state, as well as to keep predictability of your workload chances to get scheduled, segmenting your cluster into smaller pools may significantly help.
                                  • Scale - Run:ai implementation of node pools has many benefits, one of the main of them is scale. Each node pool has its own scheduler instance, therefore allowing the cluster to handle more nodes and schedule workloads faster when segmented into node pools vs. one large cluster. To allow your workloads to use any resource within a cluster that is split to node pools, a second-level Scheduler is in charge of scheduling workloads between node pools according to your preferences and resource availability.
                                  • Prevent mutual exclusion - Some AI workloads consume CPU-only resources, to prevent those workloads from consuming the CPU resources of GPU nodes and thus block GPU workloads from using those nodes, it is recommended to group CPU-only nodes into a dedicated node pool(s) and assign a quota for CPU projects to CPU node-pools only while keeping GPU node-pools with zero quota and optionally \u201cbest-effort\u201d over-quota access for CPU-only projects.
                                  "},{"location":"platform-admin/aiinitiatives/overview/#grouping-examples","title":"Grouping Examples","text":"

                                  Set out below are illustrations of different grouping options.

                                  Example: grouping nodes by topology

                                  Example: grouping nodes by hardware type

                                  "},{"location":"platform-admin/aiinitiatives/overview/#assigning-your-resources","title":"Assigning your resources","text":"

                                  After the initial grouping of resources, it is time to associate resources to AI initiatives, this is performed by assigning quotas to projects and optionally to departments. Assigning GPU quota to a project, on a node pool basis, means that the workloads submitted by that project are entitled to use those GPUs as guaranteed resources and can use them for all workload types.

                                  However, what happens if the project requires more resources than its quota? This depends on the type of workloads that the user wants to submit. If the user requires more resources for non-preemptible workloads, then the quota must be increased, because non-preemptible workloads require guaranteed resources. On the other hand, if the type of workload is, for example, a model Training workload that is preemptible - in this case the project can exploit unused resources of other projects, as long as the other projects don\u2019t need them. Over-quota is set per project on a node-pool basis and per department.

                                  Administrators can use quota allocations to prioritize resources between users, teams, and AI initiatives. The administrator can completely prevent the use of certain node pools by a project or department by setting the node pool quota to 0 and disabling over quota for that node pool, or it can keep the quota to 0 and enable over-quota to that node pool and allow access based on resource availability only (e.g. unused GPUs). However, when a project with a non-zero quota needs to use those resources, the Scheduler reclaims those resources back and preempts the preemptible workloads of over-quota projects. As an administrator, you can also have an impact on the amount of over-quota resources a project or department uses.

                                  It is essential to make sure that the sum of all projects' quota does NOT surpass that of the Department, and that the sum of all departments does not surpass the number of physical resources, per node pool and for the entire cluster (we call such behavior - \u2018over-subscription\u2019). The reason over-subscription is not recommended is that it may produce unexpected scheduling decisions, especially those that might preempt \u2018non-preemptive\u2019 workloads or fail to schedule workloads within quota, either non-preemptible or preemptible, thus quota cannot be considered anymore as \u2018guaranteed\u2019. Admins can opt-in a system flag that helps to prevent over-subscription scenarios.

                                  Example: assigning resources to projects

                                  "},{"location":"platform-admin/aiinitiatives/overview/#assigning-users-to-projects-and-departments","title":"Assigning users to projects and departments","text":"

                                  Run:ai system is using \u2018Role Based Access Control\u2019 (RBAC) to manage users\u2019 access rights to the different objects of the system, its resources, and the set of allowed actions. To allow AI researchers, ML engineers, Project Admins, or any other stakeholder of your AI initiatives to access projects and use AI compute resources with their AI initiatives, the administrator needs to assign users to projects. After a user is assigned to a project with the proper role, e.g. \u2018L1 Researcher\u2019, the user can submit and monitor its workloads under that project. Assigning users to departments is usually done to assign \u2018Department Admin\u2019 to manage a specific department. Other roles, such as \u2018L1 Researcher\u2019, can also be assigned to departments, this allows the researcher access to all projects within that department.

                                  "},{"location":"platform-admin/aiinitiatives/overview/#scopes-in-the-organization","title":"Scopes in the organization","text":"

                                  This is an example of an organization, as represented in the Run:ai platform:

                                  The organizational tree is structured from top down under a single node headed by the account. The account is comprised of clusters, departments and projects.

                                  Note

                                  Different roles and permissions can be granted to specific clusters, departments and projects within an organization.

                                  The organizational tree is structured from top down under a single node headed by the account. The account is comprised of clusters, departments and projects.

                                  After mapping and building your hierarchal structured organization as shown above, you can assign or associate various Run:ai components (e.g. workloads, roles, assets, policies, and more) to different parts of the organization - these organizational parts are the Scopes. The following organizational example consists of 5 optional scopes:

                                  Note

                                  When a scope is selected, the very same unit, including all of its subordinates (both existing and any future subordinates, if added), are selected as well.

                                  "},{"location":"platform-admin/aiinitiatives/overview/#next-steps","title":"Next Steps","text":"

                                  Now that resources are grouped into node pools, organizational units or business initiatives are mapped into projects and departments, projects\u2019 quota parameters are set per node pool, and users are assigned to projects, you can finally submit workloads from a project and use compute resources to run your AI initiatives.

                                  "},{"location":"platform-admin/aiinitiatives/org/departments/","title":"Departments","text":"

                                  This article explains the procedure for managing departments

                                  Departments are a grouping of projects. By grouping projects into a department, you can set quota limitations to a set of projects, create policies that are applied to the department, and create assets that can be scoped to the whole department or a partial group of descendent projects

                                  For example, in an academic environment, a department can be the Physics Department grouping various projects (AI Initiatives) within the department, or grouping projects where each project represents a single student.

                                  "},{"location":"platform-admin/aiinitiatives/org/departments/#departments","title":"Departments","text":"

                                  The Departments table can be found under Organization in the Run:ai platform.

                                  Note

                                  Departments are disabled, by default. If you cannot see Departments in the menu, then it must be enabled by your Administrator, under General settings \u2192 Resources \u2192 Departments

                                  The Departments table lists all departments defined for a specific cluster and allows you to manage them. You can switch between clusters by selecting your cluster using the filter at the top.

                                  The Departments table consists of the following columns:

                                  Column Description Department The name of the department Node pool(s) with quota The node pools associated with this department. By default, all node pools within a cluster are associated with each department. Administrators can change the node pools\u2019 quota parameters for a department. Click the values under this column to view the list of node pools with their parameters (as described below) GPU quota GPU quota associated with the department Total GPUs for projects The sum of all projects\u2019 GPU quotas associated with this department Project(s) List of projects associated with this department Subject(s) The users, SSO groups, or applications with access to the project. Click the values under this column to view the list of subjects with their parameters (as described below). This column is only viewable if your role in Run:ai platform allows you those permissions. Allocated GPUs The total number of GPUs allocated by successfully scheduled workloads in projects associated with this department GPU allocation ratio The ratio of Allocated GPUs to GPU quota. This number reflects how well the department\u2019s GPU quota is utilized by its descendant projects. A number higher than 100% means the department is using over-quota GPUs. A number lower than 100% means not all projects are utilizing their quotas. A quota becomes allocated once a workload is successfully scheduled. Creation time The timestamp for when the department was created Workload(s) The list of workloads under projects associated with this department. Click the values under this column to view the list of workloads with their resource parameters (as described below) Cluster The cluster that the department is associated with"},{"location":"platform-admin/aiinitiatives/org/departments/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"platform-admin/aiinitiatives/org/departments/#node-pools-with-quota-associated-with-the-department","title":"Node pools with quota associated with the department","text":"

                                  Click one of the values of Node pool(s) with quota column, to view the list of node pools and their parameters

                                  Column Description Node pool The name of the node pool is given by the administrator during node pool creation. All clusters have a default node pool created automatically by the system and named \u2018default\u2019. GPU quota The amount of GPU quota the administrator dedicated to the department for this node pool (floating number, e.g. 2.3 means 230% of a GPU capacity) CPU (Cores) The amount of CPU (cores) quota the administrator has dedicated to the department for this node pool (floating number, e.g. 1.3 Cores = 1300 mili-cores). The \u2018unlimited\u2019 value means the CPU (Cores) quota is not bound and workloads using this node pool can use as many CPU (Cores) resources as they need (if available) CPU memory The amount of CPU memory quota the administrator has dedicated to the department for this node pool (floating number, in MB or GB). The \u2018unlimited\u2019 value means the CPU memory quota is not bounded and workloads using this node pool can use as much CPU memory resource as they need (if available). Allocated GPUs The total amount of GPUs allocated by workloads using this node pool under projects associated with this department. The number of allocated GPUs may temporarily surpass the GPU quota of the department if over-quota is used. Allocated CPU (Cores) The total amount of CPUs (cores) allocated by workloads using this node pool under all projects associated with this department. The number of allocated CPUs (cores) may temporarily surpass the CPUs (Cores) quota of the department if over-quota is used. Allocated CPU memory The actual amount of CPU memory allocated by workloads using this node pool under all projects associated with this department. The number of Allocated CPU memory may temporarily surpass the CPU memory quota if over-quota is used."},{"location":"platform-admin/aiinitiatives/org/departments/#subjects-authorized-for-the-project","title":"Subjects authorized for the project","text":"

                                  Click one of the values of the Subject(s) column, to view the list of subjects and their parameters. This column is only viewable if your role in the Run:ai system affords you those permissions.

                                  Column Description Subject A user, SSO group, or application assigned with a role in the scope of this department Type The type of subject assigned to the access rule (user, SSO group, or application). Scope The scope of this department within the organizational tree. Click the name of the scope to view the organizational tree diagram, you can only view the parts of the organizational tree for which you have permission to view. Role The role assigned to the subject, in this department\u2019s scope Authorized by The user who granted the access rule Last updated The last time the access rule was updated

                                  Note

                                  A role given in a certain scope, means the role applies to this scope and any descendant scopes in the organizational tree.

                                  "},{"location":"platform-admin/aiinitiatives/org/departments/#adding-a-new-department","title":"Adding a new department","text":"

                                  To create a new Department:

                                  1. Click +NEW DEPARTMENT
                                  2. Select a scope. By default, the field contains the scope of the current UI context cluster, viewable at the top left side of your screen. You can change the current UI context cluster by clicking the \u2018Cluster: cluster-name\u2019 field and applying another cluster as the UI context. Alternatively, you can choose another cluster within the \u2018+ New Department\u2019 form by clicking the organizational tree icon on the right side of the scope field, opening the organizational tree and selecting one of the available clusters.
                                  3. Enter a name for the department. Department names must start with a letter and can only contain lower case latin letters, numbers or a hyphen ('-\u2019).
                                  4. Under Quota Management, select a quota for the department. The Quota management section may contain different fields depending on pre-created system configuration. Possible system configurations are:
                                    • Existence of Node Pools
                                    • CPU Quota - Allow setting a quota for CPU resources.

                                  When no node pools are configured, you can set the following quota parameters:

                                  • GPU Devices The number of GPUs you want to allocate for this department (decimal number). This quota is consumed by the department\u2019s subordinated project.
                                  • CPUs (cores) (when CPU quota is set) The number of CPU cores you want to allocate for this department (decimal number). This quota is consumed by the department\u2019s subordinated projects
                                  • CPUs memory (when CPU quota is set) The amount of CPU memory you want to allocate for this department (in Megabytes or Gigabytes). This quota is consumed by the department\u2019s subordinated projects

                                  When node pools are enabled, it is possible to set the above quota parameters for each node-pool separately.

                                  • Order of priority This column is displayed only if more than one node pool exists. The default order in which the Scheduler uses node pools to schedule a workload. This means, the Scheduler first tries to allocate resources using the highest priority node pool, followed by the next in priority, until it reaches the lowest priority node pool list, then the Scheduler starts from the highest priority again. The Scheduler uses the department list of prioritized node pools, only if the order of priority of node pools is not set in project or the workload during submission (either by an admin policy or by the user). An empty value indicates that the node pool is not part of the department\u2019s default node pool priority list, but a node pool can still be chosen by the admin policy or a user during workload submission. Department nodepool priority sets defaults to the subordinate projects but does not enforce it, meaning projects are free to change their priority.
                                  • In addition, you can decide whether to allow a department to go over-quota. Allowing over-quota at the department level means that one department can receive more resources than its quota when not required by other departments. If the over-quota is disabled, workloads running under subordinated projects are not able to use more resources than the department\u2019s quota, but each project can still go over-quota (if enabled at the project level) up to the department\u2019s quota.

                                  Unlimited CPU(Cores) and CPU memory quotas are an exception - in this case, workloads of subordinated projects can consume available resources up to the physical limitation of the cluster or any of the node pools.

                                  Example of Quota management:

                                  1. Click CREATE DEPARTMENT
                                  "},{"location":"platform-admin/aiinitiatives/org/departments/#adding-an-access-rule-to-a-department","title":"Adding an access rule to a department","text":"

                                  To create a new access rule for a department:

                                  1. Select the department you want to add an access rule for
                                  2. Click ACCESS RULES
                                  3. Click +ACCESS RULE
                                  4. Select a subject
                                  5. Select or enter the subject identifier:
                                    • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
                                    • Group name as recognized by the IDP
                                    • Application name as created in Run:ai
                                  6. Select a role
                                  7. Click SAVE RULE
                                  8. Click CLOSE
                                  "},{"location":"platform-admin/aiinitiatives/org/departments/#deleting-an-access-rule-from-a-department","title":"Deleting an access rule from a department","text":"

                                  To delete an access rule from a department:

                                  1. Select the department you want to remove an access rule from
                                  2. Click ACCESS RULES
                                  3. Find the access rule you would like to delete
                                  4. Click on the trash icon
                                  5. Click CLOSE
                                  "},{"location":"platform-admin/aiinitiatives/org/departments/#editing-a-department","title":"Editing a department","text":"
                                  1. Select the Department you want to edit
                                  2. Click EDIT
                                  3. Update the Department and click SAVE
                                  "},{"location":"platform-admin/aiinitiatives/org/departments/#viewing-a-departments-policy","title":"Viewing a department\u2019s policy","text":"

                                  To view the policy of a department:

                                  1. Select the department for which you want to view its policies. This option is only active if the department has defined policies in place.
                                  2. Click VIEW POLICY and select the workload type for which you want to view the policies: a. Workspace workload type policy with its set of rules b. Training workload type policies with its set of rules
                                  3. In the Policy form, view the workload rules that are enforcing your department for the selected workload type as well as the defaults:
                                    • Parameter - The workload submission parameter that Rule and Default is applied on
                                    • Type (applicable for data sources only) - The data source type (Git, S3, nfs, pvc etc.)
                                    • Default - The default value of the Parameter
                                    • Rule - Set up constraints on workload policy fields
                                    • Source - The origin of the applied policy (cluster, department or project)

                                  Notes

                                  • The policy affecting the department consists of rules and defaults. Some of these rules and defaults may be derived from the policies of a parent cluster (source). You can see the source of each rule in the policy form.
                                  • A policy set for a department affects all subordinated projects and their workloads, according to the policy workload type
                                  "},{"location":"platform-admin/aiinitiatives/org/departments/#deleting-a-department","title":"Deleting a department","text":"
                                  1. Select the department you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm the deletion

                                  Note

                                  Deleting a department permanently deletes its subordinated projects, any assets created in the scope of this department, and any of its subordinated projects such as compute resources, environments, data sources, templates, and credentials. However, workloads running within the department\u2019s subordinated projects, or the policies defined for this department or its subordinated projects - remain intact and running.

                                  "},{"location":"platform-admin/aiinitiatives/org/departments/#reviewing-a-department","title":"Reviewing a department","text":"
                                  1. Select the department you want to review
                                  2. Click REVIEW
                                  3. Review and click CLOSE
                                  "},{"location":"platform-admin/aiinitiatives/org/departments/#using-api","title":"Using API","text":"

                                  Go to the Departments API reference to view the available actions

                                  "},{"location":"platform-admin/aiinitiatives/org/projects/","title":"Projects","text":"

                                  This article explains the procedure to manage Projects.

                                  Researchers submit AI workloads. To streamline resource allocation and prioritize work, Run:ai introduces the concept of Projects. Projects are the tool to implement resource allocation policies as well as the segregation between different initiatives. A project may represent a team, an individual, or an initiative that shares resources or has a specific resource quota. Projects may be aggregated in Run:ai departments.

                                  For example, you may have several people involved in a specific face-recognition initiative collaborating under one project named \u201cface-recognition-2024\u201d. Alternatively, you can have a project per person in your team, where each member receives their own quota.

                                  "},{"location":"platform-admin/aiinitiatives/org/projects/#projects-table","title":"Projects table","text":"

                                  The Projects table can be found under Organization in the Run:ai platform.

                                  The Projects table provides a list of all projects defined for a specific cluster, and allows you to manage them. You can switch between clusters by selecting your cluster using the filter at the top.

                                  The Projects table consists of the following columns:

                                  Column Description Project The name of the project Department The name of the parent department. Several projects may be grouped under a department. Status The Project creation status. Projects are manifested as Kubernetes namespaces. The project status represents the Namespace creation status. Node pool(s) with quota The node pools associated with the project. By default, a new project is associated with all node pools within its associated cluster. Administrators can change the node pools\u2019 quota parameters for a project. Click the values under this column to view the list of node pools with their parameters (as described below) Subject(s) The users, SSO groups, or applications with access to the project. Click the values under this column to view the list of subjects with their parameters (as described below). This column is only viewable if your role in the Run:ai platform allows you those permissions. Allocated GPUs The total number of GPUs allocated by successfully scheduled workloads under this project GPU allocation ratio The ratio of Allocated GPUs to GPU quota. This number reflects how well the project\u2019s GPU quota is utilized by its descendent workloads. A number higher than 100% indicates the project is using over-quota GPUs. GPU quota The GPU quota allocated to the project. This number represents the sum of all node pools\u2019 GPU quota allocated to this project. Allocated CPUs (Core) The total number of CPU cores allocated by workloads submitted within this project. (This column is only available if the CPU Quota setting is enabled, as described below). Allocated CPU Memory The total number of CPUs allocated by successfully scheduled workloads under this project. (This column is only available if the CPU Quota setting is enabled, as described below). CPU quota (Cores) CPU quota allocated to this project. (This column is only available if the CPU Quota setting is enabled, as described below). This number represents the sum of all node pools\u2019 CPU quota allocated to this project. The \u2018unlimited\u2019 value means the CPU (cores) quota is not bounded and workloads using this project can use as many CPU (cores) resources as they need (if available). CPU memory quota CPU memory quota allocated to this project. (This column is only available if the CPU Quota setting is enabled, as described below). This number represents the sum of all node pools\u2019 CPU memory quota allocated to this project. The \u2018unlimited\u2019 value means the CPU memory quota is not bounded and workloads using this Project can use as much CPU memory resources as they need (if available). CPU allocation ratio The ratio of Allocated CPUs (cores) to CPU quota (cores). This number reflects how much the project\u2019s \u2018CPU quota\u2019 is utilized by its descendent workloads. A number higher than 100% indicates the project is using over-quota CPU cores. CPU memory allocation ratio The ratio of Allocated CPU memory to CPU memory quota. This number reflects how well the project\u2019s \u2018CPU memory quota\u2019 is utilized by its descendent workloads. A number higher than 100% indicates the project is using over-quota CPU memory. Node affinity of training workloads The list of Run:ai node-affinities. Any training workload submitted within this project must specify one of those Run:ai node affinities, otherwise it is not submitted. Node affinity of interactive workloads The list of Run:ai node-affinities. Any interactive (workspace) workload submitted within this project must specify one of those Run:ai node affinities, otherwise it is not submitted. Idle time limit of training workloads The time in days:hours:minutes after which the project stops a training workload not using its allocated GPU resources. Idle time limit of preemptible workloads The time in days:hours:minutes after which the project stops a preemptible interactive (workspace) workload not using its allocated GPU resources. Idle time limit of non preemptible workloads The time in days:hours:minutes after which the project stops a non-preemptible interactive (workspace) workload not using its allocated GPU resources.. Interactive workloads time limit The duration in days:hours:minutes after which the project stops an interactive (workspace) workload Training workloads time limit The duration in days:hours:minutes after which the project stops a training workload Creation time The timestamp for when the project was created Workload(s) The list of workloads associated with the project. Click the values under this column to view the list of workloads with their resource parameters (as described below). Cluster The cluster that the project is associated with"},{"location":"platform-admin/aiinitiatives/org/projects/#node-pools-with-quota-associated-with-the-project","title":"Node pools with quota associated with the project","text":"

                                  Click one of the values of Node pool(s) with quota column, to view the list of node pools and their parameters

                                  Column Description Node pool The name of the node pool is given by the administrator during node pool creation. All clusters have a default node pool created automatically by the system and named \u2018default\u2019. GPU quota The amount of GPU quota the administrator dedicated to the project for this node pool (floating number, e.g. 2.3 means 230% of GPU capacity). CPU (Cores) The amount of CPUs (cores) quota the administrator has dedicated to the project for this node pool (floating number, e.g. 1.3 Cores = 1300 mili-cores). The \u2018unlimited\u2019 value means the CPU (Cores) quota is not bounded and workloads using this node pool can use as many CPU (Cores) resources as they require, (if available). CPU memory The amount of CPU memory quota the administrator has dedicated to the project for this node pool (floating number, in MB or GB). The \u2018unlimited\u2019 value means the CPU memory quota is not bounded and workloads using this node pool can use as much CPU memory resource as they need (if available). Allocated GPUs The actual amount of GPUs allocated by workloads using this node pool under this project. The number of allocated GPUs may temporarily surpass the GPU quota if over-quota is used. Allocated CPU (Cores) The actual amount of CPUs (cores) allocated by workloads using this node pool under this project. The number of allocated CPUs (cores) may temporarily surpass the CPUs (Cores) quota if over-quota is used. Allocated CPU memory The actual amount of CPU memory allocated by workloads using this node pool under this Project. The number of Allocated CPU memory may temporarily surpass the CPU memory quota if over-quota is used. Order of priority The default order in which the Scheduler uses node-pools to schedule a workload. This is used only if the order of priority of node pools is not set in the workload during submission, either by an admin policy or the user. An empty value means the node pool is not part of the project\u2019s default list, but can still be chosen by an admin policy or the user during workload submission"},{"location":"platform-admin/aiinitiatives/org/projects/#subjects-authorized-for-the-project","title":"Subjects authorized for the project","text":"

                                  Click one of the values in the Subject(s) column, to view the list of subjects and their parameters. This column is only viewable, if your role in the Run:ai system affords you those permissions.

                                  Column Description Subject A user, SSO group, or application assigned with a role in the scope of this Project Type The type of subject assigned to the access rule (user, SSO group, or application) Scope The scope of this project in the organizational tree. Click the name of the scope to view the organizational tree diagram, you can only view the parts of the organizational tree for which you have permission to view. Role The role assigned to the subject, in this project\u2019s scope Authorized by The user who granted the access rule Last updated The last time the access rule was updated"},{"location":"platform-admin/aiinitiatives/org/projects/#workloads-associated-with-the-project","title":"Workloads associated with the project","text":"

                                  Click one of the values of Workload(s) column, to view the list of workloads and their parameters

                                  Column Description Workload The name of the workload, given during its submission. Optionally, an icon describing the type of workload is also visible Type The type of the workload, e.g. Workspace, Training, Inference Status The state of the workload and time elapsed since the last status change Created by The subject that created this workload Running/ requested pods The number of running pods out of the number of requested pods for this workload. e.g. a distributed workload requesting 4 pods but may be in a state where only 2 are running and 2 are pending Creation time The date and time the workload was created GPU compute request The amount of GPU compute requested (floating number, represents either a portion of the GPU compute, or the number of whole GPUs requested) GPU memory request The amount of GPU memory requested (floating number, can either be presented as a portion of the GPU memory, an absolute memory size in MB or GB, or a MIG profile) CPU memory request The amount of CPU memory requested (floating number, presented as an absolute memory size in MB or GB) CPU compute request The amount of CPU compute requested (floating number, represents the number of requested Cores)"},{"location":"platform-admin/aiinitiatives/org/projects/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"platform-admin/aiinitiatives/org/projects/#adding-a-new-project","title":"Adding a new project","text":"

                                  To create a new Project:

                                  1. Click +NEW PROJECT
                                  2. Select a scope, you can only view clusters if you have permission to do so - within the scope of the roles assigned to you
                                  3. Enter a name for the project Project names must start with a letter and can only contain lower case Latin letters, numbers or a hyphen ('-\u2019)
                                  4. Namespace associated with Project Each project has an associated (Kubernetes) namespace in the cluster. All workloads under this project use this namespace. a. By default, Run:ai creates a namespace based on the Project name (in the form of runai-<name>) b. Alternatively, you can choose an existing namespace created for you by the cluster administrator
                                  5. In the Quota management section, you can set the quota parameters and prioritize resources

                                    • Order of priority This column is displayed only if more than one node pool exists. The default order in which the Scheduler uses node pools to schedule a workload. This means the Scheduler first tries to allocate resources using the highest priority node pool, then the next in priority, until it reaches the lowest priority node pool list, then the Scheduler starts from the highest again. The Scheduler uses the Project list of prioritized node pools, only if the order of priority of node pools is not set in the workload during submission, either by an admin policy or by the user. Empty value means the node pool is not part of the Project\u2019s default node pool priority list, but a node pool can still be chosen by the admin policy or a user during workload submission
                                    • Node pool This column is displayed only if more than one node pool exists. It represents the name of the node pool.
                                    • GPU devices The number of GPUs you want to allocate for this project in this node pool (decimal number).
                                    • CPUs (Cores) This column is displayed only if CPU quota is enabled via the General settings. Represents the number of CPU cores you want to allocate for this project in this node pool (decimal number).
                                    • CPU memory This column is displayed only if CPU quota is enabled via the General settings. The amount of CPU memory you want to allocate for this project in this node pool (in Megabytes or Gigabytes).

                                    • Over quota / Over quota weight - If over-quota weight is enabled via the General settings then over-quota weight is presented, otherwise over-quota is presented

                                      • Over quota When enabled, the project can use non-guaranteed overage resources above its quota in this node pool. The amount of the non-guaranteed overage resources for this project is calculated proportionally to the project quota in this node pool. When disabled, the project cannot use more resources than the guaranteed quota in this node pool.
                                      • Over quota weight - Represents a weight used to calculate the amount of non-guaranteed overage resources a project can get on top of its quota in this node pool. All unused resources are split between projects that require the use of overage resources:
                                        • Medium The default value. The Admin can change the default to any of the following values: High, Low, Lowest, or None.
                                        • None When set, the project cannot use more resources than the guaranteed quota in this node pool.
                                        • Lowest Over-quota weight \u2018lowest\u2019 has a unique behavior, because its weight is 0, it can only use over-quota (unused overage) resources if no other project needs them, and any project with a higher over-quota weight can snap the average resources at any time.

                                  Note

                                  Setting the quota to 0 (either GPU, CPU, or CPU memory) and the over-quota to \u2018disabled\u2019 or over-quota weight to \u2018none\u2019 means the project is blocked from using those resources on this node pool.

                                  When no node pools are configured, you can set the same parameters but it is for the whole project, instead of per node pool.

                                  After node pools are created, you can set the above parameters for each node-pool separately.

                                  1. Set Scheduling rules as required. You can have a scheduling rule for:

                                    • Idle GPU timeout Preempt a workload that does not use GPUs for more than a specified duration. You can apply a single rule per workload type - Preemptive Workspaces, Non-preemptive Workspaces, and Training.

                                    Note

                                    To make \u2018Idle GPU timeout\u2019 effective, it must be set to a shorter duration than that workload duration of the same workload type.

                                    • Workspace duration Preempt workspaces after a specified duration. This applies to both preemptive and non-preemptive Workspaces.
                                    • Training duration Preempt a training workload after a specified duration.
                                    • Node type (Affinity) Node type is used to select a group of nodes, usually with specific characteristics such as a hardware feature, storage type, fast networking interconnection, etc. The scheduler uses node type as an indication of which nodes should be used for your workloads, within this project. Node type is a label in the form of run.ai/type and a value (e.g. run.ai/type = dgx200) that the administrator uses to tag a set of nodes. Adding the node type to the project\u2019s scheduling rules enables the user to submit workloads with any node type label/value pairs in this list, according to the workload type - Workspace or Training. The Scheduler then schedules workloads using a node selector, targeting nodes tagged with the Run:ai node type label/value pair. Node pools and a node type can be used in conjunction with each other. For example, specifying a node pool and a smaller group of nodes from that node pool that includes a fast SSD memory or other unique characteristics.
                                  2. Click CREATE PROJECT

                                  "},{"location":"platform-admin/aiinitiatives/org/projects/#adding-an-access-rule-to-a-project","title":"Adding an access rule to a project","text":"

                                  To create a new access rule for a project:

                                  1. Select the project you want to add an access rule for
                                  2. Click ACCESS RULES
                                  3. Click +ACCESS RULE
                                  4. Select a subject
                                  5. Select or enter the subject identifier:
                                    • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
                                    • Group name as recognized by the IDP
                                    • Application name as created in Run:ai
                                  6. Select a role
                                  7. Click SAVE RULE
                                  8. Click CLOSE
                                  "},{"location":"platform-admin/aiinitiatives/org/projects/#deleting-an-access-rule-from-a-project","title":"Deleting an access rule from a project","text":"

                                  To delete an access rule from a project:

                                  1. Select the project you want to remove an access rule from
                                  2. Click ACCESS RULES
                                  3. Find the access rule you want to delete
                                  4. Click on the trash icon
                                  5. Click CLOSE
                                  "},{"location":"platform-admin/aiinitiatives/org/projects/#editing-a-project","title":"Editing a project","text":"

                                  To edit a project:

                                  1. Select the project you want to edit
                                  2. Click EDIT
                                  3. Update the Project and click SAVE
                                  "},{"location":"platform-admin/aiinitiatives/org/projects/#viewing-a-projects-policy","title":"Viewing a project\u2019s policy","text":"

                                  To view the policy of a project:

                                  1. Select the project for which you want to view its policies. This option is only active for projects with defined policies in place.
                                  2. Click VIEW POLICY and select the workload type for which you want to view the policies: a. Workspace workload type policy with its set of rules b. Training workload type policies with its set of rules
                                  3. In the Policy form, view the workload rules that are enforcing your project for the selected workload type as well as the defaults:
                                    • Parameter - The workload submission parameter that Rules and Defaults are applied to
                                    • Type (applicable for data sources only) - The data source type (Git, S3, nfs, pvc etc.)
                                    • Default - The default value of the Parameter
                                    • Rule - Set up constraints on workload policy fields
                                    • Source - The origin of the applied policy (cluster, department or project)

                                  Note

                                  The policy affecting the project consists of rules and defaults. Some of these rules and defaults may be derived from policies of a parent cluster and/or department (source). You can see the source of each rule in the policy form.

                                  "},{"location":"platform-admin/aiinitiatives/org/projects/#deleting-a-project","title":"Deleting a project","text":"

                                  To delete a project:

                                  1. Select the project you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm

                                  Note

                                  • Clusters < v2.20

                                    Deleting a project does not delete its associated namespace, any of the running workloads using this namespace, or the policies defined for this project. However, any assets created in the scope of this project such as compute resources, environments, data sources, templates and credentials, are permanently deleted from the system.

                                  • Clusters >=v2.20

                                    Deleting a project does not delete its associated namespace, but will attempt to delete its associated workloads and assets. Any assets created in the scope of this project such as compute resources, environments, data sources, templates and credentials, are permanently deleted from the system.

                                  "},{"location":"platform-admin/aiinitiatives/org/projects/#using-api","title":"Using API","text":"

                                  Go to the Projects API reference to view the available actions

                                  "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/","title":"Scheduling Rules","text":"

                                  This article explains the procedure of configuring and managing Scheduling rules. Scheduling rules refer to restrictions applied over workloads. These restrictions apply to either the resources (nodes) on which workloads can run or to the duration of the workload run time. Scheduling rules are set for projects or departments and apply to a specific workload type. Once scheduling rules are set, all matching workloads associated with the project or (subordinate projects in case of department) have the restrictions as defined when the workload was submitted. New scheduling rules added, are not applied over already created workloads associated with that project/department.

                                  "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#scheduling-rules","title":"Scheduling Rules","text":"

                                  There are 3 types of scheduling rules:

                                  "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#workload-duration-time-limit","title":"Workload duration (time limit)","text":"

                                  This rule limits the duration of a workload run time. Workload run time is calculated as the total time in which the workload was in status Running. You can apply a single rule per workload type - Preemptive Workspaces, Non-preemptive Workspaces, and Training.

                                  "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#idle-gpu-time-limit","title":"Idle GPU time limit","text":"

                                  This rule limits the total GPU time of a workload. Workload idle time is counted from the first time the workload is in status Running and the GPU was idle. We calculate idleness by employing the runai_gpu_idle_seconds_per_workload metric. This metric determines the total duration of zero GPU utilization within each 30-second interval. If the GPU remains idle throughout the 30-second window, 30 seconds are added to the idleness sum; otherwise, the idleness count is reset. You can apply a single rule per workload type - Preemptible Workspaces, Non-preemptible Workspaces, and Training.

                                  Note

                                  To make Idle GPU timeout effective, it must be set to a shorter duration than that workload duration of the same workload type.

                                  "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#node-type-affinity","title":"Node type (Affinity)","text":"

                                  Node type is used to select a group of nodes, typically with specific characteristics such as a hardware feature, storage type, fast networking interconnection, etc. The scheduler uses node type as an indication of which nodes should be used for your workloads, within this project.

                                  Node type is a label in the form of run.ai/type and a value (e.g. run.ai/type = dgx200) that the administrator uses to tag a set of nodes. Adding the node type to the project\u2019s scheduling rules mandates the user to submit workloads with a node type label/value pairs from this list, according to the workload type - Workspace or Training. The Scheduler then schedules workloads using a node selector, targeting nodes tagged with the Run:ai node type label/value pair. Node pools and a node type can be used in conjunction. For example, specifying a node pool and a smaller group of nodes from that node pool that includes a fast SSD memory or other unique characteristics.

                                  "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#labelling-nodes-for-node-types-grouping","title":"Labelling nodes for node types grouping","text":"

                                  The administrator should use a node label with the key of run.ai/type and any coupled value

                                  To assign a label to nodes you want to group, set the \u2018node type (affinity)\u2019 on each relevant node:

                                  1. Obtain the list of nodes and their current labels by coping the following to your terminal:

                                    kubectl get nodes --show-labels\n

                                  2. Annotate a specific node with a new label by coping the following to your terminal:

                                    kubectl label node <node-name> run.ai/type=<value>\n

                                  "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#adding-a-scheduling-rule-to-a-projectdepartment","title":"Adding a scheduling rule to a project/department","text":"

                                  To add a scheduling rule:

                                  1. Select the project/department for which you want to add a scheduling rule
                                  2. Click EDIT
                                  3. In the Scheduling rules section click +RULE
                                  4. Select the rule type
                                  5. Select the workload type and time limitation period
                                  6. For Node type, choose one or more labels for the desired nodes.
                                  7. Click SAVE

                                  Note

                                  You can review the defined rules in the Projects table in the relevant column.

                                  "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#editing-the-projectdepartment-scheduling-rule","title":"Editing the project/department scheduling rule","text":"

                                  To edit a scheduling rule:

                                  1. Select the project/department for which you want to edit its scheduling rule
                                  2. Click EDIT
                                  3. Find the scheduling rule you would like to edit
                                  4. Edit the rule
                                  5. Click SAVE

                                  Note

                                  When a editing an inherited rule on a project/department (a rule defined by the department), you can only restrict the rule limitation

                                  "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#deleting-the-projectdepartment-scheduling-rule","title":"Deleting the project/department scheduling rule","text":"

                                  To delete a scheduling rule:

                                  1. Select the project/department from which you want to delete a scheduling rule
                                  2. Click EDIT
                                  3. Find the scheduling rule you would like to delete
                                  4. Click on the x icon
                                  5. Click SAVE

                                  !!! You cannot delete rules inherited from the department from the project's set of rules

                                  "},{"location":"platform-admin/aiinitiatives/org/scheduling-rules/#using-api","title":"Using API","text":"

                                  Go to the Projects API reference to view the available actions

                                  "},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/","title":"Configuring NVIDIA MIG Profiles","text":"

                                  NVIDIA\u2019s Multi-Instance GPU (MIG) enables splitting a GPU into multiple logical GPU devices, each with its own memory and compute portion of the physical GPU.

                                  NVIDIA provides two MIG strategies that the user can split the GPU into:

                                  • Single - A GPU can be divided evenly. This means all MIG profiles are the same.
                                  • Mixed - A GPU can be divided into different profiles.

                                  The Run:ai platform supports running workloads using NVIDIA MIG. Administrators can set the Kubernetes nodes to their preferred MIG strategy and configure the appropriate MIG profiles for researchers and MLOPS engineers to use.

                                  This guide explains how to configure MIG in each strategy to submit workloads. It also outlines the individual implications of each strategy and best practices for administrators.

                                  Note

                                  • Starting from v2.19, Dynamic MIG feature began a deprecation process and is now no longer supported. With Dynamic MIG, the Run:ai platform automatically configured MIG profiles according to on-demand user requests for different MIG profiles or memory fractions.
                                  • GPU fractions and memory fractions are not supported with MIG profiles.
                                  • Single strategy supports both Run:ai and third-party workloads. Using mixed strategy can only be done using third-party workloads. For more details on Run:ai and third-party workloads, see Introduction to workloads.
                                  "},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/#before-you-start","title":"Before you start","text":"

                                  To use MIG single and mixed strategy effectively, make sure to familiarize yourself with the following NVIDIA resources:

                                  • NVIDIA Multi-Instance GPU
                                  • MIG User Guide
                                  • GPU Operator with MIG
                                  "},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/#configuring-single-mig-strategy","title":"Configuring single MIG strategy","text":"

                                  When deploying MIG using single strategy, all GPUs within a node are configured with the same profile. For example, a node might have GPUs configured with 3 MIG slices of profile type 1g.20gb, or 7 MIG slices of profile 1g.10gb. With this strategy, MIG profiles are displayed as whole GPU devices by CUDA.

                                  The Run:ai platform discovers these MIG profiles as whole GPU devices as well, ensuring MIG devices are transparent to the end-user (practitioner). For example, a node that consists of 8 physical GPUs split into MIG slices, 3\u00d72g20gb slices each, is discovered by the Run:ai platform as a node with 24 GPU devices.

                                  Users can submit workloads by requesting a specific number of GPU devices (X GPU) and Run:ai will allocate X MIG slices (logical devices). The Run:ai platform deducts X GPUs from the workload\u2019s Project quota, regardless of whether this \u2018logical GPU\u2019 represents 1/3 of a physical GPU device or 1/7 of a physical GPU device.

                                  "},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/#configuring-mixed-mig-strategy","title":"Configuring mixed MIG strategy","text":"

                                  When deploying MIG using mixed strategy, each GPU in a node can be configured with a different combination of MIG profiles such as 2\u00d72g.20gb and 3\u00d71g.10gb. For details on supported combinations per GPU type, refer to Supported MIG Profiles.

                                  In mixed strategy, physical GPU devices continue to be displayed as physical GPU devices by CUDA, and each MIG profile is shown individually. The Run:ai platform identifies the physical GPU devices normally, however, MIG profiles are not visible in the UI or node APIs.

                                  When submitting third-party workloads with this strategy, the user should explicitly specify the exact requested MIG profile (for example, nvidia.com/gpu.product: A100-SXM4-40GB-MIG-3g.20gb). The Run:ai Scheduler finds a node that can provide this specific profile and binds it to the workload.

                                  A third-party workload submitted with a MIG profile of type Xg.Ygb (e.g. 3g.40gb or 2g.20gb) is considered as consuming X GPUs. These X GPUs will be deducted from the workload\u2019s project quota of GPUs. For example, a 3g.40gb profile deducts 3 GPUs from the associated Project\u2019s quota, while 2g.20gb deducts 2 GPUs from the associated Project\u2019s quota. This is done to maintain a logical ratio according to the characteristics of the MIG profile.

                                  "},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/#best-practices-for-administrators","title":"Best practices for administrators","text":""},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/#single-strategy","title":"Single strategy","text":"
                                  • Configure proper and uniform sizes of MIG slices (profiles) across all GPUs within a node.
                                  • Set the same MIG profiles on all nodes of a single node pool.
                                  • Create separate node pools with different MIG profile configurations allowing users to select the pool that best matches their workloads\u2019 needs.
                                  • Ensure Project quotas are allocated according to the MIG profile sizes.
                                  "},{"location":"platform-admin/aiinitiatives/resources/configuring-mig-profiles/#mixed-strategy","title":"Mixed strategy","text":"
                                  • Use mixed strategy with workloads that require diverse resources. Make sure to evaluate the workload requirements and plan accordingly.
                                  • Configure individual MIG profiles on each node by using a limited set of MIG profile combinations to minimize complexity. Make sure to evaluate your requirements and node configurations.
                                  • Ensure Project quotas are allocated according to the MIG profile sizes.

                                  Note

                                  Since MIG slices are a fixed size, once configured, changing MIG profiles requires administrative intervention.

                                  "},{"location":"platform-admin/aiinitiatives/resources/node-pools/","title":"Node Pools","text":"

                                  This article explains the procedure for managing Node pools.

                                  Node pools assist in managing heterogeneous resources effectively. A node pool is a Run:ai construct representing a set of nodes grouped into a bucket of resources using a predefined node label (e.g. NVidia GPU type) or an administrator-defined node label (any key/value pair).

                                  Typically, the grouped nodes share a common feature or property, such as GPU type or other HW capability (such as Infiniband connectivity), or represent a proximity group (i.e. nodes interconnected via a local ultra-fast switch). Researchers and ML Engineers would typically use node pools to run specific workloads on specific resource types.

                                  Platform administrators can create, view, edit, and delete node pools. Creating a new node pool creates a new instance of the Run:ai scheduler, workloads submitted to a node pool will be scheduled using the node pool\u2019s designated scheduler instance.

                                  Once a new node pool is created, it is automatically assigned to all Projects and Departments with a quota of zero GPU resources, unlimited CPU resources, and over-quota enabled (Medium priority if over-quota weight is enabled). This allows any Project and Department to use any node pool when over-quota is enabled, even if the administrator has not assigned a quota for a specific node pool in a Project or Department.

                                  Workloads can be submitted using a prioritized list of node pools, the node pool selector picks one node pool at a time (according to the prioritized list) and the designated node pool scheduler instance handles the submission request and tries to match the requested resources within that node pool. If the scheduler cannot find resources to satisfy the submitted workload, the node pool selector will move the request to the next node pool in the prioritized list, if no node pool satisfies the request, the node pool selector will start from the first node pool again until one of the node pools satisfies the request.

                                  "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#node-pools-table","title":"Node pools table","text":"

                                  The Node pools table can be found under Resources in the Run:ai platform.

                                  The Node pools table lists all the node pools defined in the Run:ai platform and allows you to manage them.

                                  Note

                                  By default, the Run:ai platform includes a single node pool named \u2018default\u2019. When no other node pool is defined, all existing and new nodes are associated with the \u2018default\u2019 node pool. When deleting a node pool, if no other node pool matches any of the nodes\u2019 labels, the node will be included in the default node pool.

                                  The Node pools table consists of the following columns:

                                  Column Description Node pool The node pool name, set by the administrator during its creation (the node pool name cannot be changed after its creation). Status Node pool status. A \u2018Ready\u2019 status means the scheduler can use this node pool to schedule workloads. \u2018Empty\u2019 status means no nodes are currently included in that node pool. Label key Label value The node pool controller will use this node-label key-value pair to match nodes into this node pool. Node(s) List of nodes included in this node pool. Click the field to view details (the details are in the Nodes article). GPU devices The total number of GPU devices installed into nodes included in this node pool. For example, a node pool that includes 12 nodes each with 8 GPU devices would show a total number of 96 GPU devices. GPU memory The total amount of GPU memory included in this node pool. The total amount of GPU memory installed in nodes included in this node pool. For example, a node pool that includes 12 nodes, each with 8 GPU devices, and each device with 80 GB of memory would show a total memory amount of 7.68 TB. Allocated GPUs The total allocation of GPU devices in units of GPUs (decimal number). For example, if 3 GPUs are 50% allocated, the field prints out the value 1.50. This value represents the portion of GPU memory consumed by all running pods using this node pool. \u2018Allocated GPUs\u2019 can be larger than \u2018Projects\u2019 GPU quota\u2019 if over-quota is used by workloads, but not larger than GPU devices. GPU resource optimization ratio Shows the Node Level Scheduler mode. CPUs (Cores) The number of CPU cores installed on nodes included in this node CPU memory The total amount of CPU memory installed on nodes using this node pool Allocated CPUs (Cores) The total allocation of CPU compute in units of Cores (decimal number). This value represents the amount of CPU cores consumed by all running pods using this node pool. \u2018Allocated CPUs\u2019 can be larger than \u2018Projects\u2019 GPU quota\u2019 if over-quota is used by workloads, but not larger than CPUs (Cores). Allocated CPU memory The total allocation of CPU memory in units of TB/GB/MB (decimal number). This value represents the amount of CPU memory consumed by all running pods using this node pool. \u2018Allocated CPUs\u2019 can be larger than \u2018Projects\u2019 CPU memory quota\u2019 if over-quota is used by workloads, but not larger than CPU memory. GPU placement strategy Sets the Scheduler strategy for the assignment of pods requesting both GPU and CPU resources to nodes, which can be either Bin-pack or Spread. By default, Bin-Pack is used, but can be changed to Spread by editing the node pool. When set to Bin-pack the scheduler will try to fill nodes as much as possible before using empty or sparse nodes, when set to spread the scheduler will try to keep nodes as sparse as possible by spreading workloads across as many nodes as it succeeds. CPU placement strategy Sets the Scheduler strategy for the assignment of pods requesting only CPU resources to nodes, which can be either Bin-pack or Spread. By default, Bin-Pack is used, but can be changed to Spread by editing the node pool. When set to Bin-pack the scheduler will try to fill nodes as much as possible before using empty or sparse nodes, when set to spread the scheduler will try to keep nodes as sparse as possible by spreading workloads across as many nodes as it succeeds. Last update The date and time when the node pool was last updated Creation time The date and time when the node pool was created Workload(s) List of workloads running on nodes included in this node pool, click the field to view details (described below in this article)"},{"location":"platform-admin/aiinitiatives/resources/node-pools/#workloads-associated-with-the-node-pool","title":"Workloads associated with the node pool","text":"

                                  Click one of the values in the Workload(s) column, to view the list of workloads and their parameters.

                                  Note

                                  This column is only viewable if your role in the Run:ai platform gives you read access to workloads, even if you are allowed to view workloads, you can only view the workloads within your allowed scope. This means, there might be more pods running on this node than appear in the list your are viewing.

                                  Column Description Workload The name of the workload. If the workloads\u2019 type is one of the recognized types (for example: Pytorch, MPI, Jupyter, Ray, Spark, Kubeflow, and many more), an appropriate icon is printed. Type The Run:ai platform type of the workload - Workspace, Training, or Inference Status The state of the workload. The Workloads state is described in the \u2018Run:ai Workloads\u2019 article. Created by The User or Application created this workload Running/requested pods The number of running pods out of the number of requested pods within this workload. Creation time The workload\u2019s creation date and time Allocated GPU compute The total amount of GPU compute allocated by this workload. A workload with 3 Pods, each allocating 0.5 GPU, will show a value of 1.5 GPUs for the workload. Allocated GPU memory The total amount of GPU memory allocated by this workload. A workload with 3 Pods, each allocating 20GB, will show a value of 60 GB for the workload. Allocated CPU compute (cores) The total amount of CPU compute allocated by this workload. A workload with 3 Pods, each allocating 0.5 Core, will show a value of 1.5 Cores for the workload. Allocated CPU memory The total amount of CPU memory allocated by this workload. A workload with 3 Pods, each allocating 5 GB of CPU memory, will show a value of 15 GB of CPU memory for the workload."},{"location":"platform-admin/aiinitiatives/resources/node-pools/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  • Show/Hide details - Click to view additional information on the selected row
                                  "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#showhide-details","title":"Show/Hide details","text":"

                                  Select a row in the Node pools table and then click Show details in the upper-right corner of the action bar. The details window appears, presenting metrics graphs for the whole node pool:

                                  • Node GPU allocation - This graph shows an overall sum of the Allocated, Unallocated, and Total number of GPUs for this node pool, over time. From observing this graph, you can learn about the occupancy of GPUs in this node pool, over time.

                                  • GPU Utilization Distribution - This graph shows the distribution of GPU utilization in this node pool over time. Observing this graph, you can learn how many GPUs are utilized up to 25%, 25%-50%, 50%-75%, and 75%-100%. This information helps to understand how many available resources you have in this node pool, and how well those resources are utilized by comparing the allocation graph to the utilization graphs, over time.

                                  • GPU Utilization - This graph shows the average GPU utilization in this node pool over time. Comparing this graph with the GPU Utilization Distribution helps to understand the actual distribution of GPU occupancy over time.

                                  • GPU Memory Utilization - This graph shows the average GPU memory utilization in this node pool over time, for example an average of all nodes\u2019 GPU memory utilization over time.

                                  • CPU Utilization - This graph shows the average CPU utilization in this node pool over time, for example, an average of all nodes\u2019 CPU utilization over time.

                                  • CPU Memory Utilization - This graph shows the average CPU memory utilization in this node pool over time, for example an average of all nodes\u2019 CPU memory utilization over time.

                                  "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#adding-a-new-node-pool","title":"Adding a new node pool","text":"

                                  To create a new node pool:

                                  1. Click +NEW NODE POOL
                                  2. Enter a name for the node pool. Node pools names must start with a letter and can only contain lowercase Latin letters, numbers or a hyphen ('-\u2019)
                                  3. Enter the node pool label: The node pool controller will use this node-label key-value pair to match nodes into this node pool.

                                    • Key is the unique identifier of a node label.

                                      • The key must fit the following regular expression: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?/?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$
                                      • The administrator can put an automatically preset label such as the nvidia.com/gpu.product that labels the GPU type or any other key from a node label.
                                    • Value is the value of that label identifier (key). The same key may have different values, in this case, they are considered as different labels.

                                      • Value must fit the following regular expression: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$
                                    • A node pool is defined by a single key-value pair. You must not use different labels that are set on the same node by different node pools, this situation may lead to unexpected results.
                                  4. Set the GPU placement strategy:

                                    • Bin-pack - Place as many workloads as possible in each GPU and node to use fewer resources and maximize GPU and node vacancy.
                                    • Spread Spread workloads across as many GPUs and nodes as possible to minimize the load and maximize the available resources per workload.
                                    • GPU workloads are workloads that request both GPU and CPU resources
                                  5. Set the CPU placement strategy:

                                    • Bin-pack - Place as many workloads as possible in each CPU and node to use fewer resources and maximize CPU and node vacancy.
                                    • Spread - Spread workloads across as many CPUs and nodes as possible to minimize the load and maximize the available resources per workload.
                                    • CPU workloads are workloads that request purely CPU resources
                                  6. Click CREATE NODE POOL

                                  "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#labeling-nodes-for-node-pool-grouping","title":"Labeling nodes for node-pool grouping:","text":"

                                  The Infrastructure Administrator can use a preset node label such as the nvidia.com/gpu.product that labels the GPU type, or configure any other node label (e.g. faculty=physics).

                                  To assign a label to nodes you want to group into a node pool, set a node label on each node:

                                  1. Get the list of nodes and their current labels using the following command:

                                    kubectl get nodes --show-labels\n

                                  2. Annotate a specific node with a new label using the following command:

                                    kubectl label node <node-name> <key>=<value>\n

                                  "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#editing-a-node-pool","title":"Editing a node pool","text":"
                                  1. Select the node pool you want to edit
                                  2. Click EDIT
                                  3. Update the node pool and click SAVE
                                  "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#deleting-a-node-pool","title":"Deleting a node pool","text":"
                                  1. Select the node pool you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm the deletion

                                  Note

                                  The default node pool cannot be deleted. When deleting a node pool, if no other node pool matches any of the nodes\u2019 labels, the node will be included in the default node pool.

                                  "},{"location":"platform-admin/aiinitiatives/resources/node-pools/#using-api","title":"Using API","text":"

                                  Go to the Node pools API reference to view the available actions

                                  "},{"location":"platform-admin/aiinitiatives/resources/nodes/","title":"Nodes","text":"

                                  This article explains the procedure for managing Nodes.

                                  Nodes are Kubernetes elements automatically discovered by the Run:ai platform. Once a node is discovered by the Run:ai platform, an associated instance is created in the Nodes table, administrators can view the Node\u2019s relevant information, and Run:ai scheduler can use the node for Scheduling.

                                  "},{"location":"platform-admin/aiinitiatives/resources/nodes/#nodes-table","title":"Nodes table","text":"

                                  The Nodes table can be found under Resources in the Run:ai platform.

                                  The Nodes table displays a list of predefined nodes available to users in the Run:ai platform.

                                  Note

                                  • It is not possible to create additional nodes, or edit, or delete existing nodes.
                                  • Only users with relevant permissions can view the table.

                                  The Nodes table consists of the following columns:

                                  Column Description Node The Kubernetes name of the node Status The state of the node. Nodes in the Ready state are eligible for scheduling. If the state is Not ready then the main reason appears in parenthesis on the right side of the state field. Hovering the state lists the reasons why a node is Not ready. Node pool The name of the associated node pool. By default, every node in the Run:ai platform is associated with the default node pool, if no other node pool is associated GPU type The GPU model, for example, H100, or V100 GPU devices The number of GPU devices installed on the node. Clicking this field pops up a dialog with details per GPU (described below in this article) Free GPU devices The current number of fully vacant GPU devices GPU memory The total amount of GPU memory installed on this node. For example, if the number is 640GB and the number of GPU devices is 8, then each GPU is installed with 80GB of memory (assuming the node is assembled of homogenous GPU devices) Allocated GPUs The total allocation of GPU devices in units of GPUs (decimal number). For example, if 3 GPUs are 50% allocated, the field prints out the value 1.50. This value represents the portion of GPU memory consumed by all running pods using this node Used GPU memory The actual amount of memory (in GB or MB) used by pods running on this node. GPU compute utilization The average compute utilization of all GPU devices in this node GPU memory utilization The average memory utilization of all GPU devices in this node CPU (Cores) The number of CPU cores installed on this node CPU memory The total amount of CPU memory installed on this node Allocated CPU (Cores) The number of CPU cores allocated by pods running on this node (decimal number, e.g. a pod allocating 350 mili-cores shows an allocation of 0.35 cores). Allocated CPU memory The total amount of CPU memory allocated by pods running on this node (in GB or MB) Used CPU memory The total amount of actually used CPU memory by pods running on this node. Pods may allocate memory but not use all of it, or go beyond their CPU memory allocation if using Limit > Request for CPU memory (burstable workload) CPU compute utilization The utilization of all CPU compute resources on this node (percentage) CPU memory utilization The utilization of all CPU memory resources on this node (percentage) Used swap CPU memory The amount of CPU memory (in GB or MB) used for GPU swap memory (* future) Pod(s) List of pods running on this node, click the field to view details (described below in this article)"},{"location":"platform-admin/aiinitiatives/resources/nodes/#gpu-devices-for-node","title":"GPU devices for node","text":"

                                  Click one of the values in the GPU devices column, to view the list of GPU devices and their parameters.

                                  Column Description Index The GPU index, read from the GPU hardware. The same index is used when accessing the GPU directly Used memory The amount of memory used by pods and drivers using the GPU (in GB or MB) Compute utilization The portion of time the GPU is being used by applications (percentage) Memory utilization The portion of the GPU memory that is being used by applications (percentage) Idle time The elapsed time since the GPU was used (i.e. the GPU is being idle for \u2018Idle time\u2019)"},{"location":"platform-admin/aiinitiatives/resources/nodes/#pods-associated-with-node","title":"Pods associated with node","text":"

                                  Click one of the values in the Pod(s) column, to view the list of pods and their parameters.

                                  Note

                                  This column is only viewable if your role in the Run:ai platform gives you read access to workloads, even if you are allowed to view workloads, you can only view the workloads within your allowed scope. This means, there might be more pods running on this node than appear in the list your are viewing.

                                  Column Description Pod The Kubernetes name of the pod. Usually name of the pod is made of the name of the parent workload if there is one, and an index for unique for that pod instance within the workload Status The state of the pod. In steady state this should be Running and the amount of time the pod is running Project The Run:ai project name the pod belongs to. Clicking this field takes you to the Projects table filtered by this project name Workload The workload name the pod belongs to. Clicking this field takes you to the Workloads table filtered by this workload name Image The full path of the image used by the main container of this pod Creation time The pod\u2019s creation date and time"},{"location":"platform-admin/aiinitiatives/resources/nodes/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  • Show/Hide details - Click to view additional information on the selected row
                                  "},{"location":"platform-admin/aiinitiatives/resources/nodes/#showhide-details","title":"Show/Hide details","text":"

                                  Click a row in the Nodes table and then click the Show details button at the upper right side of the action bar. The details screen appears, presenting the following metrics graphs:

                                  • GPU utilization Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs compute utilization (percentage of GPU compute) in this node.
                                  • GPU memory utilization Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs memory usage (percentage of the GPU memory) in this node.
                                  • CPU compute utilization The average of all CPUs\u2019 cores compute utilization graph, along an adjustable period allows you to see the trends of CPU compute utilization (percentage of CPU compute) in this node.
                                  • CPU memory utilization The utilization of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory utilization (percentage of CPU memory) in this node.
                                  • CPU memory usage The usage of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory usage (in GB or MB of CPU memory) in this node.

                                  • For GPUs charts - Click the GPU legend on the right-hand side of the chart, to activate or deactivate any of the GPU lines.

                                  • You can click the date picker to change the presented period
                                  • You can use your mouse to mark a sub-period in the graph for zooming in, and use the \u2018Reset zoom\u2019 button to go back to the preset period
                                  • Changes in the period affect all graphs on this screen.
                                  "},{"location":"platform-admin/aiinitiatives/resources/nodes/#using-api","title":"Using API","text":"

                                  Go to the Nodes API reference to view the available actions

                                  "},{"location":"platform-admin/authentication/accessrules/","title":"Access Rules","text":"

                                  This article explains the procedure to manage Access rules.

                                  Access rules provide users, groups, or applications privileges to system entities.

                                  An access rule is the assignment of a role to a subject in a scope: <Subject> is a <Role> in a <Scope>.

                                  For example, user user@domain.com is a department admin in department A.

                                  "},{"location":"platform-admin/authentication/accessrules/#access-rules-table","title":"Access rules table","text":"

                                  The Access rules table can be found under Access in the Run:ai platform.

                                  The Access rules table provides a list of all the access rules defined in the platform and allows you to manage them.

                                  Note

                                  Flexible management

                                  It is also possible to manage access rules directly for a specific user, application, project, or department.

                                  The Access rules table consists of the following columns:

                                  Column Description Type The type of subject assigned to the access rule (user, SSO group, or application). Subject The user, SSO group, or application assigned with the role Role The role assigned to the subject Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates Authorized by The user who granted the access rule Creation time The timestamp for when the rule was created Last updated The last time the access rule was updated"},{"location":"platform-admin/authentication/accessrules/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"platform-admin/authentication/accessrules/#adding-new-access-rules","title":"Adding new access rules","text":"

                                  To add a new access rule:

                                  1. Click +NEW ACCESS RULE
                                  2. Select a subject User, SSO Group, or Application
                                  3. Select or enter the subject identifier:
                                    • User Email for a local user created in Run:ai or for SSO user as recognized by the IDP
                                    • Group name as recognized by the IDP
                                    • Application name as created in Run:ai
                                  4. Select a role
                                  5. Select a scope
                                  6. Click SAVE RULE

                                  Note

                                  An access rule consists of a single subject with a single role in a single scope. To assign multiple roles or multiple scopes to the same subject, multiple access rules must be added.

                                  "},{"location":"platform-admin/authentication/accessrules/#editing-an-access-rule","title":"Editing an access rule","text":"

                                  Access rules cannot be edited. To change an access rule, you must delete the rule, and then create a new rule to replace it.

                                  "},{"location":"platform-admin/authentication/accessrules/#deleting-an-access-rule","title":"Deleting an access rule","text":"
                                  1. Select the access rule you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm the deletion
                                  "},{"location":"platform-admin/authentication/accessrules/#using-api","title":"Using API","text":"

                                  Go to the Access rules API reference to view the available actions

                                  "},{"location":"platform-admin/authentication/applications/","title":"Applications","text":"

                                  This article explains the procedure to manage your organization's applications.

                                  Applications are used for API integrations with Run:ai. An application contains a client ID and a client secret. With the client credentials, you can obtain a token as detailed in API authentication and use it within subsequent API calls.

                                  Applications are assigned with Access Rules to manage permissions. For example, application ci-pipeline-prod is assigned with a Researcher role in Cluster: A.

                                  "},{"location":"platform-admin/authentication/applications/#applications-table","title":"Applications table","text":"

                                  The Applications table can be found under Access in the Run:ai platform.

                                  The Applications table provides a list of all the applications defined in the platform, and allows you to manage them.

                                  The Applications table consists of the following columns:

                                  Column Description Application The name of the application Client ID The client ID of the application Access rule(s) The access rules assigned to the application Last login The timestamp for the last time the user signed in Created by The user who created the application Creation time The timestamp for when the application was created Last updated The last time the application was updated"},{"location":"platform-admin/authentication/applications/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"platform-admin/authentication/applications/#creating-an-application","title":"Creating an application","text":"

                                  To create an application:

                                  1. Click +NEW APPLICATION
                                  2. Enter the application\u2019s name
                                  3. Click CREATE
                                  4. Copy the Client ID and Client secret and store them securely
                                  5. Click DONE

                                  Note

                                  The client secret is visible only at the time of creation. It cannot be recovered but can be regenerated.

                                  "},{"location":"platform-admin/authentication/applications/#adding-an-access-rule-to-an-application","title":"Adding an access rule to an application","text":"

                                  To create an access rule:

                                  1. Select the application you want to add an access rule for
                                  2. Click ACCESS RULES
                                  3. Click +ACCESS RULE
                                  4. Select a role
                                  5. Select a scope
                                  6. Click SAVE RULE
                                  7. Click CLOSE
                                  "},{"location":"platform-admin/authentication/applications/#deleting-an-access-rule-from-an-application","title":"Deleting an access rule from an application","text":"

                                  To delete an access rule:

                                  1. Select the application you want to remove an access rule from
                                  2. Click ACCESS RULES
                                  3. Find the access rule assigned to the user you would like to delete
                                  4. Click on the trash icon
                                  5. Click CLOSE
                                  "},{"location":"platform-admin/authentication/applications/#regenerating-client-secret","title":"Regenerating client secret","text":"

                                  To regenerate a client secret:

                                  1. Locate the application you want to regenerate its client secret
                                  2. Click REGENERATE CLIENT SECRET
                                  3. Click REGENERATE
                                  4. Copy the New client secret and store it securely
                                  5. Click DONE

                                  Warning

                                  Regenerating a client secret revokes the previous one.

                                  "},{"location":"platform-admin/authentication/applications/#deleting-an-application","title":"Deleting an application","text":"
                                  1. Select the application you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm
                                  "},{"location":"platform-admin/authentication/applications/#using-api","title":"Using API","text":"

                                  Go to the Applications, Access rules API reference to view the available actions

                                  "},{"location":"platform-admin/authentication/roles/","title":"Roles","text":"

                                  This article explains the available roles in the Run:ai platform.

                                  A role is a set of permissions that can be assigned to a subject in a scope.

                                  A permission is a set of actions (View, Edit, Create and Delete) over a Run:ai entity (e.g. projects, workloads, users).

                                  "},{"location":"platform-admin/authentication/roles/#roles-table","title":"Roles table","text":"

                                  The Roles table can be found under Access in the Run:ai platform.

                                  The Roles table displays a list of predefined roles available to users in the Run:ai platform. It is not possible to create additional rules or edit or delete existing rules.

                                  The Roles table consists of the following columns:

                                  Column Description Role The name of the role Created by The name of the role creator Creation time The timestamp when the role was created"},{"location":"platform-admin/authentication/roles/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"platform-admin/authentication/roles/#reviewing-a-role","title":"Reviewing a role","text":"
                                  1. To review a role click the role name on the table
                                  2. In the role form review the following:
                                    • Role name The name of the role
                                    • Entity A system-managed object that can be viewed, edited, created or deleted by a user based on their assigned role and scope
                                    • Actions The actions that the role assignee is authorized to perform for each entity
                                      • View If checked, an assigned user with this role can view instances of this type of entity within their defined scope
                                      • Edit If checked, an assigned user with this role can change the settings of an instance of this type of entity within their defined scope
                                      • Create If checked, an assigned user with this role can create new instances of this type of entity within their defined scope
                                      • Delete If checked, an assigned user with this role can delete instances of this type of entity within their defined scope
                                  "},{"location":"platform-admin/authentication/roles/#roles-in-runai","title":"Roles in Run:ai","text":"

                                  Run:ai supports the following roles and their permissions: Under each role is a detailed list of the actions that the role assignee is authorized to perform for each entity.

                                  Compute resource administrator

                                  Data source administrator

                                  Data volume administrator

                                  Department administrator

                                  Department viewer

                                  Editor

                                  Environment administrator

                                  L1 researcher

                                  L2 researcher

                                  ML engineer

                                  Research manager

                                  System administrator

                                  Template administrator

                                  Viewer

                                  Notes

                                  Keep the following in mind when upgrading from versions 2.13 or earlier:

                                  • Admin becomes System Admin with full access to all managed objects and scopes
                                  • Research Manager is not automatically assigned to all projects, but to projects set by the relevant Admin when assigning this role to a user, group or app
                                  • To preserve backwards compatibility, users with the role of Research Manager are assigned to all current projects, but not to new projects
                                  • To allow the Department Admin to assign a Researcher role to a user, group or app, the Department Admin must have VECD permissions for jobs and workspaces. This creates a broader span of managed objects
                                  • To preserve backwards compatibility, users with the role of Editor, are assigned to the same scope they had before the upgrade. However, with new user assignments, the Admin can limit the scope to only part of the organizational scope.
                                  "},{"location":"platform-admin/authentication/roles/#permitted-workloads","title":"Permitted workloads","text":"

                                  When assigning a role with either one, all or any combination of the View, Edit, Create and Delete permissions for workloads, the subject has permissions to manage not only Run:ai native workloads (Workspace, Training, Inference), but also a list of 3rd party workloads:

                                  • k8s: StatefulSet
                                  • k8s: ReplicaSet
                                  • k8s: Pod
                                  • k8s: Deployment
                                  • batch: Job
                                  • batch: CronJob
                                  • machinelearning.seldon.io: SeldonDeployment
                                  • kubevirt.io: VirtualMachineInstance
                                  • kubeflow.org: TFJob
                                  • kubeflow.org: PyTorchJob
                                  • kubeflow.org: XGBoostJob
                                  • kubeflow.org: MPIJob
                                  • kubeflow.org: MPIJob
                                  • kubeflow.org: Notebook
                                  • kubeflow.org: ScheduledWorkflow
                                  • amlarc.azureml.com: AmlJob
                                  • serving.knative.dev: Service
                                  • workspace.devfile.io: DevWorkspace
                                  • ray.io: RayCluster
                                  • ray.io: RayJob
                                  • ray.io: RayService
                                  • ray.io: RayCluster
                                  • ray.io: RayJob
                                  • ray.io: RayService
                                  • tekton.dev: TaskRun
                                  • tekton.dev: PipelineRun
                                  • argoproj.io: Workflow
                                  "},{"location":"platform-admin/authentication/roles/#using-api","title":"Using API","text":"

                                  Go to the Roles API reference to view the available actions.

                                  "},{"location":"platform-admin/authentication/users/","title":"Users","text":"

                                  This article explains the procedure to manage users and their permissions.

                                  Users can be managed locally, or via the Identity provider, while assigned with Access Rules to manage its permissions.

                                  For example, user user@domain.com is a department admin in department A.

                                  "},{"location":"platform-admin/authentication/users/#users-table","title":"Users table","text":"

                                  The Users table can be found under Access in the Run:ai platform.

                                  The users table provides a list of all the users in the platform. You can manage local users and manage user permissions (access rules) for both local and SSO users.

                                  Note

                                  Single Sign-On users

                                  SSO users are managed by the identity provider and appear once they have signed in to Run:ai

                                  The Users table consists of the following columns:

                                  Column Description User The unique identity of the user (email address) Type The type of the user - SSO / local Last login The timestamp for the last time the user signed in Access rule(s) The access rules assigned to the user Created By The user who created the user Creation time The timestamp for when the user was created Last updated The last time the user was updated"},{"location":"platform-admin/authentication/users/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"platform-admin/authentication/users/#creating-a-local-user","title":"Creating a local user","text":"

                                  To create a local user:

                                  1. Click +NEW LOCAL USER
                                  2. Enter the user\u2019s Email address
                                  3. Click CREATE
                                  4. Review and copy the user\u2019s credentials:
                                    • User Email
                                    • Temporary password to be used on first sign-in
                                  5. Click DONE

                                  Note

                                  The temporary password is visible only at the time of user\u2019s creation, and must be changed after the first sign-in

                                  "},{"location":"platform-admin/authentication/users/#adding-an-access-rule-to-a-user","title":"Adding an access rule to a user","text":"

                                  To create an access rule:

                                  1. Select the user you want to add an access rule for
                                  2. Click ACCESS RULES
                                  3. Click +ACCESS RULE
                                  4. Select a role
                                  5. Select a scope
                                  6. Click SAVE RULE
                                  7. Click CLOSE
                                  "},{"location":"platform-admin/authentication/users/#deleting-users-access-rule","title":"Deleting user\u2019s access rule","text":"

                                  To delete an access rule:

                                  1. Select the user you want to remove an access rule from
                                  2. Click ACCESS RULES
                                  3. Find the access rule assigned to the user you would like to delete
                                  4. Click on the trash icon
                                  5. Click CLOSE
                                  "},{"location":"platform-admin/authentication/users/#resetting-a-user-password","title":"Resetting a user password","text":"

                                  To reset a user\u2019s password:

                                  1. Select the user you want to reset it\u2019s password
                                  2. Click RESET PASSWORD
                                  3. Click RESET
                                  4. Review and copy the user\u2019s credentials:
                                    • User Email
                                    • Temporary password to be used on next sign-in
                                  5. Click DONE
                                  "},{"location":"platform-admin/authentication/users/#deleting-a-user","title":"Deleting a user","text":"
                                  1. Select the user you want to delete
                                  2. Click DELETE
                                  3. In the dialog, click DELETE to confirm the deletion

                                  Note

                                  To ensure administrative operations are always available, at least one local user with System Administrator role should exist.

                                  "},{"location":"platform-admin/authentication/users/#using-api","title":"Using API","text":"

                                  Go to the Users, Access rules API reference to view the available actions

                                  "},{"location":"platform-admin/integrations/integration-overview/","title":"Integrations with Run:ai","text":"

                                  The table below summarizes the integration capabilities with various third-party products.

                                  "},{"location":"platform-admin/integrations/integration-overview/#integration-support","title":"Integration support","text":"

                                  Support for integrations varies. Where mentioned below, the integration is supported out of the box with Run:ai. With other integrations, our customer success team has previous experience with integrating with the third party software and many times the community portal will contain additional reference documentation provided on an as-is basis.

                                  The Run:ai community portal is password protected and access is provided to customers and partners.

                                  "},{"location":"platform-admin/integrations/integration-overview/#integrations","title":"Integrations","text":"Tool Category Run:ai support details Additional Information Triton Orchestration Supported Usage via docker base image. Quickstart inference example Spark Orchestration Community Support It is possible to schedule Spark workflows with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-Run-Spark-jobs-with-Run-AI Kubeflow Pipelines Orchestration Community Support It is possible to schedule kubeflow pipelines with the Run:ai scheduler. For details please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portalhttps://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Kubeflow Apache Airflow Orchestration Community Support It is possible to schedule Airflow workflows with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Apache-Airflow Argo workflows Orchestration Community Support It is possible to schedule Argo workflows with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Argo-Workflows SeldonX Orchestration Community Support It is possible to schedule Seldon Core workloads with the Run:ai scheduler. For details, please contact Run:ai customer success. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Seldon-Core Jupyter Notebook Development Supported Run:ai provides integrated support with Jupyter Notebooks. Quickstart example: https://docs.run.ai/latest/Researcher/Walkthroughs/quickstart-jupyter/ Jupyter Hub Development Community Support It is possible to submit Run:ai workloads via JupyterHub. For more information please contact Run:ai customer support PyCharm Development Supported Containers created by Run:ai can be accessed via PyCharm. PyCharm example VScode Development Supported - Containers created by Run:ai can be accessed via Visual Studio Code. example - You can automatically launch Visual Studio code web from the Run:ai console. example. Kubeflow notebooks Development Community Support It is possible to launch a kubeflow notebook with the Run:ai scheduler. For details please contact Run:ai customer support Sample code can be found in the Run:ai customer success community portal:https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Kubeflow Ray training, inference, data processing. Community Support It is possible to schedule Ray jobs with the Run:ai scheduler. Sample code can be found in the Run:ai customer success community portal https://runai.my.site.com/community/s/article/How-to-Integrate-Run-ai-with-Ray TensorBoard Experiment tracking Supported Run:ai comes with a preset Tensorboard Environment asset. TensorBoard example. Additional sample Weights & Biases Experiment tracking Community Support It is possible to schedule W&B workloads with the Run:ai scheduler. For details, please contact Run:ai customer success. ClearML Experiment tracking Community Support It is possible to schedule ClearML workloads with the Run:ai scheduler. For details, please contact Run:ai customer success. MLFlow Model Serving Community Support It is possible to use ML Flow together with the Run:ai scheduler. For details, please contact Run:ai customer support. Sample code can be found in the Run:ai customer success community portal: https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-MLflow Additional MLFlow sample Hugging Face Repositories Supported Run:ai provides an out of the box integration with Hugging Face Docker Registry Repositories Supported Run:ai allows using a docker registry as a Credentials asset. S3 Storage Supported Run:ai communicates with S3 by defining a data source asset. Github Storage Supported Run:ai communicates with GitHub by defining it as a data source asset Tensorflow Training Supported Run:ai provides out of the box support for submitting TensorFlow workloads via API or by submitting workloads via user interface. Pytorch Training Supported Run:ai provides out of the box support for submitting PyTorch workloads via API or by submitting workloads via user interface. Kubeflow MPI Training Supported Run:ai provides out of the box support for submitting MPI workloads via API or by submitting workloads via user interface XGBoost Training Supported Run:ai provides out of the box support for submitting XGBoost workloads via API or by submitting workloads via user interface Karpenter Cost Optimization Supported Run:ai provides out of the box support for Karpenter to save cloud costs. Integration notes with Karpenter can be found here"},{"location":"platform-admin/integrations/integration-overview/#kubernetes-workloads-integration","title":"Kubernetes Workloads Integration","text":"

                                  Kubernetes has several built-in resources that encapsulate running Pods. These are called Kubernetes Workloads and should not be confused with Run:ai Workloads.

                                  Examples of such resources are a Deployment that manages a stateless application, or a Job that runs tasks to completion.

                                  Run:ai natively runs Run:ai Workloads. A Run:ai workload encapsulates all the resources needed to run, creates them, and deletes them together. However, Run:ai, being an open platform allows the scheduling of any Kubernetes Workflow.

                                  For more information see Kubernetes Workloads Integration.

                                  "},{"location":"platform-admin/integrations/karpenter/","title":"Working with Karpenter","text":"

                                  Karpenter is an open-source, Kubernetes cluster autoscaler built for cloud deployments. Karpenter optimizes the cloud cost of a customer\u2019s cluster by moving workloads between different node types, consolidating workloads into fewer nodes, using lower-cost nodes where possible, scaling up new nodes when needed, and shutting down unused nodes.

                                  Karpenter\u2019s main goal is cost optimization. Unlike Karpenter, Run:ai\u2019s scheduler optimizes for fairness and resource utilization. Therefore, there are a few potential friction points when using both on the same cluster.

                                  "},{"location":"platform-admin/integrations/karpenter/#friction-points-using-karpenter-with-runai","title":"Friction points using Karpenter with Run:ai","text":"
                                  1. Karpenter looks for \u201cunschedulable\u201d pending workloads and may try to scale up new nodes to make those workloads schedulable. However, in some scenarios, these workloads may exceed their quota parameters, and the Run:ai scheduler will put them into a pending state.
                                  2. Karpenter is not aware of the Run:ai fractions mechanism and may try to interfere incorrectly.
                                  3. Karpenter preempts any type of workload (i.e., high-priority, non-preemptible workloads will potentially be interrupted and moved to save cost).
                                  4. Karpenter has no pod-group (i.e., workload) notion or gang scheduling awareness, meaning that Karpenter is unaware that a set of \u201carbitrary\u201d pods is a single workload. This may cause Karpenter to schedule those pods into different node pools (in the case of multi-node-pool workloads) or scale up or down a mix of wrong nodes.
                                  "},{"location":"platform-admin/integrations/karpenter/#mitigating-the-friction-points","title":"Mitigating the friction points","text":"

                                  Run:ai scheduler mitigates the friction points using the following techniques (each numbered bullet below corresponds to the related friction point listed above):

                                  1. Karpenter uses a \u201cnominated node\u201d to recommend a node for the scheduler. The Run:ai scheduler treats this as a \u201cpreferred\u201d recommendation, meaning it will try to use this node, but it\u2019s not required and it may choose another node.
                                  2. Fractions - Karpenter won\u2019t consolidate nodes with one or more pods that cannot be moved. The Run:ai reservation pod is marked as \u2018do not evict\u2019 to allow the Run:ai scheduler to control the scheduling of fractions.
                                  3. Non-preemptible workloads - Run:ai marks non-preemptible workloads as \u2018do not evict\u2019 and Karpenter respects this annotation.
                                  4. Run:ai node pools (single-node-pool workloads) - Karpenter respects the \u2018node affinity\u2019 that Run:ai sets on a pod, so Karpenter uses the node affinity for its recommended node. For the gang-scheduling/pod-group (workload) notion, Run:ai scheduler considers Karpenter directives as preferred recommendations rather than mandatory instructions and overrides Karpenter instructions where appropriate.
                                  "},{"location":"platform-admin/integrations/karpenter/#deployment-considerations","title":"Deployment Considerations","text":"
                                  • Using multi-node-pool workloads
                                    • Workloads may include a list of optional nodepools. Karpenter is not aware that only a single node pool should be selected out of that list for the workload. It may therefore recommend putting pods of the same workload into different node pools and may scaleup nodes from different node pools to serve a \u201cmulti-node-pool\u201d workload instead of nodes on the selected single node pool.
                                    • If this becomes an issue (i.e., if Karpenter scales up the wrong node types), users can set an inter-pod affinity using the node pool label or another common label as a \u2018topology\u2019 identifier. This will force Karpenter to choose nodes from a single-node pool per workload, selecting from any of the node pools listed as allowed by the workload.
                                    • An alternative approach is to use a single-node pool for each workload instead of multi-node pools.
                                  • Consolidation
                                    • To make Karpenter more effective when using its consolidation function, users should consider separating preemptible and non-preemptible workloads, either by using node pools, node affinities, taint/tollerations, or inter-pod anti-affinity.
                                    • If users don\u2019t separate preemptible and non-preemptible workloads (i.e., make them run on different nodes), Karpenter\u2019s ability to consolidate (binpack) and shut down nodes will be reduced, but it is still effective.
                                  • Conflicts between binpacking and spread policies
                                    • If Run:ai is used with a scheduling spread policy, it will clash with Karpenter\u2019s default binpacks/consolidation policy, and the outcome may be a deployment that is not optimized for any of these policies.
                                    • Usually spread is used for Inference, which is non-preemptible and therefore not controlled by Karpenter (Run:ai scheduler will mark those workloads as \u2018do not evict\u2019 for Karpenter), so this should not present a real deployment issue for customers.
                                  "},{"location":"platform-admin/performance/dashboard-analysis/","title":"Introduction","text":"

                                  The Run:ai Administration User Interface provides a set of dashboards that help you monitor Clusters, Cluster Nodes, Projects, and Workloads. This document provides the key metrics to monitor, how to assess them as well as suggested actions.

                                  Dashboards are used by system administrators to analyze and diagnose issues that relate to:

                                  • Physical Resources.
                                  • Organization resource allocation and utilization.
                                  • Usage characteristics.

                                  System administrators need to know important information about the physical resources that are currently being used. Important information such as:

                                  • Resource health.
                                  • Available resources and their distribution.
                                  • Is there a lack of resources.
                                  • Are resources being utilized correctly.

                                  With this information, system administrators can hone in on:

                                  • How resources are allocated across the organization.
                                  • How the different organizational units utilized quotas and resources within those quotas.
                                  • The actual performance of the organizational units.

                                  These dashboards give system administrators the ability to drill down to see details of the different types of workloads that each of the organizational units is running. These usage and performance metrics ensure that system administrators can then take actions to correct issues that affect performance.

                                  There are 5 dashboards:

                                  • GPU/CPU Overview dashboard\u2014Provides information about what is happening right now in the cluster.
                                  • Quota Management dashboard\u2014Provides information about quota utilization.
                                  • Analytics dashboard\u2014Provides long term analysis of cluster behavior.
                                  • Multi-Cluster Overview dashboard\u2014Provides a more holistic, multi-cluster view of what is happening right now. The dashboard is intended for organizations that have more than one connected cluster.
                                  • Consumption dashboard\u2014Provides information about resource consumption.
                                  "},{"location":"platform-admin/performance/dashboard-analysis/#gpucpu-overview-dashboard-new-and-legacy","title":"GPU/CPU Overview Dashboard (New and legacy)","text":"

                                  The Overview dashboard provides information about what is happening right now in the cluster. Administrators can view high-level information on the state of the cluster. The dashboard has two tabs that change the display to provide a focused view for GPU Dashboards (default view) and CPU Dashboards.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#gpu-dashboard","title":"GPU Dashboard","text":"

                                  The GPU dashboard displays specific information for GPU based nodes, node-pools, clusters, or tenants. These dashboards also include additional metrics that specific to GPU based environments. The dashboard contains tiles that show information about specific resource allocation and performance metrics. The tiles are interactive allowing you to link directly to the assets or drill down to specific scopes. Use the time frame selector to choose a time frame for all the tiles in the dashboard.

                                  The dashboard has the following tiles:

                                  • Ready nodes\u2014displays GPU nodes that are in the ready state.
                                  • Ready GPU devices\u2014displays the number of GPUs in nodes that are in the ready state.
                                  • Allocated GPU compute\u2014displays the total number of GPUs allocated from all the nodes.
                                  • Idle allocated GPU devices\u2014displays the number of allocated GPU devices that have been idle for more than 5 minutes.
                                  • Running workloads\u2014displays the number of running workloads.
                                  • Pending workloads\u2014displays the number of workloads in the pending status.
                                  • Allocation ration by node pool\u2014displays the percentage of GPUs allocated per node pool. Hover over the bar for detailed information. Use the scope selected at the bottom of the graph to drill down for more details.
                                  • Free resources by node pool\u2014the graph displays the amount of free resources per node pool. Press a entry in the graph for more details. Hover over the resource bubbles for specific details for the workers in the node. Use the ellipsis to download the graph as a CSV file.
                                  • Resource allocation by workload type\u2014displays the resource allocation by workload type. Hover over the bar for detailed information. Use the scope selected at the bottom of the graph to drill down for more details. Use the ellipsis to download the graph as a CSV file.
                                  • Workload by status\u2014displays the number of workloads for each status in the workloads table. Hover over the bar for detailed information. Use the scope selected at the bottom of the graph to drill down for more details. Use the ellipsis to download the graph as a CSV file.
                                  • Resources utilization\u2014displays the resource utilization over time. The right pane of the graph shows the average utilization of the selected time frame of the dashboard. Hover over the graph to see details of a specific time in the graph. Use the ellipsis to download the graph as a CSV file.
                                  • Resource allocation\u2014displays the resource allocation over time. The right pane of the graph shows the average allocation of the selected time frame of the dashboard. Hover over the graph to see details of a specific time in the graph. Use the ellipsis to download the graph as a CSV file.
                                  "},{"location":"platform-admin/performance/dashboard-analysis/#cpu-dashboard","title":"CPU Dashboard","text":"

                                  The CPU dashboards display specific information for CPU based nodes, node-pools, clusters, or tenants. These dashboards also include additional metrics that specific to CPU based environments.

                                  To enable CPU Dashboards:

                                  1. Click General settings
                                  2. Open the Analytics pane and toggle the Show CPU dashboard switch to enable the feature.

                                  Toggle the switch to disable to disable CPU Dashboards option.

                                  The dashboard contains the following tiles:

                                  • Total CPU Nodes\u2014displays the total amount of CPU nodes.
                                  • Ready CPU nodes\u2014displays the total amount of CPU nodes in the ready state.
                                  • Total CPUs\u2014displays the total amount of CPUs.
                                  • Ready CPUs\u2014displays the total amount of CPUs in the ready state.
                                  • Allocated CPUs\u2014displays the amount of allocated CPUs.
                                  • Running workloads\u2014displays the amount of workloads in the running state.
                                  • Pending workloads\u2014displays the amount of workloads in the pending state.
                                  • Allocated CPUs per project\u2014displays the amount of CPUs allocated per project.
                                  • Active projects\u2014displays the active projects with the CPU allocation and amount of running and pending workloads.
                                  • Utilization per resource type\u2014displays the CPU compute and CPU memory utilization over time.
                                  • CPU compute utilization\u2014displays the current CPU compute utilization.
                                  • CPU memory utilization\u2014displays the current CPU memory utilization.
                                  • Pending workloads\u2014displays the requested resources and wait time for workloads in the pending status.
                                  • Workloads with error\u2014displays the amount of workloads that are currently not running due to an error.
                                  • Workload Count per CPU Compute Utilization\u2014
                                  • 5 longest running workloads\u2014displays up to 5 of workloads that have the longest running time.

                                  Analysis and Suggested actions:

                                  Review Analysis & Actions Interactive Workloads are too frequently idle Consider setting time limits for interactive Workloads through the Projects tab.\u00a0 Consider also reducing GPU/CPU quotas for specific Projects to encourage users to run more training Workloads as opposed to interactive Workloads (note that interactive Workloads can not use more than the GPU/CPU quota assigned to their Project). Training Workloads are too frequently idle Identify and notify the right users and work with them to improve the utilization of their training scripts"},{"location":"platform-admin/performance/dashboard-analysis/#workloads-with-an-error","title":"Workloads with an Error","text":"

                                  Search for Workloads with an error status. These Workloads may be holding GPUs/CPUs without actually using them.

                                  Analysis and Suggested actions:

                                  Search for workloads with an Error status on the Workloads view and discuss with the Job owner. Consider deleting these Workloads to free up the resources for other users.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#workloads-with-a-long-duration","title":"Workloads with a Long Duration","text":"

                                  View list of 5 longest Workloads.

                                  Analysis and Suggested actions:

                                  Review Analysis & Actions Training Workloads run for too long Ask users to view their Workloads and analyze whether useful work is being done. If needed, stop their Workloads. Interactive Workloads run for too long Consider setting time limits for interactive Workloads via the Project editor."},{"location":"platform-admin/performance/dashboard-analysis/#job-queue","title":"Job Queue","text":"

                                  Identify queueing bottlenecks.

                                  Analysis and Suggested actions:

                                  Review Analysis & Actions Cluster is fully loaded Go over the table of active Projects and check that fairness between Projects was enforced, by reviewing the number of allocated GPUs/CPUs for each Project, ensuring each Project was allocated with its fair-share portion of the cluster. Cluster is not fully loaded Go to the Workloads view to review the resources requested for that Job (CPU, CPU memory, GPU, GPU memory). Go to the Nodes view to verify that there is no Node with enough free resources that can host that Job.

                                  Also, check the command that the user used to submit the job. The Researcher may have requested a specific Node for that Job.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#analytics-dashboard","title":"Analytics Dashboard","text":"

                                  The Analytics dashboard provides means for viewing historical data on cluster information such as:

                                  • Utilization across the cluster
                                  • GPU usage by different Projects, including allocation and utilization, broken down into interactive and training Workloads
                                  • Breakdown of running Workloads into interactive, training, and GPU versus CPU-only Workloads, including information on queueing (number of pending Workloads and requested GPUs),
                                  • Status of Nodes in terms of availability and allocated and utilized resources.

                                  The dashboard has a dropdown filter for node pools and Departments. From the dropdown, select one or more node pools. The default setting is all.

                                  The information presented in Analytics can be used in different ways for identifying problems and fixing them. Below are a few examples.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#node-downtime","title":"Node Downtime","text":"

                                  View the overall available resources per Node and identify cases where a Node is down and there was a reduction in the number of available resources.

                                  How to: view the following panel.

                                  Analysis and Suggested actions:

                                  Filter according to time range to understand for how long the Node is down.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#gpu-allocation","title":"GPU Allocation","text":"

                                  Track GPU allocation across time.

                                  How to: view the following panels.

                                  The panel on the right-hand side shows the cluster-wide GPU allocation and utilization versus time, whereas the panels on the left-hand side show the cluster-wide GPU allocation and utilization averaged across the filtered time range.

                                  Analysis and Suggested actions:

                                  If the allocation is too low for a long period, work with users to run more workloads and to better utilize the Cluster.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#track-gpu-utilization","title":"Track GPU utilization","text":"

                                  Track whether Researchers efficiently use the GPU resources they have allocated for themselves.

                                  How to: view the following panel:

                                  Analysis and Suggested actions:

                                  If utilization is too low for a long period, you will want to identify the source of the problem:

                                  • Go to \u201cAverage GPU Allocation & Utilization\u201d
                                  • Look for Projects with large GPU allocations for interactive Workloads or Projects that poorly utilize their training Workloads. Users tend to poorly utilize their GPUs in interactive sessions because of the dev & debug nature of their work which typically is an iterative process with long idle GPU time. On many occasions users also don\u2019t shut down their interactive Workloads, holding their GPUs idle and preventing others from using them.
                                  Review Analysis & Actions Low GPU utilization is due to interactive Workloads being used too frequently Consider setting time limits for interactive Workloads through the Projects tab or reducing GPU quotas to encourage users to run more training Workloads as opposed to interactive Workloads (note that interactive Workloads can not use more than the GPU quota assigned to their Project). Low GPU utilization is due to users poorly utilizing their GPUs in training sessions Identify Projects with bad GPU utilization in training Workloads, notify the users and work with them to improve their code and the way they utilize their GPUs."},{"location":"platform-admin/performance/dashboard-analysis/#training-vs-interactive-researcher-maturity","title":"Training vs. Interactive -- Researcher maturity","text":"

                                  Track the number of running Workloads and the breakdown into interactive, training, and CPU-only Workloads.

                                  How to: view the following panel:

                                  Analysis and Suggested actions:

                                  We would want to encourage users to run more training Workloads than interactive Workloads, as it is the key to achieving high GPU utilization across the Cluster:

                                  • Training Workloads run to completion and free up their resources automatically when training ends
                                  • Training Workloads can be preempted, queued, and resumed automatically by the Run:ai system according to predefined policies which increases fairness and Cluster utilization.
                                  "},{"location":"platform-admin/performance/dashboard-analysis/#pending-queue-size","title":"Pending Queue Size","text":"

                                  Track how long is the queue for pending Workloads

                                  How to: view the following panels:

                                  Analysis and Suggested actions:

                                  Consider buying more GPUs:

                                  • When there are too many Workloads are waiting in queue for too long.
                                  • With a large number of requested GPUs.
                                  • While the Cluster is fully loaded and well utilized.
                                  "},{"location":"platform-admin/performance/dashboard-analysis/#cpu-memory-utilization","title":"CPU & Memory Utilization","text":"

                                  Track CPU and memory Node utilization and identify times where the load on specific Nodes is high.

                                  How to: view the following panel:

                                  Analysis and Suggested actions:

                                  If the load on specific Nodes is too high, it may cause problems with the proper operation of the Cluster and the way workloads are running.

                                  Consider adding more CPUs, or adding additional CPU-only nodes for Workloads that do only CPU processing.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#multi-cluster-overview-dashboard","title":"Multi-Cluster overview dashboard","text":"

                                  Provides a holistic, aggregated view across Clusters, including information about Cluster and Node utilization, available resources, and allocated resources. With this dashboard, you can identify Clusters that are down or underutilized and go to the Overview of that Cluster to explore further.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#consumption-dashboard","title":"Consumption dashboard","text":"

                                  This dashboard enables users and admins to view consumption usage using run:AI services. The dashboard provides views based on configurable filters and timelines. The dashboard also provides costing analysis for GPU, CPU, and memory costs for the system.

                                  The dashboard has 4 tiles for:

                                  • Cumulative GPU allocation per Project or Department
                                  • Cumulative CPU allocation per Project or Department
                                  • Cumulative memory allocation per Project or Department
                                  • Consumption types

                                  Use the drop down menus at the top of the dashboard to apply filters for:

                                  • Project or department
                                  • Per project (single, multiple, or all)
                                  • Per department (single, multiple or all)
                                  • Per cluster (single, multiple, all)

                                  To enable the Consumption Dashboard:

                                  1. Press the General settings icon, then press General.
                                  2. Open the Analytics pane and toggle the Consumption switch to enable the feature.
                                  3. Enter the cost of:
                                  4. GPU compute / Hour
                                  5. CPU compute / Hour
                                  6. CPU memory / Hour

                                  Use the time picker dropdown to select relative time range options and set custom absolute time ranges. You can change the Timezone and fiscal year settings from the time range controls by clicking the Change time settings button.

                                  Note

                                  Dashboard data updates once an hour.

                                  You can change the refresh interval using the refresh interval drop down.

                                  The dashboard has a 2 consumption tables that display the total consumption of resources. Hover over an entry in the table to filter it in or out of the table.

                                  The Total consumption table includes consumption details based on the filters selected. Fields include:

                                  • Project
                                  • Department
                                  • GPU hours
                                  • CPU hours
                                  • Memory hours
                                  • GPU Idle allocated hours\u2014the portion of time the GPUs spend idle from the total allocation hours.
                                  • CPU usage hours\u2014the actual usage time of CPU.
                                  • Memory usage time\u2014the actual usage time of CPU memory.
                                  • GPU cost (only when configured)
                                  • CPU cost (only when configured)
                                  • CPU memory (only when configured)

                                  The Total department consumption table includes consumption details for each department, or details for departments selected in the filters. Fields include:

                                  • Department
                                  • GPU hours
                                  • CPU hours
                                  • Memory hours
                                  • GPU Idle allocated hours\u2014the portion of time the GPUs spend idle from the total allocation hours.
                                  • CPU usage hours\u2014the actual usage time of CPU.
                                  • Memory usage time\u2014the actual usage time of CPU memory.
                                  • GPU cost (only when configured)
                                  • CPU cost (only when configured)
                                  • CPU memory (only when configured)

                                  The dashboard has a graph of the GPU allocation over time.

                                  !

                                  The dashboard has a graph of the Project over-quota GPU consumption.

                                  !

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#quota-management-dashboard","title":"Quota management dashboard","text":"

                                  The Quota management dashboard provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard is divided into sections with essential metrics and data visualizations to identify resource usage patterns, potential bottlenecks, and areas for optimization. The sections of the dashboard include:

                                  • Add Filter
                                  • Quota / Total
                                  • Allocated / Quota
                                  • Pending workloads
                                  • Quota by node pool
                                  • Allocation by node pool
                                  • Pending workloads by node pool
                                  • Departments with lowest allocation by node pool
                                  • Projects with lowest allocation ratio by node pool
                                  • Over time allocation / quota
                                  "},{"location":"platform-admin/performance/dashboard-analysis/#add-filter","title":"Add Filter","text":"

                                  Use the Add Filter dropdown to select filters for the dashboard. The filters will change the data shown on the dashboard. Available filters are:

                                  • Departments
                                  • Projects
                                  • Nodes

                                  Select a filter from the dropdown, then select a item from the list, and press apply.

                                  Note

                                  You can create a filter with multiple categories, but you can use each category and item only once.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#quota-total","title":"Quota / Total","text":"

                                  This section shows the number of GPUs that are in the quota based on the filter selection. The quota of GPUs is the number of GPUs that are reserved for use.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#allocated-quota","title":"Allocated / Quota","text":"

                                  This section shows the number of GPUs that are allocated based on the filter selection. Allocated GPUs are the number of GPUs that are being used.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#pending-workloads","title":"Pending workloads","text":"

                                  This section shows the number workloads that are pending based on the filter selection. Pending workloads are workloads that have not started.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#quota-by-node-pool","title":"Quota by node pool","text":"

                                  This section shows the quota of GPUs by node pool based on the filter. The quota is the number of GPUs that are reserved for use. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#allocation-by-node-pool","title":"Allocation by node pool","text":"

                                  This section shows the allocation of GPUs by node pool based on the filter. The allocation is the number of GPUs that are being used. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#pending-workloads-by-node-pool","title":"Pending workloads by node pool","text":"

                                  This section shows the number of pending workloads by node pool. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#departments-with-lowest-allocation-by-node-pool","title":"Departments with lowest allocation by node pool","text":"

                                  This section shows the departments with the lowest allocation of GPUs by percentage relative to the total number of GPUs.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#projects-with-lowest-allocation-ratio-by-node-pool","title":"Projects with lowest allocation ratio by node pool","text":"

                                  This section shows the projects with the lowest allocation of GPUS by percentage relative to the total number of GPUs.

                                  "},{"location":"platform-admin/performance/dashboard-analysis/#over-time-allocation-quota","title":"Over time allocation / quota","text":"

                                  This section shows the allocation of GPUs from the quota over a period of time.

                                  "},{"location":"platform-admin/performance/reports/","title":"Reports","text":"

                                  This article explains the procedure of managing reports in Run:ai.

                                  Reports allow users to access and organize large amounts of data in a clear, CSV-formatted layout. They enable users to monitor resource consumption, analyze trends, and make data-driven decisions to optimize their AI workloads effectively.

                                  Note

                                  Reports is enabled by default for SaaS tenants. In order to enable the feature for tenants, additional configuration must be added. See Enabling reports for self-hosted accounts.

                                  "},{"location":"platform-admin/performance/reports/#report-types","title":"Report types","text":"

                                  Currently, only \u201cConsumption Reports\u201d are available, which provides insights into the consumption of resources such as GPU, CPU, and CPU memory across organizational units.

                                  "},{"location":"platform-admin/performance/reports/#reports-table","title":"Reports table","text":"

                                  The Reports table can be found under Analytics in the Run:ai platform.

                                  The Reports table provides a list of all the reports defined in the platform and allows you to manage them.

                                  Users are able to access the reports they have generated themselves. Users with project viewing permissions throughout the tenant can access all reports within the tenant.

                                  The Reports table comprises the following columns:

                                  Column Description Report The name of the report Description The description of the report Status The different lifecycle phases and representation of the report condition Type The type of the report \u2013 e.g., consumption Created by The user who created the report Creation time The timestamp of when the report was created Collection period The period in which the data was collected"},{"location":"platform-admin/performance/reports/#reports-status","title":"Reports status","text":"

                                  The following table describes the reports' condition and whether they were created successfully:

                                  Status Description Ready Report is ready and can be downloaded as CSV Pending Report is in the queue and waiting to be processed Failed The report couldn\u2019t be created Processing... The report is being created"},{"location":"platform-admin/performance/reports/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  "},{"location":"platform-admin/performance/reports/#creating-a-new-report","title":"Creating a\u00a0new report","text":"

                                  Before you start, make sure you have a project.

                                  To create a new report:

                                  1. Click +NEW REPORT
                                  2. Enter a name for the report (if the name already exists, you will need to choose a different one)
                                  3. Optional: Provide a description of the report
                                  4. Set the report\u2019s data collection period
                                    • Start date - The date at which the report data commenced
                                    • End date - The date at which the report data concluded
                                  5. Set the report segmentation and filters
                                    • Filters - Filter by project or department name
                                    • Segment by - Data is collected and aggregated based on the segment
                                  6. Click CREATE REPORT
                                  "},{"location":"platform-admin/performance/reports/#deleting-a-report","title":"Deleting a report","text":"
                                  1. Select the report you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm
                                  "},{"location":"platform-admin/performance/reports/#downloading-a-report","title":"Downloading a report","text":"

                                  Note

                                  To download, the report must be in status \u201cReady\u201d.

                                  1. Select the report you want to download
                                  2. Click DOWNLOAD CSV
                                  "},{"location":"platform-admin/performance/reports/#using-api","title":"Using API","text":"

                                  To view the available actions, go to the Reports API reference.

                                  "},{"location":"platform-admin/performance/reports/#enabling-reports-for-self-hosted-accounts","title":"Enabling Reports for self-hosted accounts","text":"

                                  Reports must be saved in a storage solution compatible with S3. To activate this feature for self-hosted accounts, the storage needs to be linked to the account. The configuration should be incorporated into two ConfigMap objects within the Control Plane.

                                  1. Edit the runai-backend-org-unit-service ConfigMap:

                                    kubectl edit cm runai-backend-org-unit-service -n runai-backend\n

                                  2. Add the following lines to the file:

                                    S3_ENDPOINT: <S3_END_POINT_URL>\nS3_ACCESS_KEY_ID: <S3_ACCESS_KEY_ID>\nS3_ACCESS_KEY: <S3_ACCESS_KEY>\nS3_USE_SSL: \"true\"\nS3_BUCKET: <BUCKET_NAME>\n

                                  3. Edit the runai-backend-metrics-service ConfigMap:

                                    kubectl edit cm runai-backend-metrics-service -n runai-backend\n

                                  4. Add the following lines to the file:

                                    S3_ENDPOINT: <S3_END_POINT_URL>\nS3_ACCESS_KEY_ID: <S3_ACCESS_KEY_ID>\nS3_ACCESS_KEY: <S3_ACCESS_KEY>\nS3_USE_SSL: \"true\"\n

                                  5. In addition on the same file, under config.yaml section, add the following right after log_level: \\\"Info\\\"\\n:

                                    reports:\\n s3_config:\\n bucket: \\\"<BUCKET_NAME>\\\"\\n\n

                                  6. Restart the deployments:

                                    kubectl rollout restart deployment runai-backend-metrics-service runai-backend-org-unit-service -n runai-backend\n

                                  7. Refresh the page to see Reports under Analytics in the Run:ai platform.

                                  "},{"location":"platform-admin/workloads/assets/compute/","title":"Compute Resources","text":"

                                  This article explains what compute resources are and how to create and use them.

                                  Compute resources are one type of workload asset. A compute resource is a template that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

                                  A compute resource asset is a preconfigured building block that encapsulates all the specifications of compute requirements for the workload including:

                                  • GPU devices and GPU memory
                                  • CPU memory and CPU compute
                                  "},{"location":"platform-admin/workloads/assets/compute/#compute-resource-table","title":"Compute resource table","text":"

                                  The Compute resource table can be found under Workload manager in the Run:ai UI.

                                  The Compute resource table provides a list of all the compute resources defined in the platform and allows you to manage them.

                                  The Compute resource table consists of the following columns:

                                  Column Description Compute resource The name of the compute resource Description A description of the essence of the compute resource GPU devices request per pod The number of requested physical devices per pod of the workload that uses this compute resource GPU memory request per device The amount of GPU memory per requested device that is granted to each pod of the workload that uses this compute resource CPU memory request The minimum amount of CPU memory per pod of the workload that uses this compute resource CPU memory limit The maximum amount of CPU memory per pod of the workload that uses this compute resource CPU compute request The minimum number of CPU cores per pod of the workload that uses this compute resource CPU compute limit The maximum number of CPU cores per pod of the workload that uses this compute resource Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram Workload(s) The list of workloads associated with the compute resource Template(s) The list of workload templates that use this compute resource Created by The name of the user who created the compute resource Creation time The timestamp of when the compute resource was created Last updated The timestamp of when the compute resource was last updated Cluster The cluster that the compute resource is associated with"},{"location":"platform-admin/workloads/assets/compute/#workloads-associated-with-the-compute-resource","title":"Workloads associated with the compute resource","text":"

                                  Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

                                  Column Description Workload The workload that uses the compute resource Type Workspace/Training/Inference Status Represents the workload lifecycle. See the full list of workload status."},{"location":"platform-admin/workloads/assets/compute/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  "},{"location":"platform-admin/workloads/assets/compute/#adding-new-compute-resource","title":"Adding new compute resource","text":"

                                  To add a new compute resource:

                                  1. Go to the Compute resource table
                                  2. Click +NEW COMPUTE RESOURCE
                                  3. Select under which cluster to create the compute resource
                                  4. Select a scope
                                  5. Enter a name for the compute resource. The name must be unique.
                                  6. Optional: Provide a description of the essence of the compute resource
                                  7. Set the resource types needed within a single node (the Run:ai scheduler tries to match a single node that complies with the compute resource for each of the workload\u2019s pods)

                                    • GPU

                                      • GPU devices per pod The number of devices (physical GPUs) per pod (for example, if you requested 3 devices per pod and the running workload using this compute resource consists of 3 pods, there are 9 physical GPU devices used in total)

                                      Note

                                      • When setting it to zero, the workload using this computer resource neither requests or uses GPU resources while running
                                      • You can set any number of GPU devices and specify the memory requirement to any portion size (1..100), or memory size value using GB or MB units per device
                                      • GPU memory per device
                                        • Select the memory request format
                                          • % (of device) - Fraction of a GPU device\u2019s memory
                                          • MB (memory size) - An explicit GPU memory unit
                                          • GB (memory size) - An explicit GPU memory unit
                                        • Set the memory Request - The minimum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives this amount of GPU memory for each device(s) the pod utilizes
                                        • Optional: Set the memory Limit - The maximum amount of GPU memory that is provisioned per device. This means that any pod of a running workload that uses this compute resource, receives at most this amount of GPU memory for each device(s) the pod utilizes. To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request.

                                      Note

                                      • GPU memory limit is disabled by default. If you cannot see the Limit toggle in the compute resource form, then it must be enabled by your Administrator, under General settings \u2192 Resources \u2192 GPU resource optimization
                                      • When a Limit is set and is bigger than the Request, the scheduler allows each pod to reach the maximum amount of GPU memory in an opportunistic manner (only upon availability).
                                      • If the GPU Memory Limit is bigger that the Request the pod is prone to be killed by the Run:ai toolkit (out of memory signal). The greater the difference between the GPU memory used and the request, the higher the risk of being killed
                                      • If GPU resource optimization is turned off, the minimum and maximum are in fact equal
                                    • CPU

                                      • CPU compute per pod
                                        • Select the units for the CPU compute (Cores / Millicores)
                                        • Set the CPU compute Request - the minimum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU compute for each pod.
                                        • Optional: Set the CPU compute Limit - The maximum amount of CPU compute that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU compute. To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request. By default, the limit is set to \u201cUnlimited\u201d - which means that the pod may consume all the node's free CPU compute resources.
                                      • CPU memory per pod
                                        • Select the units for the CPU memory (MB / GB)
                                        • Set the CPU memory Request - The minimum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives this amount of CPU memory for each pod.
                                        • Optional: Set the CPU memory Limit - The maximum amount of CPU memory that is provisioned per pod. This means that any pod of a running workload that uses this compute resource, receives at most this amount of CPU memory. To set a Limit, first enable the limit toggle. The limit value must be equal to or higher than the request. By default, the limit is set to \u201cUnlimited\u201d - Meaning that the pod may consume all the node's free CPU memory resources.

                                      Note

                                      If the CPU Memory Limit is bigger that the Request the pod is prone to be killed by the operating system (out of memory signal). The greater the difference between the CPU memory used and the request, the higher the risk of being killed.

                                  8. Optional: More settings

                                    • Increase shared memory size When enabled, the shared memory size available to the pod is increased from the default 64MB to the node's total available memory or the CPU memory limit, if set above.
                                    • Set extended resource(s) Click +EXTENDED RESOURCES to add resource/quantity pairs. For more information on how to set extended resources, see the Extended resources and Quantity guides
                                  9. Click CREATE COMPUTE RESOURCE

                                    Note

                                    It is also possible to add compute resources directly when creating a specific Workspace, training or inference workload.

                                  "},{"location":"platform-admin/workloads/assets/compute/#editing-a-compute-resource","title":"Editing a compute resource","text":"

                                  To edit a compute resource:

                                  1. Select the compute resource you want to edit
                                  2. Click Edit
                                  3. Click SAVE COMPUTE RESOURCE

                                  Note

                                  The already bound workload that is using this asset will not be affected.

                                  "},{"location":"platform-admin/workloads/assets/compute/#copying-a-compute-resource","title":"Copying a compute resource","text":"

                                  To make a copy of an existing compute resource:

                                  1. Select the compute resource you want to copy
                                  2. Click MAKE A COPY
                                  3. Enter a name for the environment. The name must be unique.
                                  4. Update the environment
                                  5. Click CREATE COMPUTE RESOURCE
                                  "},{"location":"platform-admin/workloads/assets/compute/#deleting-a-compute-resource","title":"Deleting a compute resource","text":"
                                  1. Select the compute resource you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm

                                  Note

                                  The already bound workload that is using this asset will not be affected.

                                  "},{"location":"platform-admin/workloads/assets/compute/#using-api","title":"Using API","text":"

                                  Go to the Compute resources API reference to view the available actions

                                  "},{"location":"platform-admin/workloads/assets/credentials/","title":"Credentials","text":"

                                  This article explains what credentials are and how to create and use them.

                                  Credentials are a workload asset that simplify the complexities of Kubernetes secrets. They consist of and mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

                                  Credentials are crucial for the security of AI workloads and the resources they require, as they restrict access to authorized users, verify identities, and ensure secure interactions. By enforcing the protection of sensitive data, credentials help organizations comply with industry regulations, fostering a secure environment overall.

                                  Essentially, credentials enable AI practitioners to access relevant protected resources, such as private data sources and Docker images, thereby streamlining the workload submission process.

                                  "},{"location":"platform-admin/workloads/assets/credentials/#credentials-table","title":"Credentials table","text":"

                                  The Credentials table can be found under Workload manager in the Run:ai User interface.

                                  The Credentials table provides a list of all the credentials defined in the platform and allows you to manage them.

                                  The Credentials table comprises the following columns:

                                  Column Description Credentials The name of the credentials Description A description of the credentials Type The type of credentials, e.g., Docker registry Status The different lifecycle phases and representation of the credentials\u2019 condition Scope The scope of this compute resource within the organizational tree. Click the name of the scope to view the organizational tree diagram Kubernetes name The unique name of the credentials Kubernetes name as it appears in the cluster Environment(s) The environment(s) that are associated with the credentials Data source(s) The private data source(s) that are accessed using the credentials Created by The user who created the credentials Creation time The timestamp of when the credentials were created Cluster The cluster with which the credentials are associated"},{"location":"platform-admin/workloads/assets/credentials/#credentials-status","title":"Credentials status","text":"

                                  The following table describes the credentials\u2019 condition and whether they were created successfully for the selected scope.

                                  Status Description No issues found No issues were found while creating the credentials (this status may change while propagating the credentials to the selected scope) Issues found Issues found while propagating the credentials Issues found Failed to access the cluster Creating\u2026 Credentials are being created Deleting\u2026 Credentials are being deleted No status When the credentials\u2019 scope is an account, or the current version of the cluster is not up to date, the status cannot be displayed"},{"location":"platform-admin/workloads/assets/credentials/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then click \u2018Download as CSV\u2019. Export to CSV is limited to 20,000 rows.
                                  • Refresh - Click REFRESH to update the table with the latest data
                                  "},{"location":"platform-admin/workloads/assets/credentials/#adding-new-credentials","title":"Adding new credentials","text":"

                                  Creating credentials is limited to specific roles.

                                  To add a new credential:

                                  1. Go to the Credentials table:
                                  2. Click +NEW CREDENTIALS
                                  3. Select the credential type from the list Follow the step-by-step guide for each credential type:
                                  "},{"location":"platform-admin/workloads/assets/credentials/#docker-registry","title":"Docker registry","text":"

                                  These credentials allow users to authenticate and pull images from a Docker registry, enabling access to containerized applications and services.

                                  After creating the credentials, it is used automatically when pulling images.

                                  1. Select a scope.
                                  2. Enter a name for the credential. The name must be unique.
                                  3. Optional: Provide a description of the credentials
                                  4. Set how the credential is created
                                    • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
                                      • Select a secret from the list (The list is empty if no secrets were created in advance)
                                    • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
                                      • Enter the username, password, and Docker registry URL
                                  5. Click CREATE CREDENTIALS

                                  After the credentials are created, check their status to monitor their proper creation across the selected scope.

                                  "},{"location":"platform-admin/workloads/assets/credentials/#access-key","title":"Access key","text":"

                                  These credentials are unique identifiers used to authenticate and authorize access to cloud services or APIs, ensuring secure communication between applications. They typically consist of two parts:

                                  • An access key ID
                                  • A secret access key

                                  The purpose of this credential type is to allow access to restricted data.

                                  1. Select a scope.
                                  2. Enter a name for the credential. The name must be unique.
                                  3. Optional: Provide a description of the credential
                                  4. Set how the credential is created
                                    • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
                                      • Select a secret from the list (The list is empty if no secrets were created in advance)
                                    • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
                                      • Enter the Access key and Access secret
                                  5. Click CREATE CREDENTIALS

                                  After the credentials are created, check their status to monitor their proper creation across the selected scope.

                                  "},{"location":"platform-admin/workloads/assets/credentials/#username-password","title":"Username & password","text":"

                                  These credentials require a username and corresponding password to access various resources, ensuring that only authorized users can log in.

                                  The purpose of this credential type is to allow access to restricted data.

                                  1. Select a scope
                                  2. Enter a name for the credential. The name must be unique.
                                  3. Optional: Provide a description of the credentials
                                  4. Set how the credential is created
                                    • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
                                      • Select a secret from the list (The list is empty if no secrets were created in advance)
                                    • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
                                      • Enter the username and password
                                  5. Click CREATE CREDENTIALS

                                  After the credentials are created, check their status to monitor their proper creation across the selected scope.

                                  "},{"location":"platform-admin/workloads/assets/credentials/#generic-secret","title":"Generic secret","text":"

                                  These credentials are a flexible option that consists of multiple keys & values and can store various sensitive information, such as API keys or configuration data, to be used securely within applications.

                                  The purpose of this credential type is to allow access to restricted data.

                                  1. Select a scope
                                  2. Enter a name for the credential. The name must be unique.
                                  3. Optional: Provide a description of the credentials
                                  4. Set how the credential is created
                                    • Existing secret (in the cluster) This option applies when the purpose is to create credentials based on an existing secret
                                      • Select a secret from the list (The list is empty if no secrets were created in advance)
                                    • New secret (recommended) A new secret is created together with the credentials. New secrets are not added to the list of existing secrets.
                                      • Click +KEY & VALUE - to add key/value pairs to store in the new secret
                                  5. Click CREATE CREDENTIALS
                                  "},{"location":"platform-admin/workloads/assets/credentials/#editing-credentials","title":"Editing credentials","text":"

                                  To rename a credential:

                                  1. Select the credential from the table
                                  2. Click Rename to edit its name and description
                                  "},{"location":"platform-admin/workloads/assets/credentials/#deleting-credentials","title":"Deleting credentials","text":"

                                  To delete a credential:

                                  1. Select the credential you want to delete
                                  2. Click DELETE
                                  3. In the dialog, click DELETE to confirm

                                  Note

                                  Credentials cannot be deleted if they are being used by a workload and template.

                                  "},{"location":"platform-admin/workloads/assets/credentials/#using-credentials","title":"Using credentials","text":"

                                  You can use credentials (secrets) in various ways within the system

                                  "},{"location":"platform-admin/workloads/assets/credentials/#access-private-data-sources","title":"Access private data sources","text":"

                                  To access private data sources, attach credentials to data sources of the following types: Git, S3 Bucket

                                  "},{"location":"platform-admin/workloads/assets/credentials/#use-directly-within-the-container","title":"Use directly within the container","text":"

                                  To use the secret directly from within the container, you can choose between the following options

                                  1. Get the secret mounted to the file system by using the Generic secret data source
                                  2. Get the secret as an environment variable injected into the container. There are two equivalent ways to inject the environment variable.

                                    a. By adding it to the Environment asset. b. By adding it ad-hoc as part of the workload.

                                  "},{"location":"platform-admin/workloads/assets/credentials/#creating-secrets-in-advance","title":"Creating secrets in advance","text":"

                                  Add secrets in advance to be used when creating credentials via the Run:ai UI.

                                  Follow the steps below for each required scope:

                                  Cluster scopeDepartment scopeProject scope
                                  1. Create the secret in the Run:ai namespace (runai)
                                  2. To authorize Run:ai to use the secret, label it: run.ai/cluster-wide: \"true\"
                                  3. Label the secret with the correct credential type:
                                    1. Docker registry - run.ai/resource: \"docker-registry\"
                                    2. Access key - run.ai/resource: \"access-key\"
                                    3. Username and password - run.ai/resource: \"password\"
                                    4. Generic secret - run.ai/resource: \"generic\" \u05bf
                                  1. Create the secret in the Run:ai namespace (runai)
                                  2. To authorize Run:ai to use the secret, label it: run.ai/department: \"<department id>\"
                                  3. Label the secret with the correct credential type:
                                    1. Docker registry - run.ai/resource: \"docker-registry\"
                                    2. Access key - run.ai/resource: \"access-key\"
                                    3. Username and password - run.ai/resource: \"password\"
                                    4. Generic secret - run.ai/resource: \"generic\"
                                  1. Create the secret in the project\u2019s namespace
                                  2. Label the secret with the correct credential type:
                                    1. Docker registry - run.ai/resource: \"docker-registry\"
                                    2. Access key - run.ai/resource: \"access-key\"
                                    3. Username and password - run.ai/resource: \"password\"
                                    4. Generic secret - run.ai/resource: \"generic\"

                                  The secret is now displayed for that scope in the list of existing secrets.

                                  "},{"location":"platform-admin/workloads/assets/credentials/#using-api","title":"Using API","text":"

                                  To view the available actions, go to the Credentials API reference

                                  "},{"location":"platform-admin/workloads/assets/data-volumes/","title":"Data Volumes","text":"

                                  Data volumes offer a powerful solution for storing, managing, and sharing AI training data within the Run:ai platform. They promote collaboration, simplify data access control, and streamline the AI development lifecycle.

                                  Data volumes are snapshots of datasets stored in Kubernetes Persistent Volume Claims (PVCs). They act as a central repository for training data.

                                  "},{"location":"platform-admin/workloads/assets/data-volumes/#why-use-a-data-volume","title":"Why use a data volume?","text":"
                                  1. Sharing with multiple scopes Unlike other Run:ai data sources, data volumes can be shared across projects, departments, or clusters, encouraging data reuse and collaboration within the organization.
                                  2. Storage saving A single copy of the data can be used across multiple scopes
                                  "},{"location":"platform-admin/workloads/assets/data-volumes/#typical-use-cases","title":"Typical use cases","text":"
                                  1. Sharing large data sets In large organizations, the data is often stored in a remote location, which can be a barrier for large model training. Even if the data is transferred into the cluster, sharing it easily with multiple users is still challenging. Data volumes can help share the data seamlessly, with maximum security and control.
                                  2. Sharing data with colleagues When sharing training results, generated data sets, or other artifacts with team members is needed, data volumes can help make the data available easily.
                                  "},{"location":"platform-admin/workloads/assets/data-volumes/#prerequisites","title":"Prerequisites","text":"

                                  To create a data volume, there must be a project with a PVC in its namespace.

                                  Working with data volumes is currently available using the API. To view the available actions, go to the Data volumes API reference.

                                  "},{"location":"platform-admin/workloads/assets/data-volumes/#adding-a-new-data-volume","title":"Adding a new data volume","text":"

                                  Data volume creation is limited to specific roles

                                  "},{"location":"platform-admin/workloads/assets/data-volumes/#adding-scopes-for-a-data-volume","title":"Adding scopes for a data volume","text":"

                                  Data volume sharing (adding scopes) is limited to specific roles

                                  Once created, the data volume is available to its originating project (see the prerequisites above).

                                  Data volumes can be shared with additional scopes in the organization.

                                  "},{"location":"platform-admin/workloads/assets/data-volumes/#who-can-use-a-data-volume","title":"Who can use a data volume?","text":"

                                  Data volumes are used when submitting workloads. Any user, application or SSO group with a role that has permissions to create workloads can also use data volumes.

                                  Researchers can list available data volumes within their permitted scopes for easy selection.

                                  "},{"location":"platform-admin/workloads/assets/datasources/","title":"Data Sources","text":"

                                  This article explains what data sources are and how to create and use them.

                                  Data sources are a type of workload asset and represent a location where data is actually stored. They may represent a remote data location, such as NFS, Git, or S3, or a Kubernetes local resource, such as PVC, ConfigMap, HostPath, or Secret.

                                  This configuration simplifies the mapping of the data into the workload\u2019s file system and handles the mounting process during workload creation for reading and writing. These data sources are reusable and can be easily integrated and used by AI practitioners while submitting workloads across various scopes.

                                  "},{"location":"platform-admin/workloads/assets/datasources/#data-sources-table","title":"Data sources table","text":"

                                  The data sources table can be found under Workload manager in the Run:ai platform.

                                  The data sources table provides a list of all the data sources defined in the platform and allows you to manage them.

                                  The data sources table comprises the following columns:

                                  Column Description Data source The name of the data source Description A description of the data source Type The type of data source connected \u2013 e.g., S3 bucket, PVC, or others Status The different lifecycle phases and representation of the data source condition Scope The scope of the data source within the organizational tree. Click the scope name to view the organizational tree diagram Kubernetes name The unique name of the data sources Kubernetes name as it appears in the cluster Workload(s) The list of existing workloads that use the data source Template(s) The list of workload templates that use the data source Created by The user who created the data source Creation time The timestamp for when the data source was created Cluster The cluster that the data source is associated with"},{"location":"platform-admin/workloads/assets/datasources/#data-sources-status","title":"Data sources status","text":"

                                  The following table describes the data sources' condition and whether they were created successfully for the selected scope.

                                  Status Description No issues found No issues were found while creating the data source Issues found Issues were found while propagating the data source credentials Issues found The data source couldn\u2019t be created at the cluster Creating\u2026 The data source is being created No status / \u201c-\u201d When the data source\u2019s scope is an account, the current version of the cluster is not up to date, or the asset is not a cluster-syncing entity, the status can\u2019t be displayed"},{"location":"platform-admin/workloads/assets/datasources/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then click \u2018Download as CSV\u2019
                                  • Refresh - Click REFRESH to update the table with the latest data
                                  "},{"location":"platform-admin/workloads/assets/datasources/#adding-a-new-data-source","title":"Adding a new data source","text":"

                                  To create a new data source:

                                  1. Click +NEW DATA SOURCE
                                  2. Select the data source type from the list. Follow the step-by-step guide for each data source type:
                                  "},{"location":"platform-admin/workloads/assets/datasources/#nfs","title":"NFS","text":"

                                  A Network File System (NFS) is a Kubernetes concept used for sharing storage in the cluster among different pods. Like a PVC, the NFS volume\u2019s content remains preserved, even outside the lifecycle of a single pod. However, unlike PVCs, which abstract storage management, NFS provides a method for network-based file sharing. The NFS volume can be pre-populated with data and can be mounted by multiple pod writers simultaneously. At Run:ai, an NFS-type data source is an abstraction that is mapped directly to a Kubernetes NFS volume. This integration allows multiple workloads under various scopes to mount and present the NFS data source.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Set the data origin
                                    • Enter the NFS server (host name or host IP)
                                    • Enter the NFS path
                                  6. Set the data target location
                                    • Container path
                                  7. Optional: Restrictions
                                    • Prevent data modification - When enabled, the data will be mounted with read-only permissions
                                  8. Click CREATE DATA SOURCE
                                  "},{"location":"platform-admin/workloads/assets/datasources/#pvc","title":"PVC","text":"

                                  A Persistent Volume Claim (PVC) is a Kubernetes concept used for managing storage in the cluster, which can be provisioned by an administrator or dynamically by Kubernetes using a StorageClass. PVCs allow users to request specific sizes and access modes (read/write once, read-only many). Run:ai ensures that data remains consistent and accessible across various scopes and workloads, beyond the lifecycle of individual pods, which is efficient while working with large datasets typically associated with AI projects.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Select PVC:
                                    • Existing PVC This option is relevant when the purpose is to create a PVC-type data source based on an existing PVC in the cluster
                                      • Select a PVC from the list - (The list is empty if no existing PVCs were created in advance)
                                    • New PVC - creates a new PVC in the cluster. New PVCs are not added to the Existing PVCs list. When creating a PVC-type data source and selecting the \u2018New PVC\u2019 option, the PVC is immediately created in the cluster (even if no workload has requested this PVC).
                                  6. Select the storage class
                                    • None - Proceed without defining a storage class
                                    • Custom storage class - This option applies when selecting a storage class based on existing storage classes. To add new storage classes to the storage class list, and for additional information, check Kubernetes storage classes
                                  7. Select the access mode(s) (multiple modes can be selected)
                                    • Read-write by one node - The volume can be mounted as read-write by a single node.
                                    • Read-only by many nodes - The volume can be mounted as read-only by many nodes.
                                    • Read-write by many nodes - The volume can be mounted as read-write by many nodes.
                                  8. Set the claim size and its units
                                  9. Select the volume mode
                                    • File system (default) - allows the volume to be mounted as a filesystem, enabling the usage of directories and files.
                                    • Block - exposes the volume as a block storage, which can be formatted or used by applications directly without a filesystem.
                                  10. Set the data target location
                                    • container path
                                  11. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permission.
                                  12. Click CREATE DATA SOURCE

                                  After the data source is created, check its status to monitor its proper creation across the selected scope.

                                  "},{"location":"platform-admin/workloads/assets/datasources/#s3-bucket","title":"S3 Bucket","text":"

                                  The S3 bucket data source enables the mapping of a remote S3 bucket into the workload\u2019s file system. Similar to a PVC, this mapping remains accessible across different workload executions, extending beyond the lifecycle of individual pods. However, unlike PVCs, data stored in an S3 bucket resides remotely, which may lead to decreased performance during the execution of heavy machine learning workloads. As part of the Run:ai connection to the S3 bucket, you can create credentials in order to access and map private buckets.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Set the data origin
                                    • Set the S3 service URL
                                    • Select the credentials
                                      • None - for public buckets
                                      • Credential names - This option is relevant for private buckets based on existing credentials that were created for the scope. To add new credentials to the credentials list, and for additional information, check the Credentials article.
                                    • Enter the bucket name
                                  6. Set the data target location
                                    • container path
                                  7. Click CREATE DATA SOURCE

                                  After a private data source is created, check its status to monitor its proper creation across the selected scope.

                                  "},{"location":"platform-admin/workloads/assets/datasources/#git","title":"Git","text":"

                                  A Git-type data source is a Run:ai integration, that enables code to be copied from a Git branch into a dedicated folder in the container. It is mainly used to provide the workload with the latest code repository. As part of the integration with Git, in order to access private repositories, you can add predefined credentials to the data source mapping.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Set the data origin
                                    • Set the Repository URL
                                    • Set the Revision (branch, tag, or hash)- If left empty, it will use the 'HEAD' (latest)
                                    • Select the credentials
                                      • None - for public repositories
                                      • Credential names - This option applies to private repositories based on existing credentials that were created for the scope. To add new credentials to the credentials list, and for additional information, check the Credentials article.
                                  6. Set the data target location
                                    • container path
                                  7. Click CREATE DATA SOURCE

                                  After a private data source is created, check its status to monitor its proper creation across the selected scope.

                                  "},{"location":"platform-admin/workloads/assets/datasources/#host-path","title":"Host path","text":"

                                  A Host path volume is a Kubernetes concept that enables mounting a host path file or a directory on the workload\u2019s file system. Like a PVC, the host path volume\u2019s data persists across workloads under various scopes. It also enables data serving from the hosting node.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Set the data origin
                                    • host path
                                  6. Set the data target location
                                    • container path
                                  7. Optional: Prevent data modification - When enabled, the data will be mounted with read-only permissions.
                                  8. Click CREATE DATA SOURCE
                                  "},{"location":"platform-admin/workloads/assets/datasources/#configmap","title":"ConfigMap","text":"

                                  A ConfigMap data source is a Run:ai abstraction for the Kubernetes ConfigMap concept. The ConfigMap is used mainly for storage that can be mounted on the workload container for non-confidential data. It is usually represented in key-value pairs (e.g., environment variables, command-line arguments etc.). It allows you to decouple environment-specific system configurations from your container images, so that your applications are easily portable. ConfigMaps must be created on the cluster prior to being used within the Run:ai system.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Set the data origin
                                    • Select the ConfigMap name (The list is empty if no existing ConfigMaps were created in advance).
                                  6. Set the data target location
                                    • container path
                                  7. Click CREATE DATA SOURCE
                                  "},{"location":"platform-admin/workloads/assets/datasources/#secret","title":"Secret","text":"

                                  A secret-type data source enables the mapping of a credential into the workload\u2019s file system. Credentials are a workload asset that simplify the complexities of Kubernetes Secrets. The credentials mask sensitive access information, such as passwords, tokens, and access keys, which are necessary for gaining access to various resources.

                                  1. Select the cluster under which to create this data source
                                  2. Select a scope
                                  3. Enter a name for the data source. The name must be unique.
                                  4. Optional: Provide a description of the data source
                                  5. Set the data origin
                                    • Select the credentials To add new credentials, and for additional information, check the Credentials article.
                                  6. Set the data target location
                                    • container path
                                  7. Click CREATE DATA SOURCE

                                  After the data source is created, check its status to monitor its proper creation across the selected scope.

                                  Note

                                  It is also possible to add data sources directly when creating a specific workspace, training or inference workload

                                  "},{"location":"platform-admin/workloads/assets/datasources/#editing-a-data-source","title":"Editing a data source","text":"

                                  To edit a data source:

                                  1. Select the data source from the table
                                  2. Click Rename to provide it with a new name
                                  3. Click Copy & Edit to make any changes to the data source
                                  "},{"location":"platform-admin/workloads/assets/datasources/#deleting-a-data-source","title":"Deleting a data source","text":"

                                  To delete a data source:

                                  1. Select the data source you want to delete
                                  2. Click DELETE
                                  3. Confirm you want to delete the data source

                                  Note

                                  It is not possible to delete an environment being used by an existing workload or template.

                                  "},{"location":"platform-admin/workloads/assets/datasources/#using-api","title":"Using API","text":"

                                  To view the available actions, go to the Data sources API reference.

                                  "},{"location":"platform-admin/workloads/assets/environments/","title":"Environments","text":"

                                  This article explains what environments are and how to create and use them.

                                  Environments are one type of workload asset. An environment consists of a configuration that simplifies how workloads are submitted and can be used by AI practitioners when they submit their workloads.

                                  An environment asset is a preconfigured building block that encapsulates aspects for the workload such as:

                                  • Container image and container configuration
                                  • Tools and connections
                                  • The type of workload it serves
                                  "},{"location":"platform-admin/workloads/assets/environments/#environments-table","title":"Environments table","text":"

                                  The Environments table can be found under Workload manager in the Run:ai platform.

                                  The Environment table provides a list of all the environment defined in the platform and allows you to manage them.

                                  The Environments table consists of the following columns:

                                  Column Description Environment The name of the environment Description A description of the environment Scope The scope of this environment within the organizational tree. Click the name of the scope to view the organizational tree diagram Image The application or service to be run by the workload Workload Architecture This can be either standard for running workloads on a single node or distributed for running distributed workloads on multiple nodes Tool(s) The tools and connection types the environment exposes Workload(s) The list of existing workloads that use the environment Workload types The workload types that can use the environment (Workspace/ Training / Inference) Template(s) The list of workload templates that use this environment Created by The user who created the environment. By default Run:ai UI comes with preinstalled environments created by Run:ai created by Run:ai Creation time The timestamp of when the environment was created Last updated The timestamp of when the environment was last updated Cluster The cluster with which the environment is associated"},{"location":"platform-admin/workloads/assets/environments/#tools-associated-with-the-environment","title":"Tools associated with the environment","text":"

                                  Click one of the values in the tools column to view the list of tools and their connection type.

                                  Column Description Tool name The name of the tool or application AI practitioner can set up within the environment. Connection type The method by which you can access and interact with the running workload. It's essentially the \"doorway\" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc)"},{"location":"platform-admin/workloads/assets/environments/#workloads-associated-with-the-environment","title":"Workloads associated with the environment","text":"

                                  Click one of the values in the Workload(s) column to view the list of workloads and their parameters.

                                  Column Description Workload The workload that uses the environment Type The workload type (Workspace/Training/Inference) Status Represents the workload lifecycle. See the full list of workload status"},{"location":"platform-admin/workloads/assets/environments/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  "},{"location":"platform-admin/workloads/assets/environments/#environments-created-by-runai","title":"Environments created by Run:ai","text":"

                                  When installing Run:ai, you automatically get the environments created by Run:ai to ease up the onboarding process and support different use cases out of the box. These environments are created at the scope of the account.

                                  Environment Image Jupiter-lab jupyter/scipy-notebook jupyter-tensorboard gcr.io/run-ai-demo/jupyter-tensorboard tensorboard tensorflow/tensorflow:latest llm-server runai.jfrog.io/core-llm/runai-vllm:v0.6.4-0.10.0 chatbot-ui runai.jfrog.io/core-llm/llm-app gpt2 runai.jfrog.io/core-llm/quickstart-inference:gpt2-cpu"},{"location":"platform-admin/workloads/assets/environments/#adding-a-new-environment","title":"Adding a new environment","text":"

                                  Environment creation is limited to specific roles

                                  To add a new environment:

                                  1. Go to the Environments table
                                  2. Click +NEW ENVIRONMENT
                                  3. Select under which cluster to create the environment
                                  4. Select a scope
                                  5. Enter a name for the environment. The name must be unique.
                                  6. Optional: Provide a description of the essence of the environment
                                  7. Enter the Image URL If a token or secret is required to pull the image, it is possible to create it via credentials of type docker registry. These credentials are automatically used once the image is pulled (which happens when the workload is submitted)
                                  8. Set the image pull policy - the condition for when to pull the image from the registry
                                  9. Set the workload architecture:
                                    • Standard Only standard workloads can use the environment. A standard workload consists of a single process.
                                    • Distributed Only distributed workloads can use the environment. A distributed workload consists of multiple processes working together. These processes can run on different nodes.
                                    • Select a framework from the list.
                                  10. Set the workload type:
                                    • Workspace
                                    • Training
                                    • Inference
                                    • When inference is selected, define the endpoint of the model by providing both the protocol and the container\u2019s serving port
                                  11. Optional: Set the connection for your tool(s). The tools must be configured in the image. When submitting a workload using the environment, it is possible to connect to these tools
                                    • Select the tool from the list (the available tools varies from IDE, experiment tracking, and more, including a custom tool for your choice)
                                    • Select the connection type
                                      • External URL
                                        • Auto generate A unique URL is automatically created for each workload using the environment
                                        • Custom URL The URL is set manually
                                      • Node port
                                        • Auto generate A unique port is automatically exposed for each workload using the environment
                                        • Custom URL Set the port manually
                                      • Set the container port
                                  12. Optional: Set a command and arguments for the container running the pod
                                    • When no command is added, the default command of the image is used (the image entrypoint)
                                    • The command can be modified while submitting a workload using the environment
                                    • The argument(s) can be modified while submitting a workload using the environment
                                  13. Optional: Set the environment variable(s)
                                    • Click +ENVIRONMENT VARIABLE
                                    • Enter a name
                                    • Select the source for the environment variable
                                    • Custom
                                      • Enter a value
                                      • Leave empty
                                      • Add instructions for the expected value if any
                                    • Credentials - Select existing credentials as the environment variable
                                      • Select a credential name To add new credentials to the credentials list, and for additional information, see Credentials.
                                      • Select a secret key
                                    • The environment variables can be modified and new variables can be added while submitting a workload using the environment
                                  14. Optional: Set the container\u2019s working directory to define where the container\u2019s process starts running. When left empty, the default directory is used.
                                  15. Optional: Set where the UID, GID and supplementary groups are taken from, this can be:
                                    • From the image
                                    • From the IdP token (only available in an SSO installations)
                                    • Custom (manually set) - decide whether the submitter can modify these value upon submission.
                                    • Set the User ID (UID), Group ID (GID) and the supplementary groups that can run commands in the container
                                      • Enter UID
                                      • Enter GID
                                      • Add Supplementary groups (multiple groups can be added, separated by commas)
                                      • Disable Allow the values above to be modified within the workload if you want the above values to be used as the default
                                  16. Optional: Select Linux capabilities - Grant certain privileges to a container without granting all the privileges of the root user.
                                  17. Click CREATE ENVIRONMENT

                                  Note

                                  It is also possible to add environments directly when creating a specific workspace, training or inference workload.

                                  "},{"location":"platform-admin/workloads/assets/environments/#editing-an-environment","title":"Editing an environment","text":"

                                  To edit an environment:

                                  1. Select the environment you want to edit
                                  2. Click Edit
                                  3. Click SAVE ENVIRONMENT

                                  Note

                                  • The already bound workload that is using this asset will not be affected.
                                  • llm-server and chatbot-ui environments cannot be edited.
                                  "},{"location":"platform-admin/workloads/assets/environments/#copying-an-environment","title":"Copying an environment","text":"

                                  To make a copy of an existing environment:

                                  1. Select the environment you want to copy
                                  2. Click MAKE A COPY
                                  3. Enter a name for the environment. The name must be unique.
                                  4. Update the environment
                                  5. Click CREATE ENVIRONMENT
                                  "},{"location":"platform-admin/workloads/assets/environments/#deleting-an-environment","title":"Deleting an environment","text":"

                                  To delete an environment:

                                  1. Select the environment you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm

                                  Note

                                  The already bound workload that is using this asset will not be affected.

                                  "},{"location":"platform-admin/workloads/assets/environments/#using-api","title":"Using API","text":"

                                  Go to the Environment API reference to view the available actions

                                  "},{"location":"platform-admin/workloads/assets/overview/","title":"Overview","text":"

                                  Workload assets enable organizations to:

                                  • Create and reuse preconfigured setup for code, data, storage and resources to be used by AI practitioners to simplify the process of submitting workloads
                                  • Share the preconfigured setup with a wide audience of AI practitioners with similar needs

                                  Note

                                  • The creation of assets is possible only via API and the Run:ai UI
                                  • The submission of workloads using assets, is possible only via the Run:ai UI
                                  "},{"location":"platform-admin/workloads/assets/overview/#workload-asset-types","title":"Workload asset types","text":"

                                  There are four workload asset types used by the workload:

                                  • Environments The container image, tools and connections for the workload
                                  • Data sources The type of data, its origin and the target storage location such as PVCs or cloud storage buckets where datasets are stored
                                  • Compute resources The compute specification, including GPU and CPU compute and memory
                                  • Credentials The secrets to be used to access sensitive data, services, and applications such as docker registry or S3 buckets
                                  "},{"location":"platform-admin/workloads/assets/overview/#asset-scope","title":"Asset scope","text":"

                                  When a workload asset is created, a scope is required. The scope defines who in the organization can view and/or use the asset.

                                  Note

                                  When an asset is created via API, the scope can be the entire account, this is currently an experimental feature.

                                  "},{"location":"platform-admin/workloads/assets/overview/#who-can-create-an-asset","title":"Who can create an asset?","text":"

                                  Any subject (user, application, or SSO group) with a role that has permissions to Create an asset, can do so within their scope.

                                  "},{"location":"platform-admin/workloads/assets/overview/#who-can-use-an-asset","title":"Who can use an asset?","text":"

                                  Assets are used when submitting workloads. Any subject (user, application or SSO group) with a role that has permissions to Create workloads, can also use assets.

                                  "},{"location":"platform-admin/workloads/assets/overview/#who-can-view-an-asset","title":"Who can view an asset?","text":"

                                  Any subject (user, application, or SSO group) with a role that has permission to View an asset, can do so within their scope.

                                  "},{"location":"platform-admin/workloads/assets/templates/","title":"Workspace Templates","text":"

                                  This article explains the procedure to manage templates.

                                  A template is a pre-set configuration that is used to quickly configure and submit workloads using existing assets. A template consists of all the assets a workload needs, allowing researchers to submit a workload in a single click, or make subtle adjustments to differentiate them from each other.

                                  "},{"location":"platform-admin/workloads/assets/templates/#workspace-templates-table","title":"Workspace templates table","text":"

                                  The Templates table can be found under Workload manager in the Run:ai User interface.

                                  The Templates table provides a list of all the templates defined in the platform, and allows you to manage them.

                                  Flexible Management

                                  It is also possible to manage templates directly for a specific user, application, project, or department.

                                  The Templates table consists of the following columns:

                                  Column Description Scope The scope to which the subject has access. Click the name of the scope to see the scope and its subordinates Environment The name of the environment related to the workspace template Compute resource The name of the compute resource connected to the workspace template Data source(s) The name of the data source(s) connected to the workspace template Created by The subject that created the template Creation time The timestamp for when the template was created Cluster The cluster name containing the template"},{"location":"platform-admin/workloads/assets/templates/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  • Refresh (optional) - Click REFRESH to update the table with the latest data
                                  • Show/Hide details (optional) - Click to view additional information on the selected row
                                  "},{"location":"platform-admin/workloads/assets/templates/#adding-a-new-workspace-template","title":"Adding a new workspace template","text":"

                                  To add a new template:

                                  1. Click +NEW TEMPLATE
                                  2. Set the scope for the template
                                  3. Enter a name for the template
                                  4. Select the environment for your workload
                                  5. Select the node resources needed to run your workload - or - Click +NEW COMPUTE RESOURCE

                                  6. Set the volume needed for your workload

                                  7. Create a new data source
                                  8. Set auto-deletion, annotations and labels, as required
                                  9. Click CREATE TEMPLATE
                                  "},{"location":"platform-admin/workloads/assets/templates/#editing-a-template","title":"Editing a template","text":"

                                  To edit a template:

                                  1. Select the template from the table
                                  2. Click Rename to provide it with a new name
                                  3. Click Copy & Edit to make any changes to the template
                                  "},{"location":"platform-admin/workloads/assets/templates/#deleting-a-template","title":"Deleting a template","text":"

                                  To delete a template:

                                  1. Select the template you want to delete
                                  2. Click DELETE
                                  3. Confirm you want to delete the template
                                  "},{"location":"platform-admin/workloads/assets/templates/#using-api","title":"Using API**","text":"

                                  Go to the Workload template API reference to view the available actions

                                  "},{"location":"platform-admin/workloads/overviews/introduction-to-workloads/","title":"Introduction to Workloads","text":"

                                  Run:ai enhances visibility and simplifies management, by monitoring, presenting and orchestrating all AI workloads in the clusters it is installed on. Workloads are the fundamental building blocks for consuming resources, enabling AI practitioners such as researchers, data scientists and engineers to efficiently support the entire life cycle of an AI initiative.

                                  "},{"location":"platform-admin/workloads/overviews/introduction-to-workloads/#workloads-across-the-ai-lifecycle","title":"Workloads across the AI lifecycle","text":"

                                  A typical AI initiative progresses through several key stages, each with distinct workloads and objectives. With Run:ai, research and engineering teams can host and manage all these workloads to achieve the following:

                                  • Data preparation: Aggregating, cleaning, normalizing, and labeling data to prepare for training.
                                  • Training: Conducting resource-intensive model development and iterative performance optimization.
                                  • Fine-tuning: Adapting pre-trained models to domain-specific data sets while balancing efficiency and performance.
                                  • Inference: Deploying models for real-time or batch predictions with a focus on low latency and high throughput.
                                  • Monitoring and optimization: Ensuring ongoing performance by addressing data drift, usage patterns, and retraining as needed.
                                  "},{"location":"platform-admin/workloads/overviews/introduction-to-workloads/#what-is-a-workload","title":"What is a workload?","text":"

                                  A workload runs in the cluster, is associated with a namespace, and operates to fulfill its targets, whether that is running to completion for a batch job, allocating resources for experimentation in an integrated development environment (IDE)/notebook, or serving inference requests in production.

                                  The workload, defined by the AI practitioner, consists of:

                                  • Container images: This includes the application, its dependencies, and the runtime environment.
                                  • Compute resources: CPU, GPU, and RAM to execute efficiently and address the workload\u2019s needs.
                                  • Data sets: The data needed for processing, such as training data sets or input from external databases.
                                  • Credentials: The access to certain data sources or external services, ensuring proper authentication and authorization.
                                  "},{"location":"platform-admin/workloads/overviews/introduction-to-workloads/#workload-scheduling-and-orchestration","title":"Workload scheduling and orchestration","text":"

                                  Run:ai\u2019s core mission is to optimize AI resource usage at scale. This is achieved through efficient scheduling and orchestrating of all cluster workloads using the Run:ai Scheduler. The Scheduler allows the prioritization of workloads across different departments and projects within the organization at large scales, based on the resource distribution set by the system administrator.

                                  "},{"location":"platform-admin/workloads/overviews/introduction-to-workloads/#runai-and-third-party-workloads","title":"Run:ai and third-party workloads","text":"
                                  • Run:ai workloads: These workloads are submitted via the Run:ai platform. They are represented by Kubernetes Custom Resource Definitions (CRDs) and APIs. When using Run:ai workloads, a complete Workload and Scheduling Policy solution is offered for administrators to ensure optimizations, governance and security standards are applied.
                                  • Third-party workloads: These workloads are submitted via third-party applications that use the Run:ai Scheduler. The Run:ai platform manages and monitors these workloads. They enable seamless integrations with external tools, allowing teams and individuals flexibility.
                                  "},{"location":"platform-admin/workloads/overviews/introduction-to-workloads/#levels-of-support","title":"Levels of support","text":"

                                  Different types of workloads have different levels of support. Understanding what capabilities are needed before selecting the workload type to work with is important. The table below details the level of support for each workload type in Run:ai. Run:ai workloads are fully supported with all of Run:ai advanced features and capabilities. While third-party workloads are partially supported. The list of capabilities can change between different Run:ai versions.

                                  Functionality Workload Type Run:ai workloads Third-party workloads Training - Standard Workspace Inference Training - distributed Fairness v v v v v Priority and preemption v v v v v Over quota v v v v v Node pools v v v v v Bin packing / Spread v v v v v Multi-GPU fractions v v v v v Multi-GPU dynamic fractions v v v v v Node level scheduler v v v v v Multi-GPU memory swap v v v v v Elastic scaling NA NA v v v Gang scheduling v v v v v Monitoring v v v v v RBAC v v v v Workload awareness v v v v Workload submission v v v v Workload actions (stop/run) v v v v Workload Policies v v v v Scheduling rules v v v v

                                  Note

                                  Workload awareness

                                  Specific workload-aware visibility, so that different pods are identified and treated as a single workload (for example GPU utilization, workload view, dashboards).

                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/","title":"Workloads","text":"

                                  This article explains the procedure for managing workloads.

                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#workloads-table","title":"Workloads table","text":"

                                  The Workloads table can be found under Workload manager in the Run:ai platform.

                                  The workloads table provides a list of all the workloads scheduled on the run:ai Scheduler, and allows you to manage them.

                                  The Workloads table consists of the following columns:

                                  Column Description Workload The name of the workload Type The workload type Preemptible Is the workload preemptible Status The different phases in a workload life cycle. Project The project in which the workload runs. Department The department that the workload is associated with. this column is visible only if the department toggle is enabled by your administrator. Created by The user who created the workload Running/requested pods The number of running pods out of the requested Creation time The timestamp for when the workload was created Completion time The timestamp the workload reached a terminal state (failed/completed) Connection(s) The method by which you can access and interact with the running workload. It's essentially the \"doorway\" through which you can reach and use the tools the workload provide. (E.g node port, external URL, etc). Click one of the values in the column to view the list of connections and their parameters Data source(s) Data resources used by the workload Environment The environment used by the workload Workload architecture Standard or distributed. A standard workload consists of a single process. A distributed workload consists of multiple processes working together. These processes can run on different nodes. GPU compute request Amount of GPU devices requested GPU compute allocation Amount of GPU devices allocated GPU memory request Amount of GPU memory Requested GPU memory allocation Amount of GPU memory allocated Idle GPU devices The number of allocated GPU devices that have been idle for more than 5 minutes CPU compute request Amount of CPU cores requested CPU compute allocation Amount of CPU cores allocated CPU memory request Amount of CPU memory requested CPU memory allocation Amount of CPU memory allocated Cluster The cluster that the workload is associated with"},{"location":"platform-admin/workloads/overviews/managing-workloads/#workload-status","title":"Workload status","text":"

                                  The following table describes the different phases in a workload life cycle. The UI provides additional details for some of the below workload statuses which can be viewed by clicking the icon next to the status.

                                  Status Description Entry Condition Exit Condition Creating Workload setup is initiated in the cluster. Resources and pods are now provisioning. A workload is submitted. A multi-pod group is created. Pending Workload is queued and awaiting resource allocation. A pod group exists. All pods are scheduled. Initializing Workload is retrieving images, starting containers, and preparing pods. All pods are scheduled. All pods are initialized or a failure to initialize is detected. Running Workload is currently in progress with all pods operational. All pods initialized (all containers in pods are ready). Workload completion or failure. Degraded Pods may not align with specifications, network services might be incomplete, or persistent volumes may be detached. Check your logs for specific details. Pending - All pods are running but have issues. Running - All pods are running with no issues. Running - All resources are OK. Completed - Workload finished with fewer resources. Failed - Workload failure or user-defined rules. Deleting Workload and its associated resources are being decommissioned from the cluster. Deleting the workload. Resources are fully deleted. Stopped Workload is on hold and resources are intact but inactive. Stopping the workload without deleting resources. Transitioning back to the initializing phase or proceeding to deleting the workload. Failed Image retrieval failed or containers experienced a crash. Check your logs for specific details. An error occurs preventing the successful completion of the workload. Terminal state. Completed Workload has successfully finished its execution. The workload has finished processing without errors. Terminal state."},{"location":"platform-admin/workloads/overviews/managing-workloads/#pods-associated-with-workload","title":"Pods Associated with Workload","text":"

                                  Click one of the values in the Running/requested pods column, to view the list of pods and their parameters.

                                  Column Description Pod Pod name Status Pod lifecycle stages Node The node on which the pod resides Node pool The node pool in which the pod resides (applicable if node pools are enabled) Image The pod\u2019s main image GPU compute allocation Amount of GPU devices allocated for the pod GPU memory allocation Amount of GPU memory allocated for the pod"},{"location":"platform-admin/workloads/overviews/managing-workloads/#connections-associated-with-workload","title":"Connections Associated with Workload","text":"

                                  A connection refers to the method by which you can access and interact with the running workloads. It is essentially the \"doorway\" through which you can reach and use the applications (tools) these workloads provide.

                                  Click one of the values in the Connection(s) column, to view the list of connections and their parameters. Connections are network interfaces that communicate with the application running in the workload. Connections are either the URL the application exposes or the IP and the port of the node that the workload is running on.

                                  Column Description Name The name of the application running on the workload Connection type The network connection type selected for the workload Access Who is authorized to use this connection (everyone, specific groups/users) Address The connection URL Copy button Copy URL to clipboard Connect button Enabled only for supported tools"},{"location":"platform-admin/workloads/overviews/managing-workloads/#data-sources-associated-with-workload","title":"Data Sources Associated with Workload","text":"

                                  Click one of the values in the Data source(s) column, to view the list of data sources and their parameters.

                                  Column Description Data source The name of the data source mounted to the workload Type The data source type"},{"location":"platform-admin/workloads/overviews/managing-workloads/#customizing-the-table-view","title":"Customizing the table view","text":"
                                  • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                  • Search - Click SEARCH and type the value to search by
                                  • Sort - Click each column header to sort by
                                  • Column selection - Click COLUMNS and select the columns to display in the table
                                  • Download table - Click MORE and then Click Download as CSV. Export to CSV is limited to 20,000 rows.
                                  • Refresh - Click REFRESH to update the table with the latest data
                                  • Show/Hide details - Click to view additional information on the selected row
                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#showhide-details","title":"Show/Hide details","text":"

                                  Click a row in the Workloads table and then click the SHOW DETAILS button at the upper-right side of the action bar. The details pane appears, presenting the following tabs:

                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#event-history","title":"Event History","text":"

                                  Displays the workload status over time. It displays events describing the workload lifecycle and alerts on notable events. Use the filter to search through the history for specific events.

                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#metrics","title":"Metrics","text":"
                                  • GPU utilization Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs compute utilization (percentage of GPU compute) in this node.
                                  • GPU memory utilization Per GPU graph and an average of all GPUs graph, all on the same chart, along an adjustable period allows you to see the trends of all GPUs memory usage (percentage of the GPU memory) in this node.
                                  • CPU compute utilization The average of all CPUs\u2019 cores compute utilization graph, along an adjustable period allows you to see the trends of CPU compute utilization (percentage of CPU compute) in this node.
                                  • CPU memory utilization The utilization of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory utilization (percentage of CPU memory) in this node.
                                  • CPU memory usage The usage of all CPUs memory in a single graph, along an adjustable period allows you to see the trends of CPU memory usage (in GB or MB of CPU memory) in this node.

                                  • For GPUs charts - Click the GPU legend on the right-hand side of the chart, to activate or deactivate any of the GPU lines.

                                  • You can click the date picker to change the presented period
                                  • You can use your mouse to mark a sub-period in the graph for zooming in, and use Reset zoom to go back to the preset period
                                  • Changes in the period affect all graphs on this screen.
                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#logs","title":"Logs","text":"

                                  Workload events are ordered in chronological order. The logs contain events from the workload\u2019s lifecycle to help monitor and debug issues.

                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#adding-new-workload","title":"Adding new workload","text":"

                                  Before starting, make sure you have created a project or have one created for you to work with workloads.

                                  To create a new workload:

                                  1. Click +NEW WORKLOAD
                                  2. Select a workload type - Follow the links below to view the step-by-step guide for each workload type:
                                    • Workspace. Used for data preparation and model-building tasks.
                                    • Training. Used for standard training tasks of all sorts
                                    • Distributed Training. Used for distributed tasks of all sorts
                                    • Inference. Used for inference and serving tasks
                                    • Job (legacy). This type is displayed only if enabled by your Administrator, under General settings \u2192 Workloads \u2192 Workload policies
                                  3. Click CREATE WORKLOAD
                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#stopping-a-workload","title":"Stopping a workload","text":"

                                  Stopping a workload kills the workload pods and releases the workload resources.

                                  1. Select the workload you want to stop
                                  2. Click STOP
                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#running-a-workload","title":"Running a workload","text":"

                                  Running a workload spins up new pods and resumes the workload work after it was stopped.

                                  1. Select the workload you want to run again
                                  2. Click RUN
                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#connecting-to-a-workload","title":"Connecting to a workload","text":"

                                  To connect to an application running in the workload (for example, Jupyter Notebook)

                                  1. Select the workload you want to connect
                                  2. Click CONNECT
                                  3. Select the tool from the drop-down list
                                  4. The selected tool is opened in a new tab on your browser
                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#deleting-a-workload","title":"Deleting a workload","text":"
                                  1. Select the workload you want to delete
                                  2. Click DELETE
                                  3. On the dialog, click DELETE to confirm the deletion

                                  Note

                                  Once a workload is deleted you can view it in the Deleted tab in the workloads view. This tab is displayed only if enabled by your Administrator, under General settings \u2192 Workloads \u2192 Deleted workloads

                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#copy-edit-a-workload","title":"Copy & Edit a workload","text":"
                                  1. Select the workload you want to copy and edit
                                  2. Click COPY & EDIT
                                  3. Update the workload and click CREATE WORKLOAD
                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#using-api","title":"Using API","text":"

                                  Go to the Workloads API reference to view the available actions

                                  "},{"location":"platform-admin/workloads/overviews/managing-workloads/#troubleshooting","title":"Troubleshooting","text":"

                                  To understand the condition of the workload, review the workload status in the Workload table. For more information, see check the workload\u2019s event history.

                                  Listed below are a number of known issues when working with workloads and how to fix them:

                                  Issue Mediation Cluster connectivity issues (there are issues with your connection to the cluster error message) Verify that you are on a network that has been granted access to the cluster. Reach out to your cluster admin for instructions on verifying this. If you are an admin, see the troubleshooting section in the cluster documentation Workload in \u201cInitializing\u201d status for some time Check that you have access to the Container image registry. Check the statuses of the pods in the pods\u2019 modal. Check the event history for more details Workload has been pending for some time Check that you have the required quota. Check the project\u2019s available quota in the project dialog. Check that all services needed to run are bound to the workload. Check the event history for more details. PVCs created using the K8s API or kubectl are not visible or mountable in Run:ai. This is by design. - Create a new data source of type PVC in the Run:ai UI - In the Data mount section, select Existing PVC - Select the PVC you created via the K8S API You are now able to select and mount this PVC in your Run:ai submitted workloads. Workload is not visible in the UI. Check that the workload hasn\u2019t been deleted. See the \u201cDeleted\u201d tab in the workloads view"},{"location":"platform-admin/workloads/overviews/workload-types/","title":"Run:ai Workload Types","text":"

                                  In the world of machine learning (ML), the journey from raw data to actionable insights is a complex process that spans multiple stages. Each stage of the AI lifecycle requires different tools, resources, and frameworks to ensure optimal performance. Run:ai simplifies this process by offering specialized workload types tailored to each phase, facilitating a smooth transition across various stages of the ML workflows.

                                  The ML lifecycle usually begins with the experimental work on data and exploration of different modeling techniques to identify the best approach for accurate predictions. At this stage, resource consumption is usually moderate as experimentation is done on a smaller scale. As confidence grows in the model's potential and its accuracy, the demand for compute resources increases. This is especially true during the training phase, where vast amounts of data need to be processed, particularly with complex models such as large language models (LLMs), with their huge parameter sizes, that often require distributed training across multiple GPUs to handle the intensive computational load.

                                  Finally, once the model is ready, it moves to the inference stage, where it is deployed to make predictions on new, unseen data. Run:ai's workload types are designed to correspond with the natural stages of this lifecycle. They are structured to align with the specific resource and framework requirements of each phase, ensuring that AI researchers and data scientists can focus on advancing their models without worrying about infrastructure management.

                                  Run:ai offers three workload types that correspond to a specific phase of the researcher\u2019s work:

                                  • Workspaces \u2013 For experimentation with data and models.
                                  • Training \u2013 For resource-intensive tasks such as model training and data preparation.
                                  • Inference \u2013 For deploying and serving the trained model.
                                  "},{"location":"platform-admin/workloads/overviews/workload-types/#workspaces-the-experimentation-phase","title":"Workspaces: the experimentation phase","text":"

                                  The Workspace is where data scientists conduct initial research, experiment with different data sets, and test various algorithms. This is the most flexible stage in the ML lifecycle, where models and data are explored, tuned, and refined. The value of workspaces lies in the flexibility they offer, allowing the researcher to iterate quickly without being constrained by rigid infrastructure.

                                  • Framework flexibility

                                    Workspaces support a variety of machine learning frameworks, as researchers need to experiment with different tools and methods.

                                  • Resource requirements

                                    Workspaces are often lighter on resources compared to the training phase, but they still require significant computational power for data processing, analysis, and model iteration.

                                    Hence, the default for the Run:ai workspaces considerations is to allow scheduling those workloads without the ability to preempt them once the resources were allocated. However, this non-preemptable state doesn\u2019t allow to utilize more resources outside of the project\u2019s deserved quota.

                                  See Running workspaces to learn more about how to submit a workspace via the Run:ai platform. For quick starts, see Running Jupyter Notebook using workspaces.

                                  "},{"location":"platform-admin/workloads/overviews/workload-types/#training-scaling-resources-for-model-development","title":"Training: scaling resources for model development","text":"

                                  As models mature and the need for more robust data processing and model training increases, Run:ai facilitates this shift through the Training workload. This phase is resource-intensive, often requiring distributed computing and high-performance clusters to process vast data sets and train models.

                                  • Training architecture

                                    For training workloads Run:ai allows you to specify the architecture - standard or distributed. The distributed architecture is relevant for larger data sets and more complex models that require utilizing multiple nodes. For the distributed architecture, Run:ai allows you to specify different configurations for the master and workers and select which framework to use - PyTorch, XGBoost, MPI, and TensorFlow. In addition, as part of the distributed configuration, Run:ai enable the researchers to schedule their distributed workloads on nodes within the same region, zone, placement group, or any other topology.

                                  • Resource requirements

                                    Training tasks demand high memory, compute power, and storage. Run:ai ensures that the allocated resources match the scale of the task and allows those workloads to utilize more compute resources than the project\u2019s deserved quota. Make sure that if you wish your training workload not to be preempted, specify the number of GPU\u2019s that are in your quota.

                                  See Standard training and Distributed training to learn more about how to submit a training workload via the Run:ai UI. For quick starts, see Run your first standard training and Run your first distributed training.

                                  "},{"location":"platform-admin/workloads/overviews/workload-types/#inference-deploying-and-serving-models","title":"Inference: deploying and serving models","text":"

                                  Once a model is trained and validated, it moves to the Inference stage, where it is deployed to make predictions (usually in a production environment). This phase is all about efficiency and responsiveness, as the model needs to serve real-time or batch predictions to end-users or other systems.

                                  • Inference-specific use cases

                                    Naturally, inference workloads are required to change and adapt to the ever-changing demands to meet SLA. For example, additional replicas may be deployed, manually or automatically, to increase compute resources as part of a horizontal scaling approach or a new version of the deployment may need to be rolled out without affecting the running services.

                                  • Resource requirements

                                    Inference models differ in size and purpose, leading to varying computational requirements. For example, small OCR models can run efficiently on CPUs, whereas LLMs typically require significant GPU memory for deployment and serving. Inference workloads are considered production-critical and are given the highest priority to ensure compliance with SLAs. Additionally, Run:ai ensures that inference workloads cannot be preempted, maintaining consistent performance and reliability.

                                  See Deploy a custom inference workload to learn more about how to submit an inference workload via the Run:ai UI.

                                  "},{"location":"platform-admin/workloads/policies/old-policies/","title":"Policies (YAML-based)","text":"

                                  Warning

                                  The below describes the old V1 Policies. While these still work, they have been replaced with Control-plane-based v2 policies which are accessible via API and user interface. For a description of the new policies, see API-based Policies.

                                  "},{"location":"platform-admin/workloads/policies/old-policies/#what-are-policies","title":"What are Policies?","text":"

                                  Policies allow administrators to impose restrictions and set default values for Researcher Workloads. For example:

                                  1. Restrict researchers from requesting more than 2 GPUs, or less than 1GB of memory for an interactive workload.
                                  2. Set the default memory of each training job to 1GB, or mount a default volume to be used by any submitted Workload.

                                  Policies are stored as Kubernetes custom resources.

                                  Policies are specific to Workload type as such there are several kinds of Policies:

                                  Workload Type Kubernetes Workload Name Kubernetes Policy Name Interactive InteractiveWorkload InteractivePolicy Training TrainingWorkload TrainingPolicy Distributed Training DistributedWorkload DistributedPolicy Inference InferenceWorkload InferencePolicy

                                  A Policy can be created per Run:ai Project (Kubernetes namespace). Additionally, a Policy resource can be created in the runai namespace. This special Policy will take effect when there is no project-specific Policy for the relevant workload kind.

                                  When researchers create a new interactive workload or workspace, they see list of available node pools and their priority. Priority is set by dragging and dropping the node pools in the desired order of priority. When the node pool priority list is locked by an administrator policy, the node pool list isn't editable by the Researcher even if the workspace is created from a template or copied from another workspace.

                                  Note

                                  Policies on this page cannot be added to platform 2.16 or higher that have the New Policy Manager enabled.

                                  "},{"location":"platform-admin/workloads/policies/old-policies/#creating-a-policy","title":"Creating a Policy","text":""},{"location":"platform-admin/workloads/policies/old-policies/#creating-your-first-policy","title":"Creating your First Policy","text":"

                                  To create a sample InteractivePolicy, prepare a file (e.g. policy.yaml) containing the following YAML:

                                  gpupolicy.yaml
                                  apiVersion: run.ai/v2alpha1\nkind: InteractivePolicy\nmetadata:\n  name: interactive-policy1\n  namespace: runai-team-a # (1)\nspec:\n  gpu:\n    rules:\n      required: true\n      min: \"1\"  # (2)\n      max: \"4\"  \n    value: \"1\"\n
                                  1. Set the Project namespace here.
                                  2. GPU values are quoted as they can contain non-integer values.

                                  The policy places a default and limit on the available values for GPU allocation. To apply this policy, run:

                                  kubectl apply -f gpupolicy.yaml \n

                                  Now, try the following command:

                                  runai submit --gpu 5 --interactive -p team-a\n

                                  The following message will appear:

                                  gpu: must be no greater than 4\n

                                  A similar message will appear in the New Job form of the Run:ai user interface, when attempting to enter the number of GPUs, which is out of range for a training job.

                                  "},{"location":"platform-admin/workloads/policies/old-policies/#gpu-and-cpu-memory-limits","title":"GPU and CPU memory limits","text":"

                                  The following policy places a default and limit on the available values for CPU and GPU memory allocation.

                                  gpumemorypolicy.yaml
                                  apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\n  name: training-policy\n  namespace: runai\nspec:\n  gpuMemory:\n    rules:\n      min: 100M\n      max: 2G\nmemory:\n    rules:\n      min: 100M\n      max: 2G\n
                                  "},{"location":"platform-admin/workloads/policies/old-policies/#read-only-values","title":"Read-only values","text":"

                                  When you do not want the user to be able to change a value, you can force the corresponding user interface control to become read-only by using the canEdit key. For example,

                                  runasuserpolicy.yaml
                                  apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\n  name: train-policy1\n  namespace: runai-team-a # (1) \n\nspec:\n  runAsUser:\n    rules:\n      required: true  # (2)\n      canEdit: false  # (3)\n    value: true # (4)\n
                                  1. Set the Project namespace here.
                                  2. The field is required.
                                  3. The field will be shown as read-only in the user interface.
                                  4. The field value is true.
                                  "},{"location":"platform-admin/workloads/policies/old-policies/#complex-values","title":"Complex Values","text":"

                                  The example above illustrated rules for parameters of \"primitive\" types, such as GPU allocation, CPU memory, working directory, etc. These parameters contain a single value.

                                  Other workload parameters, such as ports or volumes, are \"complex\", in the sense that they may contain multiple values: a workload may contain multiple ports and multiple volumes.

                                  The following is an example of a policy containing the value ports, which is complex: The ports flag typically contains two values: The external port that is mapped to an internal container port. One can have multiple port tuples defined for a single Workload:

                                  apiVersion: run.ai/v2alpha1\nkind: InteractivePolicy\nmetadata:\n  name: interactive-policy\n  namespace: runai\nspec:\n  ports:\n    rules:\n      canAdd: true\n    itemRules:\n      container:\n        min: 30000\n        max: 32767\n      external:\n        max: 32767\n    items:\n      admin-port-a:\n        rules:\n          canRemove: false\n          canEdit: false\n        value:\n          container: 30100\n          external: 8080\n      admin-port-b:\n        value:\n          container: 30101\n          external: 8081\n

                                  A policy for a complex field is composed of three parts:

                                  • Rules: Rules apply to the ports parameter as a whole. In this example, the administrator specifies canAdd rule with true value, indicating that a researcher submitting an interactive job can add additional ports to the ports listed by the policy (true is the default for canAdd, so it actually could have been omitted from the policy above). When canAdd is set to false, the researcher will not be able to add any additional port except those already specified by the policy.
                                  • itemRules: itemRules impose restrictions on the data members of each item, in this case - container and external. In the above example, the administrator has limited the value of container to 30000-32767, and the value of external to a maximum of 32767.
                                  • Items: Specifies a list of default ports. Each port is an item in the ports list and given a label (e.g. admin-port-b). The administrator can also specify whether a researcher can change/delete ports from the submitted workload. In the above example, admin-port-a is hardwired and cannot be changed or deleted, while admin-port-b can be changed or deleted by the researcher when submitting the Workload. It is possible to specify a label using the reserved name of DEFAULTS. This item provides the defaults for all other items.

                                  The following is an example of a complex policy for PVCs which contains DEFAULTS.

                                  apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\n  name: tp # use your name.\n  namespace: runai-team-a # use your namespace\nspec:\n  pvcs:\n    itemRules:\n      existingPvc:\n        canEdit: false\n      claimName:\n        required: true\n    items:\n      DEFAULTS:\n        value:\n          existingPvc: true\n          path: nil\n
                                  "},{"location":"platform-admin/workloads/policies/old-policies/#syntax","title":"Syntax","text":"

                                  The complete syntax of the policy YAML can be obtained using the explain command of kubectl. For example:

                                  kubectl explain trainingpolicy.spec\n
                                  Should provide the list of all possible fields in the spec of training policies:

                                  KIND:     TrainingPolicy\nVERSION:  run.ai/v2alpha1\n\nRESOURCE: spec <Object>\n\nDESCRIPTION:\nThe specifications of this TrainingPolicy\n\nFIELDS:\nannotations <Object>\nSpecifies annotations to be set in the container running the created\nworkload.\n\narguments   <Object>\nIf set, the arguments are sent along with the command which overrides the\nimage's entry point of the created workload.\n\ncommand <Object>\nIf set, overrides the image's entry point with the supplied command.\n...\n

                                  You can further drill down to get the syntax for ports by running:

                                  kubectl explain trainingpolicy.spec.ports\n
                                  KIND:     TrainingPolicy\nVERSION:  run.ai/v2alpha1\n\nRESOURCE: ports <Object>\n\nDESCRIPTION:\n     Specify the set of ports exposed from the container running the created\n     workload. Used together with --service-type.\n\nFIELDS:\n   itemRules    <Object>\n\n   items    <map[string]Object>\n\n   rules    <Object>\n     these rules apply to a value of type map (=non primitive) as a whole\n     additionally there are rules which apply for specific items of the map\n

                                  Drill down into the ports.rules object by running:

                                  kubectl explain trainingpolicy.spec.ports.rules\n
                                  KIND:     TrainingPolicy\nVERSION:  run.ai/\n\nRESOURCE: rules <Object>\n\nDESCRIPTION:\n     these rules apply to a value of type map (=non primitive) as a whole\n     additionally there are rules which apply for specific items of the map\n\nFIELDS:\n   canAdd   <boolean>\n     is it allowed for a workload to add items to this map\n\n   required <boolean>\n     if the map as a whole is required\n

                                  Note that each kind of policy has a slightly different set of parameters. For example, an InteractivePolicy has a jupyter parameter that is not available under TrainingPolicy.

                                  "},{"location":"platform-admin/workloads/policies/old-policies/#using-secrets-for-environment-variables","title":"Using Secrets for Environment Variables","text":"

                                  It is possible to add values from Kubernetes secrets as the value of environment variables included in the policy. The secret will be extracted from the secret object when the Job is created. For example:

                                    environment:\n    items:\n      MYPASSWORD:\n        value: \"SECRET:my-secret,password\"\n

                                  When submitting a workload that is affected by this policy, the created container will have an environment variable called MYPASSWORD whose value is the key password residing in Kubernetes secret my-secret which has been pre-created in the namespace where the workload runs.

                                  "},{"location":"platform-admin/workloads/policies/old-policies/#prevent-data-storage-on-the-node","title":"Prevent Data Storage on the Node","text":"

                                  You can configure policies to prevent the submission of workloads that use data sources that consist of a host path. This setting prevents data from being stored on the node so that in the event when a node is deleted, all data stored on that node is lost.

                                  Example for rejecting workloads requesting host path:

                                  spec:\n  volumes:\n    itemRules:\n      nfsServer:\n        required: true\n
                                  "},{"location":"platform-admin/workloads/policies/old-policies/#terminate-runai-training-jobs-after-preemption-policy","title":"Terminate Run:ai training Jobs after preemption policy","text":"

                                  Administrators can set a \u2018termination after preemption\u2019 policy to Run:ai training jobs. After applying this policy, a training job will be terminated once it has been preempted from any reason. For example, a training job that is using over-quota resources (e.g. GPUs) and the owner of those GPUs wants to reclaim them back, the Training job is preempted and typically goes back to the pending queue. However, if the termination policy is applied, the job is terminated instead of reinstated as pending. The Termination after Preemption Policy can be set as a cluster-wide policy (applicable to all namespaces/projects) or per project/namespace.

                                  To use this feature the administrator should configure either a cluster wide or namespace policy.

                                  For cluster wide (all namespaces/projects) use this YAML based policy:

                                  apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\n  name: training-policy\n  namespace: runai\nspec:\n  terminateAfterPreemption:\n    value: true\n

                                  For per namespace (project) use this YAML based policy:

                                  apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\n  name: training-policy\n  namespace: runai-<PROJECT_NAME>\nspec:\n  terminateAfterPreemption:\n    value: false\n
                                  "},{"location":"platform-admin/workloads/policies/old-policies/#modifyingdeleting-policies","title":"Modifying/Deleting Policies","text":"

                                  Use the standard kubectl get/apply/delete commands to modify and delete policies.

                                  For example, to view the global interactive policy:

                                  kubectl get interactivepolicies -n runai\n

                                  Should return the following:

                                  NAME                 AGE\ninteractive-policy   2d3h\n

                                  To delete this policy:

                                  kubectl delete InteractivePolicy interactive-policy -n runai\n

                                  To access project-specific policies, replace the -n runai parameter with the namespace of the relevant project.

                                  "},{"location":"platform-admin/workloads/policies/old-policies/#see-also","title":"See Also","text":"
                                  • For creating workloads based on policies, see the Run:ai submitting workloads
                                  "},{"location":"platform-admin/workloads/policies/overview/","title":"Overview","text":"

                                  At Run:ai, Administrator can access a suite of tools designed to facilitate efficient account management. This article focuses on two key features: workload policies and workload scheduling rules. These features empower admins to establish default values and implement restrictions allowing enhanced control, assuring compatibility with organizational policies and optimizing resources usage and utilization.

                                  "},{"location":"platform-admin/workloads/policies/overview/#workload-policies","title":"Workload policies","text":"

                                  A workload policy is an end-to-end solution for AI managers and administrators to control and simplify how workloads are submitted. This solution allows them to set best practices, enforce limitations, and standardize processes for the submission of workloads for AI projects within their organization. It acts as a key guideline for data scientists, researchers, ML & MLOps engineers by standardizing submission practices and simplifying the workload submission process.

                                  "},{"location":"platform-admin/workloads/policies/overview/#older-and-newer-policy-technologies","title":"Older and Newer Policy technologies","text":"

                                  Run:ai provides two policy technologies.

                                  YAML-Based policies are the older policies. These policies:

                                  • Require access to Kubernetes to view or change.
                                  • Contact Run:ai support to convert the old policies to the new V2 policies format.

                                  API-based policies which are the newer policies. These are:

                                  • Show in the Run:ai user interface.
                                  • Can be viewed and modified via the user interface and the Control-plane API.
                                  • Enable new rules addressing differences between project, department and cluster policies.
                                  • Only available with Run:ai clusters of version 2.18 and up.
                                  "},{"location":"platform-admin/workloads/policies/overview/#why-use-a-workload-policy","title":"Why use a workload policy?","text":"

                                  Implementing workload policies is essential when managing complex AI projects within an enterprise for several reasons:

                                  1. Resource control and management Defining or limiting the use of costly resources across the enterprise via a centralized management system to ensure efficient allocation and prevent overuse.
                                  2. Setting best practices Provide managers with the ability to establish guidelines and standards to follow, reducing errors amongst AI practitioners within the organization.
                                  3. Security and compliance Define and enforce permitted and restricted actions to uphold organizational security and meet compliance requirements.
                                  4. Simplified setup Conveniently allow setting defaults and streamline the workload submission process for AI practitioners.
                                  5. Scalability and diversity
                                    1. Multi-purpose clusters with various workload types that may have different requirements and characteristics for resource usage.
                                    2. The organization has multiple hierarchies, each with distinct goals, objectives and degrees of flexibility.
                                    3. Manage multiple users and projects with distinct requirements and methods, ensuring appropriate utilization of resources.
                                  "},{"location":"platform-admin/workloads/policies/overview/#understanding-the-mechanism","title":"Understanding the mechanism","text":"

                                  The following sections provide details of how the workload policy mechanism works.

                                  "},{"location":"platform-admin/workloads/policies/overview/#cross-interface-enforcement","title":"Cross-interface enforcement","text":"

                                  The policy enforces the workloads regardless of whether they were submitted via UI, CLI, Rest APIs, or Kubernetes YAMLs.

                                  "},{"location":"platform-admin/workloads/policies/overview/#policy-types","title":"Policy types","text":"

                                  Run:ai\u2019s policies enforce Run:ai workloads. The policy type is per Run:ai workload type. This allows administrators to set different policies for each workload type.

                                  Policy type Workload type Kubernetes name Workspace Workspace Interactive workload Training Standard Training Standard Training workload Distributed Distributed Distributed workload Inference* Inference Inference workload

                                  * The submission of this policy type is supported currently via API only

                                  "},{"location":"platform-admin/workloads/policies/overview/#policy-structure-rules-defaults-and-imposed-assets","title":"Policy structure - rules, defaults, and imposed assets","text":"

                                  A policy consists of rules for limiting and controlling the values of fields of the workload. In addition to rules, some defaults allow the implementation of default values to different workload fields. These default values are not rules, as they simply suggest values that can be overridden during the workload submission.

                                  Furthermore, policies allow the enforcement of workload assets. For example, as an admin, you can impose a data source of type PVC to be used by any workload submitted.

                                  For more information see rules, defaults and imposed assets.

                                  "},{"location":"platform-admin/workloads/policies/overview/#scope-of-effectiveness","title":"Scope of effectiveness","text":"

                                  Numerous teams working on various projects require the use of different tools, requirements, and safeguards. One policy may not suit all teams and their requirements. Hence, administrators can select the scope to cover the effectiveness of the policy. When a scope is selected, all of its subordinate units are also affected. As a result, all workloads submitted within the selected scope are controlled by the policy.

                                  For example, if a policy is set for Department A, all workloads submitted by any of the projects within this department are controlled.

                                  A scope for a policy can be:

                                      The entire account *  \n        L Specific cluster  \n            L Specific department  \n                L Specific project\n

                                  * The policy submission to the entire account scope is supported via API only

                                  The different scoping of policies also allows the breakdown of the responsibility between different administrators. This allows delegation of ownership between different levels within the organization. The policies, containing rules and defaults, propagate* down the organizational tree, forming an \u201ceffective\u201d policy that enforces any workload submitted by users within the project.

                                  "},{"location":"platform-admin/workloads/policies/overview/#policy-rules-reconciliation","title":"Policy rules reconciliation","text":"

                                  For situations where a rule or a default for a specific field is already governed by a policy, newly submitted policies for additional organizational units mentioning this existing field are not blocked from submission. For those instances, the effective rules and defaults are selected based on the following logic:

                                  • For policy defaults - The lowest organizational hierarchy \u201cclosest\u201d to the actual workload becomes the effective policy defaults (project defaults > department defaults > cluster defaults > tenant defaults).
                                  • For policy rules -
                                  • If the rule belongs to the compute and security sections in the workload spec of the Run:ai API, the highest hierarchy is chosen for the effective policy for the field (tenant rules > cluster rules > department rules > project rules).
                                  • If the rule does not belong to the compute or security sections, the lowest hierarchy \u201cclosest\u201d to the actual workload becomes the effective policy for the field (similar to defaults).

                                  While viewing the effective policy, for each rule and default the source of the policy origin is visible, allowing users to clearly understand the selected hierarchy of the effective policy.

                                  "},{"location":"platform-admin/workloads/policies/overview/#runai-policies-vs-kyverno-policies","title":"Run:ai Policies vs. Kyverno Policies","text":"

                                  Kyverno runs as a dynamic admission controller in a Kubernetes cluster. Kyverno receives validating and mutating admission webhook HTTP callbacks from the Kubernetes API server and applies matching policies to return results that enforce admission policies or reject requests. Kyverno policies can match resources using the resource kind, name, label selectors, and much more. For more information, see How Kyverno Works.

                                  "},{"location":"platform-admin/workloads/policies/policy-examples/","title":"Policies Examples","text":"

                                  This article provides examples of:

                                  1. Creating a new rule within a policy
                                  2. Best practices for adding sections to a policy
                                  3. A full example of a policy.
                                  "},{"location":"platform-admin/workloads/policies/policy-examples/#creating-a-new-rule-within-a-policy","title":"Creating a new rule within a policy","text":"

                                  This example shows how to add a new limitation to the GPU usage for workloads of type workspace:

                                  1. Check the workload API fields documentation and select the field(s) that are most relevant for GPU usage.

                                    {\n\"spec\": {\n    \"compute\": {\n    \"gpuDevicesRequest\": 1,\n    \"gpuRequestType\": \"portion\",\n    \"gpuPortionRequest\": 0.5,\n    \"gpuPortionLimit\": 0.5,\n    \"gpuMemoryRequest\": \"10M\",\n    \"gpuMemoryLimit\": \"10M\",\n    \"migProfile\": \"1g.5gb\",\n    \"cpuCoreRequest\": 0.5,\n    \"cpuCoreLimit\": 2,\n    \"cpuMemoryRequest\": \"20M\",\n    \"cpuMemoryLimit\": \"30M\",\n    \"largeShmRequest\": false,\n    \"extendedResources\": [\n        {\n        \"resource\": \"hardware-vendor.example/foo\",\n        \"quantity\": 2,\n        \"exclude\": false\n        }\n    ]\n    },\n}\n}\n
                                  2. Search the field in the Policy YAML fields - reference table. For example, gpuDevicesRequest appears under the Compute fields sub-table and appears as follow:

                                  Fields Description Value type Supported Run:ai workload type gpuDeviceRequest Specifies the number of GPUs to allocate for the created workload. Only if gpuDeviceRequest = 1, the gpuRequestType can be defined. integer Workspace & Training
                                  1. Use the value type of the gpuDevicesRequest field indicated in the table - \u201cinteger\u201d and navigate to the Value types table to view the possible rules that can be applied to this value type -

                                    for integer, the options are:

                                    • canEdit
                                    • required
                                    • min
                                    • max
                                    • step
                                  2. Proceed to the Rule Type table, select the required rule for the limitation of the field - for example \u201cmax\u201d and use the examples syntax to indicate the maximum GPU device requested.

                                  compute:\n    gpuDevicesRequest:\n        max: 2\n
                                  "},{"location":"platform-admin/workloads/policies/policy-examples/#policy-yaml-best-practices","title":"Policy YAML best practices","text":"Create a policy that has multiple defaults and rules

                                  Best practice description: Presentation of the syntax while adding a set of defaults and rules

                                  defaults:\n  createHomeDir: true\n  environmentVariables:\n    instances:\n    - name: MY_ENV\n      value: my_value\nsecurity:\n  allowPrivilegeEscalation: false\n\nrules:\n  storage:\n    s3:\n      attributes:\n        url:\n          options:\n            - value: https://www.google.com\n            displayed: https://www.google.com\n            - value: https://www.yahoo.com\n            displayed: https://www.yahoo.com\n
                                  Allow only single selection out of many

                                  Best practice description: Blocking the option to create all types of data sources except the one that is allowed is the solution.

                                  rules:\n  storage:\n    dataVolume:\n      instances:\n        canAdd: false\n    hostPath:\n      instances:\n        canAdd: false\n    pvc:\n      instances:\n        canAdd: false\n    git:\n      attributes:\n        repository:\n          required: true\n        branch:\n          required: true\n        path:\n          required: true\n    nfs:\n      instances:\n        canAdd: false\n    s3:\n      instances:\n        canAdd: false\n
                                  Create a robust set of guidelines

                                  Best practice description: Set rules for specific compute resource usage, addressing most relevant spec fields

                                  rules:\n  compute:\n    cpuCoreRequest:\n      required: true\n      min: 0\n      max: 8\n    cpuCoreLimit:\n      min: 0\n      max: 8\n    cpuMemoryRequest:\n      required: true\n      min: '0'\n      max: 16G\n    cpuMemoryLimit:\n      min: '0'\n      max: 8G\n    migProfile:\n      canEdit: false\n    gpuPortionRequest:\n      min: 0\n      max: 1\n    gpuMemoryRequest:\n      canEdit: false\n    extendedResources:\n      instances:\n        canAdd: false\n
                                  Environment creation (specific section)
                                  rules:\n  imagePullPolicy:\n    required: true\n    options:\n    - value: Always\n      displayed: Always\n    - value: Never\n      displayed: Never\n  createHomeDir:\n    canEdit: false\n
                                  Setting security measures (specific section)
                                  rules:\n  security:\n    runAsUid:\n      min: 1\n      max: 32700\n    allowPrivilegeEscalation:\n      canEdit: false\n
                                  Policy for distributed training workloads (specific section)

                                  Best practice description: Set rules and defaults for a distributed training workload with different settings for master and worker

                                  defaults:\n  worker:\n    command: my-command-worker-1\n    environmentVariables:\n      instances:\n        - name: LOG_DIR\n          value: policy-worker-to-be-ignored\n        - name: ADDED_VAR\n          value: policy-worker-added\n   security:\n    runAsUid: 500\n  storage:\n     s3:\n     attributes:\n       bucket: bucket1-worker\n master:\n   command: my-command-master-2\n   environmentVariables:\n     instances:\n       - name: LOG_DIR\n         value: policy-master-to-be-ignored\n       - name: ADDED_VAR\n         value: policy-master-added\n    security:\n      runAsUid: 800\n    storage:\n     s3:\n       attributes:\n         bucket: bucket1-master\n rules:\n   worker:\n     command:\n       options:\n         - value: my-command-worker-1\n           displayed: command1\n         - value: my-command-worker-2\n           displayed: command2\n     storage:\n       nfs:\n         instances:\n           canAdd: false\n       s3:\n         attributes:\n           bucket:\n             options:\n               - value: bucket1-worker\n               - value: bucket2-worker\n   master:\n     command:\n       options:\n         - value: my-command-master-1\n           displayed: command1\n         - value: my-command-master-2\n           displayed: command2\n     storage:\n       nfs:\n         instances:\n           canAdd: false\n       s3:\n         attributes:\n           bucket:\n             options:\n               - value: bucket1-master\n               - value: bucket2-master\n
                                  Impose an asset (specific section)
                                   defaults: null\n rules: null\n imposedAssets:\n   - f12c965b-44e9-4ff6-8b43-01d8f9e630cc\n
                                  "},{"location":"platform-admin/workloads/policies/policy-examples/#example-of-a-full-policy","title":"Example of a full policy","text":"
                                  defaults:\n  createHomeDir: true\n  imagePullPolicy: IfNotPresent\n  nodePools:\n    - node-pool-a\n    - node-pool-b\n  environmentVariables:\n    instances:\n      - name: WANDB_API_KEY\n        value: REPLACE_ME!\n      - name: WANDB_BASE_URL\n        value: https://wandb.mydomain.com\n  compute:\n    cpuCoreRequest: 0.1\n    cpuCoreLimit: 20\n    cpuMemoryRequest: 10G\n    cpuMemoryLimit: 40G\n    largeShmRequest: true\n  security:\n    allowPrivilegeEscalation: false\n  storage:\n    git:\n      attributes:\n        repository: https://git-repo.my-domain.com\n        branch: master\n    hostPath:\n      instances:\n        - name: vol-data-1\n          path: /data-1\n          mountPath: /mount/data-1\n        - name: vol-data-2\n          path: /data-2\n          mountPath: /mount/data-2\nrules:\n  createHomeDir:\n    canEdit: false\n  imagePullPolicy:\n    canEdit: false\n  environmentVariables:\n    instances:\n      locked:\n        - WANDB_BASE_URL\n  compute:\n    cpuCoreRequest:\n      max: 32\n    cpuCoreLimit:\n      max: 32\n    cpuMemoryRequest:\n      min: 1G\n      max: 20G\n    cpuMemoryLimit:\n      min: 1G\n      max: 40G\n    largeShmRequest:\n      canEdit: false\n    extendedResources:\n      instances:\n        canAdd: false\n  security:\n    allowPrivilegeEscalation:\n      canEdit: false\n    runAsUid:\n      min: 1\n  storage:\n    hostPath:\n      instances:\n        locked:\n          - vol-data-1\n          - vol-data-2\nimposedAssets:\n  - 4ba37689-f528-4eb6-9377-5e322780cc27\n
                                  "},{"location":"platform-admin/workloads/policies/policy-reference/","title":"Policies Reference","text":"

                                  A workload policy is an end-to-end solution for AI managers and administrators to control and simplify how workloads are submitted, setting best practices, enforcing limitations, and standardizing processes for AI projects within their organization.

                                  This article explains the policy YAML fields and the possible rules and defaults that can be set for each field.

                                  "},{"location":"platform-admin/workloads/policies/policy-reference/#policy-yaml-fields-reference-table","title":"Policy YAML fields - reference table","text":"

                                  The policy fields are structured in a similar format to the workload API fields. The following tables represent a structured guide designed to help you understand and configure policies in a YAML format. It provides the fields, descriptions, defaults and rules for each workload type.

                                  Click the link to view the value type of each field.

                                  Fields Description Value type Supported Run:ai workload type args When set, contains the arguments sent along with the command. These override the entry point of the image in the created workload string Workspace Training command A command to serve as the entry point of the container running the workspace string Workspace Training createHomeDir Instructs the system to create a temporary home directory for the user within the container. Data stored in this directory is not saved when the container exists. When the runAsUser flag is set to true, this flag defaults to true as well boolean Workspace Training environmentVariables Set of environmentVariables to populate the container running the workspace array Workspace Training image Specifies the image to use when creating the container running the workload string Workspace Training imagePullPolicy Specifies the pull policy of the image when starting t a container running the created workload. Options are: always, ifNotPresent, or never string Workspace Training workingDir Container\u2019s working directory. If not specified, the container runtime default is used, which might be configured in the container image string Workspace Training nodeType Nodes (machines) or a group of nodes on which the workload runs string Workspace Training nodePools A prioritized list of node pools for the scheduler to run the workspace on. The scheduler always tries to use the first node pool before moving to the next one when the first is not available. array Workspace Training annotations Set of annotations to populate into the container running the workspace itemized Workspace Training labels Set of labels to populate into the container running the workspace itemized Workspace Training terminateAfterPreemtpion Indicates whether the job should be terminated, by the system, after it has been preempted boolean Workspace Training autoDeletionTimeAfterCompletionSeconds Specifies the duration after which a finished workload (Completed or Failed) is automatically deleted. If this field is set to zero, the workload becomes eligible to be deleted immediately after it finishes. integer Workspace Training backoffLimit Specifies the number of retries before marking a workload as failed integer Workspace Training cleanPodPolicy

                                  Specifies which pods will be deleted when the workload reaches a terminal state (completed/failed). The policy can be one of the following values:

                                  • Running - Only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default).
                                  • All - All (including completed) pods will be deleted immediately when the job finishes.
                                  • None - No pods will be deleted when the job completes. It will keep running pods that consume GPU, CPU and memory over time. It is recommended to set to None only for debugging and obtaining logs from running pods.
                                  string Distributed completions Used with Hyperparameter Optimization. Specifies the number of successful pods the job should reach to be completed. The Job is marked as successful once the specified amount of pods has succeeded. integer Workspace Training parallelism Used with Hyperparameters Optimization. Specifies the maximum desired number of pods the workload should run at any given time. itemized Workspace Training exposeUrls Specifies a set of exported URL (e.g. ingress) from the container running the created workload. itemized Workspace Training largeShmRequest Specifies a large /dev/shm device to mount into a container running the created workload. SHM is a shared file system mounted on RAM. boolean Workspace Training PodAffinitySchedulingRule Indicates if we want to use the Pod affinity rule as: the \u201chard\u201d (required) or the \u201csoft\u201d (preferred) option. This field can be specified only if PodAffinity is set to true. string Workspace Training podAffinityTopology Specifies the Pod Affinity Topology to be used for scheduling the job. This field can be specified only if PodAffinity is set to true. string Workspace Training ports Specifies a set of ports exposed from the container running the created workload. More information in Ports fields below. itemized Workspace Training probes Specifies the ReadinessProbe to use to determine if the container is ready to accept traffic. More information in Probes fields below - Workspace Training tolerations Toleration rules which apply to the pods running the workload. Toleration rules guide (but do not require) the system to which node each pod can be scheduled to or evicted from, based on matching between those rules and the set of taints defined for each Kubernetes node. itemized Workspace Training priorityClass Priority class of the workload. The values for workspace are build (default) or interactive-preemptible. For training only, use train. Enum: \"build\", \"train\", \"interactive-preemptible\" string Workspace storage Contains all the fields related to storage configurations. More information in Storage fields below. - Workspace Training security Contains all the fields related to security configurations. More information in Security fields below. - Workspace Training compute Contains all the fields related to compute configurations. More information in Compute fields below. - Workspace Training"},{"location":"platform-admin/workloads/policies/policy-reference/#ports-fields","title":"Ports fields","text":"Fields Description Value type Supported Run:ai workload type container The port that the container running the workload exposes. string Workspace Training serviceType Specifies the default service exposure method for ports. the default shall be sued for ports which do not specify service type. Options are: LoadBalancer, NodePort or ClusterIP. For more information see the External Access to Containers guide. string Workspace Training external The external port which allows a connection to the container port. If not specified, the port is auto-generated by the system. integer Workspace Training toolType The tool type that runs on this port. string Workspace Training toolName A name describing the tool that runs on this port. string Workspace Training"},{"location":"platform-admin/workloads/policies/policy-reference/#probes-fields","title":"Probes fields","text":"Fields Description Value type Supported Run:ai workload type readiness Specifies the Readiness Probe to use to determine if the container is ready to accept traffic. - Workspace Training Readiness field details Spec fields readiness Description Specifies the Readiness Probe to use to determine if the container is ready to accept traffic Supported Run:ai workload types Workspace Training Value type itemized Spec Readiness fields Description Value type initialDelaySeconds Number of seconds after the container has started before liveness or readiness probes are initiated. integer periodSeconds How often (in seconds) to perform the probe. integer timeoutSeconds Number of seconds after which the probe times out integer successThreshold Minimum consecutive successes for the probe to be considered successful after having failed. integer failureThreshod When a probe fails, the number of times to try before giving up. integer

                                  Example workload snippet:

                                  defaults:\n  probes:\n    readiness:\n        initialDelaySeconds: 2\n
                                  "},{"location":"platform-admin/workloads/policies/policy-reference/#security-fields","title":"Security fields","text":"Fields Description Value type Supported Run:ai workload type uidGidSource Indicates the way to determine the user and group ids of the container. The options are: fromTheImage - user and group IDs are determined by the docker image that the container runs. This is the default option. custom - user and group IDs can be specified in the environment asset and/or the workspace creation request. idpToken - user and group IDs are determined according to the identity provider (idp) access token. This option is intended for internal use of the environment UI form. For more information, see Non-root containers string Workspace Training capabilities The capabilities field allows adding a set of unix capabilities to the container running the workload. Capabilities are Linux distinct privileges traditionally associated with superuser which can be independently enabled and disabled Array Workspace Training seccompProfileType Indicates which kind of seccomp profile is applied to the container. The options are: RuntimeDefault - the container runtime default profile should be used Unconfined - no profile should be applied string Workspace Training runAsNonRoot Indicates that the container must run as a non-root user. boolean Workspace Training readOnlyRootFilesystem If true, mounts the container's root filesystem as read-only. boolean Workspace Training runAsUid Specifies the Unix user id with which the container running the created workload should run. integer Workspace Training runasGid Specifies the Unix Group ID with which the container should run. integer Workspace Training supplementalGroups Comma separated list of groups that the user running the container belongs to, in addition to the group indicated by runAsGid. string Workspace Training allowPrivilegeEscalation Allows the container running the workload and all launched processes to gain additional privileges after the workload starts boolean Workspace Training hostIpc Whether to enable hostIpc. Defaults to false. boolean Workspace Training hostNetwork Whether to enable host network. boolean Workspace Training"},{"location":"platform-admin/workloads/policies/policy-reference/#compute-fields","title":"Compute fields","text":"Fields Description Value type Supported Run:ai workload type cpuCoreRequest CPU units to allocate for the created workload (0.5, 1, .etc). The workload receives at least this amount of CPU. Note that the workload is not scheduled unless the system can guarantee this amount of CPUs to the workload. number Workspace Training cpuCoreLimit Limitations on the number of CPUs consumed by the workload (0.5, 1, .etc). The system guarantees that this workload is not able to consume more than this amount of CPUs. number Workspace Training cpuMemoryRequest The amount of CPU memory to allocate for this workload (1G, 20M, .etc). The workload receives at least this amount of memory. Note that the workload is not scheduled unless the system can guarantee this amount of memory to the workload quantity Workspace Training cpuMemoryLimit Limitations on the CPU memory to allocate for this workload (1G, 20M, .etc). The system guarantees that this workload is not be able to consume more than this amount of memory. The workload receives an error when trying to allocate more memory than this limit. quantity Workspace Training largeShmRequest A large /dev/shm device to mount into a container running the created workload (shm is a shared file system mounted on RAM). boolean Workspace Training gpuRequestType Sets the unit type for GPU resources requests to either portion, memory or mig profile. Only if gpuDeviceRequest = 1, the request type can be stated as portion, memory or migProfile. string Workspace Training migProfile Specifies the memory profile to be used for workload running on NVIDIA Multi-Instance GPU (MIG) technology. string Workspace Training (Deprecated) gpuPortionRequest Specifies the fraction of GPU to be allocated to the workload, between 0 and 1. For backward compatibility, it also supports the number of gpuDevices larger than 1, currently provided using the gpuDevices field. number Workspace Training gpuDeviceRequest Specifies the number of GPUs to allocate for the created workload. Only if gpuDeviceRequest = 1, the gpuRequestType can be defined. integer Workspace Training gpuPortionLimit When a fraction of a GPU is requested, the GPU limit specifies the portion limit to allocate to the workload. The range of the value is from 0 to 1. number Workspace Training gpuMemoryRequest Specifies GPU memory to allocate for the created workload. The workload receives this amount of memory. Note that the workload is not scheduled unless the system can guarantee this amount of GPU memory to the workload. quantity Workspace Training gpuMemoryLimit Specifies a limit on the GPU memory to allocate for this workload. Should be no less than the gpuMemory. quantity Workspace Training extendedResources Specifies values for extended resources. Extended resources are third-party devices (such as high-performance NICs, FPGAs, or InfiniBand adapters) that you want to allocate to your Job. itemized Workspace Training"},{"location":"platform-admin/workloads/policies/policy-reference/#storage-fields","title":"Storage fields","text":"Fields Description Value type Supported Run:ai workload type dataVolume Set of data volumes to use in the workload. Each data volume is mapped to a file-system mount point within the container running the workload. itemized Workspace Training hostPath Maps a folder to a file-system mount point within the container running the workload. itemized Workspace Training git Details of the git repository and items mapped to it. itemized Workspace Training pvc Specifies persistent volume claims to mount into a container running the created workload. itemized Workspace Training nfs Specifies NFS volume to mount into the container running the workload. itemized Workspace Training s3 Specifies S3 buckets to mount into the container running the workload. itemized Workspace Training configMapVolumes Specifies ConfigMaps to mount as volumes into a container running the created workload. itemized Workspace Training secretVolume Set of secret volumes to use in the workload. A secret volume maps a secret resource in the cluster to a file-system mount point within the container running the workload. itemized Workspace Training Storage field details Spec fields hostPath Description Maps a folder to a file system mount oint within the container running the workload Supported Run:ai workload types Workspace Training Value type itemized Git fields Description Value type name Unique name to identify the instance. primarily used for policy locked rules. string path Local path within the controller to which the host volume is mapped. string readOnly Force the volume to be mounted with read-only permissions. Defaults to false. boolean mountPath The path that the host volume is mounted to when in use. string mountPropagation Enum: \"None\" \"HostToContainer\" Share this volume mount with other containers. If set to HostToContainer, this volume mount receives all subsequent mounts that are mounted to this volume or any of its subdirectories. In case of multiple hostPath entries, this field should have the same value for all of them string

                                  Example workload snippet:

                                  defaults:\n  storage:\n    hostPath:\n      instances:\n        - path: h3-path-1\n          mountPath: h3-mount-1\n        - path: h3-path-2\n          mountPath: h3-mount-2\n      attributes:\n        - readOnly: true\n
                                  Spec fields git Description Details of the git repository and items mapped to it. Supported Run:ai workload types Workspace Training Value type itemized Git fields Description Value type repository URL to a remote git repository. The content of this repository is mapped to the container running the workload string revision Specific revision to synchronize the repository from string path Local path within the workspace to which the S3 bucket is mapped. string secretName Optional name of Kubernetes secret that holds your git username and password. string username If secretName is provided, this field should contain the key, within the provided Kubernetes secret, which holds the value of your git username. Otherwise, this field should specify your git username in plain text (example: myuser). string

                                  Example workload snippet:

                                  defaults:\n  storage:\n    git:\n      attributes:\n        Repository: https://runai.public.github.com\n      instances\n        - branch: \"master\"\n          path: /container/my-repository\n          passwordSecret: my-password-secret\n
                                  Spec fields pvc Description Specifies persistent volume claims to mount into a container running the created workload Supported Run:ai workload types Workspace Training Value type itemized Spec PVC fields Description Value type claimName (manadatory) A given name for the PVC. Allowed referencing it across workspaces. string ephemeral Use true to set PVC to ephemeral. If set to true, the PVC is deleted when the workspace is stopped. boolean path Local path within the workspace to which the PVC bucket is mapped. string readonly Permits read only from the PVC, prevents additions or modifications to its content. boolean ReadwriteOnce Requesting claim that can be mounted in read/write mode to exactly 1 host. If none of the modes are specified, the default is readWriteOnce. boolean size Requested size for the PVC. Mandatory when existing PVC is false. string storageClass Storage class name to associate with the PVC. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. Further details at Kubernetes storage classes. string readOnlyMany Requesting claim that can be mounted in read-only mode to many hosts. boolean readWriteMany Requesting claim that can be mounted in read/write mode to many hosts. boolean

                                  Example workload snippet:

                                  defaults:\n  storage:\n    pvc:\n      instances:\n        - claimName: pvc-staging-researcher1-home\n          existingPvc: true\n          path: /myhome\n          readOnly: false\n          claimInfo:\n            accessModes:\n              readWriteMany: true\n
                                  Spec fields nfs Description Specifies NFS volume to mount into the container running the workload Supported Run:ai workload types Workspace Training Value type itemized Spec PVC fields Description Value type mountpath The path that the NFS volume is mounted to when in use. string path Path that is exported by the NFS server. string readOnly Whether to force the NFS export to be mounted with read-only permissions. boolean nfsServer The hostname or IP address of the NFS server. string

                                  Example workload snippet:

                                  defaults:\nstorage:\n  nfs:\n    instances:\n      - path: nfs-path\n        readOnly: true\n        server: nfs-server\n        mountPath: nfs-mount\nrules:\n  storage:\n    nfs:\n      instances:\n        canAdd: false\n
                                  Spec fields s3 Description Specifies S3 buckets to mount into the container running the workload Supported Run:ai workload types Workspace Training Value type itemized Spec PVC fields Description Value type Bucket The name of the bucket string path Local path within the workspace to which the S3 bucket is mapped string url The URL of the S3 service provider. The default is the URL of the Amazon AWS Se service string

                                  Example workload snippet:

                                  defaults:\n  storage:\n    s3:\n      instances:\n        - bucket: bucket-opt-1\n          path: /s3/path\n          accessKeySecret: s3-access-key\n          secretKeyOfAccessKeyId: s3-secret-id\n          secretKeyOfSecretKey: s3-secret-key\n      attributes:\n        url: https://amazonaws.s3.com\n
                                  "},{"location":"platform-admin/workloads/policies/policy-reference/#value-types","title":"Value types","text":"

                                  Each field has a specific value type. The following value types are supported.

                                  Value type Description Supported rule type Defaults Boolean A binary value that can be either True or False
                                  • canEdit
                                  • required true/false String A sequence of characters used to represent text. It can include letters, numbers, symbols, and spaces
                                    • canEdit
                                    • required
                                    • options abc Itemized An ordered collection of items (objects), which can be of different types (all items in the list are of the same type). For further information see the chapter below the table.
                                      • canAdd
                                      • locked See below Integer An Integer is a whole number without a fractional component.
                                        • canEdit
                                        • required
                                        • min
                                        • max
                                        • step
                                        • defaultFrom 100 Number Capable of having non-integer values
                                          • canEdit
                                          • required
                                          • min
                                          • defaultFrom 10.3 Quantity Holds a string composed of a number and a unit representing a quantity
                                            • canEdit
                                            • required
                                            • min
                                            • max
                                            • defaultFrom 5M Array Set of values that are treated as one, as opposed to Itemized in which each item can be referenced separately.
                                              • canEdit
                                              • required node-a node-b node-c"},{"location":"platform-admin/workloads/policies/policy-reference/#itemized","title":"Itemized","text":"

                                                Workload fields of type itemized have multiple instances, however in comparison to objects, each can be referenced by a key field. The key field is defined for each field.

                                                Consider the following workload spec:

                                                spec:\n  image: ubuntu\n  compute:\n    extendedResources:\n      - resource: added/cpu\n        quantity: 10\n      - resource: added/memory\n        quantity: 20M\n

                                                In this example, extendedResources have two instances, each has two attributes: resource (the key attribute) and quantity.

                                                In policy, the defaults and rules for itemized fields have two sub sections:

                                                • Instances: default items to be added to the policy or rules which apply to an instance as a whole.
                                                • Attributes: defaults for attributes within an item or rules which apply to attributes within each item.

                                                Consider the following example:

                                                defaults:\n  compute:\n    extendedResources:\n      instances: \n        - resource: default/cpu\n          quantity: 5\n        - resource: default/memory\n          quantity: 4M\n      attributes:\n        quantity: 3\nrules:\n  compute:\n    extendedResources:\n      instances:\n        locked: \n          - default/cpu\n      attributes:\n        quantity: \n          required: true\n

                                                Assume the following workload submission is requested:

                                                spec:\n  image: ubuntu\n  compute:\n    extendedResources:\n      - resource: default/memory\n        exclude: true\n      - resource: added/cpu\n      - resource: added/memory\n        quantity: 5M\n

                                                The effective policy for the above mentioned workload has the following extendedResources instances:

                                                Resource Source of the instance Quantity Source of the attribute quantity default/cpu Policy defaults 5 The default of this instance in the policy defaults section added/cpu Submission request 3 The default of the quantity attribute from the attributes section added/memory Submission request 5M Submission request

                                                Note

                                                The default/memory is not populated to the workload, this is because it has been excluded from the workload using \u201cexclude: true\u201d.

                                                A workload submission request cannot exclude the default/cpu resource, as this key is included in the locked rules under the instances section. {#a-workload-submission-request-cannot-exclude-the-default/cpu-resource,-as-this-key-is-included-in-the-locked-rules-under-the-instances-section.}

                                                "},{"location":"platform-admin/workloads/policies/policy-reference/#rule-types","title":"Rule types","text":"Rule types Description Supported value types Rule type example canAdd Whether the submission request can add items to an itemized field other than those listed in the policy defaults for this field. itemized storage: hostPath: instances: canAdd: false locked Set of items that the workload is unable to modify or exclude. In this example, a workload policy default is given to HOME and USER, that the submission request cannot modify or exclude from the workload. itemized storage: hostPath: Instances: locked: - HOME - USER canEdit Whether the submission request can modify the policy default for this field. In this example, it is assumed that the policy has default for imagePullPolicy. As canEdit is set to false, submission requests are not able to alter this default.
                                                • string
                                                • boolean
                                                • integer
                                                • number
                                                • quantity
                                                • array imagePullPolicy: canEdit: false required When set to true, the workload must have a value for this field. The value can be obtained from policy defaults. If no value specified in the policy defaults, a value must be specified for this field in the submission request.
                                                  • string
                                                  • boolean
                                                  • integer
                                                  • number
                                                  • quantity
                                                  • array image: required: true min The minimal value for the field.
                                                    • integer
                                                    • number
                                                    • quantity compute: gpuDevicesRequest: min: 3 max The maximal value for the field.
                                                      • integer
                                                      • number
                                                      • quantity compute: gpuMemoryRequest: max: 2G step The allowed gap between values for this field. In this example the allowed values are: 1, 3, 5, 7
                                                        • integer
                                                        • number compute: cpuCoreRequest: min: 1 max: 7 Step: 2 options Set of allowed values for this field. string image: options: - value: image-1 - value: image-2 defaultFrom Set a default value for a field that will be calculated based on the value of another field.
                                                          • integer
                                                          • number
                                                          • quantity computeCoreRequest: defaultFrom: field:compute.cpuCoreLimit factor:0.5"},{"location":"platform-admin/workloads/policies/policy-reference/#policy-spec-sections","title":"Policy Spec Sections","text":"

                                                            For each field of a specific policy, you can specify both rules and defaults. A policy spec consists of the following sections:

                                                            • Rules
                                                            • Defaults
                                                            • Imposed Assets
                                                            "},{"location":"platform-admin/workloads/policies/policy-reference/#rules","title":"Rules","text":"

                                                            Rules set up constraints on workload policy fields. For example, consider the following policy:

                                                            rules:\n  compute:\n    gpuDevicesRequest: \n      max: 8\n  security:\n    runAsUid: \n      min: 500\n

                                                            Such a policy restricts the maximum value for gpuDeviceRequests to 8, and the minimal value for runAsUid, provided in the security section to 500.

                                                            "},{"location":"platform-admin/workloads/policies/policy-reference/#defaults","title":"Defaults","text":"

                                                            The defaults section is used for providing defaults for various workload fields. For example, consider the following policy:

                                                            defaults:\n  imagePullPolicy: Always\n  security:\n    runAsNonRoot: true\n    runAsUid: 500\n

                                                            Assume a submission request with the following values:

                                                            • Image: ubuntu
                                                            • runAsUid: 501

                                                            The effective workload that runs has the following set of values:

                                                            Field Value Source Image Ubuntu Submission request ImagePullPolicy Always Policy defaults security.runAsNonRoot true Policy defaults security.runAsUid 501 Submission request

                                                            Note

                                                            It is possible to specify a rule for each field, which states if a submission request is allowed to change the policy default for that given field, for example:

                                                            defaults:\n  imagePullPolicy: Always\n  security:\n    runAsNonRoot: true\n    runAsUid: 500\nrules:\n  security:\n    runAsUid:\n      canEdit: false\n

                                                            If this policy is applied, the submission request above fails, as it attempts to change the value of secuirty.runAsUid from 500 (the policy default) to 501 (the value provided in the submission request), which is forbidden due to canEdit rule set to false for this field.

                                                            "},{"location":"platform-admin/workloads/policies/policy-reference/#imposed-assets","title":"Imposed Assets","text":"

                                                            Default instances of a storage field can be provided using a datasource containing the details of this storage instance. To add such instances in the policy, specify those asset IDs in the imposedAssets section of the policy.

                                                            defaults: null\nrules: null\nimposedAssets:\n  - f12c965b-44e9-4ff6-8b43-01d8f9e630cc\n

                                                            Assets with references to credentials assets (for example: private S3, containing reference to an AccessKey asset) cannot be used as imposedAssets.

                                                            "},{"location":"platform-admin/workloads/policies/workspaces-policy/","title":"Policies","text":"

                                                            This article explains the procedure to manage workload policies.

                                                            "},{"location":"platform-admin/workloads/policies/workspaces-policy/#workload-policies-table","title":"Workload policies table","text":"

                                                            The Workload policies table can be found under Policies in the Run:ai platform.

                                                            Note

                                                            Workload policies are disabled by default. If you cannot see Workload policies in the menu, then it must be enabled by your administrator, under General settings \u2192 Workloads \u2192 Policies

                                                            The Workload policies table provides a list of all the policies defined in the platform, and allows you to manage them.

                                                            The Workload policies table consists of the following columns:

                                                            Column Description Policy The policy name which is a combination of the policy scope and the policy type Type The policy type is per Run:ai workload type. This allows administrators to set different policies for each workload type. Status Representation of the policy lifecycle (one of the following - \u201cCreating\u2026\u201d, \u201cUpdating\u2026\u201d, \u201cDeleting\u2026\u201d, Ready or Failed) Scope The scope the policy affects. Click the name of the scope to view the organizational tree diagram. You can only view the parts of the organizational tree for which you have permission to view. Created by The user who created the policy Creation time The timestamp for when the policy was created Last updated The last time the policy was updated"},{"location":"platform-admin/workloads/policies/workspaces-policy/#customizing-the-table-view","title":"Customizing the table view","text":"
                                                            • Filter - Click ADD FILTER, select the column to filter by, and enter the filter values
                                                            • Search - Click SEARCH and type the value to search by
                                                            • Sort - Click each column header to sort by
                                                            • Column selection - Click COLUMNS and select the columns to display in the table
                                                            • Refresh - Click REFRESH to update the table with the latest data
                                                            "},{"location":"platform-admin/workloads/policies/workspaces-policy/#adding-a-policy","title":"Adding a policy","text":"

                                                            To create a new policy:

                                                            1. Click +NEW POLICY
                                                            2. Select a scope
                                                            3. Select the workload type
                                                            4. Click +POLICY YAML
                                                            5. In the YAML editor type or paste a YAML policy with defaults and rules. You can utilize the following references and examples:
                                                            6. Policy YAML reference
                                                            7. Policy YAML examples
                                                            8. Click SAVE POLICY
                                                            "},{"location":"platform-admin/workloads/policies/workspaces-policy/#editing-a-policy","title":"Editing a policy","text":"
                                                            1. Select the policy you want to edit
                                                            2. Click EDIT
                                                            3. Update the policy and click APPLY
                                                            4. Click SAVE POLICY
                                                            "},{"location":"platform-admin/workloads/policies/workspaces-policy/#troubleshooting","title":"Troubleshooting","text":"

                                                            Listed below are issues that might occur when creating or editing a policy via the YAML Editor:

                                                            Issue Message Mitigation Cluster connectivity issues There's no communication from cluster \u201ccluster_name\u201c. Actions may be affected, and the data may be stale. Verify that you are on a network that has been allowed access to the cluster. Reach out to your cluster administrator for instructions on verifying the issue. Policy can\u2019t be applied due to a rule that is occupied by a different policy Field \u201cfield_name\u201d already has rules in cluster: \u201ccluster_id\u201d Remove the rule from the new policy or adjust the old policy for the specific rule. Policy is not visible in the UI - Check that the policy hasn\u2019t been deleted. Policy syntax is no valid Add a valid policy YAML;json: unknown field \"field_name\" For correct syntax check the Policy YAML reference or the Policy YAML examples. Policy can\u2019t be saved for some reason The policy couldn't be saved due to a network or other unknown issue. Download your draft and try pasting and saving it again later. Possible cluster connectivity issues. Try updating the policy once again at a different time. Policies were submitted before version 2.18, you upgraded to version 2.18 or above and wish to submit new policies If you have policies and want to create a new one, first contact Run:ai support to prevent potential conflicts Contact Run:ai support. R&D can migrate your old policies to the new version."},{"location":"platform-admin/workloads/policies/workspaces-policy/#viewing-a-policy","title":"Viewing a policy","text":"

                                                            To view a policy:

                                                            1. Select the policy for which you want to view its policies.
                                                            2. Click VIEW POLICY
                                                            3. In the Policy form per workload section, view the workload rules and defaults:
                                                              • Parameter The workload submission parameter that Rules and Defaults are applied to
                                                              • Type (applicable for data sources only) The data source type (Git, S3, nfs, pvc etc.)
                                                              • Default The default value of the Parameter
                                                              • Rule Set up constraint on workload policy field
                                                              • Source The origin of the applied policy (cluster, department or project)

                                                            Note

                                                            Some of the rules and defaults may be derived from policies of a parent cluster and/or department. You can see the source of each rule in the policy form. For more information, check the Scope of effectiveness documentation

                                                            "},{"location":"platform-admin/workloads/policies/workspaces-policy/#deleting-a-policy","title":"Deleting a policy","text":"
                                                            1. Select the policy you want to delete
                                                            2. Click DELETE
                                                            3. On the dialog, click DELETE to confirm the deletion
                                                            "},{"location":"platform-admin/workloads/policies/workspaces-policy/#using-api","title":"Using API","text":"

                                                            Go to the Policies API reference to view the available actions.

                                                            "}]} \ No newline at end of file diff --git a/v2.20/sitemap.xml b/v2.20/sitemap.xml index 29e440a9cc..27d30950e0 100644 --- a/v2.20/sitemap.xml +++ b/v2.20/sitemap.xml @@ -2,1558 +2,1558 @@ https://docs.run.ai/v2.20/Researcher/overview-researcher/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/use-cases/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/Walkthroughs/quickstart-inference/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/Walkthroughs/quickstart-overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/Walkthroughs/quickstart-vscode/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/Walkthroughs/walkthrough-build-ports/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/Walkthroughs/walkthrough-build/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/Walkthroughs/walkthrough-fractions/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/Walkthroughs/walkthrough-overquota/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/Walkthroughs/walkthrough-queue-fairness/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/best-practices/bare-metal-to-docker-images/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/best-practices/convert-to-unattended/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/best-practices/env-variables/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/best-practices/researcher-notifications/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/best-practices/save-dl-checkpoints/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/best-practices/secrets-as-env-var-in-cli/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/Introduction/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-bash/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-config/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-login/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-logout/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-port-forwarding/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-resume/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-submit-dist-TF/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-submit-dist-mpi/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-submit-dist-pytorch/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-submit-dist-xgboost/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-suspend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-top-node/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-update/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-version/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/runai-whoami/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/cli-examples/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_cluster/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_cluster_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_cluster_set/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_config/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_config_generate/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_config_project/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_config_set/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_describe_job/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_describe_node/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_kubeconfig/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_kubeconfig_set/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_list_clusters/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_list_jobs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_list_nodes/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_list_projects/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_login/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_login_application/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_login_sso/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_login_user/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_logout/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_mpi/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_mpi_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_mpi_bash/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_mpi_delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_mpi_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_mpi_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_mpi_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_mpi_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_mpi_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_mpi_resume/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_mpi_submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_mpi_suspend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_node/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_node_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_nodepool/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_nodepool_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_project/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_project_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_project_set/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_pytorch/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_bash/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_resume/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_pytorch_suspend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_report/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_clear/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_config/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_report_metrics_output/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_bash/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_resume/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_tensorflow_suspend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_bash/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_bash/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_resume/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_mpi_suspend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_bash/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_resume/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_pytorch_suspend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_resume/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_standard/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_bash/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_resume/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_standard_suspend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_suspend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_bash/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_resume/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_tensorflow_suspend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_bash/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_resume/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_training_xgboost_suspend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_upgrade/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_version/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_whoami/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workload/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workload_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workload_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workload_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workload_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workload_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workload_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workspace/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workspace_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workspace_bash/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workspace_delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workspace_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workspace_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workspace_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workspace_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workspace_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workspace_resume/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workspace_submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_workspace_suspend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_xgboost/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_attach/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_bash/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_describe/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_exec/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_list/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_logs/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_port-forward/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_resume/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_submit/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/runai_xgboost_suspend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/cli-reference/new-cli/guides/set-kubeconfig-with-oidc-parameters/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/scheduling/GPU-time-slicing-scheduler/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/scheduling/allocation-of-cpu-and-memory/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/scheduling/dynamic-gpu-fractions/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/scheduling/fractions/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/scheduling/gpu-memory-swap/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/scheduling/node-level-scheduler/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/scheduling/schedule-to-aws-groups/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/scheduling/the-runai-scheduler/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/tools/dev-jupyter/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/tools/dev-pycharm/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/tools/dev-tensorboard/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/tools/dev-vscode/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/tools/dev-x11forward-pycharm/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/assets/compute/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/assets/credentials/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/assets/data-volumes/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/assets/datasources/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/assets/environments/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/assets/overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/assets/templates/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/inference/custom-inference/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/inference/hugging-face-inference/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/inference/inference-overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/inference/nim-inference/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/overviews/introduction-to-workloads/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/overviews/managing-workloads/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/overviews/workload-types/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/training/distributed-training/distributed-training/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/training/distributed-training/quickstart-distributed-training/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/training/standard-training/quickstart-standard-training/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/training/standard-training/trainings-v2/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/workspaces/quickstart-jupyter/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/Researcher/workloads/workspaces/workspace-v2/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/overview-administrator/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/authentication/accessrules/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/authentication/applications/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/authentication/authentication-overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/authentication/non-root-containers/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/authentication/researcher-authentication/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/authentication/roles/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/authentication/users/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/authentication/sso/openidconnect/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/authentication/sso/openshift/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/authentication/sso/saml/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/access-roles/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/admin-messages/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/advanced-cluster-config/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/allow-external-access-to-containers/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/cli-admin-install/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/cluster-wide-pvc/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/clusters/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/create-k8s-assets-in-advance/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/default-scheduler/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/dr/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/ha/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/large-clusters/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/limit-to-node-group/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/node-affinity-with-cloud-node-pools/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/node-roles/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/notifications/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/org-cert/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/secure-cluster/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/shared-storage/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/config/workload-ownership-protection/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/maintenance/alert-monitoring/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/maintenance/audit-log/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/maintenance/node-downtime/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/maintenance/overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/researcher-setup/cli-install/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/researcher-setup/docker-to-runai/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/researcher-setup/new-cli-install/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/researcher-setup/researcher-setup-intro/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/installation-types/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/cluster-setup/cluster-delete/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/cluster-setup/cluster-install/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/cluster-setup/cluster-prerequisites/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/cluster-setup/cluster-setup-intro/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/cluster-setup/cluster-upgrade/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/cluster-setup/customize-cluster-install/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/cluster-setup/dgx-bundle/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/cluster-setup/network-req/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/cluster-setup/project-management/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/k8s/additional-clusters/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/k8s/backend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/k8s/cluster/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/k8s/next-steps/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/k8s/preparations/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/k8s/prerequisites/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/k8s/project-management/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/k8s/uninstall/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/k8s/upgrade/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/ocp/additional-clusters/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/ocp/backend/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/ocp/cluster/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/ocp/next-steps/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/ocp/preparations/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/ocp/prerequisites/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/ocp/project-management/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/ocp/uninstall/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/runai-setup/self-hosted/ocp/upgrade/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/troubleshooting/diagnostics/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/troubleshooting/logs-collection/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/admin/troubleshooting/troubleshooting/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/developer/overview-developer/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/developer/rest-auth/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/developer/user-applications/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/developer/admin-rest-api/overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/developer/cluster-api/other-resources/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/developer/cluster-api/reference/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/developer/cluster-api/submit-rest/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/developer/cluster-api/submit-yaml/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/developer/cluster-api/workload-overview-dev/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/developer/metrics/metrics-api/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/developer/metrics/metrics/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/components/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/data-privacy-details/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/saas-updates/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/whats-new-2-13/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/whats-new-2-15/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/whats-new-2-16/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/whats-new-2-17/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/whats-new-2-18/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/whats-new-2-19/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/whats-new-2-20/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/changelog/hotfixes-2-13/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/changelog/hotfixes-2-15/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/changelog/hotfixes-2-16/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/changelog/hotfixes-2-17/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/changelog/hotfixes-2-18/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/changelog/hotfixes-2-19/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/home/changelog/hotfixes-2-20/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/aiinitiatives/overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/aiinitiatives/org/departments/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/aiinitiatives/org/projects/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/aiinitiatives/org/scheduling-rules/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/aiinitiatives/resources/configuring-mig-profiles/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/aiinitiatives/resources/node-pools/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/aiinitiatives/resources/nodes/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/authentication/accessrules/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/authentication/applications/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/authentication/roles/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/authentication/users/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/integrations/integration-overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/integrations/karpenter/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/performance/dashboard-analysis/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/performance/reports/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/assets/compute/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/assets/credentials/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/assets/data-volumes/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/assets/datasources/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/assets/environments/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/assets/overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/assets/templates/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/overviews/introduction-to-workloads/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/overviews/managing-workloads/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/overviews/workload-types/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/policies/old-policies/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/policies/overview/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/policies/policy-examples/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/policies/policy-reference/ - 2025-02-02 + 2025-02-03 https://docs.run.ai/v2.20/platform-admin/workloads/policies/workspaces-policy/ - 2025-02-02 + 2025-02-03 \ No newline at end of file diff --git a/v2.20/sitemap.xml.gz b/v2.20/sitemap.xml.gz index 897e40f575bf443f2b9db136b63359ffd9efb367..f09c00710fde20bbef10c95a514464762d332663 100644 GIT binary patch delta 2827 zcmV+m3-t7(7NZt_ABzYG00*FF0{?SqbY*Q}a4vXlYyj1r%Wm8_7KZoz6oL12Ic}so zX(aAl4`w$QbYqqVi(*MkQ6$YvUFPYB)V*Aa%QFLrzr09FstZ5xa`>OaL+b1Ar&@kE zaO;Ft-|jx%Kiqxbis?eA^4r~CfBr!}-~Imd)7N07#|yFzLKehm z>m4K-8r05G=A+4itF6@QUxM}&4eTZGz{!pZtGLTmZCma=cdntWsc9zLp9BaNAwKjZeD^h z8Jy3yCh`ulXBaPH9?x+|Ivn?X4(z$)id&R_*=Ux~SpA(d+_nYncu-N1d9>&6dW(z) zBO5RIV1i4RSo^T$u-0yb10AlQlWW52XIO{I7tIJPTe2araNX1j$p;f7p5c0?#>j3_ z6qeCB4a*iBq^AS45O4*CI7m?f$yZdkZhnG;Sz*3L16tgjgM)Qx5+0~7PiUa(vO$A? zHEqw+bx%-4pasxtIH1+3IXIk^s!KZ1utUN^H6)46Xf1)!P^%8y$l*qRJ2KdI?eH+e zT}X)@vt+m)L=On+Mf7;Ep2P?U>rM2CpdLk!hidtHJWS7`$3vfZJ4{#)qX&fbGDbjD zPs8%m31PvWYwoRJ4j%BC$@rxeV4^O6&%WZ+!5hdpJglwpTsdubQXiq=a8>n$5mZ=D z2w}DL1tYAyo=}1+>g)+2xYVAI!m8~F;|eB0V9o6<6$)h+-8>rbEZq~L z&e1&|?hHK=;?B=KBkJtj^P$eo10U$j-1A}P4^8n_U zaLoy>L{^fwaK($SNZ>OWW=P?c34XIXPgMavh__}0*iZ4tCSafAF zfTBy25fogT44`WfCj;<$rl}Qw$dd^wygpe$ZQ@rVcG}xDbxy zaM9(|3X)K5bQGGhx;4OU27Ccl7AE8}AU=}J3Zkm(4Ozy0Gc;ziAg zlY+$?q8ZPJjjlLP<5# zwia~9F&F3plx<`^{t}L&AwRD?;AISG$&K`in zoLNO}ot>l2D%$FNGG}BtIF|3R8zaW)bPChM!^Hyx5nuus`-o!?J34-mNr`8i}`hH*JEc*4Ez})={M-GqKIfvymH?7aS zUfwL-@N(f=qeU&$V30_2nFb2=+AhyQgRhdy&^@=CvRE~L&IY-xX`E*npC=9gb8(qP zvCML*2xGPOglXlY8CW<3JbnNt?_xu#j>5VpAi6V9#weaI4-z0LC z*kRB(TFd1@x+fB~wwPqjgDABn2a8GG%U+Mwds;aN_`~!6j2nF zq-(0^kL5If!yQ{%!fcyOsHncA%t+EU)=?Ckm;=iVlCe!Jr=FAjXGhv-R%{#6b?R(T z1>eyieV)ajOOkv=S2SDElG>7!?0r*;TKHtUn#I&+tuh-+gFtt?=o0yJn_$7D4n@mm znkAvEY$Jl{zyl>t59LoUKrc^adUG20{!o`=E zu6cTY7wqzC`P~NZCCk^lZlW2>Eq@8Z36I&TFoR$oFT=qUH_@8(4e{?m$|3f@qCAmp zP=$o5I>mj`eIyPEB&;YrzenLonj8e4@~QIDIWBjk(vh&pDRzE9wEyhlC!#2gvlW#Y zLlJKQZfTT4%Wep{4VBIHdkaeOQ%YUn2CM~t)TghFe zwQn;}X{jQYU@kvQ+CWd3mNHl1`!&$(>ji;|v?WuUXuO9+$MIGc4$GZy($Vr1pK`uH zNUn^=5u?ouY3ss>EFS3y(|hg>iBDnN2)6vbta+gE5P<9`?U4;ltHHpW7tV~EhZ$EM zCsrm8R?WRl(@^(+ldTCie~XDMf6M$+yMgF zl&o&3^@|^nag(bHA!0#|XFid|_j0}oH#+!j?r2C!nWPqF${5`Ve@xUox##}{TvjfK z4a7H~GR58ArdoU}!P_{C?@E1zlahkVegSK8K>((s3${JACf3`sf$Ud6hTC-2+ievGqeZqtj{F$vu(<87{EMkcR3&)x0(AfG-FDm!GvO1KN+q?w9_Rdr5iBLOkW1Dd>uD<2KunsQvZiu=%ju&Jbge-{B z);mZvG^m}W%tw<2S6ivqzXa_m8rVzVfs-8-R&kfB+P2(zVw$#N(U5Mb<)r4GO5!z% zf4S$Dh?T)C#(Pl_<3}PSHeZFLr{_oMZGAouda% z3Q6L7Y;Uws9tDgu%e^Dik*-nWoKBb&nebxCQEm02;_u2^U4+KkLIA~2Q9Jwz>z(9yLk!5 zWN<#)n#eoIo?*O*c|6A<>2TclIk4xFD{fJLW}{g`WA%5=aN8EN<3UA5=Fy(N>n$=K zjBLE*g9$EOV(r70!&tXW`+404QO$94i46(NqC^TJfVTA%LWa9 z*0eoO*F8ZIffhio;eb}B=HPHvsxIk3!wv}x)sQ4MqqPJ^L#;Y+BZnLP?Z{x;wZp>< zcOfNu%#z`H5IrEM7t!OvdJ-cbtT)jkf_fA^9;)T*@i0A$9uIxu?J!|Ij2;ly%NPMs zJq^oKCxiuiuDQ2@Ie5TlCgUHi026h8dG-~j4&Fe<;bCo!=gMijlllk^hpVb5jG)4L zLI|s^FBoCv^@I{sVNVD_MfQ{sQ)f>I!KLsZc1p=;qOYXX&00 zb&l@&aA)X|5O;p=8Bu5Fo)2|y9{50K=AI8Te`t!g0#gW_QFOtGZxX?Cj4n8T(M=<8 z*3l&+zKMi)U#6_M4$PDi*M+&HM0a9V(=~DP4PGOzKyiR)AY5_cD~}cABwVqgD-iH3 zg=@K?@93|{gD)P#t){n6!Nefq$es|k^w${S)l{db}uz8 zfYET}+CT^_Dh5VSF){FgiinX9Q#=fO;G!XMIzt&pX2Ui$Eh4B(I0C}@f>T6jd)-0; z8|)Sk*kX5pxF)*=T#L=d_3%bSSl4cfi0aef25;eGw&0;!_z3L^0<+-pSnx0`c=UB8 zL09-#D}0C*IkH*?k)Bw8Q@Pf=oxK{PB;!h!-^{ zP6`%pknX)~50{y+Q5pZLc|s7;JHkvLR28J=wYKN{bj4ZGS};;ZuAQYUaRM|b5K5|< zwzY^mQj;Gj9VAaAKX8fE&3hE$HK9jGN~?nbXg*(Y(mj$qb7gsdt)<%({6M}3bM^on z=FBQ;>+BqDR?$}HlQ|>H!LfXg-54=WpFVZMFLNvP!F;!&E@}a{VY)UUo7;AcV|l@q z7gRbFq=I@%4q8T048`PhunY%@tAnsw)v@(pnDl|$gWyLL9?xqdsRw8@p?U{_1?P25 z#v3?jRVd3O|96ysaY(@76@lq#Bb*XQt=d;mBJ@@lp>YV3yc|D+!&p;Ao34il*WV-l zYyeEjaPny(oVQ{d)kH*l^CmX(OAu`DJhh%+)A#!#XVI_U1m^BvIC6N%&N(cnxoLgo z_3~!vhL;Q18ZBy}27^SJ%QR4^*LHah8hn*phVHrDl*OulaW=?hP2)Vv_&jj{n2XCS zie;8dMHs8KCrm3J&A`GT=vgK^j6Cw{o$O>9p<|~9z2~QJPH(i~N))l{sD&RV`6iL8 z#14bT(ONDK(mj!=wZ$Y;pY0*i1*szlEwf@AXmrF1!esQgqhVS8sZEzWQtBh&r--7c zBwbTQe=Mhe8SdE95@y?MLPhl@Wk!;=v5unP#2i>|kc@3&IrW_EA026MR?^5K`60 zBz!7ih_TvpmdEF-qyF+-HB&2%vzjNhj^;$$g~NC=G@*!=R_e?cjUg*yW2H2)yeRob z@+3$R^;n$pl5EZiJx`}2ixBMjtCp>1W!VH3WybK3Z+(c{AH^jdODTmnqitJ;}$_mMaxkg%fg{2qlTX>t&F%BRXp=eXRFN=L#Xr`Y)c(f-lJPef4~XDcc* zh9ceq+|nq8mfa9?8!DUY_ZF1mrPVj zPOMBGteSh9rlIb4ldTCie*^jSbKc@V=Pmx1yv6^TxA@1r#ed0Ke7v`~xvi)wdCDoN zwBLyncX>wOJjZ7cj$c4Heg)xp1mXCC!LjVPO{W;_w5`eIR=|u1RHTGk5k1obcYr`P zC94~1{o)5?+~n#)h*(hLnNMW#y_|2tjShaBI~r0_CaFc4GDdd-e-kxN?)iTKmz4`* z1Mv;0OmVljsTSW#@HWokyHa1_q@>`oU%;AN5P<3Ef^AQ&iS@Q@Ao~@N;Wk}0In)Jp zRdZe(p^D106AlE-@^ol#P8x&*+ioJ|IkfE-R<48F-ouq)k>D0qt^?d|aRC@ljDy^M z2nBYw);l=!p0D)|LBM=xZ2b_<=yV!(a*t#|h6^k*q@g-cH80MDpJ*JkXjRt1Wm&7e dO*bAk)%Ljm