diff --git a/.github/workflows/e2e-test-mxnet-mnist.yaml b/.github/workflows/e2e-test-mxnet-mnist.yaml deleted file mode 100644 index 16e022b8055..00000000000 --- a/.github/workflows/e2e-test-mxnet-mnist.yaml +++ /dev/null @@ -1,42 +0,0 @@ -name: E2E Test with mxnet-mnist - -on: - pull_request: - paths-ignore: - - "pkg/ui/v1beta1/frontend/**" - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - e2e: - runs-on: ubuntu-22.04 - timeout-minutes: 120 - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Setup Test Env - uses: ./.github/workflows/template-setup-e2e-test - with: - kubernetes-version: ${{ matrix.kubernetes-version }} - python-version: "3.9" - - - name: Run e2e test with ${{ matrix.experiments }} experiments - uses: ./.github/workflows/template-e2e-test - with: - experiments: ${{ matrix.experiments }} - # Comma Delimited - trial-images: mxnet-mnist - - strategy: - fail-fast: false - matrix: - kubernetes-version: ["v1.25.12", "v1.26.6", "v1.27.3"] - # Comma Delimited - experiments: - # suggestion-hyperopt - - "long-running-resume,from-volume-resume,median-stop" - # others - - "grid,bayesian-optimization,tpe,multivariate-tpe,cma-es,hyperband" diff --git a/.github/workflows/e2e-test-pytorch-mnist.yaml b/.github/workflows/e2e-test-pytorch-mnist.yaml index 7bcc92215de..1c6db9d0fce 100644 --- a/.github/workflows/e2e-test-pytorch-mnist.yaml +++ b/.github/workflows/e2e-test-pytorch-mnist.yaml @@ -37,5 +37,9 @@ jobs: kubernetes-version: ["v1.25.12", "v1.26.6", "v1.27.3"] # Comma Delimited experiments: + # suggestion-hyperopt + - "long-running-resume,from-volume-resume,median-stop" + # others + - "grid,bayesian-optimization,tpe,multivariate-tpe,cma-es,hyperband" - "file-metrics-collector,pytorchjob-mnist" - "median-stop-with-json-format,file-metrics-collector-with-json-format" diff --git a/.github/workflows/e2e-test-ui-random-search-postgres.yaml b/.github/workflows/e2e-test-ui-random-search-postgres.yaml index 420760d9bdd..09ca97a452f 100644 --- a/.github/workflows/e2e-test-ui-random-search-postgres.yaml +++ b/.github/workflows/e2e-test-ui-random-search-postgres.yaml @@ -25,7 +25,7 @@ jobs: with: experiments: random # Comma Delimited - trial-images: mxnet-mnist + trial-images: pytorch-mnist-cpu katib-ui: true database-type: postgres diff --git a/.github/workflows/publish-trial-images.yaml b/.github/workflows/publish-trial-images.yaml index 7839e2687ce..81ff03a63a9 100644 --- a/.github/workflows/publish-trial-images.yaml +++ b/.github/workflows/publish-trial-images.yaml @@ -22,9 +22,6 @@ jobs: fail-fast: false matrix: include: - - trial-name: mxnet-mnist - platforms: linux/amd64,linux/arm64 - dockerfile: examples/v1beta1/trial-images/mxnet-mnist/Dockerfile - trial-name: pytorch-mnist-cpu platforms: linux/amd64,linux/arm64 dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu diff --git a/docs/images-location.md b/docs/images-location.md index 5afa11d008d..0cbf78d7618 100644 --- a/docs/images-location.md +++ b/docs/images-location.md @@ -238,17 +238,6 @@ The following table shows images for training containers which are used in the Location - - - docker.io/kubeflowkatib/mxnet-mnist - - - MXNet MNIST example with collecting metrics time - - - Dockerfile - - docker.io/kubeflowkatib/pytorch-mnist-cpu diff --git a/examples/v1beta1/README.md b/examples/v1beta1/README.md index 7439d869c13..f971ae29862 100644 --- a/examples/v1beta1/README.md +++ b/examples/v1beta1/README.md @@ -104,8 +104,6 @@ Check the following images for the Trial containers: - [Tensorflow MNIST with summaries](./trial-images/tf-mnist-with-summaries) -- [MXNet MNIST](./trial-images/mxnet-mnist) - - [PyTorch MNIST](./trial-images/pytorch-mnist) - [ENAS Keras CNN CIFAR-10](./trial-images/enas-cnn-cifar10) diff --git a/examples/v1beta1/argo/argo-workflow.yaml b/examples/v1beta1/argo/argo-workflow.yaml index 1c581be1d18..b658943984d 100644 --- a/examples/v1beta1/argo/argo-workflow.yaml +++ b/examples/v1beta1/argo/argo-workflow.yaml @@ -13,11 +13,9 @@ metadata: name: katib-argo-workflow spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: random parallelTrialCount: 2 @@ -50,22 +48,22 @@ spec: - name: hp-workflow steps: - - name: data-preprocessing - template: gen-num-examples + template: gen-epochs - - name: model-training template: model-training arguments: parameters: - - name: num-examples + - name: epochs value: "{{steps.data-preprocessing.outputs.result}}" - - name: gen-num-examples + - name: gen-epochs script: image: python:alpine3.6 command: - python source: | import random - print(60000//random.randint(10, 100)) + print(60000//random.randint(3000, 30000)) - name: model-training metadata: @@ -73,12 +71,13 @@ spec: katib.kubeflow.org/model-training: "true" inputs: parameters: - - name: num-examples + - name: epochs container: name: model-training - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" + - "/opt/pytorch-mnist/mnist.py" - "--lr=${trialParameters.learningRate}" - - "--num-examples={{inputs.parameters.num-examples}}" + - "--epochs={{inputs.parameters.epochs}}" + - "--batch-size=16" diff --git a/examples/v1beta1/early-stopping/median-stop.yaml b/examples/v1beta1/early-stopping/median-stop.yaml index 010112a6eeb..3e4659672a5 100644 --- a/examples/v1beta1/early-stopping/median-stop.yaml +++ b/examples/v1beta1/early-stopping/median-stop.yaml @@ -8,11 +8,9 @@ metadata: name: median-stop spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: random earlyStopping: @@ -30,12 +28,12 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.5" - - name: num-epochs - parameterType: int + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - min: "3" - max: "4" + min: "0.5" + max: "0.9" trialTemplate: retain: true primaryContainerName: training-container @@ -43,9 +41,9 @@ spec: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberEpochs - description: Number of epochs to train the model - reference: num-epochs + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -54,11 +52,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-epochs=${trialParameters.numberEpochs}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/hp-tuning/bayesian-optimization.yaml b/examples/v1beta1/hp-tuning/bayesian-optimization.yaml index 2362349e88b..80bfd7566f8 100644 --- a/examples/v1beta1/hp-tuning/bayesian-optimization.yaml +++ b/examples/v1beta1/hp-tuning/bayesian-optimization.yaml @@ -6,11 +6,9 @@ metadata: name: bayesian-optimization spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: bayesianoptimization algorithmSettings: @@ -24,31 +22,21 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -57,12 +45,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/hp-tuning/cma-es.yaml b/examples/v1beta1/hp-tuning/cma-es.yaml index 7f75488e434..f913b80b366 100644 --- a/examples/v1beta1/hp-tuning/cma-es.yaml +++ b/examples/v1beta1/hp-tuning/cma-es.yaml @@ -6,11 +6,9 @@ metadata: name: cmaes spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: cmaes algorithmSettings: @@ -24,31 +22,21 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -57,12 +45,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/hp-tuning/grid.yaml b/examples/v1beta1/hp-tuning/grid.yaml index af5cc7a800d..e739245fa87 100644 --- a/examples/v1beta1/hp-tuning/grid.yaml +++ b/examples/v1beta1/hp-tuning/grid.yaml @@ -6,11 +6,9 @@ metadata: name: grid spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: grid parallelTrialCount: 3 @@ -20,33 +18,24 @@ spec: - name: lr parameterType: double feasibleSpace: - min: "0.001" - max: "0.01" - step: "0.001" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + min: "0.01" + step: "0.005" + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + step: "0.1" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -55,12 +44,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/hp-tuning/hyperband.yaml b/examples/v1beta1/hp-tuning/hyperband.yaml index 5315a6a31bd..80c1109c575 100644 --- a/examples/v1beta1/hp-tuning/hyperband.yaml +++ b/examples/v1beta1/hp-tuning/hyperband.yaml @@ -8,11 +8,9 @@ spec: parallelTrialCount: 2 maxTrialCount: 2 objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: hyperband algorithmSettings: @@ -28,19 +26,12 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + max: "0.9" - name: num-epochs parameterType: int feasibleSpace: @@ -52,12 +43,9 @@ spec: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum - name: numberEpochs description: Number of epochs to train the model reference: num-epochs @@ -69,13 +57,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=32" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=${trialParameters.numberEpochs}" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" - - "--num-epochs=${trialParameters.numberEpochs}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/hp-tuning/multivariate-tpe.yaml b/examples/v1beta1/hp-tuning/multivariate-tpe.yaml index a24ff247b39..4217ce55ada 100644 --- a/examples/v1beta1/hp-tuning/multivariate-tpe.yaml +++ b/examples/v1beta1/hp-tuning/multivariate-tpe.yaml @@ -6,11 +6,9 @@ metadata: name: multivariate-tpe spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: multivariate-tpe parallelTrialCount: 3 @@ -21,31 +19,21 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -54,12 +42,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/hp-tuning/random.yaml b/examples/v1beta1/hp-tuning/random.yaml index 0f2c1178833..0c9705d64f5 100644 --- a/examples/v1beta1/hp-tuning/random.yaml +++ b/examples/v1beta1/hp-tuning/random.yaml @@ -6,11 +6,9 @@ metadata: name: random spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: random parallelTrialCount: 3 @@ -21,31 +19,21 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -54,14 +42,14 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" resources: limits: memory: "1Gi" diff --git a/examples/v1beta1/hp-tuning/sobol.yaml b/examples/v1beta1/hp-tuning/sobol.yaml index 2a1ee1ef008..5acaf903912 100644 --- a/examples/v1beta1/hp-tuning/sobol.yaml +++ b/examples/v1beta1/hp-tuning/sobol.yaml @@ -6,11 +6,9 @@ metadata: name: sobol spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: sobol parallelTrialCount: 3 @@ -21,31 +19,21 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -54,12 +42,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/hp-tuning/tpe.yaml b/examples/v1beta1/hp-tuning/tpe.yaml index dc697388765..98adc16baf3 100644 --- a/examples/v1beta1/hp-tuning/tpe.yaml +++ b/examples/v1beta1/hp-tuning/tpe.yaml @@ -6,11 +6,9 @@ metadata: name: tpe spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: tpe parallelTrialCount: 3 @@ -21,31 +19,21 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -54,12 +42,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/kind-cluster/README.md b/examples/v1beta1/kind-cluster/README.md index 5241aed3393..011a2059010 100644 --- a/examples/v1beta1/kind-cluster/README.md +++ b/examples/v1beta1/kind-cluster/README.md @@ -46,14 +46,13 @@ using `kubectl`: kubectl create -f https://raw.githubusercontent.com/kubeflow/katib/master/examples/v1beta1/hp-tuning/random.yaml ``` -This example uses a MXNet neural network to train an image classification model +This example uses a PyTorch neural network to train an image classification model using the MNIST dataset. You can check the training container source code -[here](../trial-images/mxnet-mnist). +[here](../trial-images/pytorch-mnist). The Experiment runs twelve training jobs (Trials) and tunes the following hyperparameters: - Learning Rate (`lr`). -- Number of layers (`num-layers`). -- Neural network optimizer (`optimizer`). +- Momentum (`momentum`). After creating above example, check the [Experiment](https://www.kubeflow.org/docs/components/katib/overview/#experiment) status: @@ -92,8 +91,7 @@ You can get the best hyperparameters with the following command: $ kubectl get experiment random -n kubeflow -o jsonpath='{range .status.currentOptimalTrial.parameterAssignments[*]}{.name}: {.value}{"\n"}{end}' lr: 0.028162244250364066 -num-layers: 5 -optimizer: sgd +momentum: 0.583672196492823 ``` To view created Experiment in Katib UI, follow diff --git a/examples/v1beta1/kubeflow-pipelines/early-stopping.ipynb b/examples/v1beta1/kubeflow-pipelines/early-stopping.ipynb index 0d3353b49e6..422cc1ff90a 100644 --- a/examples/v1beta1/kubeflow-pipelines/early-stopping.ipynb +++ b/examples/v1beta1/kubeflow-pipelines/early-stopping.ipynb @@ -156,12 +156,9 @@ "\n", "# Objective specification.\n", "objective=V1beta1ObjectiveSpec(\n", - " type=\"maximize\",\n", - " goal= 0.99,\n", - " objective_metric_name=\"Validation-accuracy\",\n", - " additional_metric_names=[\n", - " \"Train-accuracy\"\n", - " ]\n", + " type=\"minimize\",\n", + " goal= 0.001,\n", + " objective_metric_name=\"loss\",\n", ")\n", "\n", "# Algorithm specification.\n", @@ -180,7 +177,6 @@ " ]\n", ")\n", "\n", - "\n", "# Experiment search space.\n", "# In this example we tune learning rate, number of layer and optimizer.\n", "# Learning rate has bad feasible space to show more early stopped Trials.\n", @@ -194,22 +190,11 @@ " ),\n", " ),\n", " V1beta1ParameterSpec(\n", - " name=\"num-layers\",\n", - " parameter_type=\"int\",\n", - " feasible_space=V1beta1FeasibleSpace(\n", - " min=\"2\",\n", - " max=\"5\"\n", - " ),\n", - " ),\n", - " V1beta1ParameterSpec(\n", - " name=\"optimizer\",\n", - " parameter_type=\"categorical\",\n", + " name=\"momentum\",\n", + " parameter_type=\"double\",\n", " feasible_space=V1beta1FeasibleSpace(\n", - " list=[\n", - " \"sgd\", \n", - " \"adam\",\n", - " \"ftrl\"\n", - " ]\n", + " min=\"0.5\",\n", + " max=\"0.9\"\n", " ),\n", " ),\n", "]\n" @@ -245,14 +230,14 @@ " \"containers\": [\n", " {\n", " \"name\": \"training-container\",\n", - " \"image\": \"docker.io/kubeflowkatib/mxnet-mnist:v0.13.0\",\n", + " \"image\": \"docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0\",\n", " \"command\": [\n", " \"python3\",\n", - " \"/opt/mxnet-mnist/mnist.py\",\n", - " \"--batch-size=64\",\n", + " \"/opt/pytorch-mnist/mnist.py\",\n", + " \"--epochs=1\",\n", + " \"--batch-size=16\",\n", " \"--lr=${trialParameters.learningRate}\",\n", - " \"--num-layers=${trialParameters.numberLayers}\",\n", - " \"--optimizer=${trialParameters.optimizer}\"\n", + " \"--momentum=${trialParameters.momentum}\",\n", " ]\n", " }\n", " ],\n", @@ -274,14 +259,9 @@ " reference=\"lr\"\n", " ),\n", " V1beta1TrialParameterSpec(\n", - " name=\"numberLayers\",\n", - " description=\"Number of training model layers\",\n", - " reference=\"num-layers\"\n", - " ),\n", - " V1beta1TrialParameterSpec(\n", - " name=\"optimizer\",\n", - " description=\"Training model optimizer (sdg, adam or ftrl)\",\n", - " reference=\"optimizer\"\n", + " name=\"momentum\",\n", + " description=\"Momentum for the training model\",\n", + " reference=\"momentum\"\n", " ),\n", " ],\n", " trial_spec=trial_spec\n", diff --git a/examples/v1beta1/metrics-collector/metrics-collection-strategy.yaml b/examples/v1beta1/metrics-collector/metrics-collection-strategy.yaml index 5facadc0ae8..0d7beaf8a70 100644 --- a/examples/v1beta1/metrics-collector/metrics-collection-strategy.yaml +++ b/examples/v1beta1/metrics-collector/metrics-collection-strategy.yaml @@ -6,16 +6,16 @@ metadata: name: metrics-collection-strategy spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss additionalMetricNames: - - Train-accuracy + - accuracy metricStrategies: - - name: Train-accuracy - value: "latest" - - name: Validation-accuracy + - name: accuracy value: "max" + - name: loss + value: "min" algorithm: algorithmName: tpe parallelTrialCount: 3 @@ -26,31 +26,21 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -59,12 +49,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/resume-experiment/from-volume-resume.yaml b/examples/v1beta1/resume-experiment/from-volume-resume.yaml index 082e09c0969..71ab478866d 100644 --- a/examples/v1beta1/resume-experiment/from-volume-resume.yaml +++ b/examples/v1beta1/resume-experiment/from-volume-resume.yaml @@ -6,11 +6,9 @@ metadata: name: from-volume-resume spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: random parallelTrialCount: 3 @@ -22,31 +20,21 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -55,12 +43,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/resume-experiment/long-running-resume.yaml b/examples/v1beta1/resume-experiment/long-running-resume.yaml index 3a7e10dba7c..271cc6d0358 100644 --- a/examples/v1beta1/resume-experiment/long-running-resume.yaml +++ b/examples/v1beta1/resume-experiment/long-running-resume.yaml @@ -6,11 +6,9 @@ metadata: name: long-running-resume spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: random parallelTrialCount: 3 @@ -22,31 +20,21 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -55,12 +43,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb b/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb index c901086c1af..d6876f2b864 100644 --- a/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb +++ b/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb @@ -93,10 +93,9 @@ "\n", "# Objective specification.\n", "objective_spec=V1beta1ObjectiveSpec(\n", - " type=\"maximize\",\n", - " goal= 0.99,\n", - " objective_metric_name=\"Validation-accuracy\",\n", - " additional_metric_names=[\"Train-accuracy\"]\n", + " type=\"minimize\",\n", + " goal= 0.001,\n", + " objective_metric_name=\"loss\",\n", ")\n", "\n", "# Experiment search space. In this example we tune learning rate, number of layer and optimizer.\n", @@ -110,24 +109,15 @@ " ),\n", " ),\n", " V1beta1ParameterSpec(\n", - " name=\"num-layers\",\n", - " parameter_type=\"int\",\n", - " feasible_space=V1beta1FeasibleSpace(\n", - " min=\"2\",\n", - " max=\"5\"\n", - " ),\n", - " ),\n", - " V1beta1ParameterSpec(\n", - " name=\"optimizer\",\n", - " parameter_type=\"categorical\",\n", + " name=\"momentum\",\n", + " parameter_type=\"double\",\n", " feasible_space=V1beta1FeasibleSpace(\n", - " list=[\"sgd\", \"adam\", \"ftrl\"]\n", + " min=\"0.5\",\n", + " max=\"0.9\"\n", " ),\n", " ),\n", "]\n", "\n", - "\n", - "\n", "# JSON template specification for the Trial's Worker Kubernetes Job.\n", "trial_spec={\n", " \"apiVersion\": \"batch/v1\",\n", @@ -143,15 +133,14 @@ " \"containers\": [\n", " {\n", " \"name\": \"training-container\",\n", - " \"image\": \"docker.io/kubeflowkatib/mxnet-mnist:v0.14.0\",\n", + " \"image\": \"docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0\",\n", " \"command\": [\n", " \"python3\",\n", - " \"/opt/mxnet-mnist/mnist.py\",\n", + " \"/opt/pytorch-mnist/mnist.py\",\n", + " \"--epochs=1\",\n", " \"--batch-size=64\",\n", - " \"--num-epochs=1\",\n", " \"--lr=${trialParameters.learningRate}\",\n", - " \"--num-layers=${trialParameters.numberLayers}\",\n", - " \"--optimizer=${trialParameters.optimizer}\"\n", + " \"--momentum=${trialParameters.momentum}\",\n", " ]\n", " }\n", " ],\n", @@ -171,14 +160,9 @@ " reference=\"lr\"\n", " ),\n", " V1beta1TrialParameterSpec(\n", - " name=\"numberLayers\",\n", - " description=\"Number of training model layers\",\n", - " reference=\"num-layers\"\n", - " ),\n", - " V1beta1TrialParameterSpec(\n", - " name=\"optimizer\",\n", - " description=\"Training model optimizer (sdg, adam or ftrl)\",\n", - " reference=\"optimizer\"\n", + " name=\"momentum\",\n", + " description=\"Momentum for the training model\",\n", + " reference=\"momentum\"\n", " ),\n", " ],\n", " trial_spec=trial_spec\n", @@ -735,8 +719,7 @@ " 'message': 'Trial has succeeded',\n", " 'reason': 'TrialSucceeded',\n", " 'status': 'True',\n", - " 'type': 'Succeeded'}\n", - "\n" + " 'type': 'Succeeded'}\n" ] } ], diff --git a/examples/v1beta1/tekton/pipeline-run.yaml b/examples/v1beta1/tekton/pipeline-run.yaml index 1c2de0a45d7..fb608c76a19 100644 --- a/examples/v1beta1/tekton/pipeline-run.yaml +++ b/examples/v1beta1/tekton/pipeline-run.yaml @@ -13,11 +13,9 @@ metadata: name: tekton-pipeline-run spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: random parallelTrialCount: 2 @@ -45,53 +43,55 @@ spec: kind: PipelineRun spec: params: - - name: lr - value: ${trialParameters.learningRate} - - name: num-examples-init + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: epochs-init value: "60000" pipelineSpec: params: - name: lr description: Learning rate for the training model - - name: num-examples-init + - name: epochs-init description: Initial value for number of training examples tasks: - name: data-preprocessing params: - - name: num-examples-pre - value: $(params.num-examples-init) + - name: epochs-pre + value: $(params.epochs-init) taskSpec: params: - - name: num-examples-pre + - name: epochs-pre description: Number of training examples before optimization results: - - name: num-examples-post + - name: epochs-post description: Number of training examples after optimization steps: - - name: num-examples-optimize + - name: epochs-optimize image: python:alpine3.6 command: - sh - -c args: - - python3 -c "import random; print($(params.num-examples-pre)//random.randint(10,100),end='')" | tee $(results.num-examples-post.path) + - python3 -c "import random; print($(params.epochs-pre)//random.randint(3000,30000),end='')" | tee $(results.epochs-post.path) - name: model-training params: - name: lr value: $(params.lr) - - name: num-examples - value: $(tasks.data-preprocessing.results.num-examples-post) + - name: epochs + value: $(tasks.data-preprocessing.results.epochs-post) taskSpec: params: - name: lr description: Learning rate for the training model - - name: num-examples - description: Number of training examples + - name: epochs + description: Number of epochs steps: - name: model-training - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--num-examples=$(params.num-examples)" - - "--lr=$(params.lr)" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=$(params.epochs)" + - "--batch-size=16" + - "--lr=$(trialParameters.lr)" diff --git a/examples/v1beta1/trial-images/mxnet-mnist/Dockerfile b/examples/v1beta1/trial-images/mxnet-mnist/Dockerfile deleted file mode 100644 index f10c8c75e77..00000000000 --- a/examples/v1beta1/trial-images/mxnet-mnist/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM python:3.10-slim - -ARG TARGETARCH -ENV LD_LIBRARY_PATH "${LD_LIBRARY_PATH}:/opt/arm/armpl_22.0.2_gcc-11.2/lib" - -COPY examples/v1beta1/trial-images/mxnet-mnist /opt/mxnet-mnist - -WORKDIR /opt/mxnet-mnist - -RUN apt-get -y update \ - && apt-get -y install libgomp1 \ - && if [ "${TARGETARCH}" = "arm64" ]; then \ - apt-get -y install wget; \ - elif [ "${TARGETARCH}" = "amd64" ]; then \ - apt-get -y install libquadmath0; \ - fi \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -RUN if [ "${TARGETARCH}" = "arm64" ]; then \ - /opt/mxnet-mnist/install-arm-performance-libraries.sh; \ - fi -RUN pip install --prefer-binary -r requirements.txt -RUN chgrp -R 0 /opt/mxnet-mnist \ - && chmod -R g+rwX /opt/mxnet-mnist - -ENTRYPOINT ["python3", "/opt/mxnet-mnist/mnist.py"] diff --git a/examples/v1beta1/trial-images/mxnet-mnist/README.md b/examples/v1beta1/trial-images/mxnet-mnist/README.md deleted file mode 100644 index 71d6b8f8533..00000000000 --- a/examples/v1beta1/trial-images/mxnet-mnist/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# MXNet MNIST Image Classification Example - -This is MXNet MNIST image classification training container with recording time -of the metrics. It uses only simple multilayer perceptron network (mlp). - -If you want to read more about this example, visit the official -[incubator-mxnet](https://github.com/apache/incubator-mxnet/tree/1cf2fe5f8753042951bc0aacb6c95ddd3a904395/example/image-classification) -GitHub repository. - -Katib uses this training container in some Experiments, for instance in the -[random search](../../hp-tuning/random.yaml#L55-L64). diff --git a/examples/v1beta1/trial-images/mxnet-mnist/common/__init__.py b/examples/v1beta1/trial-images/mxnet-mnist/common/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/examples/v1beta1/trial-images/mxnet-mnist/common/fit.py b/examples/v1beta1/trial-images/mxnet-mnist/common/fit.py deleted file mode 100644 index 6818559cedb..00000000000 --- a/examples/v1beta1/trial-images/mxnet-mnist/common/fit.py +++ /dev/null @@ -1,343 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" example train fit utility """ -import logging -import os -import time -import re -import math -import mxnet as mx - - -def get_epoch_size(args, kv): - return math.ceil(int(args.num_examples / kv.num_workers) / args.batch_size) - - -def _get_lr_scheduler(args, kv): - if 'lr_factor' not in args or args.lr_factor >= 1: - return (args.lr, None) - epoch_size = get_epoch_size(args, kv) - begin_epoch = args.load_epoch if args.load_epoch else 0 - if 'pow' in args.lr_step_epochs: - lr = args.lr - max_up = args.num_epochs * epoch_size - pwr = float(re.sub('pow[- ]*', '', args.lr_step_epochs)) - poly_sched = mx.lr_scheduler.PolyScheduler(max_up, lr, pwr) - return (lr, poly_sched) - step_epochs = [int(l) for l in args.lr_step_epochs.split(',')] - lr = args.lr - for s in step_epochs: - if begin_epoch >= s: - lr *= args.lr_factor - if lr != args.lr: - logging.info('Adjust learning rate to %e for epoch %d', - lr, begin_epoch) - - steps = [epoch_size * (x - begin_epoch) - for x in step_epochs if x - begin_epoch > 0] - if steps: - return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor, - base_lr=args.lr)) - else: - return (lr, None) - - -def _load_model(args, rank=0): - if 'load_epoch' not in args or args.load_epoch is None: - return (None, None, None) - assert args.model_prefix is not None - model_prefix = args.model_prefix - if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)): - model_prefix += "-%d" % (rank) - sym, arg_params, aux_params = mx.model.load_checkpoint( - model_prefix, args.load_epoch) - logging.info('Loaded model %s_%04d.params', model_prefix, args.load_epoch) - return (sym, arg_params, aux_params) - - -def _save_model(args, rank=0): - if args.model_prefix is None: - return None - return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % ( - args.model_prefix, rank), period=args.save_period) - - -def add_fit_args(parser): - """ - parser : argparse.ArgumentParser - return a parser added with args required by fit - """ - train = parser.add_argument_group('Training', 'model training') - train.add_argument('--network', type=str, - help='the neural network to use') - train.add_argument('--num-layers', type=int, - help='number of layers in the neural network, \ - required by some networks such as resnet') - train.add_argument('--gpus', type=str, - help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu') - train.add_argument('--kv-store', type=str, default='device', - help='key-value store type') - train.add_argument('--num-epochs', type=int, default=100, - help='max num of epochs') - train.add_argument('--lr', type=float, default=0.1, - help='initial learning rate') - train.add_argument('--lr-factor', type=float, default=0.1, - help='the ratio to reduce lr on each step') - train.add_argument('--lr-step-epochs', type=str, - help='the epochs to reduce the lr, e.g. 30,60') - train.add_argument('--initializer', type=str, default='default', - help='the initializer type') - train.add_argument('--optimizer', type=str, default='sgd', - help='the optimizer type') - train.add_argument('--mom', type=float, default=0.9, - help='momentum for sgd') - train.add_argument('--wd', type=float, default=0.0001, - help='weight decay for sgd') - train.add_argument('--batch-size', type=int, default=128, - help='the batch size') - train.add_argument('--disp-batches', type=int, default=20, - help='show progress for every n batches') - train.add_argument('--model-prefix', type=str, - help='model prefix') - train.add_argument('--save-period', type=int, default=1, help='params saving period') - parser.add_argument('--monitor', dest='monitor', type=int, default=0, - help='log network parameters every N iters if larger than 0') - train.add_argument('--load-epoch', type=int, - help='load the model on an epoch using the model-load-prefix') - train.add_argument('--top-k', type=int, default=0, - help='report the top-k accuracy. 0 means no report.') - train.add_argument('--loss', type=str, default='', - help='show the cross-entropy or nll loss. ce strands for cross-entropy, nll-loss stands for likelihood loss') - train.add_argument('--test-io', type=int, default=0, - help='1 means test reading speed without training') - train.add_argument('--dtype', type=str, default='float32', - help='precision: float32 or float16') - train.add_argument('--gc-type', type=str, default='none', - help='type of gradient compression to use, \ - takes `2bit` or `none` for now') - train.add_argument('--gc-threshold', type=float, default=0.5, - help='threshold for 2bit gradient compression') - # additional parameters for large batch sgd - train.add_argument('--macrobatch-size', type=int, default=0, - help='distributed effective batch size') - train.add_argument('--warmup-epochs', type=int, default=5, - help='the epochs to ramp-up lr to scaled large-batch value') - train.add_argument('--warmup-strategy', type=str, default='linear', - help='the ramping-up strategy for large batch sgd') - train.add_argument('--profile-worker-suffix', type=str, default='', - help='profile workers actions into this file. During distributed training\ - filename saved will be rank1_ followed by this suffix') - train.add_argument('--profile-server-suffix', type=str, default='', - help='profile server actions into a file with name like rank1_ followed by this suffix \ - during distributed training') - train.add_argument('--use-imagenet-data-augmentation', type=int, default=0, - help='enable data augmentation of ImageNet data, default disabled') - return train - - -def fit(args, network, data_loader, **kwargs): - """ - train a model - args : argparse returns - network : the symbol definition of the nerual network - data_loader : function that returns the train and val data iterators - """ - # kvstore - kv = mx.kvstore.create(args.kv_store) - if args.gc_type != 'none': - kv.set_gradient_compression({'type': args.gc_type, - 'threshold': args.gc_threshold}) - if args.profile_server_suffix: - mx.profiler.set_config(filename=args.profile_server_suffix, profile_all=True, profile_process='server') - mx.profiler.set_state(state='run', profile_process='server') - - if args.profile_worker_suffix: - if kv.num_workers > 1: - filename = 'rank' + str(kv.rank) + '_' + args.profile_worker_suffix - else: - filename = args.profile_worker_suffix - mx.profiler.set_config(filename=filename, profile_all=True, profile_process='worker') - mx.profiler.set_state(state='run', profile_process='worker') - - # logging - head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' - logging.basicConfig(level=logging.DEBUG, format=head) - logging.info('start with arguments %s', args) - - epoch_size = get_epoch_size(args, kv) - - # data iterators - (train, val) = data_loader(args, kv) - if 'dist' in args.kv_store and not 'async' in args.kv_store: - logging.info('Resizing training data to %d batches per machine', epoch_size) - # resize train iter to ensure each machine has same number of batches per epoch - # if not, dist_sync can hang at the end with one machine waiting for other machines - train = mx.io.ResizeIter(train, epoch_size) - - if args.test_io: - tic = time.time() - for i, batch in enumerate(train): - if isinstance(batch, list): - for b in batch: - for j in b.data: - j.wait_to_read() - else: - for j in batch.data: - j.wait_to_read() - if (i + 1) % args.disp_batches == 0: - logging.info('Batch [%d]\tSpeed: %.2f samples/sec', i, - args.disp_batches * args.batch_size / (time.time() - tic)) - tic = time.time() - return - - # load model - if 'arg_params' in kwargs and 'aux_params' in kwargs: - arg_params = kwargs['arg_params'] - aux_params = kwargs['aux_params'] - else: - sym, arg_params, aux_params = _load_model(args, kv.rank) - if sym is not None: - assert sym.tojson() == network.tojson() - - # save model - checkpoint = _save_model(args, kv.rank) - - # devices for training - devs = mx.cpu() if args.gpus is None or args.gpus == "" else [ - mx.gpu(int(i)) for i in args.gpus.split(',')] - - # learning rate - lr, lr_scheduler = _get_lr_scheduler(args, kv) - - # create model - model = mx.mod.Module( - context=devs, - symbol=network - ) - - lr_scheduler = lr_scheduler - optimizer_params = { - 'learning_rate': lr, - 'wd': args.wd, - 'lr_scheduler': lr_scheduler, - 'multi_precision': True} - - # Only a limited number of optimizers have 'momentum' property - has_momentum = {'sgd', 'dcasgd', 'nag', 'signum', 'lbsgd'} - if args.optimizer in has_momentum: - optimizer_params['momentum'] = args.mom - - monitor = mx.mon.Monitor( - args.monitor, pattern=".*") if args.monitor > 0 else None - - # A limited number of optimizers have a warmup period - has_warmup = {'lbsgd', 'lbnag'} - if args.optimizer in has_warmup: - nworkers = kv.num_workers - if epoch_size < 1: - epoch_size = 1 - macrobatch_size = args.macrobatch_size - if macrobatch_size < args.batch_size * nworkers: - macrobatch_size = args.batch_size * nworkers - #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999) - batch_scale = math.ceil( - float(macrobatch_size) / args.batch_size / nworkers) - optimizer_params['updates_per_epoch'] = epoch_size - optimizer_params['begin_epoch'] = args.load_epoch if args.load_epoch else 0 - optimizer_params['batch_scale'] = batch_scale - optimizer_params['warmup_strategy'] = args.warmup_strategy - optimizer_params['warmup_epochs'] = args.warmup_epochs - optimizer_params['num_epochs'] = args.num_epochs - - if args.initializer == 'default': - if args.network == 'alexnet': - # AlexNet will not converge using Xavier - initializer = mx.init.Normal() - # VGG will not trend to converge using Xavier-Gaussian - elif args.network and 'vgg' in args.network: - initializer = mx.init.Xavier() - else: - initializer = mx.init.Xavier( - rnd_type='gaussian', factor_type="in", magnitude=2) - # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), - elif args.initializer == 'xavier': - initializer = mx.init.Xavier() - elif args.initializer == 'msra': - initializer = mx.init.MSRAPrelu() - elif args.initializer == 'orthogonal': - initializer = mx.init.Orthogonal() - elif args.initializer == 'normal': - initializer = mx.init.Normal() - elif args.initializer == 'uniform': - initializer = mx.init.Uniform() - elif args.initializer == 'one': - initializer = mx.init.One() - elif args.initializer == 'zero': - initializer = mx.init.Zero() - - # evaluation metrices - eval_metrics = ['accuracy'] - if args.top_k > 0: - eval_metrics.append(mx.metric.create( - 'top_k_accuracy', top_k=args.top_k)) - - supported_loss = ['ce', 'nll_loss'] - if len(args.loss) > 0: - # ce or nll loss is only applicable to softmax output - loss_type_list = args.loss.split(',') - if 'softmax_output' in network.list_outputs(): - for loss_type in loss_type_list: - loss_type = loss_type.strip() - if loss_type == 'nll': - loss_type = 'nll_loss' - if loss_type not in supported_loss: - logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' - 'negative likelihood loss is supported!') - else: - eval_metrics.append(mx.metric.create(loss_type)) - else: - logging.warning("The output is not softmax_output, loss argument will be skipped!") - - # callbacks that run after each batch - batch_end_callbacks = [mx.callback.Speedometer( - args.batch_size, args.disp_batches)] - if 'batch_end_callback' in kwargs: - cbs = kwargs['batch_end_callback'] - batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs] - - # run - model.fit(train, - begin_epoch=args.load_epoch if args.load_epoch else 0, - num_epoch=args.num_epochs, - eval_data=val, - eval_metric=eval_metrics, - kvstore=kv, - optimizer=args.optimizer, - optimizer_params=optimizer_params, - initializer=initializer, - arg_params=arg_params, - aux_params=aux_params, - batch_end_callback=batch_end_callbacks, - epoch_end_callback=checkpoint, - allow_missing=True, - monitor=monitor) - - if args.profile_server_suffix: - mx.profiler.set_state(state='run', profile_process='server') - if args.profile_worker_suffix: - mx.profiler.set_state(state='run', profile_process='worker') diff --git a/examples/v1beta1/trial-images/mxnet-mnist/common/utils.py b/examples/v1beta1/trial-images/mxnet-mnist/common/utils.py deleted file mode 100644 index 3a0f64cc95b..00000000000 --- a/examples/v1beta1/trial-images/mxnet-mnist/common/utils.py +++ /dev/null @@ -1,48 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os -import errno - -import mxnet as mx - - -def download_file(url, local_fname=None, force_write=False): - # requests is not default installed - import requests - if local_fname is None: - local_fname = url.split('/')[-1] - if not force_write and os.path.exists(local_fname): - return local_fname - - dir_name = os.path.dirname(local_fname) - - if dir_name != "": - if not os.path.exists(dir_name): - try: # try to create the directory if it doesn't exists - os.makedirs(dir_name) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise - - r = requests.get(url, stream=True) - assert r.status_code == 200, "failed to open %s" % url - with open(local_fname, 'wb') as f: - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - f.write(chunk) - return local_fname diff --git a/examples/v1beta1/trial-images/mxnet-mnist/install-arm-performance-libraries.sh b/examples/v1beta1/trial-images/mxnet-mnist/install-arm-performance-libraries.sh deleted file mode 100755 index 8216a5f53aa..00000000000 --- a/examples/v1beta1/trial-images/mxnet-mnist/install-arm-performance-libraries.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2022 The Kubeflow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -o errexit -set -o nounset -set -o pipefail -cd "$(dirname "$0")" - -# Download Arm Performance Libraries for Ubuntu 22.04 -# Ref: https://developer.arm.com/downloads/-/arm-performance-libraries -echo "Downloading Arm Performance Libraries for Ubuntu 22.04..." -wget -qO - \ - "https://developer.arm.com/-/media/Files/downloads/hpc/arm-performance-libraries/23-04-1/ubuntu-22/arm-performance-libraries_23.04.1_Ubuntu-22.04_gcc-11.3.tar?rev=207c1f7aaa16400e94eb9a980494a6eb&revision=207c1f7a-aa16-400e-94eb-9a980494a6eb" \ - | tar -xf - - -# Install Arm Performance Libraries -echo "Installing Arm Performance Libraries for Ubuntu 22.04..." -./arm-performance-libraries_23.04.1_Ubuntu-22.04/arm-performance-libraries_23.04.1_Ubuntu-22.04.sh -a - -# Clean up -echo "Removing installer..." -rm -rf ./arm-performance-libraries_23.04.1_Ubuntu-22.04 diff --git a/examples/v1beta1/trial-images/mxnet-mnist/mnist.py b/examples/v1beta1/trial-images/mxnet-mnist/mnist.py deleted file mode 100644 index 111de1fb950..00000000000 --- a/examples/v1beta1/trial-images/mxnet-mnist/mnist.py +++ /dev/null @@ -1,86 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Train mnist, see more explanation at https://mxnet.io/tutorials/python/mnist.html -""" -import os -import argparse -import logging -import mxnet as mx -import numpy as np -import gzip -import struct -from common import fit -from common import utils -# This example only for mlp network -from symbols import mlp - -# Use this format (%Y-%m-%dT%H:%M:%SZ) to record timestamp of the metrics -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - datefmt="%Y-%m-%dT%H:%M:%SZ", - level=logging.DEBUG) - - -def get_mnist_iter(args, kv): - """ - Create data iterator with NDArrayIter - """ - mnist = mx.test_utils.get_mnist() - - # Get MNIST data. - train_data = mx.io.NDArrayIter( - mnist['train_data'], mnist['train_label'], args.batch_size, shuffle=True) - val_data = mx.io.NDArrayIter( - mnist['test_data'], mnist['test_label'], args.batch_size) - - return (train_data, val_data) - - -if __name__ == '__main__': - # parse args - parser = argparse.ArgumentParser(description="train mnist", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--num-classes', type=int, default=10, - help='the number of classes') - parser.add_argument('--num-examples', type=int, default=60000, - help='the number of training examples') - - parser.add_argument('--add_stn', action="store_true", default=False, - help='Add Spatial Transformer Network Layer (lenet only)') - parser.add_argument('--image_shape', default='1, 28, 28', help='shape of training images') - - fit.add_fit_args(parser) - parser.set_defaults( - # network - network='mlp', - # train - gpus=None, - batch_size=64, - disp_batches=100, - num_epochs=10, - lr=.05, - lr_step_epochs='10' - ) - args = parser.parse_args() - - # load mlp network - sym = mlp.get_symbol(**vars(args)) - - # train - fit.fit(args, sym, get_mnist_iter) diff --git a/examples/v1beta1/trial-images/mxnet-mnist/requirements.txt b/examples/v1beta1/trial-images/mxnet-mnist/requirements.txt deleted file mode 100644 index fb439db250f..00000000000 --- a/examples/v1beta1/trial-images/mxnet-mnist/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -mxnet==1.9.1 -# This is a workaround to avoid the following error. -# AttributeError: module 'numpy' has no attribute 'bool' -# See more: https://github.com/numpy/numpy/pull/22607 -numpy==1.23.5 diff --git a/examples/v1beta1/trial-images/mxnet-mnist/symbols/__init__.py b/examples/v1beta1/trial-images/mxnet-mnist/symbols/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/examples/v1beta1/trial-images/mxnet-mnist/symbols/mlp.py b/examples/v1beta1/trial-images/mxnet-mnist/symbols/mlp.py deleted file mode 100644 index f6f6f0eba32..00000000000 --- a/examples/v1beta1/trial-images/mxnet-mnist/symbols/mlp.py +++ /dev/null @@ -1,33 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -a simple multilayer perceptron -""" -import mxnet as mx - - -def get_symbol(num_classes=10, **kwargs): - data = mx.symbol.Variable('data') - data = mx.sym.Flatten(data=data) - fc1 = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=128) - act1 = mx.symbol.Activation(data=fc1, name='relu1', act_type="relu") - fc2 = mx.symbol.FullyConnected(data=act1, name='fc2', num_hidden=64) - act2 = mx.symbol.Activation(data=fc2, name='relu2', act_type="relu") - fc3 = mx.symbol.FullyConnected(data=act2, name='fc3', num_hidden=num_classes) - mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax') - return mlp diff --git a/examples/v1beta1/trial-template/trial-metadata-substitution.yaml b/examples/v1beta1/trial-template/trial-metadata-substitution.yaml index c183e5fb5a6..fe6e360b420 100644 --- a/examples/v1beta1/trial-template/trial-metadata-substitution.yaml +++ b/examples/v1beta1/trial-template/trial-metadata-substitution.yaml @@ -7,11 +7,9 @@ metadata: name: trial-metadata-substitution spec: objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: random parallelTrialCount: 3 @@ -22,13 +20,21 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" + max: "0.05" + - name: momentum + parameterType: double + feasibleSpace: + min: "0.5" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr + - name: momentum + description: Momentum for the training model + reference: momentum - name: trialName description: Name of the current trial's job reference: ${trialSpec.Name} @@ -60,12 +66,14 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" + - "--momentum=${trialParameters.momentum}" env: - name: TRIAL_NAME value: ${trialParameters.trialName} diff --git a/hack/boilerplate/update-boilerplate.sh b/hack/boilerplate/update-boilerplate.sh index aa4ec011a9f..87c62b84068 100755 --- a/hack/boilerplate/update-boilerplate.sh +++ b/hack/boilerplate/update-boilerplate.sh @@ -57,7 +57,6 @@ find_python_files=$( find ./cmd ./pkg ./hack ./test ./examples -name "*.py" \ ! -path "./pkg/apis/manager/*" \ ! -path "*__init__.py" \ - ! -path "./examples/v1beta1/trial-images/mxnet-mnist/*" ) for i in ${find_python_files}; do diff --git a/manifests/v1beta1/components/controller/trial-templates.yaml b/manifests/v1beta1/components/controller/trial-templates.yaml index 916dd3c85b6..58a030b23b3 100644 --- a/manifests/v1beta1/components/controller/trial-templates.yaml +++ b/manifests/v1beta1/components/controller/trial-templates.yaml @@ -15,14 +15,14 @@ data: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never # For ConfigMap templates double quotes must set in commands to correct parse JSON parameters in Trial Template (e.g nn_config, architecture) enasCPUTemplate: |- diff --git a/pkg/controller.v1beta1/experiment/experiment_controller_test.go b/pkg/controller.v1beta1/experiment/experiment_controller_test.go index b43f45d7f5c..38f59fddccc 100644 --- a/pkg/controller.v1beta1/experiment/experiment_controller_test.go +++ b/pkg/controller.v1beta1/experiment/experiment_controller_test.go @@ -477,12 +477,14 @@ func newFakeInstance() *experimentsv1beta1.Experiment { Containers: []corev1.Container{ { Name: primaryContainer, - Image: "docker.io/kubeflowkatib/mxnet-mnist", + Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu", Command: []string{ "python3", - "/opt/mxnet-mnist/mnist.py", + "/opt/pytorch-mnist/mnist.py", + "--epochs=1", + "--batch-size=16", "--lr=${trialParameters.learningRate}", - "--num-layers=${trialParameters.numberLayers}", + "--momentum=${trialParameters.momentum}", }, }, }, @@ -611,12 +613,14 @@ func newFakeBatchJob() *batchv1.Job { Containers: []corev1.Container{ { Name: primaryContainer, - Image: "docker.io/kubeflowkatib/mxnet-mnist", + Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu", Command: []string{ "python3", - "/opt/mxnet-mnist/mnist.py", - "--lr=0.01", - "--num-layers=5", + "/opt/pytorch-mnist/mnist.py", + "--epochs=1", + "--batch-size=16", + "--lr=${trialParameters.learningRate}", + "--momentum=${trialParameters.momentum}", }, }, }, diff --git a/pkg/controller.v1beta1/experiment/manifest/generator_test.go b/pkg/controller.v1beta1/experiment/manifest/generator_test.go index fa3c3f6ff09..3adeb017f74 100644 --- a/pkg/controller.v1beta1/experiment/manifest/generator_test.go +++ b/pkg/controller.v1beta1/experiment/manifest/generator_test.go @@ -61,12 +61,14 @@ func TestGetRunSpecWithHP(t *testing.T) { Containers: []v1.Container{ { Name: "training-container", - Image: "docker.io/kubeflowkatib/mxnet-mnist", + Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu", Command: []string{ "python3", - "/opt/mxnet-mnist/mnist.py", + "/opt/pytorch-mnist/mnist.py", + "--epochs=1", + "--batch-size=16", "--lr=0.05", - "--num-layers=5", + "--momentum=0.9", }, Env: []v1.EnvVar{ {Name: consts.TrialTemplateMetaKeyOfName, Value: "trial-name"}, @@ -176,12 +178,14 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist + image: docker.io/kubeflowkatib/pytorch-mnist-cpu command: - "python3" - - "/opt/mxnet-mnist/mnist.py" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}"` + - "--momentum=${trialParameters.momentum}"` invalidTrialSpec := `apiVersion: batch/v1 kind: Job @@ -190,12 +194,14 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist + image: docker.io/kubeflowkatib/pytorch-mnist-cpu command: - python3 - - /opt/mxnet-mnist/mnist.py + - /opt/pytorch-mnist/mnist.py + - --epochs=1 + - --batch-size=16 - --lr=${trialParameters.learningRate} - - --num-layers=${trialParameters.numberLayers} + - --momentum=${trialParameters.momentum} - --invalidParameter={'num_layers': 2, 'input_sizes': [32, 32, 3]}` validGetConfigMap1 := c.EXPECT().GetConfigMap(gomock.Any(), gomock.Any()).Return( @@ -228,12 +234,14 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist + image: docker.io/kubeflowkatib/pytorch-mnist-cpu command: - "python3" - - "/opt/mxnet-mnist/mnist.py" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=0.05" - - "--num-layers=5"` + - "--momentum=0.9"` expectedRunSpec, err := util.ConvertStringToUnstructured(expectedStr) if err != nil { @@ -347,12 +355,14 @@ func newFakeInstance() *experimentsv1beta1.Experiment { Containers: []v1.Container{ { Name: "training-container", - Image: "docker.io/kubeflowkatib/mxnet-mnist", + Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu", Command: []string{ "python3", - "/opt/mxnet-mnist/mnist.py", + "/opt/pytorch-mnist/mnist.py", + "--epochs=1", + "--batch-size=16", "--lr=${trialParameters.learningRate}", - "--num-layers=${trialParameters.numberLayers}", + "--momentum=${trialParameters.momentum}", }, Env: []v1.EnvVar{ {Name: consts.TrialTemplateMetaKeyOfName, Value: "${trialParameters.trialName}"}, @@ -381,9 +391,9 @@ func newFakeInstance() *experimentsv1beta1.Experiment { Reference: "lr", }, { - Name: "numberLayers", - Description: "Number of layers", - Reference: "num-layers", + Name: "momentum", + Description: "Momentum for the training model", + Reference: "momentum", }, { Name: "trialName", @@ -418,8 +428,8 @@ func newFakeParameterAssignment() []commonapiv1beta1.ParameterAssignment { Value: "0.05", }, { - Name: "num-layers", - Value: "5", + Name: "momentum", + Value: "0.9", }, } } diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 6e04813aaa0..dec9bee4851 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -388,12 +388,14 @@ func newFakeTrialBatchJob() *trialsv1beta1.Trial { Containers: []corev1.Container{ { Name: primaryContainer, - Image: "docker.io/kubeflowkatib/mxnet-mnist", + Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu", Command: []string{ "python3", - "/opt/mxnet-mnist/mnist.py", + "/opt/pytorch-mnist/mnist.py", + "--epochs=1", + "--batch-size=16", "--lr=0.01", - "--num-layers=5", + "--momentum=0.9", }, }, }, diff --git a/pkg/ui/v1beta1/frontend/cypress/fixtures/trial-template.json b/pkg/ui/v1beta1/frontend/cypress/fixtures/trial-template.json index ee6d19a6630..eb515c650fc 100644 --- a/pkg/ui/v1beta1/frontend/cypress/fixtures/trial-template.json +++ b/pkg/ui/v1beta1/frontend/cypress/fixtures/trial-template.json @@ -8,7 +8,7 @@ "Templates": [ { "Path": "defaultTrialTemplate.yaml", - "Yaml": "apiVersion: batch/v1\nkind: Job\nspec:\n template:\n spec:\n containers:\n - name: training-container\n image: docker.io/kubeflowkatib/mxnet-mnist:v1beta1-45c5727\n command:\n - \"python3\"\n - \"/opt/mxnet-mnist/mnist.py\"\n - \"--batch-size=64\"\n - \"--lr=${trialParameters.learningRate}\"\n - \"--num-layers=${trialParameters.numberLayers}\"\n - \"--optimizer=${trialParameters.optimizer}\"\n restartPolicy: Never" + "Yaml": "apiVersion: batch/v1\nkind: Job\nspec:\n template:\n spec:\n containers:\n - name: training-container\n image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727\n command:\n - \"python3\"\n - \"/opt/pytorch-mnist/mnist.py\"\n - \"--epochs=1\"\n - \"--lr=${trialParameters.learningRate}\"\n - \"--momentum=${trialParameters.momentum}\"\n restartPolicy: Never" }, { "Path": "enasCPUTemplate", diff --git a/pkg/webhook/v1beta1/experiment/validator/validator_test.go b/pkg/webhook/v1beta1/experiment/validator/validator_test.go index cef44ca737d..f17b886abe6 100644 --- a/pkg/webhook/v1beta1/experiment/validator/validator_test.go +++ b/pkg/webhook/v1beta1/experiment/validator/validator_test.go @@ -1273,10 +1273,10 @@ func newFakeInstance() *experimentsv1beta1.Experiment { }, }, { - Name: "num-layers", + Name: "momentum", ParameterType: experimentsv1beta1.ParameterTypeCategorical, FeasibleSpace: experimentsv1beta1.FeasibleSpace{ - List: []string{"1", "2", "3"}, + List: []string{"0.95", "0.85", "0.75"}, }, }, }, @@ -1298,12 +1298,14 @@ func newFakeBatchJob() *batchv1.Job { Containers: []v1.Container{ { Name: "training-container", - Image: "docker.io/kubeflowkatib/mxnet-mnist", + Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu", Command: []string{ "python3", - "/opt/mxnet-mnist/mnist.py", + "--epochs=1", + "--batch-size=16", + "/opt/pytorch-mnist/mnist.py", "--lr=${trialParameters.learningRate}", - "--num-layers=${trialParameters.numberLayers}", + "--momentum=${trialParameters.momentum}", }, }, }, @@ -1321,9 +1323,9 @@ func newFakeTrialParamters() []experimentsv1beta1.TrialParameterSpec { Reference: "lr", }, { - Name: "numberLayers", - Description: "Number of layers", - Reference: "num-layers", + Name: "momentum", + Description: "Momentum for the training model", + Reference: "momentum", }, } } diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 97d4b9ed039..f8cba66c34c 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -119,9 +119,6 @@ docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/darts-cnn-cifar10 echo -e "\nBuilding PyTorch CIFAR-10 CNN training container example for DARTS with GPU support...\n" docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/darts-cnn-cifar10-gpu:${TAG}" -f examples/${VERSION}/trial-images/darts-cnn-cifar10/Dockerfile.gpu . -echo -e "\nBuilding mxnet mnist training container example...\n" -docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/mxnet-mnist:${TAG}" -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile . - echo -e "\nBuilding PyTorch mnist training container example with CPU support...\n" docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/pytorch-mnist-cpu:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile.cpu . diff --git a/scripts/v1beta1/push.sh b/scripts/v1beta1/push.sh index 9a6c70c546f..474797abe83 100755 --- a/scripts/v1beta1/push.sh +++ b/scripts/v1beta1/push.sh @@ -86,9 +86,6 @@ docker push "${REGISTRY}/earlystopping-medianstop:${TAG}" # Training container images echo -e "\nPushing training container images..." -echo -e "\nPushing mxnet mnist training container example...\n" -docker push "${REGISTRY}/mxnet-mnist:${TAG}" - echo -e "\nPushing Tensorflow with summaries mnist training container example...\n" docker push "${REGISTRY}/tf-mnist-with-summaries:${TAG}" diff --git a/scripts/v1beta1/update-images.sh b/scripts/v1beta1/update-images.sh index 21230db18a3..d7805f7af5d 100755 --- a/scripts/v1beta1/update-images.sh +++ b/scripts/v1beta1/update-images.sh @@ -82,7 +82,6 @@ done # Katib Trial training container images. # Postfixes for the each Trial image. -MXNET_MNIST="mxnet-mnist" PYTORCH_MNIST_CPU="pytorch-mnist-cpu" PYTORCH_MNIST_GPU="pytorch-mnist-gpu" TF_MNIST_WITH_SUMMARIES="tf-mnist-with-summaries" @@ -93,7 +92,6 @@ DARTS_CPU="darts-cnn-cifar10-cpu" SIMPLE_PBT="simple-pbt" echo -e "Update Katib Trial training container images\n" -update_yaml_files "./" "${OLD_PREFIX}${MXNET_MNIST}:.*" "${NEW_PREFIX}${MXNET_MNIST}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST_CPU}:.*" "${NEW_PREFIX}${PYTORCH_MNIST_CPU}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST_GPU}:.*" "${NEW_PREFIX}${PYTORCH_MNIST_GPU}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${TF_MNIST_WITH_SUMMARIES}:.*" "${NEW_PREFIX}${TF_MNIST_WITH_SUMMARIES}:${TAG}" diff --git a/test/e2e/v1beta1/hack/aws/argo_workflow.py b/test/e2e/v1beta1/hack/aws/argo_workflow.py index ac5d97fd643..b38b5beeb78 100644 --- a/test/e2e/v1beta1/hack/aws/argo_workflow.py +++ b/test/e2e/v1beta1/hack/aws/argo_workflow.py @@ -55,7 +55,6 @@ "suggestion-enas": "cmd/suggestion/nas/enas/v1beta1/Dockerfile", "suggestion-darts": "cmd/suggestion/nas/darts/v1beta1/Dockerfile", "earlystopping-medianstop": "cmd/earlystopping/medianstop/v1beta1/Dockerfile", - "trial-mxnet-mnist": "examples/v1beta1/trial-images/mxnet-mnist/Dockerfile", "trial-pytorch-mnist": "examples/v1beta1/trial-images/pytorch-mnist/Dockerfile", "trial-tf-mnist-with-summaries": "examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile", "trial-enas-cnn-cifar10-gpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu", diff --git a/test/e2e/v1beta1/testdata/invalid-experiment.yaml b/test/e2e/v1beta1/testdata/invalid-experiment.yaml index 28018af61c1..4cc1d9ebb67 100644 --- a/test/e2e/v1beta1/testdata/invalid-experiment.yaml +++ b/test/e2e/v1beta1/testdata/invalid-experiment.yaml @@ -7,11 +7,9 @@ spec: maxTrialCount: 13 maxFailedTrialCount: 3 objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: invalid-algorithm # Invalid Algorithm to check that validation webhook is working parameters: @@ -19,31 +17,21 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -52,12 +40,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/test/e2e/v1beta1/testdata/valid-experiment.yaml b/test/e2e/v1beta1/testdata/valid-experiment.yaml index 25937bf2cfe..1ae8cc1d811 100644 --- a/test/e2e/v1beta1/testdata/valid-experiment.yaml +++ b/test/e2e/v1beta1/testdata/valid-experiment.yaml @@ -7,11 +7,9 @@ spec: maxTrialCount: 13 maxFailedTrialCount: 3 objective: - type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy - additionalMetricNames: - - Train-accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: random parameters: @@ -19,31 +17,21 @@ spec: parameterType: double feasibleSpace: min: "0.01" - max: "0.03" - - name: num-layers - parameterType: int - feasibleSpace: - min: "2" - max: "5" - - name: optimizer - parameterType: categorical + max: "0.05" + - name: momentum + parameterType: double feasibleSpace: - list: - - sgd - - adam - - ftrl + min: "0.5" + max: "0.9" trialTemplate: primaryContainerName: training-container trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - - name: numberLayers - description: Number of training model layers - reference: num-layers - - name: optimizer - description: Training model optimizer (sdg, adam or ftrl) - reference: optimizer + - name: momentum + description: Momentum for the training model + reference: momentum trialSpec: apiVersion: batch/v1 kind: Job @@ -52,12 +40,12 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/mxnet-mnist:latest + image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest command: - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" + - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - - "--num-layers=${trialParameters.numberLayers}" - - "--optimizer=${trialParameters.optimizer}" + - "--momentum=${trialParameters.momentum}" restartPolicy: Never