diff --git a/.github/workflows/e2e-test-mxnet-mnist.yaml b/.github/workflows/e2e-test-mxnet-mnist.yaml deleted file mode 100644 index 16e022b8055..00000000000 --- a/.github/workflows/e2e-test-mxnet-mnist.yaml +++ /dev/null @@ -1,42 +0,0 @@ -name: E2E Test with mxnet-mnist - -on: - pull_request: - paths-ignore: - - "pkg/ui/v1beta1/frontend/**" - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - e2e: - runs-on: ubuntu-22.04 - timeout-minutes: 120 - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Setup Test Env - uses: ./.github/workflows/template-setup-e2e-test - with: - kubernetes-version: ${{ matrix.kubernetes-version }} - python-version: "3.9" - - - name: Run e2e test with ${{ matrix.experiments }} experiments - uses: ./.github/workflows/template-e2e-test - with: - experiments: ${{ matrix.experiments }} - # Comma Delimited - trial-images: mxnet-mnist - - strategy: - fail-fast: false - matrix: - kubernetes-version: ["v1.25.12", "v1.26.6", "v1.27.3"] - # Comma Delimited - experiments: - # suggestion-hyperopt - - "long-running-resume,from-volume-resume,median-stop" - # others - - "grid,bayesian-optimization,tpe,multivariate-tpe,cma-es,hyperband" diff --git a/.github/workflows/e2e-test-pytorch-mnist.yaml b/.github/workflows/e2e-test-pytorch-mnist.yaml index 7bcc92215de..1c6db9d0fce 100644 --- a/.github/workflows/e2e-test-pytorch-mnist.yaml +++ b/.github/workflows/e2e-test-pytorch-mnist.yaml @@ -37,5 +37,9 @@ jobs: kubernetes-version: ["v1.25.12", "v1.26.6", "v1.27.3"] # Comma Delimited experiments: + # suggestion-hyperopt + - "long-running-resume,from-volume-resume,median-stop" + # others + - "grid,bayesian-optimization,tpe,multivariate-tpe,cma-es,hyperband" - "file-metrics-collector,pytorchjob-mnist" - "median-stop-with-json-format,file-metrics-collector-with-json-format" diff --git a/.github/workflows/e2e-test-ui-random-search-postgres.yaml b/.github/workflows/e2e-test-ui-random-search-postgres.yaml index 420760d9bdd..09ca97a452f 100644 --- a/.github/workflows/e2e-test-ui-random-search-postgres.yaml +++ b/.github/workflows/e2e-test-ui-random-search-postgres.yaml @@ -25,7 +25,7 @@ jobs: with: experiments: random # Comma Delimited - trial-images: mxnet-mnist + trial-images: pytorch-mnist-cpu katib-ui: true database-type: postgres diff --git a/.github/workflows/publish-trial-images.yaml b/.github/workflows/publish-trial-images.yaml index 7839e2687ce..81ff03a63a9 100644 --- a/.github/workflows/publish-trial-images.yaml +++ b/.github/workflows/publish-trial-images.yaml @@ -22,9 +22,6 @@ jobs: fail-fast: false matrix: include: - - trial-name: mxnet-mnist - platforms: linux/amd64,linux/arm64 - dockerfile: examples/v1beta1/trial-images/mxnet-mnist/Dockerfile - trial-name: pytorch-mnist-cpu platforms: linux/amd64,linux/arm64 dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu diff --git a/docs/images-location.md b/docs/images-location.md index 5afa11d008d..0cbf78d7618 100644 --- a/docs/images-location.md +++ b/docs/images-location.md @@ -238,17 +238,6 @@ The following table shows images for training containers which are used in the Location -
docker.io/kubeflowkatib/mxnet-mnist
- docker.io/kubeflowkatib/pytorch-mnist-cpu
diff --git a/examples/v1beta1/README.md b/examples/v1beta1/README.md
index 7439d869c13..f971ae29862 100644
--- a/examples/v1beta1/README.md
+++ b/examples/v1beta1/README.md
@@ -104,8 +104,6 @@ Check the following images for the Trial containers:
- [Tensorflow MNIST with summaries](./trial-images/tf-mnist-with-summaries)
-- [MXNet MNIST](./trial-images/mxnet-mnist)
-
- [PyTorch MNIST](./trial-images/pytorch-mnist)
- [ENAS Keras CNN CIFAR-10](./trial-images/enas-cnn-cifar10)
diff --git a/examples/v1beta1/argo/argo-workflow.yaml b/examples/v1beta1/argo/argo-workflow.yaml
index 1c581be1d18..b658943984d 100644
--- a/examples/v1beta1/argo/argo-workflow.yaml
+++ b/examples/v1beta1/argo/argo-workflow.yaml
@@ -13,11 +13,9 @@ metadata:
name: katib-argo-workflow
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: random
parallelTrialCount: 2
@@ -50,22 +48,22 @@ spec:
- name: hp-workflow
steps:
- - name: data-preprocessing
- template: gen-num-examples
+ template: gen-epochs
- - name: model-training
template: model-training
arguments:
parameters:
- - name: num-examples
+ - name: epochs
value: "{{steps.data-preprocessing.outputs.result}}"
- - name: gen-num-examples
+ - name: gen-epochs
script:
image: python:alpine3.6
command:
- python
source: |
import random
- print(60000//random.randint(10, 100))
+ print(60000//random.randint(3000, 30000))
- name: model-training
metadata:
@@ -73,12 +71,13 @@ spec:
katib.kubeflow.org/model-training: "true"
inputs:
parameters:
- - name: num-examples
+ - name: epochs
container:
name: model-training
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
+ - "/opt/pytorch-mnist/mnist.py"
- "--lr=${trialParameters.learningRate}"
- - "--num-examples={{inputs.parameters.num-examples}}"
+ - "--epochs={{inputs.parameters.epochs}}"
+ - "--batch-size=16"
diff --git a/examples/v1beta1/early-stopping/median-stop.yaml b/examples/v1beta1/early-stopping/median-stop.yaml
index 010112a6eeb..3e4659672a5 100644
--- a/examples/v1beta1/early-stopping/median-stop.yaml
+++ b/examples/v1beta1/early-stopping/median-stop.yaml
@@ -8,11 +8,9 @@ metadata:
name: median-stop
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: random
earlyStopping:
@@ -30,12 +28,12 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.5"
- - name: num-epochs
- parameterType: int
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- min: "3"
- max: "4"
+ min: "0.5"
+ max: "0.9"
trialTemplate:
retain: true
primaryContainerName: training-container
@@ -43,9 +41,9 @@ spec:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberEpochs
- description: Number of epochs to train the model
- reference: num-epochs
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -54,11 +52,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-epochs=${trialParameters.numberEpochs}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
diff --git a/examples/v1beta1/hp-tuning/bayesian-optimization.yaml b/examples/v1beta1/hp-tuning/bayesian-optimization.yaml
index 2362349e88b..80bfd7566f8 100644
--- a/examples/v1beta1/hp-tuning/bayesian-optimization.yaml
+++ b/examples/v1beta1/hp-tuning/bayesian-optimization.yaml
@@ -6,11 +6,9 @@ metadata:
name: bayesian-optimization
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: bayesianoptimization
algorithmSettings:
@@ -24,31 +22,21 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -57,12 +45,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
diff --git a/examples/v1beta1/hp-tuning/cma-es.yaml b/examples/v1beta1/hp-tuning/cma-es.yaml
index 7f75488e434..f913b80b366 100644
--- a/examples/v1beta1/hp-tuning/cma-es.yaml
+++ b/examples/v1beta1/hp-tuning/cma-es.yaml
@@ -6,11 +6,9 @@ metadata:
name: cmaes
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: cmaes
algorithmSettings:
@@ -24,31 +22,21 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -57,12 +45,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
diff --git a/examples/v1beta1/hp-tuning/grid.yaml b/examples/v1beta1/hp-tuning/grid.yaml
index af5cc7a800d..e739245fa87 100644
--- a/examples/v1beta1/hp-tuning/grid.yaml
+++ b/examples/v1beta1/hp-tuning/grid.yaml
@@ -6,11 +6,9 @@ metadata:
name: grid
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: grid
parallelTrialCount: 3
@@ -20,33 +18,24 @@ spec:
- name: lr
parameterType: double
feasibleSpace:
- min: "0.001"
- max: "0.01"
- step: "0.001"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ min: "0.01"
+ step: "0.005"
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ step: "0.1"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -55,12 +44,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
diff --git a/examples/v1beta1/hp-tuning/hyperband.yaml b/examples/v1beta1/hp-tuning/hyperband.yaml
index 5315a6a31bd..80c1109c575 100644
--- a/examples/v1beta1/hp-tuning/hyperband.yaml
+++ b/examples/v1beta1/hp-tuning/hyperband.yaml
@@ -8,11 +8,9 @@ spec:
parallelTrialCount: 2
maxTrialCount: 2
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: hyperband
algorithmSettings:
@@ -28,19 +26,12 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ max: "0.9"
- name: num-epochs
parameterType: int
feasibleSpace:
@@ -52,12 +43,9 @@ spec:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
- name: numberEpochs
description: Number of epochs to train the model
reference: num-epochs
@@ -69,13 +57,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=32"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=${trialParameters.numberEpochs}"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
- - "--num-epochs=${trialParameters.numberEpochs}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
diff --git a/examples/v1beta1/hp-tuning/multivariate-tpe.yaml b/examples/v1beta1/hp-tuning/multivariate-tpe.yaml
index a24ff247b39..4217ce55ada 100644
--- a/examples/v1beta1/hp-tuning/multivariate-tpe.yaml
+++ b/examples/v1beta1/hp-tuning/multivariate-tpe.yaml
@@ -6,11 +6,9 @@ metadata:
name: multivariate-tpe
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: multivariate-tpe
parallelTrialCount: 3
@@ -21,31 +19,21 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -54,12 +42,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
diff --git a/examples/v1beta1/hp-tuning/random.yaml b/examples/v1beta1/hp-tuning/random.yaml
index 0f2c1178833..0c9705d64f5 100644
--- a/examples/v1beta1/hp-tuning/random.yaml
+++ b/examples/v1beta1/hp-tuning/random.yaml
@@ -6,11 +6,9 @@ metadata:
name: random
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: random
parallelTrialCount: 3
@@ -21,31 +19,21 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -54,14 +42,14 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
resources:
limits:
memory: "1Gi"
diff --git a/examples/v1beta1/hp-tuning/sobol.yaml b/examples/v1beta1/hp-tuning/sobol.yaml
index 2a1ee1ef008..5acaf903912 100644
--- a/examples/v1beta1/hp-tuning/sobol.yaml
+++ b/examples/v1beta1/hp-tuning/sobol.yaml
@@ -6,11 +6,9 @@ metadata:
name: sobol
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: sobol
parallelTrialCount: 3
@@ -21,31 +19,21 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -54,12 +42,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
diff --git a/examples/v1beta1/hp-tuning/tpe.yaml b/examples/v1beta1/hp-tuning/tpe.yaml
index dc697388765..98adc16baf3 100644
--- a/examples/v1beta1/hp-tuning/tpe.yaml
+++ b/examples/v1beta1/hp-tuning/tpe.yaml
@@ -6,11 +6,9 @@ metadata:
name: tpe
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: tpe
parallelTrialCount: 3
@@ -21,31 +19,21 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -54,12 +42,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
diff --git a/examples/v1beta1/kind-cluster/README.md b/examples/v1beta1/kind-cluster/README.md
index 5241aed3393..011a2059010 100644
--- a/examples/v1beta1/kind-cluster/README.md
+++ b/examples/v1beta1/kind-cluster/README.md
@@ -46,14 +46,13 @@ using `kubectl`:
kubectl create -f https://raw.githubusercontent.com/kubeflow/katib/master/examples/v1beta1/hp-tuning/random.yaml
```
-This example uses a MXNet neural network to train an image classification model
+This example uses a PyTorch neural network to train an image classification model
using the MNIST dataset. You can check the training container source code
-[here](../trial-images/mxnet-mnist).
+[here](../trial-images/pytorch-mnist).
The Experiment runs twelve training jobs (Trials) and tunes the following hyperparameters:
- Learning Rate (`lr`).
-- Number of layers (`num-layers`).
-- Neural network optimizer (`optimizer`).
+- Momentum (`momentum`).
After creating above example, check the
[Experiment](https://www.kubeflow.org/docs/components/katib/overview/#experiment) status:
@@ -92,8 +91,7 @@ You can get the best hyperparameters with the following command:
$ kubectl get experiment random -n kubeflow -o jsonpath='{range .status.currentOptimalTrial.parameterAssignments[*]}{.name}: {.value}{"\n"}{end}'
lr: 0.028162244250364066
-num-layers: 5
-optimizer: sgd
+momentum: 0.583672196492823
```
To view created Experiment in Katib UI, follow
diff --git a/examples/v1beta1/kubeflow-pipelines/early-stopping.ipynb b/examples/v1beta1/kubeflow-pipelines/early-stopping.ipynb
index 0d3353b49e6..422cc1ff90a 100644
--- a/examples/v1beta1/kubeflow-pipelines/early-stopping.ipynb
+++ b/examples/v1beta1/kubeflow-pipelines/early-stopping.ipynb
@@ -156,12 +156,9 @@
"\n",
"# Objective specification.\n",
"objective=V1beta1ObjectiveSpec(\n",
- " type=\"maximize\",\n",
- " goal= 0.99,\n",
- " objective_metric_name=\"Validation-accuracy\",\n",
- " additional_metric_names=[\n",
- " \"Train-accuracy\"\n",
- " ]\n",
+ " type=\"minimize\",\n",
+ " goal= 0.001,\n",
+ " objective_metric_name=\"loss\",\n",
")\n",
"\n",
"# Algorithm specification.\n",
@@ -180,7 +177,6 @@
" ]\n",
")\n",
"\n",
- "\n",
"# Experiment search space.\n",
"# In this example we tune learning rate, number of layer and optimizer.\n",
"# Learning rate has bad feasible space to show more early stopped Trials.\n",
@@ -194,22 +190,11 @@
" ),\n",
" ),\n",
" V1beta1ParameterSpec(\n",
- " name=\"num-layers\",\n",
- " parameter_type=\"int\",\n",
- " feasible_space=V1beta1FeasibleSpace(\n",
- " min=\"2\",\n",
- " max=\"5\"\n",
- " ),\n",
- " ),\n",
- " V1beta1ParameterSpec(\n",
- " name=\"optimizer\",\n",
- " parameter_type=\"categorical\",\n",
+ " name=\"momentum\",\n",
+ " parameter_type=\"double\",\n",
" feasible_space=V1beta1FeasibleSpace(\n",
- " list=[\n",
- " \"sgd\", \n",
- " \"adam\",\n",
- " \"ftrl\"\n",
- " ]\n",
+ " min=\"0.5\",\n",
+ " max=\"0.9\"\n",
" ),\n",
" ),\n",
"]\n"
@@ -245,14 +230,14 @@
" \"containers\": [\n",
" {\n",
" \"name\": \"training-container\",\n",
- " \"image\": \"docker.io/kubeflowkatib/mxnet-mnist:v0.13.0\",\n",
+ " \"image\": \"docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0\",\n",
" \"command\": [\n",
" \"python3\",\n",
- " \"/opt/mxnet-mnist/mnist.py\",\n",
- " \"--batch-size=64\",\n",
+ " \"/opt/pytorch-mnist/mnist.py\",\n",
+ " \"--epochs=1\",\n",
+ " \"--batch-size=16\",\n",
" \"--lr=${trialParameters.learningRate}\",\n",
- " \"--num-layers=${trialParameters.numberLayers}\",\n",
- " \"--optimizer=${trialParameters.optimizer}\"\n",
+ " \"--momentum=${trialParameters.momentum}\",\n",
" ]\n",
" }\n",
" ],\n",
@@ -274,14 +259,9 @@
" reference=\"lr\"\n",
" ),\n",
" V1beta1TrialParameterSpec(\n",
- " name=\"numberLayers\",\n",
- " description=\"Number of training model layers\",\n",
- " reference=\"num-layers\"\n",
- " ),\n",
- " V1beta1TrialParameterSpec(\n",
- " name=\"optimizer\",\n",
- " description=\"Training model optimizer (sdg, adam or ftrl)\",\n",
- " reference=\"optimizer\"\n",
+ " name=\"momentum\",\n",
+ " description=\"Momentum for the training model\",\n",
+ " reference=\"momentum\"\n",
" ),\n",
" ],\n",
" trial_spec=trial_spec\n",
diff --git a/examples/v1beta1/metrics-collector/metrics-collection-strategy.yaml b/examples/v1beta1/metrics-collector/metrics-collection-strategy.yaml
index 5facadc0ae8..0d7beaf8a70 100644
--- a/examples/v1beta1/metrics-collector/metrics-collection-strategy.yaml
+++ b/examples/v1beta1/metrics-collector/metrics-collection-strategy.yaml
@@ -6,16 +6,16 @@ metadata:
name: metrics-collection-strategy
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
additionalMetricNames:
- - Train-accuracy
+ - accuracy
metricStrategies:
- - name: Train-accuracy
- value: "latest"
- - name: Validation-accuracy
+ - name: accuracy
value: "max"
+ - name: loss
+ value: "min"
algorithm:
algorithmName: tpe
parallelTrialCount: 3
@@ -26,31 +26,21 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -59,12 +49,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
diff --git a/examples/v1beta1/resume-experiment/from-volume-resume.yaml b/examples/v1beta1/resume-experiment/from-volume-resume.yaml
index 082e09c0969..71ab478866d 100644
--- a/examples/v1beta1/resume-experiment/from-volume-resume.yaml
+++ b/examples/v1beta1/resume-experiment/from-volume-resume.yaml
@@ -6,11 +6,9 @@ metadata:
name: from-volume-resume
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: random
parallelTrialCount: 3
@@ -22,31 +20,21 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -55,12 +43,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
diff --git a/examples/v1beta1/resume-experiment/long-running-resume.yaml b/examples/v1beta1/resume-experiment/long-running-resume.yaml
index 3a7e10dba7c..271cc6d0358 100644
--- a/examples/v1beta1/resume-experiment/long-running-resume.yaml
+++ b/examples/v1beta1/resume-experiment/long-running-resume.yaml
@@ -6,11 +6,9 @@ metadata:
name: long-running-resume
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: random
parallelTrialCount: 3
@@ -22,31 +20,21 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -55,12 +43,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
diff --git a/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb b/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb
index c901086c1af..d6876f2b864 100644
--- a/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb
+++ b/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb
@@ -93,10 +93,9 @@
"\n",
"# Objective specification.\n",
"objective_spec=V1beta1ObjectiveSpec(\n",
- " type=\"maximize\",\n",
- " goal= 0.99,\n",
- " objective_metric_name=\"Validation-accuracy\",\n",
- " additional_metric_names=[\"Train-accuracy\"]\n",
+ " type=\"minimize\",\n",
+ " goal= 0.001,\n",
+ " objective_metric_name=\"loss\",\n",
")\n",
"\n",
"# Experiment search space. In this example we tune learning rate, number of layer and optimizer.\n",
@@ -110,24 +109,15 @@
" ),\n",
" ),\n",
" V1beta1ParameterSpec(\n",
- " name=\"num-layers\",\n",
- " parameter_type=\"int\",\n",
- " feasible_space=V1beta1FeasibleSpace(\n",
- " min=\"2\",\n",
- " max=\"5\"\n",
- " ),\n",
- " ),\n",
- " V1beta1ParameterSpec(\n",
- " name=\"optimizer\",\n",
- " parameter_type=\"categorical\",\n",
+ " name=\"momentum\",\n",
+ " parameter_type=\"double\",\n",
" feasible_space=V1beta1FeasibleSpace(\n",
- " list=[\"sgd\", \"adam\", \"ftrl\"]\n",
+ " min=\"0.5\",\n",
+ " max=\"0.9\"\n",
" ),\n",
" ),\n",
"]\n",
"\n",
- "\n",
- "\n",
"# JSON template specification for the Trial's Worker Kubernetes Job.\n",
"trial_spec={\n",
" \"apiVersion\": \"batch/v1\",\n",
@@ -143,15 +133,14 @@
" \"containers\": [\n",
" {\n",
" \"name\": \"training-container\",\n",
- " \"image\": \"docker.io/kubeflowkatib/mxnet-mnist:v0.14.0\",\n",
+ " \"image\": \"docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0\",\n",
" \"command\": [\n",
" \"python3\",\n",
- " \"/opt/mxnet-mnist/mnist.py\",\n",
+ " \"/opt/pytorch-mnist/mnist.py\",\n",
+ " \"--epochs=1\",\n",
" \"--batch-size=64\",\n",
- " \"--num-epochs=1\",\n",
" \"--lr=${trialParameters.learningRate}\",\n",
- " \"--num-layers=${trialParameters.numberLayers}\",\n",
- " \"--optimizer=${trialParameters.optimizer}\"\n",
+ " \"--momentum=${trialParameters.momentum}\",\n",
" ]\n",
" }\n",
" ],\n",
@@ -171,14 +160,9 @@
" reference=\"lr\"\n",
" ),\n",
" V1beta1TrialParameterSpec(\n",
- " name=\"numberLayers\",\n",
- " description=\"Number of training model layers\",\n",
- " reference=\"num-layers\"\n",
- " ),\n",
- " V1beta1TrialParameterSpec(\n",
- " name=\"optimizer\",\n",
- " description=\"Training model optimizer (sdg, adam or ftrl)\",\n",
- " reference=\"optimizer\"\n",
+ " name=\"momentum\",\n",
+ " description=\"Momentum for the training model\",\n",
+ " reference=\"momentum\"\n",
" ),\n",
" ],\n",
" trial_spec=trial_spec\n",
@@ -735,8 +719,7 @@
" 'message': 'Trial has succeeded',\n",
" 'reason': 'TrialSucceeded',\n",
" 'status': 'True',\n",
- " 'type': 'Succeeded'}\n",
- "\n"
+ " 'type': 'Succeeded'}\n"
]
}
],
diff --git a/examples/v1beta1/tekton/pipeline-run.yaml b/examples/v1beta1/tekton/pipeline-run.yaml
index 1c2de0a45d7..fb608c76a19 100644
--- a/examples/v1beta1/tekton/pipeline-run.yaml
+++ b/examples/v1beta1/tekton/pipeline-run.yaml
@@ -13,11 +13,9 @@ metadata:
name: tekton-pipeline-run
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: random
parallelTrialCount: 2
@@ -45,53 +43,55 @@ spec:
kind: PipelineRun
spec:
params:
- - name: lr
- value: ${trialParameters.learningRate}
- - name: num-examples-init
+ - name: learningRate
+ description: Learning rate for the training model
+ reference: lr
+ - name: epochs-init
value: "60000"
pipelineSpec:
params:
- name: lr
description: Learning rate for the training model
- - name: num-examples-init
+ - name: epochs-init
description: Initial value for number of training examples
tasks:
- name: data-preprocessing
params:
- - name: num-examples-pre
- value: $(params.num-examples-init)
+ - name: epochs-pre
+ value: $(params.epochs-init)
taskSpec:
params:
- - name: num-examples-pre
+ - name: epochs-pre
description: Number of training examples before optimization
results:
- - name: num-examples-post
+ - name: epochs-post
description: Number of training examples after optimization
steps:
- - name: num-examples-optimize
+ - name: epochs-optimize
image: python:alpine3.6
command:
- sh
- -c
args:
- - python3 -c "import random; print($(params.num-examples-pre)//random.randint(10,100),end='')" | tee $(results.num-examples-post.path)
+ - python3 -c "import random; print($(params.epochs-pre)//random.randint(3000,30000),end='')" | tee $(results.epochs-post.path)
- name: model-training
params:
- name: lr
value: $(params.lr)
- - name: num-examples
- value: $(tasks.data-preprocessing.results.num-examples-post)
+ - name: epochs
+ value: $(tasks.data-preprocessing.results.epochs-post)
taskSpec:
params:
- name: lr
description: Learning rate for the training model
- - name: num-examples
- description: Number of training examples
+ - name: epochs
+ description: Number of epochs
steps:
- name: model-training
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--num-examples=$(params.num-examples)"
- - "--lr=$(params.lr)"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=$(params.epochs)"
+ - "--batch-size=16"
+ - "--lr=$(trialParameters.lr)"
diff --git a/examples/v1beta1/trial-images/mxnet-mnist/Dockerfile b/examples/v1beta1/trial-images/mxnet-mnist/Dockerfile
deleted file mode 100644
index f10c8c75e77..00000000000
--- a/examples/v1beta1/trial-images/mxnet-mnist/Dockerfile
+++ /dev/null
@@ -1,27 +0,0 @@
-FROM python:3.10-slim
-
-ARG TARGETARCH
-ENV LD_LIBRARY_PATH "${LD_LIBRARY_PATH}:/opt/arm/armpl_22.0.2_gcc-11.2/lib"
-
-COPY examples/v1beta1/trial-images/mxnet-mnist /opt/mxnet-mnist
-
-WORKDIR /opt/mxnet-mnist
-
-RUN apt-get -y update \
- && apt-get -y install libgomp1 \
- && if [ "${TARGETARCH}" = "arm64" ]; then \
- apt-get -y install wget; \
- elif [ "${TARGETARCH}" = "amd64" ]; then \
- apt-get -y install libquadmath0; \
- fi \
- && apt-get clean \
- && rm -rf /var/lib/apt/lists/*
-
-RUN if [ "${TARGETARCH}" = "arm64" ]; then \
- /opt/mxnet-mnist/install-arm-performance-libraries.sh; \
- fi
-RUN pip install --prefer-binary -r requirements.txt
-RUN chgrp -R 0 /opt/mxnet-mnist \
- && chmod -R g+rwX /opt/mxnet-mnist
-
-ENTRYPOINT ["python3", "/opt/mxnet-mnist/mnist.py"]
diff --git a/examples/v1beta1/trial-images/mxnet-mnist/README.md b/examples/v1beta1/trial-images/mxnet-mnist/README.md
deleted file mode 100644
index 71d6b8f8533..00000000000
--- a/examples/v1beta1/trial-images/mxnet-mnist/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# MXNet MNIST Image Classification Example
-
-This is MXNet MNIST image classification training container with recording time
-of the metrics. It uses only simple multilayer perceptron network (mlp).
-
-If you want to read more about this example, visit the official
-[incubator-mxnet](https://github.com/apache/incubator-mxnet/tree/1cf2fe5f8753042951bc0aacb6c95ddd3a904395/example/image-classification)
-GitHub repository.
-
-Katib uses this training container in some Experiments, for instance in the
-[random search](../../hp-tuning/random.yaml#L55-L64).
diff --git a/examples/v1beta1/trial-images/mxnet-mnist/common/__init__.py b/examples/v1beta1/trial-images/mxnet-mnist/common/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/examples/v1beta1/trial-images/mxnet-mnist/common/fit.py b/examples/v1beta1/trial-images/mxnet-mnist/common/fit.py
deleted file mode 100644
index 6818559cedb..00000000000
--- a/examples/v1beta1/trial-images/mxnet-mnist/common/fit.py
+++ /dev/null
@@ -1,343 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" example train fit utility """
-import logging
-import os
-import time
-import re
-import math
-import mxnet as mx
-
-
-def get_epoch_size(args, kv):
- return math.ceil(int(args.num_examples / kv.num_workers) / args.batch_size)
-
-
-def _get_lr_scheduler(args, kv):
- if 'lr_factor' not in args or args.lr_factor >= 1:
- return (args.lr, None)
- epoch_size = get_epoch_size(args, kv)
- begin_epoch = args.load_epoch if args.load_epoch else 0
- if 'pow' in args.lr_step_epochs:
- lr = args.lr
- max_up = args.num_epochs * epoch_size
- pwr = float(re.sub('pow[- ]*', '', args.lr_step_epochs))
- poly_sched = mx.lr_scheduler.PolyScheduler(max_up, lr, pwr)
- return (lr, poly_sched)
- step_epochs = [int(l) for l in args.lr_step_epochs.split(',')]
- lr = args.lr
- for s in step_epochs:
- if begin_epoch >= s:
- lr *= args.lr_factor
- if lr != args.lr:
- logging.info('Adjust learning rate to %e for epoch %d',
- lr, begin_epoch)
-
- steps = [epoch_size * (x - begin_epoch)
- for x in step_epochs if x - begin_epoch > 0]
- if steps:
- return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor,
- base_lr=args.lr))
- else:
- return (lr, None)
-
-
-def _load_model(args, rank=0):
- if 'load_epoch' not in args or args.load_epoch is None:
- return (None, None, None)
- assert args.model_prefix is not None
- model_prefix = args.model_prefix
- if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)):
- model_prefix += "-%d" % (rank)
- sym, arg_params, aux_params = mx.model.load_checkpoint(
- model_prefix, args.load_epoch)
- logging.info('Loaded model %s_%04d.params', model_prefix, args.load_epoch)
- return (sym, arg_params, aux_params)
-
-
-def _save_model(args, rank=0):
- if args.model_prefix is None:
- return None
- return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % (
- args.model_prefix, rank), period=args.save_period)
-
-
-def add_fit_args(parser):
- """
- parser : argparse.ArgumentParser
- return a parser added with args required by fit
- """
- train = parser.add_argument_group('Training', 'model training')
- train.add_argument('--network', type=str,
- help='the neural network to use')
- train.add_argument('--num-layers', type=int,
- help='number of layers in the neural network, \
- required by some networks such as resnet')
- train.add_argument('--gpus', type=str,
- help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu')
- train.add_argument('--kv-store', type=str, default='device',
- help='key-value store type')
- train.add_argument('--num-epochs', type=int, default=100,
- help='max num of epochs')
- train.add_argument('--lr', type=float, default=0.1,
- help='initial learning rate')
- train.add_argument('--lr-factor', type=float, default=0.1,
- help='the ratio to reduce lr on each step')
- train.add_argument('--lr-step-epochs', type=str,
- help='the epochs to reduce the lr, e.g. 30,60')
- train.add_argument('--initializer', type=str, default='default',
- help='the initializer type')
- train.add_argument('--optimizer', type=str, default='sgd',
- help='the optimizer type')
- train.add_argument('--mom', type=float, default=0.9,
- help='momentum for sgd')
- train.add_argument('--wd', type=float, default=0.0001,
- help='weight decay for sgd')
- train.add_argument('--batch-size', type=int, default=128,
- help='the batch size')
- train.add_argument('--disp-batches', type=int, default=20,
- help='show progress for every n batches')
- train.add_argument('--model-prefix', type=str,
- help='model prefix')
- train.add_argument('--save-period', type=int, default=1, help='params saving period')
- parser.add_argument('--monitor', dest='monitor', type=int, default=0,
- help='log network parameters every N iters if larger than 0')
- train.add_argument('--load-epoch', type=int,
- help='load the model on an epoch using the model-load-prefix')
- train.add_argument('--top-k', type=int, default=0,
- help='report the top-k accuracy. 0 means no report.')
- train.add_argument('--loss', type=str, default='',
- help='show the cross-entropy or nll loss. ce strands for cross-entropy, nll-loss stands for likelihood loss')
- train.add_argument('--test-io', type=int, default=0,
- help='1 means test reading speed without training')
- train.add_argument('--dtype', type=str, default='float32',
- help='precision: float32 or float16')
- train.add_argument('--gc-type', type=str, default='none',
- help='type of gradient compression to use, \
- takes `2bit` or `none` for now')
- train.add_argument('--gc-threshold', type=float, default=0.5,
- help='threshold for 2bit gradient compression')
- # additional parameters for large batch sgd
- train.add_argument('--macrobatch-size', type=int, default=0,
- help='distributed effective batch size')
- train.add_argument('--warmup-epochs', type=int, default=5,
- help='the epochs to ramp-up lr to scaled large-batch value')
- train.add_argument('--warmup-strategy', type=str, default='linear',
- help='the ramping-up strategy for large batch sgd')
- train.add_argument('--profile-worker-suffix', type=str, default='',
- help='profile workers actions into this file. During distributed training\
- filename saved will be rank1_ followed by this suffix')
- train.add_argument('--profile-server-suffix', type=str, default='',
- help='profile server actions into a file with name like rank1_ followed by this suffix \
- during distributed training')
- train.add_argument('--use-imagenet-data-augmentation', type=int, default=0,
- help='enable data augmentation of ImageNet data, default disabled')
- return train
-
-
-def fit(args, network, data_loader, **kwargs):
- """
- train a model
- args : argparse returns
- network : the symbol definition of the nerual network
- data_loader : function that returns the train and val data iterators
- """
- # kvstore
- kv = mx.kvstore.create(args.kv_store)
- if args.gc_type != 'none':
- kv.set_gradient_compression({'type': args.gc_type,
- 'threshold': args.gc_threshold})
- if args.profile_server_suffix:
- mx.profiler.set_config(filename=args.profile_server_suffix, profile_all=True, profile_process='server')
- mx.profiler.set_state(state='run', profile_process='server')
-
- if args.profile_worker_suffix:
- if kv.num_workers > 1:
- filename = 'rank' + str(kv.rank) + '_' + args.profile_worker_suffix
- else:
- filename = args.profile_worker_suffix
- mx.profiler.set_config(filename=filename, profile_all=True, profile_process='worker')
- mx.profiler.set_state(state='run', profile_process='worker')
-
- # logging
- head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
- logging.basicConfig(level=logging.DEBUG, format=head)
- logging.info('start with arguments %s', args)
-
- epoch_size = get_epoch_size(args, kv)
-
- # data iterators
- (train, val) = data_loader(args, kv)
- if 'dist' in args.kv_store and not 'async' in args.kv_store:
- logging.info('Resizing training data to %d batches per machine', epoch_size)
- # resize train iter to ensure each machine has same number of batches per epoch
- # if not, dist_sync can hang at the end with one machine waiting for other machines
- train = mx.io.ResizeIter(train, epoch_size)
-
- if args.test_io:
- tic = time.time()
- for i, batch in enumerate(train):
- if isinstance(batch, list):
- for b in batch:
- for j in b.data:
- j.wait_to_read()
- else:
- for j in batch.data:
- j.wait_to_read()
- if (i + 1) % args.disp_batches == 0:
- logging.info('Batch [%d]\tSpeed: %.2f samples/sec', i,
- args.disp_batches * args.batch_size / (time.time() - tic))
- tic = time.time()
- return
-
- # load model
- if 'arg_params' in kwargs and 'aux_params' in kwargs:
- arg_params = kwargs['arg_params']
- aux_params = kwargs['aux_params']
- else:
- sym, arg_params, aux_params = _load_model(args, kv.rank)
- if sym is not None:
- assert sym.tojson() == network.tojson()
-
- # save model
- checkpoint = _save_model(args, kv.rank)
-
- # devices for training
- devs = mx.cpu() if args.gpus is None or args.gpus == "" else [
- mx.gpu(int(i)) for i in args.gpus.split(',')]
-
- # learning rate
- lr, lr_scheduler = _get_lr_scheduler(args, kv)
-
- # create model
- model = mx.mod.Module(
- context=devs,
- symbol=network
- )
-
- lr_scheduler = lr_scheduler
- optimizer_params = {
- 'learning_rate': lr,
- 'wd': args.wd,
- 'lr_scheduler': lr_scheduler,
- 'multi_precision': True}
-
- # Only a limited number of optimizers have 'momentum' property
- has_momentum = {'sgd', 'dcasgd', 'nag', 'signum', 'lbsgd'}
- if args.optimizer in has_momentum:
- optimizer_params['momentum'] = args.mom
-
- monitor = mx.mon.Monitor(
- args.monitor, pattern=".*") if args.monitor > 0 else None
-
- # A limited number of optimizers have a warmup period
- has_warmup = {'lbsgd', 'lbnag'}
- if args.optimizer in has_warmup:
- nworkers = kv.num_workers
- if epoch_size < 1:
- epoch_size = 1
- macrobatch_size = args.macrobatch_size
- if macrobatch_size < args.batch_size * nworkers:
- macrobatch_size = args.batch_size * nworkers
- #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999)
- batch_scale = math.ceil(
- float(macrobatch_size) / args.batch_size / nworkers)
- optimizer_params['updates_per_epoch'] = epoch_size
- optimizer_params['begin_epoch'] = args.load_epoch if args.load_epoch else 0
- optimizer_params['batch_scale'] = batch_scale
- optimizer_params['warmup_strategy'] = args.warmup_strategy
- optimizer_params['warmup_epochs'] = args.warmup_epochs
- optimizer_params['num_epochs'] = args.num_epochs
-
- if args.initializer == 'default':
- if args.network == 'alexnet':
- # AlexNet will not converge using Xavier
- initializer = mx.init.Normal()
- # VGG will not trend to converge using Xavier-Gaussian
- elif args.network and 'vgg' in args.network:
- initializer = mx.init.Xavier()
- else:
- initializer = mx.init.Xavier(
- rnd_type='gaussian', factor_type="in", magnitude=2)
- # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34),
- elif args.initializer == 'xavier':
- initializer = mx.init.Xavier()
- elif args.initializer == 'msra':
- initializer = mx.init.MSRAPrelu()
- elif args.initializer == 'orthogonal':
- initializer = mx.init.Orthogonal()
- elif args.initializer == 'normal':
- initializer = mx.init.Normal()
- elif args.initializer == 'uniform':
- initializer = mx.init.Uniform()
- elif args.initializer == 'one':
- initializer = mx.init.One()
- elif args.initializer == 'zero':
- initializer = mx.init.Zero()
-
- # evaluation metrices
- eval_metrics = ['accuracy']
- if args.top_k > 0:
- eval_metrics.append(mx.metric.create(
- 'top_k_accuracy', top_k=args.top_k))
-
- supported_loss = ['ce', 'nll_loss']
- if len(args.loss) > 0:
- # ce or nll loss is only applicable to softmax output
- loss_type_list = args.loss.split(',')
- if 'softmax_output' in network.list_outputs():
- for loss_type in loss_type_list:
- loss_type = loss_type.strip()
- if loss_type == 'nll':
- loss_type = 'nll_loss'
- if loss_type not in supported_loss:
- logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or '
- 'negative likelihood loss is supported!')
- else:
- eval_metrics.append(mx.metric.create(loss_type))
- else:
- logging.warning("The output is not softmax_output, loss argument will be skipped!")
-
- # callbacks that run after each batch
- batch_end_callbacks = [mx.callback.Speedometer(
- args.batch_size, args.disp_batches)]
- if 'batch_end_callback' in kwargs:
- cbs = kwargs['batch_end_callback']
- batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs]
-
- # run
- model.fit(train,
- begin_epoch=args.load_epoch if args.load_epoch else 0,
- num_epoch=args.num_epochs,
- eval_data=val,
- eval_metric=eval_metrics,
- kvstore=kv,
- optimizer=args.optimizer,
- optimizer_params=optimizer_params,
- initializer=initializer,
- arg_params=arg_params,
- aux_params=aux_params,
- batch_end_callback=batch_end_callbacks,
- epoch_end_callback=checkpoint,
- allow_missing=True,
- monitor=monitor)
-
- if args.profile_server_suffix:
- mx.profiler.set_state(state='run', profile_process='server')
- if args.profile_worker_suffix:
- mx.profiler.set_state(state='run', profile_process='worker')
diff --git a/examples/v1beta1/trial-images/mxnet-mnist/common/utils.py b/examples/v1beta1/trial-images/mxnet-mnist/common/utils.py
deleted file mode 100644
index 3a0f64cc95b..00000000000
--- a/examples/v1beta1/trial-images/mxnet-mnist/common/utils.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import errno
-
-import mxnet as mx
-
-
-def download_file(url, local_fname=None, force_write=False):
- # requests is not default installed
- import requests
- if local_fname is None:
- local_fname = url.split('/')[-1]
- if not force_write and os.path.exists(local_fname):
- return local_fname
-
- dir_name = os.path.dirname(local_fname)
-
- if dir_name != "":
- if not os.path.exists(dir_name):
- try: # try to create the directory if it doesn't exists
- os.makedirs(dir_name)
- except OSError as exc:
- if exc.errno != errno.EEXIST:
- raise
-
- r = requests.get(url, stream=True)
- assert r.status_code == 200, "failed to open %s" % url
- with open(local_fname, 'wb') as f:
- for chunk in r.iter_content(chunk_size=1024):
- if chunk: # filter out keep-alive new chunks
- f.write(chunk)
- return local_fname
diff --git a/examples/v1beta1/trial-images/mxnet-mnist/install-arm-performance-libraries.sh b/examples/v1beta1/trial-images/mxnet-mnist/install-arm-performance-libraries.sh
deleted file mode 100755
index 8216a5f53aa..00000000000
--- a/examples/v1beta1/trial-images/mxnet-mnist/install-arm-performance-libraries.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright 2022 The Kubeflow Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -o errexit
-set -o nounset
-set -o pipefail
-cd "$(dirname "$0")"
-
-# Download Arm Performance Libraries for Ubuntu 22.04
-# Ref: https://developer.arm.com/downloads/-/arm-performance-libraries
-echo "Downloading Arm Performance Libraries for Ubuntu 22.04..."
-wget -qO - \
- "https://developer.arm.com/-/media/Files/downloads/hpc/arm-performance-libraries/23-04-1/ubuntu-22/arm-performance-libraries_23.04.1_Ubuntu-22.04_gcc-11.3.tar?rev=207c1f7aaa16400e94eb9a980494a6eb&revision=207c1f7a-aa16-400e-94eb-9a980494a6eb" \
- | tar -xf -
-
-# Install Arm Performance Libraries
-echo "Installing Arm Performance Libraries for Ubuntu 22.04..."
-./arm-performance-libraries_23.04.1_Ubuntu-22.04/arm-performance-libraries_23.04.1_Ubuntu-22.04.sh -a
-
-# Clean up
-echo "Removing installer..."
-rm -rf ./arm-performance-libraries_23.04.1_Ubuntu-22.04
diff --git a/examples/v1beta1/trial-images/mxnet-mnist/mnist.py b/examples/v1beta1/trial-images/mxnet-mnist/mnist.py
deleted file mode 100644
index 111de1fb950..00000000000
--- a/examples/v1beta1/trial-images/mxnet-mnist/mnist.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Train mnist, see more explanation at https://mxnet.io/tutorials/python/mnist.html
-"""
-import os
-import argparse
-import logging
-import mxnet as mx
-import numpy as np
-import gzip
-import struct
-from common import fit
-from common import utils
-# This example only for mlp network
-from symbols import mlp
-
-# Use this format (%Y-%m-%dT%H:%M:%SZ) to record timestamp of the metrics
-logging.basicConfig(
- format="%(asctime)s %(levelname)-8s %(message)s",
- datefmt="%Y-%m-%dT%H:%M:%SZ",
- level=logging.DEBUG)
-
-
-def get_mnist_iter(args, kv):
- """
- Create data iterator with NDArrayIter
- """
- mnist = mx.test_utils.get_mnist()
-
- # Get MNIST data.
- train_data = mx.io.NDArrayIter(
- mnist['train_data'], mnist['train_label'], args.batch_size, shuffle=True)
- val_data = mx.io.NDArrayIter(
- mnist['test_data'], mnist['test_label'], args.batch_size)
-
- return (train_data, val_data)
-
-
-if __name__ == '__main__':
- # parse args
- parser = argparse.ArgumentParser(description="train mnist",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--num-classes', type=int, default=10,
- help='the number of classes')
- parser.add_argument('--num-examples', type=int, default=60000,
- help='the number of training examples')
-
- parser.add_argument('--add_stn', action="store_true", default=False,
- help='Add Spatial Transformer Network Layer (lenet only)')
- parser.add_argument('--image_shape', default='1, 28, 28', help='shape of training images')
-
- fit.add_fit_args(parser)
- parser.set_defaults(
- # network
- network='mlp',
- # train
- gpus=None,
- batch_size=64,
- disp_batches=100,
- num_epochs=10,
- lr=.05,
- lr_step_epochs='10'
- )
- args = parser.parse_args()
-
- # load mlp network
- sym = mlp.get_symbol(**vars(args))
-
- # train
- fit.fit(args, sym, get_mnist_iter)
diff --git a/examples/v1beta1/trial-images/mxnet-mnist/requirements.txt b/examples/v1beta1/trial-images/mxnet-mnist/requirements.txt
deleted file mode 100644
index fb439db250f..00000000000
--- a/examples/v1beta1/trial-images/mxnet-mnist/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-mxnet==1.9.1
-# This is a workaround to avoid the following error.
-# AttributeError: module 'numpy' has no attribute 'bool'
-# See more: https://github.com/numpy/numpy/pull/22607
-numpy==1.23.5
diff --git a/examples/v1beta1/trial-images/mxnet-mnist/symbols/__init__.py b/examples/v1beta1/trial-images/mxnet-mnist/symbols/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/examples/v1beta1/trial-images/mxnet-mnist/symbols/mlp.py b/examples/v1beta1/trial-images/mxnet-mnist/symbols/mlp.py
deleted file mode 100644
index f6f6f0eba32..00000000000
--- a/examples/v1beta1/trial-images/mxnet-mnist/symbols/mlp.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-a simple multilayer perceptron
-"""
-import mxnet as mx
-
-
-def get_symbol(num_classes=10, **kwargs):
- data = mx.symbol.Variable('data')
- data = mx.sym.Flatten(data=data)
- fc1 = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=128)
- act1 = mx.symbol.Activation(data=fc1, name='relu1', act_type="relu")
- fc2 = mx.symbol.FullyConnected(data=act1, name='fc2', num_hidden=64)
- act2 = mx.symbol.Activation(data=fc2, name='relu2', act_type="relu")
- fc3 = mx.symbol.FullyConnected(data=act2, name='fc3', num_hidden=num_classes)
- mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
- return mlp
diff --git a/examples/v1beta1/trial-template/trial-metadata-substitution.yaml b/examples/v1beta1/trial-template/trial-metadata-substitution.yaml
index c183e5fb5a6..fe6e360b420 100644
--- a/examples/v1beta1/trial-template/trial-metadata-substitution.yaml
+++ b/examples/v1beta1/trial-template/trial-metadata-substitution.yaml
@@ -7,11 +7,9 @@ metadata:
name: trial-metadata-substitution
spec:
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: random
parallelTrialCount: 3
@@ -22,13 +20,21 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
+ max: "0.05"
+ - name: momentum
+ parameterType: double
+ feasibleSpace:
+ min: "0.5"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
- name: trialName
description: Name of the current trial's job
reference: ${trialSpec.Name}
@@ -60,12 +66,14 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
+ - "--momentum=${trialParameters.momentum}"
env:
- name: TRIAL_NAME
value: ${trialParameters.trialName}
diff --git a/hack/boilerplate/update-boilerplate.sh b/hack/boilerplate/update-boilerplate.sh
index aa4ec011a9f..87c62b84068 100755
--- a/hack/boilerplate/update-boilerplate.sh
+++ b/hack/boilerplate/update-boilerplate.sh
@@ -57,7 +57,6 @@ find_python_files=$(
find ./cmd ./pkg ./hack ./test ./examples -name "*.py" \
! -path "./pkg/apis/manager/*" \
! -path "*__init__.py" \
- ! -path "./examples/v1beta1/trial-images/mxnet-mnist/*"
)
for i in ${find_python_files}; do
diff --git a/manifests/v1beta1/components/controller/trial-templates.yaml b/manifests/v1beta1/components/controller/trial-templates.yaml
index 916dd3c85b6..58a030b23b3 100644
--- a/manifests/v1beta1/components/controller/trial-templates.yaml
+++ b/manifests/v1beta1/components/controller/trial-templates.yaml
@@ -15,14 +15,14 @@ data:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
# For ConfigMap templates double quotes must set in commands to correct parse JSON parameters in Trial Template (e.g nn_config, architecture)
enasCPUTemplate: |-
diff --git a/pkg/controller.v1beta1/experiment/experiment_controller_test.go b/pkg/controller.v1beta1/experiment/experiment_controller_test.go
index b43f45d7f5c..38f59fddccc 100644
--- a/pkg/controller.v1beta1/experiment/experiment_controller_test.go
+++ b/pkg/controller.v1beta1/experiment/experiment_controller_test.go
@@ -477,12 +477,14 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
Containers: []corev1.Container{
{
Name: primaryContainer,
- Image: "docker.io/kubeflowkatib/mxnet-mnist",
+ Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu",
Command: []string{
"python3",
- "/opt/mxnet-mnist/mnist.py",
+ "/opt/pytorch-mnist/mnist.py",
+ "--epochs=1",
+ "--batch-size=16",
"--lr=${trialParameters.learningRate}",
- "--num-layers=${trialParameters.numberLayers}",
+ "--momentum=${trialParameters.momentum}",
},
},
},
@@ -611,12 +613,14 @@ func newFakeBatchJob() *batchv1.Job {
Containers: []corev1.Container{
{
Name: primaryContainer,
- Image: "docker.io/kubeflowkatib/mxnet-mnist",
+ Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu",
Command: []string{
"python3",
- "/opt/mxnet-mnist/mnist.py",
- "--lr=0.01",
- "--num-layers=5",
+ "/opt/pytorch-mnist/mnist.py",
+ "--epochs=1",
+ "--batch-size=16",
+ "--lr=${trialParameters.learningRate}",
+ "--momentum=${trialParameters.momentum}",
},
},
},
diff --git a/pkg/controller.v1beta1/experiment/manifest/generator_test.go b/pkg/controller.v1beta1/experiment/manifest/generator_test.go
index fa3c3f6ff09..3adeb017f74 100644
--- a/pkg/controller.v1beta1/experiment/manifest/generator_test.go
+++ b/pkg/controller.v1beta1/experiment/manifest/generator_test.go
@@ -61,12 +61,14 @@ func TestGetRunSpecWithHP(t *testing.T) {
Containers: []v1.Container{
{
Name: "training-container",
- Image: "docker.io/kubeflowkatib/mxnet-mnist",
+ Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu",
Command: []string{
"python3",
- "/opt/mxnet-mnist/mnist.py",
+ "/opt/pytorch-mnist/mnist.py",
+ "--epochs=1",
+ "--batch-size=16",
"--lr=0.05",
- "--num-layers=5",
+ "--momentum=0.9",
},
Env: []v1.EnvVar{
{Name: consts.TrialTemplateMetaKeyOfName, Value: "trial-name"},
@@ -176,12 +178,14 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"`
+ - "--momentum=${trialParameters.momentum}"`
invalidTrialSpec := `apiVersion: batch/v1
kind: Job
@@ -190,12 +194,14 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu
command:
- python3
- - /opt/mxnet-mnist/mnist.py
+ - /opt/pytorch-mnist/mnist.py
+ - --epochs=1
+ - --batch-size=16
- --lr=${trialParameters.learningRate}
- - --num-layers=${trialParameters.numberLayers}
+ - --momentum=${trialParameters.momentum}
- --invalidParameter={'num_layers': 2, 'input_sizes': [32, 32, 3]}`
validGetConfigMap1 := c.EXPECT().GetConfigMap(gomock.Any(), gomock.Any()).Return(
@@ -228,12 +234,14 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=0.05"
- - "--num-layers=5"`
+ - "--momentum=0.9"`
expectedRunSpec, err := util.ConvertStringToUnstructured(expectedStr)
if err != nil {
@@ -347,12 +355,14 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
Containers: []v1.Container{
{
Name: "training-container",
- Image: "docker.io/kubeflowkatib/mxnet-mnist",
+ Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu",
Command: []string{
"python3",
- "/opt/mxnet-mnist/mnist.py",
+ "/opt/pytorch-mnist/mnist.py",
+ "--epochs=1",
+ "--batch-size=16",
"--lr=${trialParameters.learningRate}",
- "--num-layers=${trialParameters.numberLayers}",
+ "--momentum=${trialParameters.momentum}",
},
Env: []v1.EnvVar{
{Name: consts.TrialTemplateMetaKeyOfName, Value: "${trialParameters.trialName}"},
@@ -381,9 +391,9 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
Reference: "lr",
},
{
- Name: "numberLayers",
- Description: "Number of layers",
- Reference: "num-layers",
+ Name: "momentum",
+ Description: "Momentum for the training model",
+ Reference: "momentum",
},
{
Name: "trialName",
@@ -418,8 +428,8 @@ func newFakeParameterAssignment() []commonapiv1beta1.ParameterAssignment {
Value: "0.05",
},
{
- Name: "num-layers",
- Value: "5",
+ Name: "momentum",
+ Value: "0.9",
},
}
}
diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go
index 6e04813aaa0..dec9bee4851 100644
--- a/pkg/controller.v1beta1/trial/trial_controller_test.go
+++ b/pkg/controller.v1beta1/trial/trial_controller_test.go
@@ -388,12 +388,14 @@ func newFakeTrialBatchJob() *trialsv1beta1.Trial {
Containers: []corev1.Container{
{
Name: primaryContainer,
- Image: "docker.io/kubeflowkatib/mxnet-mnist",
+ Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu",
Command: []string{
"python3",
- "/opt/mxnet-mnist/mnist.py",
+ "/opt/pytorch-mnist/mnist.py",
+ "--epochs=1",
+ "--batch-size=16",
"--lr=0.01",
- "--num-layers=5",
+ "--momentum=0.9",
},
},
},
diff --git a/pkg/ui/v1beta1/frontend/cypress/fixtures/trial-template.json b/pkg/ui/v1beta1/frontend/cypress/fixtures/trial-template.json
index ee6d19a6630..eb515c650fc 100644
--- a/pkg/ui/v1beta1/frontend/cypress/fixtures/trial-template.json
+++ b/pkg/ui/v1beta1/frontend/cypress/fixtures/trial-template.json
@@ -8,7 +8,7 @@
"Templates": [
{
"Path": "defaultTrialTemplate.yaml",
- "Yaml": "apiVersion: batch/v1\nkind: Job\nspec:\n template:\n spec:\n containers:\n - name: training-container\n image: docker.io/kubeflowkatib/mxnet-mnist:v1beta1-45c5727\n command:\n - \"python3\"\n - \"/opt/mxnet-mnist/mnist.py\"\n - \"--batch-size=64\"\n - \"--lr=${trialParameters.learningRate}\"\n - \"--num-layers=${trialParameters.numberLayers}\"\n - \"--optimizer=${trialParameters.optimizer}\"\n restartPolicy: Never"
+ "Yaml": "apiVersion: batch/v1\nkind: Job\nspec:\n template:\n spec:\n containers:\n - name: training-container\n image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727\n command:\n - \"python3\"\n - \"/opt/pytorch-mnist/mnist.py\"\n - \"--epochs=1\"\n - \"--lr=${trialParameters.learningRate}\"\n - \"--momentum=${trialParameters.momentum}\"\n restartPolicy: Never"
},
{
"Path": "enasCPUTemplate",
diff --git a/pkg/webhook/v1beta1/experiment/validator/validator_test.go b/pkg/webhook/v1beta1/experiment/validator/validator_test.go
index cef44ca737d..f17b886abe6 100644
--- a/pkg/webhook/v1beta1/experiment/validator/validator_test.go
+++ b/pkg/webhook/v1beta1/experiment/validator/validator_test.go
@@ -1273,10 +1273,10 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
},
},
{
- Name: "num-layers",
+ Name: "momentum",
ParameterType: experimentsv1beta1.ParameterTypeCategorical,
FeasibleSpace: experimentsv1beta1.FeasibleSpace{
- List: []string{"1", "2", "3"},
+ List: []string{"0.95", "0.85", "0.75"},
},
},
},
@@ -1298,12 +1298,14 @@ func newFakeBatchJob() *batchv1.Job {
Containers: []v1.Container{
{
Name: "training-container",
- Image: "docker.io/kubeflowkatib/mxnet-mnist",
+ Image: "docker.io/kubeflowkatib/pytorch-mnist-cpu",
Command: []string{
"python3",
- "/opt/mxnet-mnist/mnist.py",
+ "--epochs=1",
+ "--batch-size=16",
+ "/opt/pytorch-mnist/mnist.py",
"--lr=${trialParameters.learningRate}",
- "--num-layers=${trialParameters.numberLayers}",
+ "--momentum=${trialParameters.momentum}",
},
},
},
@@ -1321,9 +1323,9 @@ func newFakeTrialParamters() []experimentsv1beta1.TrialParameterSpec {
Reference: "lr",
},
{
- Name: "numberLayers",
- Description: "Number of layers",
- Reference: "num-layers",
+ Name: "momentum",
+ Description: "Momentum for the training model",
+ Reference: "momentum",
},
}
}
diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh
index 97d4b9ed039..f8cba66c34c 100755
--- a/scripts/v1beta1/build.sh
+++ b/scripts/v1beta1/build.sh
@@ -119,9 +119,6 @@ docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/darts-cnn-cifar10
echo -e "\nBuilding PyTorch CIFAR-10 CNN training container example for DARTS with GPU support...\n"
docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/darts-cnn-cifar10-gpu:${TAG}" -f examples/${VERSION}/trial-images/darts-cnn-cifar10/Dockerfile.gpu .
-echo -e "\nBuilding mxnet mnist training container example...\n"
-docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/mxnet-mnist:${TAG}" -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile .
-
echo -e "\nBuilding PyTorch mnist training container example with CPU support...\n"
docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/pytorch-mnist-cpu:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile.cpu .
diff --git a/scripts/v1beta1/push.sh b/scripts/v1beta1/push.sh
index 9a6c70c546f..474797abe83 100755
--- a/scripts/v1beta1/push.sh
+++ b/scripts/v1beta1/push.sh
@@ -86,9 +86,6 @@ docker push "${REGISTRY}/earlystopping-medianstop:${TAG}"
# Training container images
echo -e "\nPushing training container images..."
-echo -e "\nPushing mxnet mnist training container example...\n"
-docker push "${REGISTRY}/mxnet-mnist:${TAG}"
-
echo -e "\nPushing Tensorflow with summaries mnist training container example...\n"
docker push "${REGISTRY}/tf-mnist-with-summaries:${TAG}"
diff --git a/scripts/v1beta1/update-images.sh b/scripts/v1beta1/update-images.sh
index 21230db18a3..d7805f7af5d 100755
--- a/scripts/v1beta1/update-images.sh
+++ b/scripts/v1beta1/update-images.sh
@@ -82,7 +82,6 @@ done
# Katib Trial training container images.
# Postfixes for the each Trial image.
-MXNET_MNIST="mxnet-mnist"
PYTORCH_MNIST_CPU="pytorch-mnist-cpu"
PYTORCH_MNIST_GPU="pytorch-mnist-gpu"
TF_MNIST_WITH_SUMMARIES="tf-mnist-with-summaries"
@@ -93,7 +92,6 @@ DARTS_CPU="darts-cnn-cifar10-cpu"
SIMPLE_PBT="simple-pbt"
echo -e "Update Katib Trial training container images\n"
-update_yaml_files "./" "${OLD_PREFIX}${MXNET_MNIST}:.*" "${NEW_PREFIX}${MXNET_MNIST}:${TAG}"
update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST_CPU}:.*" "${NEW_PREFIX}${PYTORCH_MNIST_CPU}:${TAG}"
update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST_GPU}:.*" "${NEW_PREFIX}${PYTORCH_MNIST_GPU}:${TAG}"
update_yaml_files "./" "${OLD_PREFIX}${TF_MNIST_WITH_SUMMARIES}:.*" "${NEW_PREFIX}${TF_MNIST_WITH_SUMMARIES}:${TAG}"
diff --git a/test/e2e/v1beta1/hack/aws/argo_workflow.py b/test/e2e/v1beta1/hack/aws/argo_workflow.py
index ac5d97fd643..b38b5beeb78 100644
--- a/test/e2e/v1beta1/hack/aws/argo_workflow.py
+++ b/test/e2e/v1beta1/hack/aws/argo_workflow.py
@@ -55,7 +55,6 @@
"suggestion-enas": "cmd/suggestion/nas/enas/v1beta1/Dockerfile",
"suggestion-darts": "cmd/suggestion/nas/darts/v1beta1/Dockerfile",
"earlystopping-medianstop": "cmd/earlystopping/medianstop/v1beta1/Dockerfile",
- "trial-mxnet-mnist": "examples/v1beta1/trial-images/mxnet-mnist/Dockerfile",
"trial-pytorch-mnist": "examples/v1beta1/trial-images/pytorch-mnist/Dockerfile",
"trial-tf-mnist-with-summaries": "examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile",
"trial-enas-cnn-cifar10-gpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu",
diff --git a/test/e2e/v1beta1/testdata/invalid-experiment.yaml b/test/e2e/v1beta1/testdata/invalid-experiment.yaml
index 28018af61c1..4cc1d9ebb67 100644
--- a/test/e2e/v1beta1/testdata/invalid-experiment.yaml
+++ b/test/e2e/v1beta1/testdata/invalid-experiment.yaml
@@ -7,11 +7,9 @@ spec:
maxTrialCount: 13
maxFailedTrialCount: 3
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: invalid-algorithm # Invalid Algorithm to check that validation webhook is working
parameters:
@@ -19,31 +17,21 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -52,12 +40,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never
diff --git a/test/e2e/v1beta1/testdata/valid-experiment.yaml b/test/e2e/v1beta1/testdata/valid-experiment.yaml
index 25937bf2cfe..1ae8cc1d811 100644
--- a/test/e2e/v1beta1/testdata/valid-experiment.yaml
+++ b/test/e2e/v1beta1/testdata/valid-experiment.yaml
@@ -7,11 +7,9 @@ spec:
maxTrialCount: 13
maxFailedTrialCount: 3
objective:
- type: maximize
- goal: 0.99
- objectiveMetricName: Validation-accuracy
- additionalMetricNames:
- - Train-accuracy
+ type: minimize
+ goal: 0.001
+ objectiveMetricName: loss
algorithm:
algorithmName: random
parameters:
@@ -19,31 +17,21 @@ spec:
parameterType: double
feasibleSpace:
min: "0.01"
- max: "0.03"
- - name: num-layers
- parameterType: int
- feasibleSpace:
- min: "2"
- max: "5"
- - name: optimizer
- parameterType: categorical
+ max: "0.05"
+ - name: momentum
+ parameterType: double
feasibleSpace:
- list:
- - sgd
- - adam
- - ftrl
+ min: "0.5"
+ max: "0.9"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- - name: numberLayers
- description: Number of training model layers
- reference: num-layers
- - name: optimizer
- description: Training model optimizer (sdg, adam or ftrl)
- reference: optimizer
+ - name: momentum
+ description: Momentum for the training model
+ reference: momentum
trialSpec:
apiVersion: batch/v1
kind: Job
@@ -52,12 +40,12 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/mxnet-mnist:latest
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- - "/opt/mxnet-mnist/mnist.py"
- - "--batch-size=64"
+ - "/opt/pytorch-mnist/mnist.py"
+ - "--epochs=1"
+ - "--batch-size=16"
- "--lr=${trialParameters.learningRate}"
- - "--num-layers=${trialParameters.numberLayers}"
- - "--optimizer=${trialParameters.optimizer}"
+ - "--momentum=${trialParameters.momentum}"
restartPolicy: Never