From 6fa91e3176e831622c8a1432cc5eac349d6f00f3 Mon Sep 17 00:00:00 2001 From: KUAN-HSUN-LI Date: Mon, 6 Sep 2021 17:02:52 +0800 Subject: [PATCH] SUBMARINE-1008. Submarine python code formatter and linter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What is this PR for? Currently, submarine python code format doesn't work well. 1. Replace `yapf` with `black` 2. Upgrade `isort` 3. Use `flake8` linter 4. Format the code in dev-support, submarine-sdk and website 5. Only check lint in dev-support, submarine-sdk since python code in website has its' style. (import not at top of file) 6. Update the SDK development guide ### What type of PR is it? [Improvement] ### Todos ### What is the Jira issue? https://issues.apache.org/jira/browse/SUBMARINE-1008 ### How should this be tested? I have updated the ci in GitHub Actions. ### Screenshots (if appropriate) Original `dev-support/style-check/python/lint.sh` result. I have fixed all of the errors in the below images ![螢幕擷取畫面 2021-09-06 094648](https://user-images.githubusercontent.com/38066413/132158081-bdcfe866-08b6-44fa-a5d2-23f9cec3aaa9.png) ![image](https://user-images.githubusercontent.com/38066413/132158044-ae18abed-469f-4cf9-a866-b0144a8f2698.png) ### Questions: * Do the license files need updating? No * Are there breaking changes for older versions? Yes * Does this need new documentation? Yes Author: KUAN-HSUN-LI Signed-off-by: Kevin Closes #736 from KUAN-HSUN-LI/SUBMARINE-1008 and squashes the following commits: 157ac0e3 [KUAN-HSUN-LI] SUBMARINE-1008. Remove useless file and update the 0.6.0 docs 633cdc1a [KUAN-HSUN-LI] add License 643bb946 [KUAN-HSUN-LI] SUBMARINE-1008. Update development guide 0638f676 [KUAN-HSUN-LI] SUBMARINE-1008. Update linter in ci 8892b263 [KUAN-HSUN-LI] SUBMARINE-1008. Update develop guide 4e086e86 [KUAN-HSUN-LI] SUBMARINE-1008. Format with isort and black. Check through flake8 7f679fdf [KUAN-HSUN-LI] SUBMARINE-1008. Setup black, isort and flake8 --- .flake8 | 19 + .github/workflows/python.yml | 4 +- dev-support/cicd/merge_submarine_pr.py | 147 +-- dev-support/database/init-database.py | 9 +- .../mnist-pytorch/DDP/mnist_distributed.py | 198 ++-- .../mnist_keras_distributed.py | 110 ++- .../mnist_keras_distributed.py | 107 +- .../mnist_keras_distributed.py | 77 +- dev-support/examples/nn-pytorch/model.py | 11 +- dev-support/examples/quickstart/train.py | 106 +- dev-support/examples/tracking/tracking.py | 17 +- .../spark-script/pyspark-yarn.py | 14 +- .../submarine/image_classification.py | 479 +++++---- .../submarine/mnist_distributed.py | 142 +-- .../submarine/mnist_distributed_tf2.py | 70 +- .../submarine/pytorch_mnist_distributed.py | 103 +- dev-support/misc/flask/server.py | 6 +- .../style-check/python}/auto-format.sh | 10 +- .../style-check/python}/lint-requirements.txt | 8 +- .../style-check/python}/lint.sh | 27 +- .../scripts/combine-docker-daemons.py | 22 +- .../submarine-installer/scripts/xmlcombine.py | 4 +- pyproject.toml | 23 + .../pysubmarine/example/deepfm_example.ipynb | 2 +- .../example/pytorch/afm/run_afm.py | 24 +- .../example/pytorch/deepfm/run_deepfm.py | 24 +- .../example/submarine_experiment_sdk.ipynb | 36 +- .../example/tensorflow/ccpm/run_ccpm.py | 14 +- .../example/tensorflow/deepfm/run_deepfm.py | 14 +- .../example/tensorflow/fm/run_fm.py | 14 +- submarine-sdk/pysubmarine/example/tracking.py | 4 +- submarine-sdk/pysubmarine/setup.py | 56 +- .../pysubmarine/submarine/__init__.py | 15 +- .../submarine/entities/_submarine_object.py | 17 +- .../pysubmarine/submarine/exceptions.py | 8 +- .../submarine/experiment/__init__.py | 12 +- .../submarine/experiment/api/__init__.py | 4 +- .../experiment/api/experiment_api.py | 413 ++++---- .../experiment/api/experiment_client.py | 24 +- .../submarine/experiment/api_client.py | 423 ++++---- .../submarine/experiment/configuration.py | 61 +- .../submarine/experiment/exceptions.py | 15 +- .../submarine/experiment/models/code_spec.py | 30 +- .../experiment/models/environment_spec.py | 50 +- .../experiment/models/experiment_meta.py | 50 +- .../experiment/models/experiment_spec.py | 39 +- .../experiment/models/experiment_task_spec.py | 70 +- .../experiment/models/json_response.py | 40 +- .../experiment/models/kernel_spec.py | 40 +- .../pysubmarine/submarine/experiment/rest.py | 325 ++++--- .../submarine/ml/abstract_model.py | 4 +- .../ml/pytorch/input/libsvm_dataset.py | 54 +- .../submarine/ml/pytorch/layers/core.py | 44 +- .../pysubmarine/submarine/ml/pytorch/loss.py | 10 +- .../submarine/ml/pytorch/metric.py | 12 +- .../ml/pytorch/model/base_pytorch_model.py | 74 +- .../ml/pytorch/model/ctr/__init__.py | 4 +- .../submarine/ml/pytorch/model/ctr/afm.py | 59 +- .../submarine/ml/pytorch/model/ctr/deepfm.py | 47 +- .../submarine/ml/pytorch/optimizer.py | 8 +- .../submarine/ml/pytorch/parameters.py | 29 +- .../submarine/ml/tensorflow/input/input.py | 28 +- .../submarine/ml/tensorflow/layers/core.py | 134 +-- .../submarine/ml/tensorflow/model/__init__.py | 2 +- .../ml/tensorflow/model/base_tf_model.py | 58 +- .../submarine/ml/tensorflow/model/ccpm.py | 43 +- .../submarine/ml/tensorflow/model/deepfm.py | 19 +- .../submarine/ml/tensorflow/model/fm.py | 10 +- .../submarine/ml/tensorflow/model/nfm.py | 17 +- .../submarine/ml/tensorflow/optimizer.py | 22 +- .../submarine/ml/tensorflow/parameters.py | 13 +- .../pysubmarine/submarine/models/client.py | 30 +- .../pysubmarine/submarine/models/utils.py | 4 +- .../submarine/store/database/db_types.py | 8 +- .../submarine/store/database/models.py | 46 +- .../submarine/store/sqlalchemy_store.py | 34 +- .../submarine/tracking/__init__.py | 8 +- .../pysubmarine/submarine/tracking/client.py | 8 +- .../pysubmarine/submarine/tracking/fluent.py | 5 +- .../pysubmarine/submarine/tracking/utils.py | 4 +- .../pysubmarine/submarine/utils/__init__.py | 7 +- .../pysubmarine/submarine/utils/env.py | 8 +- .../pysubmarine/submarine/utils/fileio.py | 20 +- .../submarine/utils/pytorch_utils.py | 6 +- .../pysubmarine/submarine/utils/rest_utils.py | 36 +- .../pysubmarine/submarine/utils/tf_utils.py | 98 +- .../pysubmarine/submarine/utils/validation.py | 44 +- .../experiment/test_experiment_client.py | 49 +- .../tests/ml/pytorch/model/conftest.py | 31 +- .../tests/ml/pytorch/test_loss_pytorch.py | 6 +- .../tests/ml/pytorch/test_metric_pytorch.py | 6 +- .../ml/pytorch/test_optimizer_pytorch.py | 6 +- .../tests/ml/tensorflow/model/conftest.py | 14 +- .../ml/tensorflow/model/test_base_tf_model.py | 7 +- .../tests/ml/tensorflow/test_optimizer.py | 7 +- .../pysubmarine/tests/models/pytorch.py | 2 - .../pysubmarine/tests/models/test_model.py | 20 +- .../tests/models/test_model_e2e.py | 15 +- .../tests/store/test_sqlalchemy_store.py | 11 +- .../tests/tracking/test_tracking.py | 11 +- .../pysubmarine/tests/tracking/test_utils.py | 16 +- .../pysubmarine/tests/utils/test_env.py | 29 +- .../tests/utils/test_rest_utils.py | 42 +- .../pysubmarine/tests/utils/test_tf_utils.py | 32 +- .../tests/utils/test_validation.py | 10 +- .../submarine-sdk/pysubmarine/development.md | 20 +- .../with-cifar10-models/cifar10_tutorial.py | 148 ++- .../cifar10_estimator_tf_1.13.1/cifar10.py | 169 ++-- .../cifar10_main.py | 918 +++++++++--------- .../cifar10_model.py | 96 +- .../cifar10_utils.py | 247 ++--- .../generate_cifar10_tfrecords.py | 133 ++- .../cifar10_estimator_tf_1.13.1/model_base.py | 372 ++++--- .../submarine-sdk/pysubmarine/development.md | 20 +- .../with-cifar10-models/cifar10_tutorial.py | 148 ++- .../cifar10_estimator_tf_1.13.1/cifar10.py | 169 ++-- .../cifar10_main.py | 918 +++++++++--------- .../cifar10_model.py | 96 +- .../cifar10_utils.py | 247 ++--- .../generate_cifar10_tfrecords.py | 133 ++- .../cifar10_estimator_tf_1.13.1/model_base.py | 372 ++++--- 121 files changed, 4640 insertions(+), 4474 deletions(-) create mode 100644 .flake8 rename {submarine-sdk/pysubmarine/github-actions => dev-support/style-check/python}/auto-format.sh (85%) rename {submarine-sdk/pysubmarine/github-actions => dev-support/style-check/python}/lint-requirements.txt (91%) rename {submarine-sdk/pysubmarine/github-actions => dev-support/style-check/python}/lint.sh (54%) create mode 100644 pyproject.toml diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000..b916a3e6d2 --- /dev/null +++ b/.flake8 @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +[flake8] +max-line-length = 100 \ No newline at end of file diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index fbc79e2c60..9ffdb4de10 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -40,9 +40,9 @@ jobs: pip install --no-cache-dir torch==1.5.0 pip install --no-cache-dir ./submarine-sdk/pysubmarine/. pip install -r ./submarine-sdk/pysubmarine/github-actions/test-requirements.txt - pip install -r ./submarine-sdk/pysubmarine/github-actions/lint-requirements.txt + pip install -r ./dev-support/style-check/python/lint-requirements.txt - name: Check python sdk code style - run: ./submarine-sdk/pysubmarine/github-actions/lint.sh + run: ./dev-support/style-check/python/lint.sh - name: Run unit test run: pytest --cov=submarine -vs -m "not e2e" integration: diff --git a/dev-support/cicd/merge_submarine_pr.py b/dev-support/cicd/merge_submarine_pr.py index 5a8c3b7a58..7f5d1c49ac 100755 --- a/dev-support/cicd/merge_submarine_pr.py +++ b/dev-support/cicd/merge_submarine_pr.py @@ -29,10 +29,12 @@ import re import subprocess import sys + import urllib2 try: import jira.client + JIRA_IMPORTED = True except ImportError: JIRA_IMPORTED = False @@ -61,19 +63,19 @@ def get_json(url): try: return json.load(urllib2.urlopen(url)) - except urllib2.HTTPError as e: - print "Unable to fetch URL, exiting: %s" % url + except urllib2.HTTPError: + print("Unable to fetch URL, exiting: %s" % url) sys.exit(-1) def fail(msg): - print msg + print(msg) clean_up() sys.exit(-1) def run_cmd(cmd): - print cmd + print(cmd) if isinstance(cmd, list): return subprocess.check_output(cmd) else: @@ -81,7 +83,7 @@ def run_cmd(cmd): def continue_maybe(prompt): - result = raw_input("\n%s (y/n): " % prompt) + result = input("\n%s (y/n): " % prompt) if result.lower() != "y": fail("Okay, exiting") @@ -90,13 +92,13 @@ def continue_maybe(prompt): def clean_up(): - print "Restoring head pointer to %s" % original_head + print("Restoring head pointer to %s" % original_head) run_cmd("git checkout %s" % original_head) branches = run_cmd("git branch").replace(" ", "").split("\n") for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches): - print "Deleting local branch %s" % branch + print("Deleting local branch %s" % branch) run_cmd("git branch -D %s" % branch) @@ -110,7 +112,7 @@ def merge_pr(pr_num, target_ref): had_conflicts = False try: - run_cmd(['git', 'merge', pr_branch_name, '--squash']) + run_cmd(["git", "merge", pr_branch_name, "--squash"]) except Exception as e: msg = "Error merging: %s\nWould you like to manually fix-up this merge?" % e continue_maybe(msg) @@ -118,15 +120,17 @@ def merge_pr(pr_num, target_ref): continue_maybe(msg) had_conflicts = True - commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, - '--pretty=format:%an <%ae>']).split("\n") - commit_date = run_cmd(['git', 'log', '%s' % pr_branch_name, '-1', - '--pretty=format:%ad']) - distinct_authors = sorted(set(commit_authors), - key=lambda x: commit_authors.count(x), reverse=True) + commit_authors = run_cmd( + ["git", "log", "HEAD..%s" % pr_branch_name, "--pretty=format:%an <%ae>"] + ).split("\n") + commit_date = run_cmd(["git", "log", "%s" % pr_branch_name, "-1", "--pretty=format:%ad"]) + distinct_authors = sorted( + set(commit_authors), key=lambda x: commit_authors.count(x), reverse=True + ) primary_author = distinct_authors[0] - commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, - '--pretty=format:%h [%an] %s']).split("\n\n") + commits = run_cmd( + ["git", "log", "HEAD..%s" % pr_branch_name, "--pretty=format:%h [%an] %s"] + ).split("\n\n") merge_message_flags = [] @@ -134,7 +138,7 @@ def merge_pr(pr_num, target_ref): if body is not None: # We remove @ symbols from the body to avoid triggering e-mails # to people every time someone creates a public fork of Submarine. - if isinstance(body, unicode): + if isinstance(body, re.UNICODE): merge_message_flags += ["-m", body.encode("utf-8").replace("@", "")] else: merge_message_flags += ["-m", body.replace("@", "")] @@ -145,27 +149,37 @@ def merge_pr(pr_num, target_ref): committer_name = run_cmd("git config --get user.name").strip() committer_email = run_cmd("git config --get user.email").strip() - merge_message_flags += ["-m", "\n" + "Signed-off-by: %s <%s>" % (committer_name, committer_email)] + merge_message_flags += [ + "-m", + "\n" + "Signed-off-by: %s <%s>" % (committer_name, committer_email), + ] if had_conflicts: message = "This patch had conflicts when merged, resolved by\nCommitter: %s <%s>" % ( - committer_name, committer_email) + committer_name, + committer_email, + ) merge_message_flags += ["-m", message] # The string "Closes #%s" string is required for GitHub to correctly close the PR merge_message_flags += [ "-m", - "Closes #%s from %s and squashes the following commits:" % (pr_num, pr_repo_desc)] + "Closes #%s from %s and squashes the following commits:" % (pr_num, pr_repo_desc), + ] for c in commits: merge_message_flags += ["-m", c] - run_cmd(['git', 'commit', '--author="%s"' % primary_author, '--date="%s"' % commit_date] + merge_message_flags) + run_cmd( + ["git", "commit", '--author="%s"' % primary_author, '--date="%s"' % commit_date] + + merge_message_flags + ) - continue_maybe("Merge complete (local ref %s). Push to %s?" % ( - target_branch_name, PUSH_REMOTE_NAME)) + continue_maybe( + "Merge complete (local ref %s). Push to %s?" % (target_branch_name, PUSH_REMOTE_NAME) + ) try: - run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name, target_ref)) + run_cmd("git push %s %s:%s" % (PUSH_REMOTE_NAME, target_branch_name, target_ref)) except Exception as e: clean_up() fail("Exception while pushing: %s" % e) @@ -178,7 +192,7 @@ def merge_pr(pr_num, target_ref): def cherry_pick(pr_num, merge_hash, default_branch): - pick_ref = raw_input("Enter a branch name [%s]: " % default_branch) + pick_ref = input("Enter a branch name [%s]: " % default_branch) if pick_ref == "": pick_ref = default_branch @@ -195,11 +209,12 @@ def cherry_pick(pr_num, merge_hash, default_branch): msg = "Okay, please fix any conflicts and finish the cherry-pick. Finished?" continue_maybe(msg) - continue_maybe("Pick complete (local ref %s). Push to %s?" % ( - pick_branch_name, PUSH_REMOTE_NAME)) + continue_maybe( + "Pick complete (local ref %s). Push to %s?" % (pick_branch_name, PUSH_REMOTE_NAME) + ) try: - run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name, pick_ref)) + run_cmd("git push %s %s:%s" % (PUSH_REMOTE_NAME, pick_branch_name, pick_ref)) except Exception as e: clean_up() fail("Exception while pushing: %s" % e) @@ -222,10 +237,11 @@ def fix_version_from_branch(branch, versions): def resolve_jira_issue(merge_branches, comment, default_jira_id=""): - asf_jira = jira.client.JIRA({'server': JIRA_API_BASE}, - basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) + asf_jira = jira.client.JIRA( + {"server": JIRA_API_BASE}, basic_auth=(JIRA_USERNAME, JIRA_PASSWORD) + ) - jira_id = raw_input("Enter a JIRA id [%s]: " % default_jira_id) + jira_id = input("Enter a JIRA id [%s]: " % default_jira_id) if jira_id == "": jira_id = default_jira_id @@ -244,15 +260,23 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""): if cur_status == "Resolved" or cur_status == "Closed": fail("JIRA issue %s already has status '%s'" % (jira_id, cur_status)) - print ("=== JIRA %s ===" % jira_id) - print ("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" % ( - cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id)) + print("=== JIRA %s ===" % jira_id) + print( + "summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" + % ( + cur_summary, + cur_assignee, + cur_status, + JIRA_BASE, + jira_id, + ) + ) versions = asf_jira.project_versions("SUBMARINE") versions = sorted(versions, key=lambda x: x.name, reverse=True) - versions = filter(lambda x: x.raw['released'] is False, versions) + versions = filter(lambda x: x.raw["released"] is False, versions) # Consider only x.y.z versions - versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions) + versions = filter(lambda x: re.match(r"\d+\.\d+\.\d+", x.name), versions) default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches) for v in default_fix_versions: @@ -267,7 +291,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""): default_fix_versions = filter(lambda x: x != v, default_fix_versions) default_fix_versions = ",".join(default_fix_versions) - fix_versions = raw_input("Enter comma-separated fix version(s) [%s]: " % default_fix_versions) + fix_versions = input("Enter comma-separated fix version(s) [%s]: " % default_fix_versions) if fix_versions == "": fix_versions = default_fix_versions fix_versions = fix_versions.replace(" ", "").split(",") @@ -277,11 +301,12 @@ def get_version_json(version_str): jira_fix_versions = map(lambda v: get_version_json(v), fix_versions) - resolve = filter(lambda a: a['name'] == "Resolve Issue", asf_jira.transitions(jira_id))[0] + resolve = filter(lambda a: a["name"] == "Resolve Issue", asf_jira.transitions(jira_id))[0] asf_jira.transition_issue( - jira_id, resolve["id"], fixVersions=jira_fix_versions, comment=comment) + jira_id, resolve["id"], fixVersions=jira_fix_versions, comment=comment + ) - print "Successfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions) + print("Successfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions)) def resolve_jira_issues(title, merge_branches, comment): @@ -293,13 +318,13 @@ def resolve_jira_issues(title, merge_branches, comment): resolve_jira_issue(merge_branches, comment, jira_id) -#branches = get_json("%s/branches" % GITHUB_API_BASE) -#branch_names = filter(lambda x: x.startswith("branch-"), [x['name'] for x in branches]) +# branches = get_json("%s/branches" % GITHUB_API_BASE) +# branch_names = filter(lambda x: x.startswith("branch-"), [x['name'] for x in branches]) # Assumes branch names can be sorted lexicographically -#latest_branch = sorted(branch_names, reverse=True)[0] +# latest_branch = sorted(branch_names, reverse=True)[0] latest_branch = "master" -pr_num = raw_input("Which pull request would you like to merge? (e.g. 23): ") +pr_num = input("Which pull request would you like to merge? (e.g. 23): ") pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num)) pr_events = get_json("%s/issues/%s/events" % (GITHUB_API_BASE, pr_num)) @@ -313,31 +338,33 @@ def resolve_jira_issues(title, merge_branches, comment): # Merged pull requests don't appear as merged in the GitHub API; # Instead, they're closed by asfgit. -merge_commits = \ - [e for e in pr_events if e["actor"]["login"] == "asfgit" and e["event"] == "closed"] +merge_commits = [e for e in pr_events if e["actor"]["login"] == "asfgit" and e["event"] == "closed"] if merge_commits: merge_hash = merge_commits[0]["commit_id"] message = get_json("%s/commits/%s" % (GITHUB_API_BASE, merge_hash))["commit"]["message"] - print "Pull request %s has already been merged, assuming you want to backport" % pr_num - commit_is_downloaded = run_cmd(['git', 'rev-parse', '--quiet', '--verify', - "%s^{commit}" % merge_hash]).strip() != "" + print("Pull request %s has already been merged, assuming you want to backport" % pr_num) + commit_is_downloaded = ( + run_cmd(["git", "rev-parse", "--quiet", "--verify", "%s^{commit}" % merge_hash]).strip() + != "" + ) if not commit_is_downloaded: fail("Couldn't find any merge commit for #%s, you may need to update HEAD." % pr_num) - print "Found commit %s:\n%s" % (merge_hash, message) + print("Found commit %s:\n%s" % (merge_hash, message)) cherry_pick(pr_num, merge_hash, latest_branch) sys.exit(0) if not bool(pr["mergeable"]): - msg = "Pull request %s is not mergeable in its current form.\n" % pr_num + \ - "Continue? (experts only!)" + msg = ( + "Pull request %s is not mergeable in its current form.\n" % pr_num + + "Continue? (experts only!)" + ) continue_maybe(msg) -print ("\n=== Pull Request #%s ===" % pr_num) -print ("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % ( - title, pr_repo_desc, target_ref, url)) +print("\n=== Pull Request #%s ===" % pr_num) +print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % (title, pr_repo_desc, target_ref, url)) continue_maybe("Proceed with merging pull request #%s?" % pr_num) merged_refs = [target_ref] @@ -345,7 +372,7 @@ def resolve_jira_issues(title, merge_branches, comment): merge_hash = merge_pr(pr_num, target_ref) pick_prompt = "Would you like to pick %s into another branch?" % merge_hash -while raw_input("\n%s (y/n): " % pick_prompt).lower() == "y": +while input("\n%s (y/n): " % pick_prompt).lower() == "y": merged_refs = merged_refs + [cherry_pick(pr_num, merge_hash, latest_branch)] if JIRA_IMPORTED: @@ -354,8 +381,8 @@ def resolve_jira_issues(title, merge_branches, comment): jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (pr_num, GITHUB_BASE, pr_num) resolve_jira_issues(title, merged_refs, jira_comment) else: - print "JIRA_USERNAME and JIRA_PASSWORD not set" - print "Exiting without trying to close the associated JIRA." + print("JIRA_USERNAME and JIRA_PASSWORD not set") + print("Exiting without trying to close the associated JIRA.") else: - print "Could not find jira library. Run 'sudo pip install jira' to install." - print "Exiting without trying to close the associated JIRA." + print("Could not find jira library. Run 'sudo pip install jira' to install.") + print("Exiting without trying to close the associated JIRA.") diff --git a/dev-support/database/init-database.py b/dev-support/database/init-database.py index f98d509abd..aa1ac69060 100644 --- a/dev-support/database/init-database.py +++ b/dev-support/database/init-database.py @@ -18,8 +18,7 @@ import mysql.connector -conn = mysql.connector.connect( - user='root', password='password', host='127.0.0.1') +conn = mysql.connector.connect(user="root", password="password", host="127.0.0.1") cursor = conn.cursor(buffered=True) @@ -35,10 +34,12 @@ def commit(sql): # Commit your changes in the database conn.commit() - except: + except mysql.connector.Error as err: + print("Something went wrong: {}".format(err)) # Rolling back in case of error conn.rollback() + def commit_from_file(file_path): with open(file_path) as f: for result in cursor.execute(f.read(), multi=True): @@ -76,4 +77,4 @@ def commit_from_file(file_path): commit("GRANT ALL PRIVILEGES ON *.* TO 'metastore'@'%';") commit("use metastore;") commit_from_file("./dev-support/database/metastore.sql") -commit("show tables;") \ No newline at end of file +commit("show tables;") diff --git a/dev-support/examples/mnist-pytorch/DDP/mnist_distributed.py b/dev-support/examples/mnist-pytorch/DDP/mnist_distributed.py index fd24a4ea2b..76161bcbc4 100644 --- a/dev-support/examples/mnist-pytorch/DDP/mnist_distributed.py +++ b/dev-support/examples/mnist-pytorch/DDP/mnist_distributed.py @@ -16,29 +16,32 @@ """ from __future__ import print_function -from submarine import ModelsClient import argparse import os -from tensorboardX import SummaryWriter -from torchvision import datasets, transforms import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F import torch.optim as optim +from tensorboardX import SummaryWriter +from torchvision import datasets, transforms + +from submarine import ModelsClient + +WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1)) +rank = int(os.environ.get("RANK", 0)) + +print("WORLD={} , RANK={}".format(WORLD_SIZE, rank)) -WORLD_SIZE = int(os.environ.get('WORLD_SIZE', 1)) -rank = int(os.environ.get('RANK', 0)) -print('WORLD={} , RANK={}'.format(WORLD_SIZE,rank)) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 20, 5, 1) self.conv2 = nn.Conv2d(20, 50, 5, 1) - self.fc1 = nn.Linear(4*4*50, 500) + self.fc1 = nn.Linear(4 * 4 * 50, 500) self.fc2 = nn.Linear(500, 10) def forward(self, x): @@ -46,11 +49,12 @@ def forward(self, x): x = F.max_pool2d(x, 2, 2) x = F.relu(self.conv2(x)) x = F.max_pool2d(x, 2, 2) - x = x.view(-1, 4*4*50) + x = x.view(-1, 4 * 4 * 50) x = F.relu(self.fc1(x)) x = self.fc2(x) return F.log_softmax(x, dim=1) - + + def train(args, model, device, train_loader, optimizer, epoch, writer, periscope): model.train() for batch_idx, (data, target) in enumerate(train_loader): @@ -61,12 +65,19 @@ def train(args, model, device, train_loader, optimizer, epoch, writer, periscope loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}'.format( - epoch, batch_idx * len(data), len(train_loader.dataset), - 100. * batch_idx / len(train_loader), loss.item())) + print( + "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( + epoch, + batch_idx * len(data), + len(train_loader.dataset), + 100.0 * batch_idx / len(train_loader), + loss.item(), + ) + ) niter = epoch * len(train_loader) + batch_idx - writer.add_scalar('loss', loss.item(), niter) - periscope.log_metric('loss', loss.item(), niter) + writer.add_scalar("loss", loss.item(), niter) + periscope.log_metric("loss", loss.item(), niter) + def test(args, model, device, test_loader, writer, epoch, periscope): model.eval() @@ -76,14 +87,15 @@ def test(args, model, device, test_loader, writer, epoch, periscope): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) - test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss - pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability + test_loss += F.nll_loss(output, target, reduction="sum").item() # sum up batch loss + pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) - print('\naccuracy={:.4f}\n'.format(float(correct) / len(test_loader.dataset))) - writer.add_scalar('accuracy', float(correct) / len(test_loader.dataset), epoch) - periscope.log_metric('accuracy', float(correct) / len(test_loader.dataset), epoch) + print("\naccuracy={:.4f}\n".format(float(correct) / len(test_loader.dataset))) + writer.add_scalar("accuracy", float(correct) / len(test_loader.dataset), epoch) + periscope.log_metric("accuracy", float(correct) / len(test_loader.dataset), epoch) + def should_distribute(): return dist.is_available() and WORLD_SIZE > 1 @@ -93,39 +105,63 @@ def is_distributed(): return dist.is_available() and dist.is_initialized() -if __name__ == '__main__': +if __name__ == "__main__": # Training settings - parser = argparse.ArgumentParser(description='PyTorch MNIST Example') - parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=5, metavar='N', - help='number of epochs to train (default: 5)') - parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') - parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') - parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') - parser.add_argument('--log-interval', type=int, default=10, metavar='N', - help='how many batches to wait before logging training status') - parser.add_argument('--save-model', action='store_true', default=False, - help='For Saving the current Model') - parser.add_argument('--dir', default='logs', metavar='L', - help='directory where summary logs are stored') + parser = argparse.ArgumentParser(description="PyTorch MNIST Example") + parser.add_argument( + "--batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for training (default: 64)", + ) + parser.add_argument( + "--test-batch-size", + type=int, + default=1000, + metavar="N", + help="input batch size for testing (default: 1000)", + ) + parser.add_argument( + "--epochs", type=int, default=5, metavar="N", help="number of epochs to train (default: 5)" + ) + parser.add_argument( + "--lr", type=float, default=0.01, metavar="LR", help="learning rate (default: 0.01)" + ) + parser.add_argument( + "--momentum", type=float, default=0.5, metavar="M", help="SGD momentum (default: 0.5)" + ) + parser.add_argument( + "--no-cuda", action="store_true", default=False, help="disables CUDA training" + ) + parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") + parser.add_argument( + "--log-interval", + type=int, + default=10, + metavar="N", + help="how many batches to wait before logging training status", + ) + parser.add_argument( + "--save-model", action="store_true", default=False, help="For Saving the current Model" + ) + parser.add_argument( + "--dir", default="logs", metavar="L", help="directory where summary logs are stored" + ) if dist.is_available(): - parser.add_argument('--backend', type=str, help='Distributed backend', - choices=[dist.Backend.GLOO, dist.Backend.NCCL, dist.Backend.MPI], - default=dist.Backend.GLOO) + parser.add_argument( + "--backend", + type=str, + help="Distributed backend", + choices=[dist.Backend.GLOO, dist.Backend.NCCL, dist.Backend.MPI], + default=dist.Backend.GLOO, + ) args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() if use_cuda: - print('Using CUDA') - else : - print('Not Using CUDA') + print("Using CUDA") + else: + print("Not Using CUDA") writer = SummaryWriter(args.dir) @@ -134,44 +170,52 @@ def is_distributed(): device = torch.device("cuda" if use_cuda else "cpu") if should_distribute(): - print('Using distributed PyTorch with {} backend'.format(args.backend)) - dist.init_process_group( - backend=args.backend, - world_size=WORLD_SIZE, - rank=rank) - - kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} - - train_dataset = datasets.FashionMNIST('../data', train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) + print("Using distributed PyTorch with {} backend".format(args.backend)) + dist.init_process_group(backend=args.backend, world_size=WORLD_SIZE, rank=rank) + + kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} + + train_dataset = datasets.FashionMNIST( + "../data", + train=True, + download=True, + transform=transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] + ), + ) train_sampler = torch.utils.data.distributed.DistributedSampler( - train_dataset, - num_replicas = WORLD_SIZE, - rank=rank + train_dataset, num_replicas=WORLD_SIZE, rank=rank ) train_loader = torch.utils.data.DataLoader( - dataset = train_dataset, - batch_size = args.batch_size, - shuffle = False, - sampler = train_sampler, - **kwargs) + dataset=train_dataset, + batch_size=args.batch_size, + shuffle=False, + sampler=train_sampler, + **kwargs + ) test_loader = torch.utils.data.DataLoader( - datasets.FashionMNIST('../data', train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=args.test_batch_size, shuffle=False, **kwargs) + datasets.FashionMNIST( + "../data", + train=False, + transform=transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] + ), + ), + batch_size=args.test_batch_size, + shuffle=False, + **kwargs + ) model = Net().to(device) if is_distributed(): - Distributor = nn.parallel.DistributedDataParallel if use_cuda \ + Distributor = ( + nn.parallel.DistributedDataParallel + if use_cuda else nn.parallel.DistributedDataParallelCPU + ) model = Distributor(model) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) @@ -182,10 +226,10 @@ def is_distributed(): for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch, writer, periscope) test(args, model, device, test_loader, writer, epoch, periscope) - if (args.save_model): - torch.save(model.state_dict(),"mnist_cnn.pt") + if args.save_model: + torch.save(model.state_dict(), "mnist_cnn.pt") """ Reference: https://github.com/kubeflow/pytorch-operator/blob/master/examples/mnist/mnist.py -""" \ No newline at end of file +""" diff --git a/dev-support/examples/mnist-tensorflow/MirroredStrategy/mnist_keras_distributed.py b/dev-support/examples/mnist-tensorflow/MirroredStrategy/mnist_keras_distributed.py index 7f4ae6a715..7cc71b4bd5 100644 --- a/dev-support/examples/mnist-tensorflow/MirroredStrategy/mnist_keras_distributed.py +++ b/dev-support/examples/mnist-tensorflow/MirroredStrategy/mnist_keras_distributed.py @@ -15,97 +15,105 @@ under the License. """ -from submarine import ModelsClient -import tensorflow_datasets as tfds -import tensorflow as tf import os -import tensorboard -datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True) -mnist_train, mnist_test = datasets['train'], datasets['test'] +import tensorflow as tf +import tensorflow_datasets as tfds + +from submarine import ModelsClient + +datasets, info = tfds.load(name="mnist", with_info=True, as_supervised=True) +mnist_train, mnist_test = datasets["train"], datasets["test"] strategy = tf.distribute.MirroredStrategy() -print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) +print("Number of devices: {}".format(strategy.num_replicas_in_sync)) # You can also do info.splits.total_num_examples to get the total # number of examples in the dataset. -num_train_examples = info.splits['train'].num_examples -num_test_examples = info.splits['test'].num_examples +num_train_examples = info.splits["train"].num_examples +num_test_examples = info.splits["test"].num_examples BUFFER_SIZE = 10000 BATCH_SIZE_PER_REPLICA = 64 BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync + def scale(image, label): - image = tf.cast(image, tf.float32) - image /= 255 + image = tf.cast(image, tf.float32) + image /= 255 + + return image, label - return image, label train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE) eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE) with strategy.scope(): - model = tf.keras.Sequential([ - tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), - tf.keras.layers.MaxPooling2D(), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(64, activation='relu'), - tf.keras.layers.Dense(10) - ]) - - model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), - optimizer=tf.keras.optimizers.Adam(), - metrics=['accuracy']) + model = tf.keras.Sequential( + [ + tf.keras.layers.Conv2D(32, 3, activation="relu", input_shape=(28, 28, 1)), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(64, activation="relu"), + tf.keras.layers.Dense(10), + ] + ) + + model.compile( + loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + optimizer=tf.keras.optimizers.Adam(), + metrics=["accuracy"], + ) # Define the checkpoint directory to store the checkpoints. -checkpoint_dir = './training_checkpoints' +checkpoint_dir = "./training_checkpoints" # Define the name of the checkpoint files. checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") + # Define a function for decaying the learning rate. # You can define any decay function you need. def decay(epoch): - if epoch < 3: - return 1e-3 - elif epoch >= 3 and epoch < 7: - return 1e-4 - else: - return 1e-5 + if epoch < 3: + return 1e-3 + elif epoch >= 3 and epoch < 7: + return 1e-4 + else: + return 1e-5 + # Define a callback for printing the learning rate at the end of each epoch. class PrintLR(tf.keras.callbacks.Callback): - def on_epoch_end(self, epoch, logs=None): - print('\nLearning rate for epoch {} is {}'.format(epoch + 1, - model.optimizer.lr.numpy())) - modelClient.log_metric("lr", model.optimizer.lr.numpy()) + def on_epoch_end(self, epoch, logs=None): + print("\nLearning rate for epoch {} is {}".format(epoch + 1, model.optimizer.lr.numpy())) + modelClient.log_metric("lr", model.optimizer.lr.numpy()) + # Put all the callbacks together. callbacks = [ - tf.keras.callbacks.TensorBoard(log_dir='./logs'), - tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, - save_weights_only=True), + tf.keras.callbacks.TensorBoard(log_dir="./logs"), + tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True), tf.keras.callbacks.LearningRateScheduler(decay), - PrintLR() + PrintLR(), ] if __name__ == "__main__": - modelClient = ModelsClient() - with modelClient.start() as run: - EPOCHS = 5 - hist = model.fit(train_dataset, epochs=EPOCHS, callbacks=callbacks) - for i in range(EPOCHS): - modelClient.log_metric("val_loss", hist.history['loss'][i]) - modelClient.log_metric("Val_accuracy", hist.history['accuracy'][i]) - model.load_weights(tf.train.latest_checkpoint(checkpoint_dir)) - eval_loss, eval_acc = model.evaluate(eval_dataset) - print('Eval loss: {}, Eval accuracy: {}'.format(eval_loss, eval_acc)) - modelClient.log_param("loss", eval_loss) - modelClient.log_param("acc", eval_acc) + modelClient = ModelsClient() + with modelClient.start() as run: + EPOCHS = 5 + hist = model.fit(train_dataset, epochs=EPOCHS, callbacks=callbacks) + for i in range(EPOCHS): + modelClient.log_metric("val_loss", hist.history["loss"][i]) + modelClient.log_metric("Val_accuracy", hist.history["accuracy"][i]) + model.load_weights(tf.train.latest_checkpoint(checkpoint_dir)) + eval_loss, eval_acc = model.evaluate(eval_dataset) + print("Eval loss: {}, Eval accuracy: {}".format(eval_loss, eval_acc)) + modelClient.log_param("loss", eval_loss) + modelClient.log_param("acc", eval_acc) """Reference: https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy -""" \ No newline at end of file +""" diff --git a/dev-support/examples/mnist-tensorflow/MultiWorkerMirroredStrategy/mnist_keras_distributed.py b/dev-support/examples/mnist-tensorflow/MultiWorkerMirroredStrategy/mnist_keras_distributed.py index 4b3c0326d5..21df78b00a 100644 --- a/dev-support/examples/mnist-tensorflow/MultiWorkerMirroredStrategy/mnist_keras_distributed.py +++ b/dev-support/examples/mnist-tensorflow/MultiWorkerMirroredStrategy/mnist_keras_distributed.py @@ -14,79 +14,94 @@ specific language governing permissions and limitations under the License. """ -from submarine import ModelsClient import json import os -import sys + import tensorflow as tf -import numpy as np import tensorflow_datasets as tfds +from submarine import ModelsClient + BUFFER_SIZE = 10000 BATCH_SIZE = 32 strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + def make_datasets_unbatched(): - #Scaling MNIST data from (0, 255] to (0., 1.] - def scale(image, label): - image = tf.cast(image, tf.float32) - image /= 255 - return image, label + # Scaling MNIST data from (0, 255] to (0., 1.] + def scale(image, label): + image = tf.cast(image, tf.float32) + image /= 255 + return image, label - datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True) + datasets, info = tfds.load(name="mnist", with_info=True, as_supervised=True) + + return ( + datasets["train"] + .map(scale, num_parallel_calls=tf.data.experimental.AUTOTUNE) + .cache() + .shuffle(BUFFER_SIZE) + ) - return datasets['train'].map(scale, num_parallel_calls=tf.data.experimental.AUTOTUNE).cache().shuffle(BUFFER_SIZE) def build_and_compile_cnn_model(): - model = tf.keras.Sequential([ - tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), - tf.keras.layers.MaxPooling2D(), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(64, activation='relu'), - tf.keras.layers.Dense(10) - ]) - model.compile( - loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), - optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), - metrics=['accuracy']) - return model - -tf_config = json.loads(os.environ['TF_CONFIG']) -NUM_WORKERS = len(tf_config['cluster']['worker']) - -#Here the batch size scales up by number of workers since -#`tf.data.Dataset.batch` expects the global batch size. Previously we used 64, -#and now this becomes 128. + model = tf.keras.Sequential( + [ + tf.keras.layers.Conv2D(32, 3, activation="relu", input_shape=(28, 28, 1)), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(64, activation="relu"), + tf.keras.layers.Dense(10), + ] + ) + model.compile( + loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), + metrics=["accuracy"], + ) + return model + + +tf_config = json.loads(os.environ["TF_CONFIG"]) +NUM_WORKERS = len(tf_config["cluster"]["worker"]) + +# Here the batch size scales up by number of workers since +# `tf.data.Dataset.batch` expects the global batch size. Previously we used 64, +# and now this becomes 128. GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS -#Creation of dataset needs to be after MultiWorkerMirroredStrategy object -#is instantiated. +# Creation of dataset needs to be after MultiWorkerMirroredStrategy object +# is instantiated. train_datasets = make_datasets_unbatched().batch(GLOBAL_BATCH_SIZE) -#next three line is the key point to fix this problem +# next three line is the key point to fix this problem options = tf.data.Options() -options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA # AutoShardPolicy.OFF can work too. +options.experimental_distribute.auto_shard_policy = ( + tf.data.experimental.AutoShardPolicy.DATA +) # AutoShardPolicy.OFF can work too. train_datasets_no_auto_shard = train_datasets.with_options(options) with strategy.scope(): - #Model building/compiling need to be within `strategy.scope()`. - multi_worker_model = build_and_compile_cnn_model() + # Model building/compiling need to be within `strategy.scope()`. + multi_worker_model = build_and_compile_cnn_model() -#Keras' `model.fit()` trains the model with specified number of epochs and -#number of steps per epoch. Note that the numbers here are for demonstration -#purposes only and may not sufficiently produce a model with good quality. +# Keras' `model.fit()` trains the model with specified number of epochs and +# number of steps per epoch. Note that the numbers here are for demonstration +# purposes only and may not sufficiently produce a model with good quality. -#attention: x=train_datasets_no_auto_shard , not x = train_datasets +# attention: x=train_datasets_no_auto_shard , not x = train_datasets if __name__ == "__main__": - modelClient = ModelsClient() - with modelClient.start() as run: - EPOCHS = 5 - hist = multi_worker_model.fit(x=train_datasets_no_auto_shard, epochs=EPOCHS, steps_per_epoch=5) - for i in range(EPOCHS): - modelClient.log_metric("val_loss", hist.history['loss'][i]) - modelClient.log_metric("Val_accuracy", hist.history['accuracy'][i]) + modelClient = ModelsClient() + with modelClient.start() as run: + EPOCHS = 5 + hist = multi_worker_model.fit( + x=train_datasets_no_auto_shard, epochs=EPOCHS, steps_per_epoch=5 + ) + for i in range(EPOCHS): + modelClient.log_metric("val_loss", hist.history["loss"][i]) + modelClient.log_metric("Val_accuracy", hist.history["accuracy"][i]) """Reference diff --git a/dev-support/examples/mnist-tensorflow/ParameterServerStrategy/mnist_keras_distributed.py b/dev-support/examples/mnist-tensorflow/ParameterServerStrategy/mnist_keras_distributed.py index 3a9cbd8867..b65a584fbc 100644 --- a/dev-support/examples/mnist-tensorflow/ParameterServerStrategy/mnist_keras_distributed.py +++ b/dev-support/examples/mnist-tensorflow/ParameterServerStrategy/mnist_keras_distributed.py @@ -14,56 +14,54 @@ specific language governing permissions and limitations under the License. """ +import json import os -import random + import tensorflow as tf -import json -from tensorflow.keras.layers.experimental import preprocessing -import tensorflow_datasets as tfds -import tensorboard + +from submarine import ModelsClient print(tf.__version__) -TF_CONFIG = os.environ.get('TF_CONFIG', '') -NUM_PS = len(json.loads(TF_CONFIG)['cluster']['ps']) +TF_CONFIG = os.environ.get("TF_CONFIG", "") +NUM_PS = len(json.loads(TF_CONFIG)["cluster"]["ps"]) cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver() -variable_partitioner = ( - tf.distribute.experimental.partitioners.MinSizePartitioner( - min_shard_bytes=(256 << 10), - max_shards=NUM_PS)) +variable_partitioner = tf.distribute.experimental.partitioners.MinSizePartitioner( + min_shard_bytes=(256 << 10), max_shards=NUM_PS +) strategy = tf.distribute.experimental.ParameterServerStrategy( - cluster_resolver, - variable_partitioner=variable_partitioner) + cluster_resolver, variable_partitioner=variable_partitioner +) + def dataset_fn(input_context): - global_batch_size = 64 - batch_size = input_context.get_per_replica_batch_size(global_batch_size) + global_batch_size = 64 + batch_size = input_context.get_per_replica_batch_size(global_batch_size) + + x = tf.random.uniform((10, 10)) + y = tf.random.uniform((10,)) - x = tf.random.uniform((10, 10)) - y = tf.random.uniform((10,)) + dataset = tf.data.Dataset.from_tensor_slices((x, y)).shuffle(10).repeat() + dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) + dataset = dataset.batch(batch_size) + dataset = dataset.prefetch(2) - dataset = tf.data.Dataset.from_tensor_slices((x, y)).shuffle(10).repeat() - dataset = dataset.shard( - input_context.num_input_pipelines, - input_context.input_pipeline_id) - dataset = dataset.batch(batch_size) - dataset = dataset.prefetch(2) + return dataset - return dataset dc = tf.keras.utils.experimental.DatasetCreator(dataset_fn) with strategy.scope(): - model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)]) + model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)]) -model.compile(tf.keras.optimizers.SGD(), loss='mse', steps_per_execution=10) +model.compile(tf.keras.optimizers.SGD(), loss="mse", steps_per_execution=10) -working_dir = '/tmp/my_working_dir' -log_dir = os.path.join(working_dir, 'log') -ckpt_filepath = os.path.join(working_dir, 'ckpt') -backup_dir = os.path.join(working_dir, 'backup') +working_dir = "/tmp/my_working_dir" +log_dir = os.path.join(working_dir, "log") +ckpt_filepath = os.path.join(working_dir, "ckpt") +backup_dir = os.path.join(working_dir, "backup") callbacks = [ tf.keras.callbacks.TensorBoard(log_dir=log_dir), @@ -71,16 +69,19 @@ def dataset_fn(input_context): tf.keras.callbacks.experimental.BackupAndRestore(backup_dir=backup_dir), ] +# Define the checkpoint directory to store the checkpoints. +checkpoint_dir = "./training_checkpoints" + model.fit(dc, epochs=5, steps_per_epoch=20, callbacks=callbacks) if __name__ == "__main__": - modelClient = ModelsClient() - with modelClient.start() as run: - EPOCHS = 5 - hist = model.fit(dc, epochs=EPOCHS, steps_per_epoch=20, callbacks=callbacks) - for i in range(EPOCHS): - modelClient.log_metric("val_loss", hist.history['loss'][i]) - modelClient.log_metric("Val_accuracy", hist.history['accuracy'][i]) - model.load_weights(tf.train.latest_checkpoint(checkpoint_dir)) + modelClient = ModelsClient() + with modelClient.start() as run: + EPOCHS = 5 + hist = model.fit(dc, epochs=EPOCHS, steps_per_epoch=20, callbacks=callbacks) + for i in range(EPOCHS): + modelClient.log_metric("val_loss", hist.history["loss"][i]) + modelClient.log_metric("Val_accuracy", hist.history["accuracy"][i]) + model.load_weights(tf.train.latest_checkpoint(checkpoint_dir)) """ Reference: diff --git a/dev-support/examples/nn-pytorch/model.py b/dev-support/examples/nn-pytorch/model.py index 10ef94b3f8..48558a3984 100644 --- a/dev-support/examples/nn-pytorch/model.py +++ b/dev-support/examples/nn-pytorch/model.py @@ -14,9 +14,8 @@ specific language governing permissions and limitations under the License. """ -from submarine import ModelsClient -import numpy as np import torch + from submarine import ModelsClient @@ -29,7 +28,13 @@ def forward(self, x): y_pred = self.linear(x) return y_pred + if __name__ == "__main__": client = ModelsClient() net = LinearNNModel() - client.save_model(model_type = "pytorch", model = net, artifact_path="pytorch-nn-model", registered_model_name="simple-nn-model") + client.save_model( + model_type="pytorch", + model=net, + artifact_path="pytorch-nn-model", + registered_model_name="simple-nn-model", + ) diff --git a/dev-support/examples/quickstart/train.py b/dev-support/examples/quickstart/train.py index e33de6815b..c9476bb349 100644 --- a/dev-support/examples/quickstart/train.py +++ b/dev-support/examples/quickstart/train.py @@ -16,71 +16,73 @@ An example of multi-worker training with Keras model using Strategy API. https://github.com/kubeflow/tf-operator/blob/master/examples/v1/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py """ -import tensorflow_datasets as tfds import tensorflow as tf +import tensorflow_datasets as tfds from tensorflow.keras import layers, models + from submarine import ModelsClient + def make_datasets_unbatched(): - BUFFER_SIZE = 10000 + BUFFER_SIZE = 10000 - # Scaling MNIST data from (0, 255] to (0., 1.] - def scale(image, label): - image = tf.cast(image, tf.float32) - image /= 255 - return image, label + # Scaling MNIST data from (0, 255] to (0., 1.] + def scale(image, label): + image = tf.cast(image, tf.float32) + image /= 255 + return image, label - datasets, _ = tfds.load(name='mnist', with_info=True, as_supervised=True) + datasets, _ = tfds.load(name="mnist", with_info=True, as_supervised=True) - return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE) + return datasets["train"].map(scale).cache().shuffle(BUFFER_SIZE) def build_and_compile_cnn_model(): - model = models.Sequential() - model.add( - layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1))) - model.add(layers.MaxPooling2D((2, 2))) - model.add(layers.Conv2D(64, (3, 3), activation='relu')) - model.add(layers.MaxPooling2D((2, 2))) - model.add(layers.Conv2D(64, (3, 3), activation='relu')) - model.add(layers.Flatten()) - model.add(layers.Dense(64, activation='relu')) - model.add(layers.Dense(10, activation='softmax')) + model = models.Sequential() + model.add(layers.Conv2D(32, (3, 3), activation="relu", input_shape=(28, 28, 1))) + model.add(layers.MaxPooling2D((2, 2))) + model.add(layers.Conv2D(64, (3, 3), activation="relu")) + model.add(layers.MaxPooling2D((2, 2))) + model.add(layers.Conv2D(64, (3, 3), activation="relu")) + model.add(layers.Flatten()) + model.add(layers.Dense(64, activation="relu")) + model.add(layers.Dense(10, activation="softmax")) + + model.summary() - model.summary() + model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) - model.compile(optimizer='adam', - loss='sparse_categorical_crossentropy', - metrics=['accuracy']) + return model - return model def main(): - strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy( - communication=tf.distribute.experimental.CollectiveCommunication.AUTO) - - BATCH_SIZE_PER_REPLICA = 4 - BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync - - with strategy.scope(): - ds_train = make_datasets_unbatched().batch(BATCH_SIZE).repeat() - options = tf.data.Options() - options.experimental_distribute.auto_shard_policy = \ - tf.data.experimental.AutoShardPolicy.DATA - ds_train = ds_train.with_options(options) - # Model building/compiling need to be within `strategy.scope()`. - multi_worker_model = build_and_compile_cnn_model() - - class MyCallback(tf.keras.callbacks.Callback): - def on_epoch_end(self, epoch, logs=None): - # monitor the loss and accuracy - print(logs) - modelClient.log_metrics({"loss": logs["loss"], "accuracy": logs["accuracy"]}, epoch) - - with modelClient.start() as run: - multi_worker_model.fit(ds_train, epochs=10, steps_per_epoch=70, callbacks=[MyCallback()]) - - -if __name__ == '__main__': - modelClient = ModelsClient() - main() \ No newline at end of file + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy( + communication=tf.distribute.experimental.CollectiveCommunication.AUTO + ) + + BATCH_SIZE_PER_REPLICA = 4 + BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync + + with strategy.scope(): + ds_train = make_datasets_unbatched().batch(BATCH_SIZE).repeat() + options = tf.data.Options() + options.experimental_distribute.auto_shard_policy = ( + tf.data.experimental.AutoShardPolicy.DATA + ) + ds_train = ds_train.with_options(options) + # Model building/compiling need to be within `strategy.scope()`. + multi_worker_model = build_and_compile_cnn_model() + + class MyCallback(tf.keras.callbacks.Callback): + def on_epoch_end(self, epoch, logs=None): + # monitor the loss and accuracy + print(logs) + modelClient.log_metrics({"loss": logs["loss"], "accuracy": logs["accuracy"]}, epoch) + + with modelClient.start(): + multi_worker_model.fit(ds_train, epochs=10, steps_per_epoch=70, callbacks=[MyCallback()]) + + +if __name__ == "__main__": + modelClient = ModelsClient() + main() diff --git a/dev-support/examples/tracking/tracking.py b/dev-support/examples/tracking/tracking.py index 61c9ad8aa4..4f84251b5a 100644 --- a/dev-support/examples/tracking/tracking.py +++ b/dev-support/examples/tracking/tracking.py @@ -15,15 +15,16 @@ under the License. """ -from submarine import ModelsClient import random import time +from submarine import ModelsClient + if __name__ == "__main__": - modelClient = ModelsClient() - with modelClient.start() as run: - modelClient.log_param("learning_rate", random.random()) - for i in range(100): - time.sleep(1) - modelClient.log_metric("mse", random.random() * 100, i) - modelClient.log_metric("acc", random.random(), i) \ No newline at end of file + modelClient = ModelsClient() + with modelClient.start() as run: + modelClient.log_param("learning_rate", random.random()) + for i in range(100): + time.sleep(1) + modelClient.log_metric("mse", random.random() * 100, i) + modelClient.log_metric("acc", random.random(), i) diff --git a/dev-support/mini-submarine/spark-script/pyspark-yarn.py b/dev-support/mini-submarine/spark-script/pyspark-yarn.py index a28a5a206f..ab8affa543 100755 --- a/dev-support/mini-submarine/spark-script/pyspark-yarn.py +++ b/dev-support/mini-submarine/spark-script/pyspark-yarn.py @@ -14,19 +14,19 @@ # limitations under the License. -from pyspark import SparkConf -from pyspark import SparkContext +from pyspark import SparkConf, SparkContext conf = SparkConf() -conf.setMaster('yarn-client') -conf.setAppName('spark-yarn') +conf.setMaster("yarn-client") +conf.setAppName("spark-yarn") sc = SparkContext(conf=conf) def mod(x): - #import time - #time.sleep(120) + # import time + # time.sleep(120) return x + rdd = sc.parallelize(range(1000)).map(mod).take(10) -print rdd +print(rdd) diff --git a/dev-support/mini-submarine/submarine/image_classification.py b/dev-support/mini-submarine/submarine/image_classification.py index 14eb68cd79..fad373f181 100644 --- a/dev-support/mini-submarine/submarine/image_classification.py +++ b/dev-support/mini-submarine/submarine/image_classification.py @@ -16,103 +16,148 @@ from __future__ import division import argparse -import time -import os import logging +import os import random import tarfile +import time import mxnet as mx -from mxnet import gluon -from mxnet import profiler -from mxnet.gluon import nn -from mxnet.gluon.model_zoo import vision as models -from mxnet.gluon.data.vision import ImageFolderDataset -from mxnet.gluon.data import DataLoader -from mxnet.contrib.io import DataLoaderIter -from mxnet import autograd as ag -from mxnet.test_utils import get_mnist_iterator, get_cifar10 -from mxnet.metric import Accuracy, TopKAccuracy, CompositeEvalMetric import numpy as np +from mxnet import autograd as ag +from mxnet import gluon, profiler +from mxnet.contrib.io import DataLoaderIter +from mxnet.gluon.data import DataLoader +from mxnet.gluon.data.vision import ImageFolderDataset +from mxnet.gluon.model_zoo import vision as models +from mxnet.metric import Accuracy, CompositeEvalMetric, TopKAccuracy +from mxnet.test_utils import get_cifar10, get_mnist_iterator # logging logging.basicConfig(level=logging.INFO) -fh = logging.FileHandler('image-classification.log') +fh = logging.FileHandler("image-classification.log") logger = logging.getLogger() logger.addHandler(fh) -formatter = logging.Formatter('%(message)s') +formatter = logging.Formatter("%(message)s") fh.setFormatter(formatter) fh.setLevel(logging.DEBUG) -logging.debug('\n%s', '-' * 100) -formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') +logging.debug("\n%s", "-" * 100) +formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s") fh.setFormatter(formatter) # CLI -parser = argparse.ArgumentParser(description='Train a model for image classification.') -parser.add_argument('--dataset', type=str, default='cifar10', - help='dataset to use. options are mnist, cifar10, caltech101, imagenet and dummy.') -parser.add_argument('--data-dir', type=str, default='', - help='training directory of imagenet images, contains train/val subdirs.') -parser.add_argument('--num-worker', '-j', dest='num_workers', default=4, type=int, - help='number of workers for dataloader') -parser.add_argument('--batch-size', type=int, default=32, - help='training batch size per device (CPU/GPU).') -parser.add_argument('--gpus', type=str, default='', - help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.') -parser.add_argument('--epochs', type=int, default=120, - help='number of training epochs.') -parser.add_argument('--lr', type=float, default=0.1, - help='learning rate. default is 0.1.') -parser.add_argument('--momentum', type=float, default=0.9, - help='momentum value for optimizer, default is 0.9.') -parser.add_argument('--wd', type=float, default=0.0001, - help='weight decay rate. default is 0.0001.') -parser.add_argument('--seed', type=int, default=123, - help='random seed to use. Default=123.') -parser.add_argument('--mode', type=str, - help='mode in which to train the model. options are symbolic, imperative, hybrid') -parser.add_argument('--model', type=str, required=True, - help='type of model to use. see vision_model for options.') -parser.add_argument('--use_thumbnail', action='store_true', - help='use thumbnail or not in resnet. default is false.') -parser.add_argument('--batch-norm', action='store_true', - help='enable batch normalization or not in vgg. default is false.') -parser.add_argument('--use-pretrained', action='store_true', - help='enable using pretrained model from gluon.') -parser.add_argument('--prefix', default='', type=str, - help='path to checkpoint prefix, default is current working dir') -parser.add_argument('--start-epoch', default=0, type=int, - help='starting epoch, 0 for fresh training, > 0 to resume') -parser.add_argument('--resume', type=str, default='', - help='path to saved weight where you want resume') -parser.add_argument('--lr-factor', default=0.1, type=float, - help='learning rate decay ratio') -parser.add_argument('--lr-steps', default='30,60,90', type=str, - help='list of learning rate decay epochs as in str') -parser.add_argument('--dtype', default='float32', type=str, - help='data type, float32 or float16 if applicable') -parser.add_argument('--save-frequency', default=10, type=int, - help='epoch frequence to save model, best model will always be saved') -parser.add_argument('--kvstore', type=str, default='device', - help='kvstore to use for trainer/module.') -parser.add_argument('--log-interval', type=int, default=50, - help='Number of batches to wait before logging.') -parser.add_argument('--profile', action='store_true', - help='Option to turn on memory profiling for front-end, '\ - 'and prints out the memory usage by python function at the end.') -parser.add_argument('--builtin-profiler', type=int, default=0, help='Enable built-in profiler (0=off, 1=on)') +parser = argparse.ArgumentParser(description="Train a model for image classification.") +parser.add_argument( + "--dataset", + type=str, + default="cifar10", + help="dataset to use. options are mnist, cifar10, caltech101, imagenet and dummy.", +) +parser.add_argument( + "--data-dir", + type=str, + default="", + help="training directory of imagenet images, contains train/val subdirs.", +) +parser.add_argument( + "--num-worker", + "-j", + dest="num_workers", + default=4, + type=int, + help="number of workers for dataloader", +) +parser.add_argument( + "--batch-size", type=int, default=32, help="training batch size per device (CPU/GPU)." +) +parser.add_argument( + "--gpus", + type=str, + default="", + help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.', +) +parser.add_argument("--epochs", type=int, default=120, help="number of training epochs.") +parser.add_argument("--lr", type=float, default=0.1, help="learning rate. default is 0.1.") +parser.add_argument( + "--momentum", type=float, default=0.9, help="momentum value for optimizer, default is 0.9." +) +parser.add_argument( + "--wd", type=float, default=0.0001, help="weight decay rate. default is 0.0001." +) +parser.add_argument("--seed", type=int, default=123, help="random seed to use. Default=123.") +parser.add_argument( + "--mode", + type=str, + help="mode in which to train the model. options are symbolic, imperative, hybrid", +) +parser.add_argument( + "--model", type=str, required=True, help="type of model to use. see vision_model for options." +) +parser.add_argument( + "--use_thumbnail", action="store_true", help="use thumbnail or not in resnet. default is false." +) +parser.add_argument( + "--batch-norm", + action="store_true", + help="enable batch normalization or not in vgg. default is false.", +) +parser.add_argument( + "--use-pretrained", action="store_true", help="enable using pretrained model from gluon." +) +parser.add_argument( + "--prefix", + default="", + type=str, + help="path to checkpoint prefix, default is current working dir", +) +parser.add_argument( + "--start-epoch", default=0, type=int, help="starting epoch, 0 for fresh training, > 0 to resume" +) +parser.add_argument( + "--resume", type=str, default="", help="path to saved weight where you want resume" +) +parser.add_argument("--lr-factor", default=0.1, type=float, help="learning rate decay ratio") +parser.add_argument( + "--lr-steps", default="30,60,90", type=str, help="list of learning rate decay epochs as in str" +) +parser.add_argument( + "--dtype", default="float32", type=str, help="data type, float32 or float16 if applicable" +) +parser.add_argument( + "--save-frequency", + default=10, + type=int, + help="epoch frequence to save model, best model will always be saved", +) +parser.add_argument( + "--kvstore", type=str, default="device", help="kvstore to use for trainer/module." +) +parser.add_argument( + "--log-interval", type=int, default=50, help="Number of batches to wait before logging." +) +parser.add_argument( + "--profile", + action="store_true", + help=( + "Option to turn on memory profiling for front-end, " + "and prints out the memory usage by python function at the end." + ), +) +parser.add_argument( + "--builtin-profiler", type=int, default=0, help="Enable built-in profiler (0=off, 1=on)" +) opt = parser.parse_args() # global variables -logger.info('Starting new image-classification task:, %s',opt) +logger.info("Starting new image-classification task:, %s", opt) mx.random.seed(opt.seed) model_name = opt.model -dataset_classes = {'mnist': 10, 'cifar10': 10, 'caltech101':101, 'imagenet': 1000, 'dummy': 1000} +dataset_classes = {"mnist": 10, "cifar10": 10, "caltech101": 101, "imagenet": 1000, "dummy": 1000} batch_size, dataset, classes = opt.batch_size, opt.dataset, dataset_classes[opt.dataset] -context = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()] +context = [mx.gpu(int(i)) for i in opt.gpus.split(",")] if opt.gpus.strip() else [mx.cpu()] num_gpus = len(context) batch_size *= max(1, num_gpus) -lr_steps = [int(x) for x in opt.lr_steps.split(',') if x.strip()] +lr_steps = [int(x) for x in opt.lr_steps.split(",") if x.strip()] metric = CompositeEvalMetric([Accuracy(), TopKAccuracy(5)]) kv = mx.kv.create(opt.kvstore) @@ -121,30 +166,35 @@ def get_cifar10_iterator(batch_size, data_shape, resize=-1, num_parts=1, part_in get_cifar10() train = mx.io.ImageRecordIter( - path_imgrec = "data/cifar/train.rec", - resize = resize, - data_shape = data_shape, - batch_size = batch_size, - rand_crop = True, - rand_mirror = True, + path_imgrec="data/cifar/train.rec", + resize=resize, + data_shape=data_shape, + batch_size=batch_size, + rand_crop=True, + rand_mirror=True, num_parts=num_parts, - part_index=part_index) + part_index=part_index, + ) val = mx.io.ImageRecordIter( - path_imgrec = "data/cifar/test.rec", - resize = resize, - rand_crop = False, - rand_mirror = False, - data_shape = data_shape, - batch_size = batch_size, + path_imgrec="data/cifar/test.rec", + resize=resize, + rand_crop=False, + rand_mirror=False, + data_shape=data_shape, + batch_size=batch_size, num_parts=num_parts, - part_index=part_index) + part_index=part_index, + ) return train, val -def get_imagenet_transforms(data_shape=224, dtype='float32'): + +def get_imagenet_transforms(data_shape=224, dtype="float32"): def train_transform(image, label): - image, _ = mx.image.random_size_crop(image, (data_shape, data_shape), 0.08, (3/4., 4/3.)) + image, _ = mx.image.random_size_crop( + image, (data_shape, data_shape), 0.08, (3 / 4.0, 4 / 3.0) + ) image = mx.nd.image.random_flip_left_right(image) image = mx.nd.image.to_tensor(image) image = mx.nd.image.normalize(image, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) @@ -156,25 +206,32 @@ def val_transform(image, label): image = mx.nd.image.to_tensor(image) image = mx.nd.image.normalize(image, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) return mx.nd.cast(image, dtype), label + return train_transform, val_transform -def get_imagenet_iterator(root, batch_size, num_workers, data_shape=224, dtype='float32'): + +def get_imagenet_iterator(root, batch_size, num_workers, data_shape=224, dtype="float32"): """Dataset loader with preprocessing.""" - train_dir = os.path.join(root, 'train') + train_dir = os.path.join(root, "train") train_transform, val_transform = get_imagenet_transforms(data_shape, dtype) logging.info("Loading image folder %s, this may take a bit long...", train_dir) train_dataset = ImageFolderDataset(train_dir, transform=train_transform) - train_data = DataLoader(train_dataset, batch_size, shuffle=True, - last_batch='discard', num_workers=num_workers) - val_dir = os.path.join(root, 'val') - if not os.path.isdir(os.path.expanduser(os.path.join(root, 'val', 'n01440764'))): - user_warning = 'Make sure validation images are stored in one subdir per category, a helper script is available at https://git.io/vNQv1' + train_data = DataLoader( + train_dataset, batch_size, shuffle=True, last_batch="discard", num_workers=num_workers + ) + val_dir = os.path.join(root, "val") + if not os.path.isdir(os.path.expanduser(os.path.join(root, "val", "n01440764"))): + user_warning = ( + "Make sure validation images are stored in one subdir per category, a helper script is" + " available at https://git.io/vNQv1" + ) raise ValueError(user_warning) logging.info("Loading image folder %s, this may take a bit long...", val_dir) val_dataset = ImageFolderDataset(val_dir, transform=val_transform) - val_data = DataLoader(val_dataset, batch_size, last_batch='keep', num_workers=num_workers) + val_data = DataLoader(val_dataset, batch_size, last_batch="keep", num_workers=num_workers) return DataLoaderIter(train_data, dtype), DataLoaderIter(val_data, dtype) + def get_caltech101_data(): url = "https://s3.us-east-2.amazonaws.com/mxnet-public/101_ObjectCategories.tar.gz" dataset_name = "101_ObjectCategories" @@ -182,16 +239,18 @@ def get_caltech101_data(): if not os.path.isdir(data_folder): os.makedirs(data_folder) tar_path = mx.gluon.utils.download(url, path=data_folder) - if (not os.path.isdir(os.path.join(data_folder, "101_ObjectCategories")) or - not os.path.isdir(os.path.join(data_folder, "101_ObjectCategories_test"))): + if not os.path.isdir(os.path.join(data_folder, "101_ObjectCategories")) or not os.path.isdir( + os.path.join(data_folder, "101_ObjectCategories_test") + ): tar = tarfile.open(tar_path, "r:gz") tar.extractall(data_folder) tar.close() - print('Data extracted') + print("Data extracted") training_path = os.path.join(data_folder, dataset_name) testing_path = os.path.join(data_folder, "{}_test".format(dataset_name)) return training_path, testing_path + def get_caltech101_iterator(batch_size, num_workers, dtype): def transform(image, label): # resize the shorter edge to 224, the longer edge will be greater or equal to 224 @@ -210,15 +269,17 @@ def transform(image, label): test_data = DataLoader(dataset_test, batch_size, shuffle=False, num_workers=num_workers) return DataLoaderIter(train_data), DataLoaderIter(test_data) + class DummyIter(mx.io.DataIter): - def __init__(self, batch_size, data_shape, batches = 100): + def __init__(self, batch_size, data_shape, batches=100): super(DummyIter, self).__init__(batch_size) self.data_shape = (batch_size,) + data_shape self.label_shape = (batch_size,) - self.provide_data = [('data', self.data_shape)] - self.provide_label = [('softmax_label', self.label_shape)] - self.batch = mx.io.DataBatch(data=[mx.nd.zeros(self.data_shape)], - label=[mx.nd.zeros(self.label_shape)]) + self.provide_data = [("data", self.data_shape)] + self.provide_label = [("softmax_label", self.label_shape)] + self.batch = mx.io.DataBatch( + data=[mx.nd.zeros(self.data_shape)], label=[mx.nd.zeros(self.label_shape)] + ) self._batches = 0 self.batches = batches @@ -230,19 +291,25 @@ def next(self): self._batches = 0 raise StopIteration + def dummy_iterator(batch_size, data_shape): return DummyIter(batch_size, data_shape), DummyIter(batch_size, data_shape) + class ImagePairIter(mx.io.DataIter): - def __init__(self, path, data_shape, label_shape, batch_size=64, flag=0, input_aug=None, target_aug=None): + def __init__( + self, path, data_shape, label_shape, batch_size=64, flag=0, input_aug=None, target_aug=None + ): + def is_image_file(fn): + return any(fn.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]) + super(ImagePairIter, self).__init__(batch_size) self.data_shape = (batch_size,) + data_shape self.label_shape = (batch_size,) + label_shape self.input_aug = input_aug self.target_aug = target_aug - self.provide_data = [('data', self.data_shape)] - self.provide_label = [('label', self.label_shape)] - is_image_file = lambda fn: any(fn.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]) + self.provide_data = [("data", self.data_shape)] + self.provide_label = [("label", self.label_shape)] self.filenames = [os.path.join(path, x) for x in os.listdir(path) if is_image_file(x)] self.count = 0 self.flag = flag @@ -250,13 +317,14 @@ def __init__(self, path, data_shape, label_shape, batch_size=64, flag=0, input_a def next(self): from PIL import Image + if self.count + self.batch_size <= len(self.filenames): data = [] label = [] for i in range(self.batch_size): fn = self.filenames[self.count] self.count += 1 - image = Image.open(fn).convert('YCbCr').split()[0] + image = Image.open(fn).convert("YCbCr").split()[0] if image.size[0] > image.size[1]: image = image.transpose(Image.TRANSPOSE) image = mx.nd.expand_dims(mx.nd.array(image), axis=2) @@ -270,8 +338,8 @@ def next(self): data = mx.nd.concat(*[mx.nd.expand_dims(d, axis=0) for d in data], dim=0) label = mx.nd.concat(*[mx.nd.expand_dims(d, axis=0) for d in label], dim=0) - data = [mx.nd.transpose(data, axes=(0, 3, 1, 2)).astype('float32')/255] - label = [mx.nd.transpose(label, axes=(0, 3, 1, 2)).astype('float32')/255] + data = [mx.nd.transpose(data, axes=(0, 3, 1, 2)).astype("float32") / 255] + label = [mx.nd.transpose(label, axes=(0, 3, 1, 2)).astype("float32") / 255] return mx.io.DataBatch(data=data, label=label) else: @@ -281,79 +349,93 @@ def reset(self): self.count = 0 random.shuffle(self.filenames) + def get_model(model, ctx, opt): """Model initialization.""" - kwargs = {'ctx': ctx, 'pretrained': opt.use_pretrained, 'classes': classes} - if model.startswith('resnet'): - kwargs['thumbnail'] = opt.use_thumbnail - elif model.startswith('vgg'): - kwargs['batch_norm'] = opt.batch_norm + kwargs = {"ctx": ctx, "pretrained": opt.use_pretrained, "classes": classes} + if model.startswith("resnet"): + kwargs["thumbnail"] = opt.use_thumbnail + elif model.startswith("vgg"): + kwargs["batch_norm"] = opt.batch_norm net = models.get_model(model, **kwargs) if opt.resume: net.load_parameters(opt.resume) elif not opt.use_pretrained: - if model in ['alexnet']: + if model in ["alexnet"]: net.initialize(mx.init.Normal()) else: net.initialize(mx.init.Xavier(magnitude=2)) net.cast(opt.dtype) return net + net = get_model(opt.model, context, opt) + def get_data_iters(dataset, batch_size, opt): """get dataset iterators""" - if dataset == 'mnist': - train_data, val_data = get_mnist_iterator(batch_size, (1, 28, 28), - num_parts=kv.num_workers, part_index=kv.rank) - elif dataset == 'cifar10': - train_data, val_data = get_cifar10_iterator(batch_size, (3, 32, 32), - num_parts=kv.num_workers, part_index=kv.rank) - elif dataset == 'imagenet': - shape_dim = 299 if model_name == 'inceptionv3' else 224 + if dataset == "mnist": + train_data, val_data = get_mnist_iterator( + batch_size, (1, 28, 28), num_parts=kv.num_workers, part_index=kv.rank + ) + elif dataset == "cifar10": + train_data, val_data = get_cifar10_iterator( + batch_size, (3, 32, 32), num_parts=kv.num_workers, part_index=kv.rank + ) + elif dataset == "imagenet": + shape_dim = 299 if model_name == "inceptionv3" else 224 if not opt.data_dir: - raise ValueError('Dir containing raw images in train/val is required for imagenet.' - 'Please specify "--data-dir"') - - train_data, val_data = get_imagenet_iterator(opt.data_dir, batch_size, - opt.num_workers, shape_dim, opt.dtype) - elif dataset == 'caltech101': + raise ValueError( + "Dir containing raw images in train/val is required for imagenet." + 'Please specify "--data-dir"' + ) + + train_data, val_data = get_imagenet_iterator( + opt.data_dir, batch_size, opt.num_workers, shape_dim, opt.dtype + ) + elif dataset == "caltech101": train_data, val_data = get_caltech101_iterator(batch_size, opt.num_workers, opt.dtype) - elif dataset == 'dummy': - shape_dim = 299 if model_name == 'inceptionv3' else 224 + elif dataset == "dummy": + shape_dim = 299 if model_name == "inceptionv3" else 224 train_data, val_data = dummy_iterator(batch_size, (3, shape_dim, shape_dim)) return train_data, val_data + def test(ctx, val_data): metric.reset() val_data.reset() for batch in val_data: - data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype, copy=False), - ctx_list=ctx, batch_axis=0) - label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype, copy=False), - ctx_list=ctx, batch_axis=0) + data = gluon.utils.split_and_load( + batch.data[0].astype(opt.dtype, copy=False), ctx_list=ctx, batch_axis=0 + ) + label = gluon.utils.split_and_load( + batch.label[0].astype(opt.dtype, copy=False), ctx_list=ctx, batch_axis=0 + ) outputs = [net(X) for X in data] metric.update(label, outputs) return metric.get() + def update_learning_rate(lr, trainer, epoch, ratio, steps): """Set the learning rate to the initial value decayed by ratio every N epochs.""" new_lr = lr * (ratio ** int(np.sum(np.array(steps) < epoch))) trainer.set_learning_rate(new_lr) return trainer + def save_checkpoint(epoch, top1, best_acc): if opt.save_frequency and (epoch + 1) % opt.save_frequency == 0: - fname = os.path.join(opt.prefix, '%s_%d_acc_%.4f.params' % (opt.model, epoch, top1)) + fname = os.path.join(opt.prefix, "%s_%d_acc_%.4f.params" % (opt.model, epoch, top1)) net.save_parameters(fname) - logger.info('[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f', epoch, fname, top1) + logger.info("[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f", epoch, fname, top1) if top1 > best_acc[0]: best_acc[0] = top1 - fname = os.path.join(opt.prefix, '%s_best.params' % (opt.model)) + fname = os.path.join(opt.prefix, "%s_best.params" % (opt.model)) net.save_parameters(fname) - logger.info('[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f', epoch, fname, top1) + logger.info("[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f", epoch, fname, top1) + def train(opt, ctx): if isinstance(ctx, mx.Context): @@ -361,12 +443,17 @@ def train(opt, ctx): train_data, val_data = get_data_iters(dataset, batch_size, opt) net.collect_params().reset_ctx(ctx) - trainer = gluon.Trainer(net.collect_params(), 'sgd', - optimizer_params={'learning_rate': opt.lr, - 'wd': opt.wd, - 'momentum': opt.momentum, - 'multi_precision': True}, - kvstore=kv) + trainer = gluon.Trainer( + net.collect_params(), + "sgd", + optimizer_params={ + "learning_rate": opt.lr, + "wd": opt.wd, + "momentum": opt.momentum, + "multi_precision": True, + }, + kvstore=kv, + ) loss = gluon.loss.SoftmaxCrossEntropyLoss() total_time = 0 @@ -379,8 +466,12 @@ def train(opt, ctx): metric.reset() btic = time.time() for i, batch in enumerate(train_data): - data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0) - label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0) + data = gluon.utils.split_and_load( + batch.data[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0 + ) + label = gluon.utils.split_and_load( + batch.label[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0 + ) outputs = [] Ls = [] with ag.record(): @@ -394,72 +485,96 @@ def train(opt, ctx): ag.backward(Ls) trainer.step(batch.data[0].shape[0]) metric.update(label, outputs) - if opt.log_interval and not (i+1)%opt.log_interval: + if opt.log_interval and not (i + 1) % opt.log_interval: name, acc = metric.get() - logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f, %s=%f'%( - epoch, i, batch_size/(time.time()-btic), name[0], acc[0], name[1], acc[1])) + logger.info( + "Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f, %s=%f" + % ( + epoch, + i, + batch_size / (time.time() - btic), + name[0], + acc[0], + name[1], + acc[1], + ) + ) btic = time.time() - epoch_time = time.time()-tic + epoch_time = time.time() - tic # First epoch will usually be much slower than the subsequent epics, # so don't factor into the average if num_epochs > 0: - total_time = total_time + epoch_time + total_time = total_time + epoch_time num_epochs = num_epochs + 1 name, acc = metric.get() - logger.info('[Epoch %d] training: %s=%f, %s=%f'%(epoch, name[0], acc[0], name[1], acc[1])) - logger.info('[Epoch %d] time cost: %f'%(epoch, epoch_time)) + logger.info("[Epoch %d] training: %s=%f, %s=%f" % (epoch, name[0], acc[0], name[1], acc[1])) + logger.info("[Epoch %d] time cost: %f" % (epoch, epoch_time)) name, val_acc = test(ctx, val_data) - logger.info('[Epoch %d] validation: %s=%f, %s=%f'%(epoch, name[0], val_acc[0], name[1], val_acc[1])) + logger.info( + "[Epoch %d] validation: %s=%f, %s=%f" + % (epoch, name[0], val_acc[0], name[1], val_acc[1]) + ) # save model if meet requirements save_checkpoint(epoch, val_acc[0], best_acc) if num_epochs > 1: - print('Average epoch time: {}'.format(float(total_time)/(num_epochs - 1))) + print("Average epoch time: {}".format(float(total_time) / (num_epochs - 1))) + def main(): if opt.builtin_profiler > 0: profiler.set_config(profile_all=True, aggregate_stats=True) - profiler.set_state('run') - if opt.mode == 'symbolic': - data = mx.sym.var('data') - if opt.dtype == 'float16': + profiler.set_state("run") + if opt.mode == "symbolic": + data = mx.sym.var("data") + if opt.dtype == "float16": data = mx.sym.Cast(data=data, dtype=np.float16) out = net(data) - if opt.dtype == 'float16': + if opt.dtype == "float16": out = mx.sym.Cast(data=out, dtype=np.float32) - softmax = mx.sym.SoftmaxOutput(out, name='softmax') + softmax = mx.sym.SoftmaxOutput(out, name="softmax") mod = mx.mod.Module(softmax, context=context) train_data, val_data = get_data_iters(dataset, batch_size, opt) - mod.fit(train_data, - eval_data=val_data, - num_epoch=opt.epochs, - kvstore=kv, - batch_end_callback = mx.callback.Speedometer(batch_size, max(1, opt.log_interval)), - epoch_end_callback = mx.callback.do_checkpoint('image-classifier-%s'% opt.model), - optimizer = 'sgd', - optimizer_params = {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'multi_precision': True}, - initializer = mx.init.Xavier(magnitude=2)) - mod.save_parameters('image-classifier-%s-%d-final.params'%(opt.model, opt.epochs)) + mod.fit( + train_data, + eval_data=val_data, + num_epoch=opt.epochs, + kvstore=kv, + batch_end_callback=mx.callback.Speedometer(batch_size, max(1, opt.log_interval)), + epoch_end_callback=mx.callback.do_checkpoint("image-classifier-%s" % opt.model), + optimizer="sgd", + optimizer_params={ + "learning_rate": opt.lr, + "wd": opt.wd, + "momentum": opt.momentum, + "multi_precision": True, + }, + initializer=mx.init.Xavier(magnitude=2), + ) + mod.save_parameters("image-classifier-%s-%d-final.params" % (opt.model, opt.epochs)) else: - if opt.mode == 'hybrid': + if opt.mode == "hybrid": net.hybridize() train(opt, context) if opt.builtin_profiler > 0: - profiler.set_state('stop') + profiler.set_state("stop") print(profiler.dumps()) -if __name__ == '__main__': + +if __name__ == "__main__": if opt.profile: - import hotshot, hotshot.stats - prof = hotshot.Profile('image-classifier-%s-%s.prof'%(opt.model, opt.mode)) + import hotshot + import hotshot.stats + + prof = hotshot.Profile("image-classifier-%s-%s.prof" % (opt.model, opt.mode)) prof.runcall(main) prof.close() - stats = hotshot.stats.load('image-classifier-%s-%s.prof'%(opt.model, opt.mode)) + stats = hotshot.stats.load("image-classifier-%s-%s.prof" % (opt.model, opt.mode)) stats.strip_dirs() - stats.sort_stats('cumtime', 'calls') + stats.sort_stats("cumtime", "calls") stats.print_stats() else: main() diff --git a/dev-support/mini-submarine/submarine/mnist_distributed.py b/dev-support/mini-submarine/submarine/mnist_distributed.py index dfa0beb5b2..8ad3c9e081 100644 --- a/dev-support/mini-submarine/submarine/mnist_distributed.py +++ b/dev-support/mini-submarine/submarine/mnist_distributed.py @@ -18,17 +18,13 @@ This example was adapted from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/mnist/mnist_deep.py. -Each worker reads the full MNIST dataset and asynchronously trains a CNN with dropout and using the Adam optimizer, -updating the model parameters on shared parameter servers. +Each worker reads the full MNIST dataset and asynchronously trains a CNN with dropout and +using the Adam optimizer, updating the model parameters on shared parameter servers. The current training accuracy is printed out after every 100 steps. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from tensorflow.examples.tutorials.mnist import input_data +from __future__ import absolute_import, division, print_function import json import logging @@ -37,25 +33,28 @@ import tensorboard.program as tb_program import tensorflow as tf +from tensorflow.examples.tutorials.mnist import input_data + # import submarine # Environment variable containing port to launch TensorBoard on, set by TonY. -TB_PORT_ENV_VAR = 'TB_PORT' +TB_PORT_ENV_VAR = "TB_PORT" # mnist data url -tf.flags.DEFINE_string('mnist_data_url', '', - 'Url for mnist handwritten digits dataset') +tf.flags.DEFINE_string("mnist_data_url", "", "Url for mnist handwritten digits dataset") # Input/output directories -tf.flags.DEFINE_string('data_dir', '/tmp/tensorflow/mnist/input_data', - 'Directory for storing input data') -tf.flags.DEFINE_string('working_dir', '/tmp/tensorflow/mnist/working_dir', - 'Directory under which events and output will be ' - 'stored (in separate subdirectories).') +tf.flags.DEFINE_string( + "data_dir", "/tmp/tensorflow/mnist/input_data", "Directory for storing input data" +) +tf.flags.DEFINE_string( + "working_dir", + "/tmp/tensorflow/mnist/working_dir", + "Directory under which events and output will be stored (in separate subdirectories).", +) # Training parameters -tf.flags.DEFINE_integer("steps", 1500, - "The number of training steps to execute.") +tf.flags.DEFINE_integer("steps", 1500, "The number of training steps to execute.") tf.flags.DEFINE_integer("batch_size", 64, "The batch size per step.") FLAGS = tf.flags.FLAGS @@ -77,32 +76,32 @@ def deepnn(x): # Reshape to use within a convolutional neural net. # Last dimension is for "features" - there is only one here, since images are # grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc. - with tf.name_scope('reshape'): + with tf.name_scope("reshape"): x_image = tf.reshape(x, [-1, 28, 28, 1]) # First convolutional layer - maps one grayscale image to 32 feature maps. - with tf.name_scope('conv1'): + with tf.name_scope("conv1"): W_conv1 = weight_variable([5, 5, 1, 32]) b_conv1 = bias_variable([32]) h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) # Pooling layer - downsamples by 2X. - with tf.name_scope('pool1'): + with tf.name_scope("pool1"): h_pool1 = max_pool_2x2(h_conv1) # Second convolutional layer -- maps 32 feature maps to 64. - with tf.name_scope('conv2'): + with tf.name_scope("conv2"): W_conv2 = weight_variable([5, 5, 32, 64]) b_conv2 = bias_variable([64]) h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) # Second pooling layer. - with tf.name_scope('pool2'): + with tf.name_scope("pool2"): h_pool2 = max_pool_2x2(h_conv2) # Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image # is down to 7x7x64 feature maps -- maps this to 1024 features. - with tf.name_scope('fc1'): + with tf.name_scope("fc1"): W_fc1 = weight_variable([7 * 7 * 64, 1024]) b_fc1 = bias_variable([1024]) @@ -111,12 +110,12 @@ def deepnn(x): # Dropout - controls the complexity of the model, prevents co-adaptation of # features. - with tf.name_scope('dropout'): + with tf.name_scope("dropout"): keep_prob = tf.placeholder(tf.float32) h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) # Map the 1024 features to 10 classes, one for each digit - with tf.name_scope('fc2'): + with tf.name_scope("fc2"): W_fc2 = weight_variable([1024, 10]) b_fc2 = bias_variable([10]) @@ -126,13 +125,12 @@ def deepnn(x): def conv2d(x, W): """conv2d returns a 2d convolution layer with full stride.""" - return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') + return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME") def max_pool_2x2(x): """max_pool_2x2 downsamples a feature map by 2X.""" - return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], padding='SAME') + return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME") def weight_variable(shape): @@ -158,23 +156,21 @@ def create_model(): # Build the graph for the deep net y_conv, keep_prob = deepnn(x) - with tf.name_scope('loss'): - cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, - logits=y_conv) + with tf.name_scope("loss"): + cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y_conv) cross_entropy = tf.reduce_mean(cross_entropy) global_step = tf.train.get_or_create_global_step() - with tf.name_scope('adam_optimizer'): - train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy, - global_step=global_step) + with tf.name_scope("adam_optimizer"): + train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy, global_step=global_step) - with tf.name_scope('accuracy'): + with tf.name_scope("accuracy"): correct_prediction = tf.equal(tf.argmax(y_conv, 1), y_) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction) - tf.summary.scalar('cross_entropy_loss', cross_entropy) - tf.summary.scalar('accuracy', accuracy) + tf.summary.scalar("cross_entropy_loss", cross_entropy) + tf.summary.scalar("accuracy", accuracy) merged = tf.summary.merge_all() @@ -194,8 +190,8 @@ def main(_): cluster_spec_str = os.environ["CLUSTER_SPEC"] cluster_spec = json.loads(cluster_spec_str) - ps_hosts = cluster_spec['ps'] - worker_hosts = cluster_spec['worker'] + ps_hosts = cluster_spec["ps"] + worker_hosts = cluster_spec["worker"] # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) @@ -209,13 +205,14 @@ def main(_): server.join() elif job_name == "worker": # Create our model graph. Assigns ops to the local worker by default. - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % task_index, - cluster=cluster)): - features, labels, keep_prob, global_step, train_step, accuracy, \ - merged = create_model() - - if task_index is 0: # chief worker + with tf.device( + tf.train.replica_device_setter( + worker_device="/job:worker/task:%d" % task_index, cluster=cluster + ) + ): + features, labels, keep_prob, global_step, train_step, accuracy, merged = create_model() + + if task_index == 0: # chief worker tf.gfile.MakeDirs(FLAGS.working_dir) start_tensorboard(FLAGS.working_dir) @@ -227,26 +224,31 @@ def main(_): # asynchronous training so there is no need for the workers to # communicate. config_proto = tf.ConfigProto( - device_filters=['/job:ps', '/job:worker/task:%d' % task_index]) - - with tf.train.MonitoredTrainingSession(master=server.target, - is_chief=(task_index == 0), - checkpoint_dir=FLAGS.working_dir, - hooks=hooks, - config=config_proto) as sess: + device_filters=["/job:ps", "/job:worker/task:%d" % task_index] + ) + + with tf.train.MonitoredTrainingSession( + master=server.target, + is_chief=(task_index == 0), + checkpoint_dir=FLAGS.working_dir, + hooks=hooks, + config=config_proto, + ) as sess: # Import data - logging.info('Extracting and loading input data...') - # Use a different data dir name to workaround "file already exists issue" when downloading dataset in the same single node - if FLAGS.mnist_data_url == '': - logging.info('Getting mnist data from default url') + logging.info("Extracting and loading input data...") + # Use a different data dir name to workaround "file already exists issue" + # when downloading dataset in the same single node + if FLAGS.mnist_data_url == "": + logging.info("Getting mnist data from default url") mnist = input_data.read_data_sets(FLAGS.data_dir + str(task_index)) else: - logging.info('Getting mnist data from ' + FLAGS.mnist_data_url) - mnist = input_data.read_data_sets(FLAGS.data_dir + str(task_index), - source_url=FLAGS.mnist_data_url) + logging.info("Getting mnist data from " + FLAGS.mnist_data_url) + mnist = input_data.read_data_sets( + FLAGS.data_dir + str(task_index), source_url=FLAGS.mnist_data_url + ) # Train - logging.info('Starting training') + logging.info("Starting training") i = 0 while not sess.should_stop(): # Before use submarine-sdk, start Mysql server first @@ -255,21 +257,21 @@ def main(_): if i % 100 == 0: step, _, train_accuracy = sess.run( [global_step, train_step, accuracy], - feed_dict={features: batch[0], labels: batch[1], - keep_prob: 1.0}) - logging.info('Step %d, training accuracy: %g' % ( - step, train_accuracy)) + feed_dict={features: batch[0], labels: batch[1], keep_prob: 1.0}, + ) + logging.info("Step %d, training accuracy: %g" % (step, train_accuracy)) # Before use submarine-sdk, start Mysql server first # submarine.log_metric("accuracy", train_accuracy, i) else: - sess.run([global_step, train_step], - feed_dict={features: batch[0], labels: batch[1], - keep_prob: 0.5}) + sess.run( + [global_step, train_step], + feed_dict={features: batch[0], labels: batch[1], keep_prob: 0.5}, + ) i += 1 - logging.info('Done training!') + logging.info("Done training!") sys.exit() -if __name__ == '__main__': +if __name__ == "__main__": tf.app.run() diff --git a/dev-support/mini-submarine/submarine/mnist_distributed_tf2.py b/dev-support/mini-submarine/submarine/mnist_distributed_tf2.py index 2f880e6f5b..aa8bab1bc5 100644 --- a/dev-support/mini-submarine/submarine/mnist_distributed_tf2.py +++ b/dev-support/mini-submarine/submarine/mnist_distributed_tf2.py @@ -13,8 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import json +import os + import tensorflow as tf import tensorflow_datasets as tfds @@ -22,25 +23,28 @@ BUFFER_SIZE = 10000 LEARNING_RATE = 1e-4 + def get_task_name(): cluster_spec = os.environ.get("CLUSTER_SPEC", None) - task_name = '' + task_name = "" if cluster_spec: cluster_spec = json.loads(cluster_spec) job_index = os.environ["TASK_INDEX"] job_name = os.environ["JOB_NAME"] - task_name = job_name + '_' + job_index + task_name = job_name + "_" + job_index return task_name + def input_fn(mode, input_context=None): - datasets, info = tfds.load(name='mnist', - data_dir='/tmp/' + get_task_name() + '/data', - with_info=True, - as_supervised=True) + datasets, info = tfds.load( + name="mnist", + data_dir="/tmp/" + get_task_name() + "/data", + with_info=True, + as_supervised=True, + ) - mnist_dataset = (datasets['train'] if mode == tf.estimator.ModeKeys.TRAIN - else datasets['test']) + mnist_dataset = datasets["train"] if mode == tf.estimator.ModeKeys.TRAIN else datasets["test"] def scale(image, label): image = tf.cast(image, tf.float32) @@ -49,50 +53,48 @@ def scale(image, label): if input_context: mnist_dataset = mnist_dataset.shard( - input_context.num_input_pipelines, - input_context.input_pipeline_id) + input_context.num_input_pipelines, input_context.input_pipeline_id + ) return mnist_dataset.map(scale).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE) + def model_fn(features, labels, mode): - model = tf.keras.Sequential([ - tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), - tf.keras.layers.MaxPooling2D(), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(64, activation='relu'), - tf.keras.layers.Dense(10) - ]) + model = tf.keras.Sequential( + [ + tf.keras.layers.Conv2D(32, 3, activation="relu", input_shape=(28, 28, 1)), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(64, activation="relu"), + tf.keras.layers.Dense(10), + ] + ) logits = model(features, training=False) if mode == tf.estimator.ModeKeys.PREDICT: - predictions = {'logits': logits} + predictions = {"logits": logits} return tf.estimator.EstimatorSpec(labels=labels, predictions=predictions) - optimizer = tf.compat.v1.train.GradientDescentOptimizer( - learning_rate=LEARNING_RATE) + optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE) loss = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels, logits) - loss = tf.reduce_sum(loss) * (1. / BATCH_SIZE) + from_logits=True, reduction=tf.keras.losses.Reduction.NONE + )(labels, logits) + loss = tf.reduce_sum(loss) * (1.0 / BATCH_SIZE) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec(mode, loss=loss) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, - train_op=optimizer.minimize( - loss, tf.compat.v1.train.get_or_create_global_step())) + train_op=optimizer.minimize(loss, tf.compat.v1.train.get_or_create_global_step()), + ) + -if __name__ == '__main__': +if __name__ == "__main__": strategy = tf.distribute.experimental.ParameterServerStrategy() config = tf.estimator.RunConfig(train_distribute=strategy) - estimator = tf.estimator.Estimator( - model_fn=model_fn, - model_dir='/tmp/model', - config=config) + estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir="/tmp/model", config=config) train_spec = tf.estimator.TrainSpec(input_fn) eval_spec = tf.estimator.EvalSpec(input_fn) - tf.estimator.train_and_evaluate( - estimator, - train_spec, - eval_spec) + tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) diff --git a/dev-support/mini-submarine/submarine/pytorch_mnist_distributed.py b/dev-support/mini-submarine/submarine/pytorch_mnist_distributed.py index 667603aefb..63c1ecf140 100644 --- a/dev-support/mini-submarine/submarine/pytorch_mnist_distributed.py +++ b/dev-support/mini-submarine/submarine/pytorch_mnist_distributed.py @@ -18,8 +18,8 @@ https://pytorch.org/tutorials/intermediate/dist_tuto.html https://github.com/narumiruna/pytorch-distributed-example/blob/master/mnist/main.py -Each worker reads the full MNIST dataset and asynchronously trains a CNN with dropout and using the Adam optimizer, -updating the model parameters on shared parameter servers. +Each worker reads the full MNIST dataset and asynchronously trains a CNN with dropout and +using the Adam optimizer, updating the model parameters on shared parameter servers. The current training accuracy is printed out after every 100 steps. """ @@ -27,18 +27,17 @@ from __future__ import division, print_function import argparse - import os + import torch import torch.nn.functional as F from torch import distributed, nn -from torch.utils import data +from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from torchvision import datasets, transforms class AverageMeter(object): - def __init__(self): self.sum = 0 self.count = 0 @@ -53,7 +52,6 @@ def average(self): class AccuracyMeter(object): - def __init__(self): self.correct = 0 self.count = 0 @@ -71,7 +69,6 @@ def accuracy(self): class Trainer(object): - def __init__(self, net, optimizer, train_loader, test_loader, device): self.net = net self.optimizer = optimizer @@ -130,8 +127,7 @@ def average_gradients(self): tensor = p.grad.data.cpu() - distributed.all_reduce( - tensor, op=distributed.reduce_op.SUM, group=group) + distributed.all_reduce(tensor, op=distributed.reduce_op.SUM, group=group) tensor /= float(world_size) @@ -139,7 +135,6 @@ def average_gradients(self): class Net(nn.Module): - def __init__(self): super(Net, self).__init__() self.fc = nn.Linear(784, 10) @@ -149,31 +144,32 @@ def forward(self, x): def get_dataloader(root, batch_size): - transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.13066047740239478,), (0.3081078087569972,)) - ]) - - train_set = datasets.MNIST( - root, train=True, transform=transform, download=True) + transform = transforms.Compose( + # https://github.com/psf/black/issues/2434 + # fmt: off + [transforms.ToTensor(), + transforms.Normalize((0.13066047740239478,), (0.3081078087569972,))] + # fmt: on + ) + + train_set = datasets.MNIST(root, train=True, transform=transform, download=True) sampler = DistributedSampler(train_set) - train_loader = data.DataLoader( - train_set, - batch_size=batch_size, - shuffle=(sampler is None), - sampler=sampler) + train_loader = DataLoader( + train_set, batch_size=batch_size, shuffle=(sampler is None), sampler=sampler + ) - test_loader = data.DataLoader( + test_loader = DataLoader( datasets.MNIST(root, train=False, transform=transform, download=True), batch_size=batch_size, - shuffle=False) + shuffle=False, + ) return train_loader, test_loader def solve(args): - device = torch.device('cuda' if args.cuda else 'cpu') + device = torch.device("cuda" if args.cuda else "cpu") net = Net().to(device) @@ -188,9 +184,11 @@ def solve(args): test_loss, test_acc = trainer.evaluate() print( - 'Epoch: {}/{},'.format(epoch, args.epochs), - 'train loss: {:.6f}, train acc: {:.6f}, test loss: {:.6f}, test acc: {:.6f}.'. - format(train_loss, train_acc, test_loss, test_acc)) + "Epoch: {}/{},".format(epoch, args.epochs), + "train loss: {:.6f}, train acc: {:.6f}, test loss: {:.6f}, test acc: {:.6f}.".format( + train_loss, train_acc, test_loss, test_acc + ), + ) def init_process(args): @@ -198,38 +196,39 @@ def init_process(args): backend=args.backend, init_method=args.init_method, rank=args.rank, - world_size=args.world_size) + world_size=args.world_size, + ) def main(): parser = argparse.ArgumentParser() + parser.add_argument("--backend", type=str, default="tcp", help="Name of the backend to use.") parser.add_argument( - '--backend', - type=str, - default='tcp', - help='Name of the backend to use.') - parser.add_argument( - '--init-method', - '-i', + "--init-method", + "-i", type=str, - default=os.environ.get('INIT_METHOD', 'tcp://127.0.0.1:23456'), - help='URL specifying how to initialize the package.') + default=os.environ.get("INIT_METHOD", "tcp://127.0.0.1:23456"), + help="URL specifying how to initialize the package.", + ) parser.add_argument( - '--rank', '-r', + "--rank", + "-r", type=int, - default=int(os.environ.get('RANK')), - help='Rank of the current process.') + default=int(os.environ.get("RANK")), + help="Rank of the current process.", + ) parser.add_argument( - '--world-size', - '-s', + "--world-size", + "-s", type=int, - default=int(os.environ.get('WORLD')), - help='Number of processes participating in the job.') - parser.add_argument('--epochs', type=int, default=20) - parser.add_argument('--no-cuda', action='store_true') - parser.add_argument('--learning-rate', '-lr', type=float, default=1e-3) - parser.add_argument('--root', type=str, default='data') - parser.add_argument('--batch-size', type=int, default=128) + default=int(os.environ.get("WORLD")), + help="Number of processes participating in the job.", + ) + parser.add_argument("--epochs", type=int, default=20) + parser.add_argument("--no-cuda", action="store_true") + parser.add_argument("--learning-rate", "-lr", type=float, default=1e-3) + parser.add_argument("--root", type=str, default="data") + parser.add_argument("--batch-size", type=int, default=128) args = parser.parse_args() args.cuda = torch.cuda.is_available() and not args.no_cuda print(args) @@ -238,5 +237,5 @@ def main(): solve(args) -if __name__ == '__main__': - main() \ No newline at end of file +if __name__ == "__main__": + main() diff --git a/dev-support/misc/flask/server.py b/dev-support/misc/flask/server.py index 439337cf1f..4c36c9c375 100644 --- a/dev-support/misc/flask/server.py +++ b/dev-support/misc/flask/server.py @@ -19,10 +19,12 @@ app = Flask(__name__) + @app.route("/") def hello(): - return "Hello, World!" + return "Hello, World!" + @app.route("/invocations") def invocation(): - return "Invocation" \ No newline at end of file + return "Invocation" diff --git a/submarine-sdk/pysubmarine/github-actions/auto-format.sh b/dev-support/style-check/python/auto-format.sh similarity index 85% rename from submarine-sdk/pysubmarine/github-actions/auto-format.sh rename to dev-support/style-check/python/auto-format.sh index b3327bbe13..f84269167e 100755 --- a/submarine-sdk/pysubmarine/github-actions/auto-format.sh +++ b/dev-support/style-check/python/auto-format.sh @@ -18,11 +18,11 @@ set -euxo pipefail FWDIR="$(cd "$(dirname "$0")"; pwd)" cd "$FWDIR" -cd .. +cd ../../../ -# Autoformat code -yapf -i submarine/**/*.py tests/**/*.py # Sort imports -isort submarine/**/*.py tests/**/*.py +isort submarine-sdk/ dev-support/ website/ +# Autoformat code +black submarine-sdk/ dev-support/ website/ --experimental-string-processing -set +euxo pipefail +set +euxo pipefail \ No newline at end of file diff --git a/submarine-sdk/pysubmarine/github-actions/lint-requirements.txt b/dev-support/style-check/python/lint-requirements.txt similarity index 91% rename from submarine-sdk/pysubmarine/github-actions/lint-requirements.txt rename to dev-support/style-check/python/lint-requirements.txt index 38e66c9325..ea33540217 100644 --- a/submarine-sdk/pysubmarine/github-actions/lint-requirements.txt +++ b/dev-support/style-check/python/lint-requirements.txt @@ -13,8 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -pep8==1.7.1 -pylint==2.4.4 -pycodestyle==2.5.0 -yapf==0.30.0 -isort==4.3.21 +black[jupyter]==21.8b0 +flake8==3.9.2 +isort==5.9.3 diff --git a/submarine-sdk/pysubmarine/github-actions/lint.sh b/dev-support/style-check/python/lint.sh similarity index 54% rename from submarine-sdk/pysubmarine/github-actions/lint.sh rename to dev-support/style-check/python/lint.sh index 9e17c8e5ff..db1f3afdf6 100755 --- a/submarine-sdk/pysubmarine/github-actions/lint.sh +++ b/dev-support/style-check/python/lint.sh @@ -18,26 +18,13 @@ set -euxo pipefail FWDIR="$(cd "$(dirname "$0")"; pwd)" cd "$FWDIR" -cd .. +cd ../../../ -pycodestyle --max-line-length=100 -- submarine tests -pylint --ignore experiment --msg-template="{path} ({line},{column}): [{msg_id} {symbol}] {msg}" --rcfile=pylintrc -- submarine tests -./github-actions/auto-format.sh - -GIT_STATUS="$(git status --porcelain)" -# Only check the files in ./pysubmarine -GIT_DIFF="$(git diff .)" -if [ "$GIT_STATUS" ]; then - echo "Code is not formatted by yapf and isort. Please run ./github-actions/auto-format.sh" - echo "Git status is" - echo "------------------------------------------------------------------" - echo "$GIT_STATUS" - echo "Git diff ./pysubmarine is" - echo "------------------------------------------------------------------" - echo "$GIT_DIFF" - exit 1 -else - echo "Test successful" -fi +# Check imports +isort -c submarine-sdk/ dev-support/ website/ +# Check code format +black submarine-sdk/ dev-support/ website/ --check --experimental-string-processing +# Check lint: Not checking the code in website +flake8 submarine-sdk/ dev-support/ set +euxo pipefail diff --git a/dev-support/submarine-installer/scripts/combine-docker-daemons.py b/dev-support/submarine-installer/scripts/combine-docker-daemons.py index 5d5ff58d4e..60da4b8ede 100644 --- a/dev-support/submarine-installer/scripts/combine-docker-daemons.py +++ b/dev-support/submarine-installer/scripts/combine-docker-daemons.py @@ -17,18 +17,20 @@ import json import sys + def combineJsons(jsonFile1, jsonFile2, outputFile): - dict1 = json.load(open(jsonFile1)) - dict2 = json.load(open(jsonFile2)) - dict3 = dict(dict1.items() + dict2.items()) + dict1 = json.load(open(jsonFile1)) + dict2 = json.load(open(jsonFile2)) + dict3 = dict(dict1.items() + dict2.items()) + + with open(outputFile, "w") as output: + json.dump(dict3, output, indent=2, sort_keys=True) - with open(outputFile, 'w') as output: - json.dump(dict3, output, indent=2, sort_keys=True) + return True - return True -if __name__ == '__main__': - if (len(sys.argv) < 4): - raise Exception,u"3 arguments needed" +if __name__ == "__main__": + if len(sys.argv) < 4: + raise Exception(u"3 arguments needed") - print(combineJsons(sys.argv[1], sys.argv[2], sys.argv[3])) + print(combineJsons(sys.argv[1], sys.argv[2], sys.argv[3])) diff --git a/dev-support/submarine-installer/scripts/xmlcombine.py b/dev-support/submarine-installer/scripts/xmlcombine.py index 2b049d0206..c9a30ab917 100644 --- a/dev-support/submarine-installer/scripts/xmlcombine.py +++ b/dev-support/submarine-installer/scripts/xmlcombine.py @@ -17,6 +17,7 @@ import sys from xml.etree import ElementTree + def run(files): first = None for filename in files: @@ -26,7 +27,8 @@ def run(files): else: first.extend(data) if first is not None: - print ElementTree.tostring(first) + print(ElementTree.tostring(first)) + if __name__ == "__main__": run(sys.argv[1:]) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..e37013320e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +[tool.isort] +profile = "black" +line_length = 100 +[tool.black] +max-line-length = 100 +line-length = 100 \ No newline at end of file diff --git a/submarine-sdk/pysubmarine/example/deepfm_example.ipynb b/submarine-sdk/pysubmarine/example/deepfm_example.ipynb index 232baf03f5..c09b0e8d51 100644 --- a/submarine-sdk/pysubmarine/example/deepfm_example.ipynb +++ b/submarine-sdk/pysubmarine/example/deepfm_example.ipynb @@ -184,7 +184,7 @@ } ], "source": [ - "model = DeepFM(json_path='deepfm.json')\n", + "model = DeepFM(json_path=\"deepfm.json\")\n", "model.train()\n", "result = model.evaluate()\n", "print(\"Model metrics : \", result)" diff --git a/submarine-sdk/pysubmarine/example/pytorch/afm/run_afm.py b/submarine-sdk/pysubmarine/example/pytorch/afm/run_afm.py index e6c28d6c39..c086c8c659 100644 --- a/submarine-sdk/pysubmarine/example/pytorch/afm/run_afm.py +++ b/submarine-sdk/pysubmarine/example/pytorch/afm/run_afm.py @@ -14,28 +14,28 @@ # limitations under the License. -from submarine.ml.pytorch.model.ctr import AFM - import argparse -if __name__ == '__main__': +from submarine.ml.pytorch.model.ctr import AFM + +if __name__ == "__main__": parser = argparse.ArgumentParser() + parser.add_argument("-conf", help="a JSON configuration file for AFM", type=str) parser.add_argument( - "-conf", help="a JSON configuration file for AFM", type=str) - parser.add_argument("-task_type", default='train', - help="train or evaluate, by default is train") + "-task_type", default="train", help="train or evaluate, by default is train" + ) args = parser.parse_args() trainer = AFM(json_path=args.conf) - if args.task_type == 'train': + if args.task_type == "train": trainer.fit() - print('[Train Done]') - elif args.task_type == 'evaluate': + print("[Train Done]") + elif args.task_type == "evaluate": score = trainer.evaluate() - print(f'Eval score: {score}') - elif args.task_type == 'predict': + print(f"Eval score: {score}") + elif args.task_type == "predict": pred = trainer.predict() - print('Predict:', pred) + print("Predict:", pred) else: assert False, args.task_type diff --git a/submarine-sdk/pysubmarine/example/pytorch/deepfm/run_deepfm.py b/submarine-sdk/pysubmarine/example/pytorch/deepfm/run_deepfm.py index 8bff12e0e9..6d3d622304 100644 --- a/submarine-sdk/pysubmarine/example/pytorch/deepfm/run_deepfm.py +++ b/submarine-sdk/pysubmarine/example/pytorch/deepfm/run_deepfm.py @@ -14,28 +14,28 @@ # limitations under the License. -from submarine.ml.pytorch.model.ctr import DeepFM - import argparse -if __name__ == '__main__': +from submarine.ml.pytorch.model.ctr import DeepFM + +if __name__ == "__main__": parser = argparse.ArgumentParser() + parser.add_argument("-conf", help="a JSON configuration file for DeepFM", type=str) parser.add_argument( - "-conf", help="a JSON configuration file for DeepFM", type=str) - parser.add_argument("-task_type", default='train', - help="train or evaluate, by default is train") + "-task_type", default="train", help="train or evaluate, by default is train" + ) args = parser.parse_args() trainer = DeepFM(json_path=args.conf) - if args.task_type == 'train': + if args.task_type == "train": trainer.fit() - print('[Train Done]') - elif args.task_type == 'evaluate': + print("[Train Done]") + elif args.task_type == "evaluate": score = trainer.evaluate() - print(f'Eval score: {score}') - elif args.task_type == 'predict': + print(f"Eval score: {score}") + elif args.task_type == "predict": pred = trainer.predict() - print('Predict:', pred) + print("Predict:", pred) else: assert False, args.task_type diff --git a/submarine-sdk/pysubmarine/example/submarine_experiment_sdk.ipynb b/submarine-sdk/pysubmarine/example/submarine_experiment_sdk.ipynb index fedc05c593..7d51538d59 100644 --- a/submarine-sdk/pysubmarine/example/submarine_experiment_sdk.ipynb +++ b/submarine-sdk/pysubmarine/example/submarine_experiment_sdk.ipynb @@ -84,23 +84,25 @@ }, "outputs": [], "source": [ - "environment = EnvironmentSpec(image='apache/submarine:tf-dist-mnist-test-1.0')\n", - "experiment_meta = ExperimentMeta(name='mnist-dist',\n", - " namespace='default',\n", - " framework='Tensorflow',\n", - " cmd='python /var/tf_dist_mnist/dist_mnist.py --train_steps=100',\n", - " env_vars={'ENV1': 'ENV1'})\n", + "environment = EnvironmentSpec(image=\"apache/submarine:tf-dist-mnist-test-1.0\")\n", + "experiment_meta = ExperimentMeta(\n", + " name=\"mnist-dist\",\n", + " namespace=\"default\",\n", + " framework=\"Tensorflow\",\n", + " cmd=\"python /var/tf_dist_mnist/dist_mnist.py --train_steps=100\",\n", + " env_vars={\"ENV1\": \"ENV1\"},\n", + ")\n", "\n", - "worker_spec = ExperimentTaskSpec(resources='cpu=1,memory=1024M',\n", - " replicas=1)\n", - "ps_spec = ExperimentTaskSpec(resources='cpu=1,memory=1024M',\n", - " replicas=1)\n", - "code_spec = CodeSpec(sync_mode='git', url='https://github.com/apache/submarine.git')\n", + "worker_spec = ExperimentTaskSpec(resources=\"cpu=1,memory=1024M\", replicas=1)\n", + "ps_spec = ExperimentTaskSpec(resources=\"cpu=1,memory=1024M\", replicas=1)\n", + "code_spec = CodeSpec(sync_mode=\"git\", url=\"https://github.com/apache/submarine.git\")\n", "\n", - "experiment_spec = ExperimentSpec(meta=experiment_meta,\n", - " environment=environment,\n", - " code=code_spec,\n", - " spec={'Ps' : ps_spec,'Worker': worker_spec})\n" + "experiment_spec = ExperimentSpec(\n", + " meta=experiment_meta,\n", + " environment=environment,\n", + " code=code_spec,\n", + " spec={\"Ps\": ps_spec, \"Worker\": worker_spec},\n", + ")" ] }, { @@ -160,7 +162,7 @@ } ], "source": [ - "id = experiment['experimentId']\n", + "id = experiment[\"experimentId\"]\n", "submarine_client.get_experiment(id)" ] }, @@ -191,7 +193,7 @@ } ], "source": [ - "status = 'running'\n", + "status = \"running\"\n", "submarine_client.list_experiments(status=status)" ] }, diff --git a/submarine-sdk/pysubmarine/example/tensorflow/ccpm/run_ccpm.py b/submarine-sdk/pysubmarine/example/tensorflow/ccpm/run_ccpm.py index 88acfca58c..1d21bbcb24 100644 --- a/submarine-sdk/pysubmarine/example/tensorflow/ccpm/run_ccpm.py +++ b/submarine-sdk/pysubmarine/example/tensorflow/ccpm/run_ccpm.py @@ -13,22 +13,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -from submarine.ml.tensorflow.model import CCPM import argparse -if __name__ == '__main__': +from submarine.ml.tensorflow.model import CCPM + +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-conf", help="a JSON configuration file for CCPM", type=str) - parser.add_argument("-task_type", default='train', - help="train or evaluate, by default is train") + parser.add_argument( + "-task_type", default="train", help="train or evaluate, by default is train" + ) args = parser.parse_args() json_path = args.conf task_type = args.task_type model = CCPM(json_path=json_path) - if task_type == 'train': + if task_type == "train": model.train() - if task_type == 'evaluate': + if task_type == "evaluate": result = model.evaluate() print("Model metrics : ", result) diff --git a/submarine-sdk/pysubmarine/example/tensorflow/deepfm/run_deepfm.py b/submarine-sdk/pysubmarine/example/tensorflow/deepfm/run_deepfm.py index ba4285c450..d1c4df65ca 100644 --- a/submarine-sdk/pysubmarine/example/tensorflow/deepfm/run_deepfm.py +++ b/submarine-sdk/pysubmarine/example/tensorflow/deepfm/run_deepfm.py @@ -13,22 +13,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -from submarine.ml.tensorflow.model import DeepFM import argparse -if __name__ == '__main__': +from submarine.ml.tensorflow.model import DeepFM + +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-conf", help="a JSON configuration file for DeepFM", type=str) - parser.add_argument("-task_type", default='train', - help="train or evaluate, by default is train") + parser.add_argument( + "-task_type", default="train", help="train or evaluate, by default is train" + ) args = parser.parse_args() json_path = args.conf task_type = args.task_type model = DeepFM(json_path=json_path) - if task_type == 'train': + if task_type == "train": model.train() - if task_type == 'evaluate': + if task_type == "evaluate": result = model.evaluate() print("Model metrics : ", result) diff --git a/submarine-sdk/pysubmarine/example/tensorflow/fm/run_fm.py b/submarine-sdk/pysubmarine/example/tensorflow/fm/run_fm.py index 77ed952afa..a4f447ccfe 100644 --- a/submarine-sdk/pysubmarine/example/tensorflow/fm/run_fm.py +++ b/submarine-sdk/pysubmarine/example/tensorflow/fm/run_fm.py @@ -13,22 +13,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -from submarine.ml.tensorflow.model import FM import argparse -if __name__ == '__main__': +from submarine.ml.tensorflow.model import FM + +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-conf", help="a JSON configuration file for FM", type=str) - parser.add_argument("-task_type", default='train', - help="train or evaluate, by default is train") + parser.add_argument( + "-task_type", default="train", help="train or evaluate, by default is train" + ) args = parser.parse_args() json_path = args.conf task_type = args.task_type model = FM(json_path=json_path) - if task_type == 'train': + if task_type == "train": model.train() - if task_type == 'evaluate': + if task_type == "evaluate": result = model.evaluate() print("Model metrics : ", result) diff --git a/submarine-sdk/pysubmarine/example/tracking.py b/submarine-sdk/pysubmarine/example/tracking.py index 071b9c52d9..b5d7400a24 100644 --- a/submarine-sdk/pysubmarine/example/tracking.py +++ b/submarine-sdk/pysubmarine/example/tracking.py @@ -15,15 +15,15 @@ import numpy as np from sklearn.linear_model import LogisticRegression + import submarine if __name__ == "__main__": X = np.array([-2, -1, 0, 1, 2, 1]).reshape(-1, 1) y = np.array([0, 0, 1, 1, 1, 0]) - lr = LogisticRegression(solver='liblinear', max_iter=100) + lr = LogisticRegression(solver="liblinear", max_iter=100) submarine.log_param("max_iter", 100) lr.fit(X, y) score = lr.score(X, y) print("Score: %s" % score) submarine.log_metric("score", score) - diff --git a/submarine-sdk/pysubmarine/setup.py b/submarine-sdk/pysubmarine/setup.py index 1b7bd5a98f..deaa521c26 100644 --- a/submarine-sdk/pysubmarine/setup.py +++ b/submarine-sdk/pysubmarine/setup.py @@ -13,46 +13,46 @@ # See the License for the specific language governing permissions and # limitations under the License. -from setuptools import setup, find_packages +from setuptools import find_packages, setup -with open('README.md') as f: +with open("README.md") as f: long_description = f.read() setup( - name='apache-submarine', - version='0.6.0-SNAPSHOT', + name="apache-submarine", + version="0.6.0-SNAPSHOT", description="A python SDK for submarine", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/apache/submarine", - packages=find_packages(exclude=['tests', 'tests.*']), + packages=find_packages(exclude=["tests", "tests.*"]), install_requires=[ - 'six>=1.10.0', - 'numpy==1.18.5', - 'pandas', - 'sqlalchemy', - 'sqlparse', - 'pymysql', - 'requests==2.26.0', - 'urllib3>=1.15.1', - 'certifi>=14.05.14', - 'python-dateutil>=2.5.3', - 'pyarrow==0.17.0', - 'mlflow>=1.15.0', - 'boto3>=1.17.58', + "six>=1.10.0", + "numpy==1.18.5", + "pandas", + "sqlalchemy", + "sqlparse", + "pymysql", + "requests==2.26.0", + "urllib3>=1.15.1", + "certifi>=14.05.14", + "python-dateutil>=2.5.3", + "pyarrow==0.17.0", + "mlflow>=1.15.0", + "boto3>=1.17.58", ], extras_require={ - 'tf': ['tensorflow>=1.14.0,<2.0.0'], - 'tf-latest': ['tensorflow'], - 'pytorch': ['torch>=1.5.0', 'torchvision>=0.6.0'], + "tf": ["tensorflow>=1.14.0,<2.0.0"], + "tf-latest": ["tensorflow"], + "pytorch": ["torch>=1.5.0", "torchvision>=0.6.0"], }, classifiers=[ - 'Intended Audience :: Developers', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', + "Intended Audience :: Developers", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", ], - license='Apache License, Version 2.0', - maintainer='Apache Submarine Community', - maintainer_email='dev@submarine.apache.org', + license="Apache License, Version 2.0", + maintainer="Apache Submarine Community", + maintainer_email="dev@submarine.apache.org", ) diff --git a/submarine-sdk/pysubmarine/submarine/__init__.py b/submarine-sdk/pysubmarine/submarine/__init__.py index c16c541f29..979e996830 100644 --- a/submarine-sdk/pysubmarine/submarine/__init__.py +++ b/submarine-sdk/pysubmarine/submarine/__init__.py @@ -23,10 +23,11 @@ set_tracking_uri = tracking.set_tracking_uri get_tracking_uri = tracking.get_tracking_uri -__all__ = ["log_metric", - "log_param", - "set_tracking_uri", - "get_tracking_uri", - "ExperimentClient", - "ModelsClient" - ] +__all__ = [ + "log_metric", + "log_param", + "set_tracking_uri", + "get_tracking_uri", + "ExperimentClient", + "ModelsClient", +] diff --git a/submarine-sdk/pysubmarine/submarine/entities/_submarine_object.py b/submarine-sdk/pysubmarine/submarine/entities/_submarine_object.py index 92b0f3556f..ffcea7fc29 100644 --- a/submarine-sdk/pysubmarine/submarine/entities/_submarine_object.py +++ b/submarine-sdk/pysubmarine/submarine/entities/_submarine_object.py @@ -17,7 +17,6 @@ class _SubmarineObject: - def __iter__(self): # Iterate through list of properties and yield as key -> value for prop in self._properties(): @@ -25,16 +24,11 @@ def __iter__(self): @classmethod def _properties(cls): - return sorted( - [p for p in cls.__dict__ if isinstance(getattr(cls, p), property)]) + return sorted([p for p in cls.__dict__ if isinstance(getattr(cls, p), property)]) @classmethod def from_dictionary(cls, the_dict): - filtered_dict = { - key: value - for key, value in the_dict.items() - if key in cls._properties() - } + filtered_dict = {key: value for key, value in the_dict.items() if key in cls._properties()} return cls(**filtered_dict) def __repr__(self): @@ -50,17 +44,14 @@ def get_classname(obj): class _SubmarineObjectPrinter: - def __init__(self): super(_SubmarineObjectPrinter, self).__init__() self.printer = pprint.PrettyPrinter() def to_string(self, obj): if isinstance(obj, _SubmarineObject): - return "<%s: %s>" % (get_classname(obj), - self._entity_to_string(obj)) + return "<%s: %s>" % (get_classname(obj), self._entity_to_string(obj)) return self.printer.pformat(obj) def _entity_to_string(self, entity): - return ", ".join( - ["%s=%s" % (key, self.to_string(value)) for key, value in entity]) + return ", ".join(["%s=%s" % (key, self.to_string(value)) for key, value in entity]) diff --git a/submarine-sdk/pysubmarine/submarine/exceptions.py b/submarine-sdk/pysubmarine/submarine/exceptions.py index 10598cc518..d6c5c4d34f 100644 --- a/submarine-sdk/pysubmarine/submarine/exceptions.py +++ b/submarine-sdk/pysubmarine/submarine/exceptions.py @@ -31,8 +31,10 @@ class RestException(SubmarineException): """Exception thrown on non 200-level responses from the REST API""" def __init__(self, json): - error_code = json.get('error_code') - message = "%s: %s" % (error_code, json['message'] if 'message' in json - else "Response: " + str(json)) + error_code = json.get("error_code") + message = "%s: %s" % ( + error_code, + json["message"] if "message" in json else "Response: " + str(json), + ) super(RestException, self).__init__(message) self.json = json diff --git a/submarine-sdk/pysubmarine/submarine/experiment/__init__.py b/submarine-sdk/pysubmarine/submarine/experiment/__init__.py index 1fc462299e..618170f081 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/__init__.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/__init__.py @@ -30,12 +30,18 @@ # import apis into sdk package from submarine.experiment.api.experiment_api import ExperimentApi + # import ApiClient from submarine.experiment.api_client import ApiClient from submarine.experiment.configuration import Configuration -from submarine.experiment.exceptions import (ApiException, ApiKeyError, - ApiTypeError, ApiValueError, - OpenApiException) +from submarine.experiment.exceptions import ( + ApiException, + ApiKeyError, + ApiTypeError, + ApiValueError, + OpenApiException, +) + # import models into sdk package from submarine.experiment.models.code_spec import CodeSpec from submarine.experiment.models.environment_spec import EnvironmentSpec diff --git a/submarine-sdk/pysubmarine/submarine/experiment/api/__init__.py b/submarine-sdk/pysubmarine/submarine/experiment/api/__init__.py index 62dc60eae7..8f94cbba1f 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/api/__init__.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/api/__init__.py @@ -15,7 +15,7 @@ from __future__ import absolute_import -# flake8: noqa - # import apis into api package from submarine.experiment.api.experiment_api import ExperimentApi + +# flake8: noqa diff --git a/submarine-sdk/pysubmarine/submarine/experiment/api/experiment_api.py b/submarine-sdk/pysubmarine/submarine/experiment/api/experiment_api.py index 46336e917b..e2a9fb8e8d 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/api/experiment_api.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/api/experiment_api.py @@ -34,10 +34,7 @@ import six from submarine.experiment.api_client import ApiClient -from submarine.experiment.exceptions import ( # noqa: F401 - ApiTypeError, - ApiValueError -) +from submarine.experiment.exceptions import ApiTypeError, ApiValueError # noqa: F401 class ExperimentApi(object): @@ -73,7 +70,7 @@ def create_experiment(self, **kwargs): # noqa: E501 If the method is called asynchronously, returns the request thread. """ - kwargs['_return_http_data_only'] = True + kwargs["_return_http_data_only"] = True return self.create_experiment_with_http_info(**kwargs) # noqa: E501 def create_experiment_with_http_info(self, **kwargs): # noqa: E501 @@ -102,26 +99,18 @@ def create_experiment_with_http_info(self, **kwargs): # noqa: E501 local_var_params = locals() - all_params = [ - 'experiment_spec' - ] + all_params = ["experiment_spec"] all_params.extend( - [ - 'async_req', - '_return_http_data_only', - '_preload_content', - '_request_timeout' - ] + ["async_req", "_return_http_data_only", "_preload_content", "_request_timeout"] ) - for key, val in six.iteritems(local_var_params['kwargs']): + for key, val in six.iteritems(local_var_params["kwargs"]): if key not in all_params: raise ApiTypeError( - "Got an unexpected keyword argument '%s'" - " to method create_experiment" % key + "Got an unexpected keyword argument '%s' to method create_experiment" % key ) local_var_params[key] = val - del local_var_params['kwargs'] + del local_var_params["kwargs"] collection_formats = {} @@ -135,34 +124,38 @@ def create_experiment_with_http_info(self, **kwargs): # noqa: E501 local_var_files = {} body_params = None - if 'experiment_spec' in local_var_params: - body_params = local_var_params['experiment_spec'] + if "experiment_spec" in local_var_params: + body_params = local_var_params["experiment_spec"] # HTTP header `Accept` - header_params['Accept'] = self.api_client.select_header_accept( - ['application/json; charset=utf-8']) # noqa: E501 + header_params["Accept"] = self.api_client.select_header_accept( + ["application/json; charset=utf-8"] + ) # noqa: E501 # HTTP header `Content-Type` - header_params['Content-Type'] = self.api_client.select_header_content_type( # noqa: E501 - ['application/yaml', 'application/json']) # noqa: E501 + header_params["Content-Type"] = self.api_client.select_header_content_type( # noqa: E501 + ["application/yaml", "application/json"] + ) # noqa: E501 # Authentication setting auth_settings = [] # noqa: E501 return self.api_client.call_api( - '/v1/experiment', 'POST', + "/v1/experiment", + "POST", path_params, query_params, header_params, body=body_params, post_params=form_params, files=local_var_files, - response_type='JsonResponse', # noqa: E501 + response_type="JsonResponse", # noqa: E501 auth_settings=auth_settings, - async_req=local_var_params.get('async_req'), - _return_http_data_only=local_var_params.get('_return_http_data_only'), # noqa: E501 - _preload_content=local_var_params.get('_preload_content', True), - _request_timeout=local_var_params.get('_request_timeout'), - collection_formats=collection_formats) + async_req=local_var_params.get("async_req"), + _return_http_data_only=local_var_params.get("_return_http_data_only"), # noqa: E501 + _preload_content=local_var_params.get("_preload_content", True), + _request_timeout=local_var_params.get("_request_timeout"), + collection_formats=collection_formats, + ) def delete_experiment(self, id, **kwargs): # noqa: E501 """Delete the experiment # noqa: E501 @@ -185,7 +178,7 @@ def delete_experiment(self, id, **kwargs): # noqa: E501 If the method is called asynchronously, returns the request thread. """ - kwargs['_return_http_data_only'] = True + kwargs["_return_http_data_only"] = True return self.delete_experiment_with_http_info(id, **kwargs) # noqa: E501 def delete_experiment_with_http_info(self, id, **kwargs): # noqa: E501 @@ -214,36 +207,31 @@ def delete_experiment_with_http_info(self, id, **kwargs): # noqa: E501 local_var_params = locals() - all_params = [ - 'id' - ] + all_params = ["id"] all_params.extend( - [ - 'async_req', - '_return_http_data_only', - '_preload_content', - '_request_timeout' - ] + ["async_req", "_return_http_data_only", "_preload_content", "_request_timeout"] ) - for key, val in six.iteritems(local_var_params['kwargs']): + for key, val in six.iteritems(local_var_params["kwargs"]): if key not in all_params: raise ApiTypeError( - "Got an unexpected keyword argument '%s'" - " to method delete_experiment" % key + "Got an unexpected keyword argument '%s' to method delete_experiment" % key ) local_var_params[key] = val - del local_var_params['kwargs'] + del local_var_params["kwargs"] # verify the required parameter 'id' is set - if self.api_client.client_side_validation and ('id' not in local_var_params or # noqa: E501 - local_var_params['id'] is None): # noqa: E501 - raise ApiValueError("Missing the required parameter `id` when calling `delete_experiment`") # noqa: E501 + if self.api_client.client_side_validation and ( + "id" not in local_var_params or local_var_params["id"] is None # noqa: E501 + ): # noqa: E501 + raise ApiValueError( + "Missing the required parameter `id` when calling `delete_experiment`" + ) # noqa: E501 collection_formats = {} path_params = {} - if 'id' in local_var_params: - path_params['id'] = local_var_params['id'] # noqa: E501 + if "id" in local_var_params: + path_params["id"] = local_var_params["id"] # noqa: E501 query_params = [] @@ -254,27 +242,30 @@ def delete_experiment_with_http_info(self, id, **kwargs): # noqa: E501 body_params = None # HTTP header `Accept` - header_params['Accept'] = self.api_client.select_header_accept( - ['application/json; charset=utf-8']) # noqa: E501 + header_params["Accept"] = self.api_client.select_header_accept( + ["application/json; charset=utf-8"] + ) # noqa: E501 # Authentication setting auth_settings = [] # noqa: E501 return self.api_client.call_api( - '/v1/experiment/{id}', 'DELETE', + "/v1/experiment/{id}", + "DELETE", path_params, query_params, header_params, body=body_params, post_params=form_params, files=local_var_files, - response_type='JsonResponse', # noqa: E501 + response_type="JsonResponse", # noqa: E501 auth_settings=auth_settings, - async_req=local_var_params.get('async_req'), - _return_http_data_only=local_var_params.get('_return_http_data_only'), # noqa: E501 - _preload_content=local_var_params.get('_preload_content', True), - _request_timeout=local_var_params.get('_request_timeout'), - collection_formats=collection_formats) + async_req=local_var_params.get("async_req"), + _return_http_data_only=local_var_params.get("_return_http_data_only"), # noqa: E501 + _preload_content=local_var_params.get("_preload_content", True), + _request_timeout=local_var_params.get("_request_timeout"), + collection_formats=collection_formats, + ) def get_experiment(self, id, **kwargs): # noqa: E501 """Get the experiment's detailed info by id # noqa: E501 @@ -297,7 +288,7 @@ def get_experiment(self, id, **kwargs): # noqa: E501 If the method is called asynchronously, returns the request thread. """ - kwargs['_return_http_data_only'] = True + kwargs["_return_http_data_only"] = True return self.get_experiment_with_http_info(id, **kwargs) # noqa: E501 def get_experiment_with_http_info(self, id, **kwargs): # noqa: E501 @@ -326,36 +317,31 @@ def get_experiment_with_http_info(self, id, **kwargs): # noqa: E501 local_var_params = locals() - all_params = [ - 'id' - ] + all_params = ["id"] all_params.extend( - [ - 'async_req', - '_return_http_data_only', - '_preload_content', - '_request_timeout' - ] + ["async_req", "_return_http_data_only", "_preload_content", "_request_timeout"] ) - for key, val in six.iteritems(local_var_params['kwargs']): + for key, val in six.iteritems(local_var_params["kwargs"]): if key not in all_params: raise ApiTypeError( - "Got an unexpected keyword argument '%s'" - " to method get_experiment" % key + "Got an unexpected keyword argument '%s' to method get_experiment" % key ) local_var_params[key] = val - del local_var_params['kwargs'] + del local_var_params["kwargs"] # verify the required parameter 'id' is set - if self.api_client.client_side_validation and ('id' not in local_var_params or # noqa: E501 - local_var_params['id'] is None): # noqa: E501 - raise ApiValueError("Missing the required parameter `id` when calling `get_experiment`") # noqa: E501 + if self.api_client.client_side_validation and ( + "id" not in local_var_params or local_var_params["id"] is None # noqa: E501 + ): # noqa: E501 + raise ApiValueError( + "Missing the required parameter `id` when calling `get_experiment`" + ) # noqa: E501 collection_formats = {} path_params = {} - if 'id' in local_var_params: - path_params['id'] = local_var_params['id'] # noqa: E501 + if "id" in local_var_params: + path_params["id"] = local_var_params["id"] # noqa: E501 query_params = [] @@ -366,27 +352,30 @@ def get_experiment_with_http_info(self, id, **kwargs): # noqa: E501 body_params = None # HTTP header `Accept` - header_params['Accept'] = self.api_client.select_header_accept( - ['application/json; charset=utf-8']) # noqa: E501 + header_params["Accept"] = self.api_client.select_header_accept( + ["application/json; charset=utf-8"] + ) # noqa: E501 # Authentication setting auth_settings = [] # noqa: E501 return self.api_client.call_api( - '/v1/experiment/{id}', 'GET', + "/v1/experiment/{id}", + "GET", path_params, query_params, header_params, body=body_params, post_params=form_params, files=local_var_files, - response_type='JsonResponse', # noqa: E501 + response_type="JsonResponse", # noqa: E501 auth_settings=auth_settings, - async_req=local_var_params.get('async_req'), - _return_http_data_only=local_var_params.get('_return_http_data_only'), # noqa: E501 - _preload_content=local_var_params.get('_preload_content', True), - _request_timeout=local_var_params.get('_request_timeout'), - collection_formats=collection_formats) + async_req=local_var_params.get("async_req"), + _return_http_data_only=local_var_params.get("_return_http_data_only"), # noqa: E501 + _preload_content=local_var_params.get("_preload_content", True), + _request_timeout=local_var_params.get("_request_timeout"), + collection_formats=collection_formats, + ) def get_log(self, id, **kwargs): # noqa: E501 """Log experiment by id # noqa: E501 @@ -409,7 +398,7 @@ def get_log(self, id, **kwargs): # noqa: E501 If the method is called asynchronously, returns the request thread. """ - kwargs['_return_http_data_only'] = True + kwargs["_return_http_data_only"] = True return self.get_log_with_http_info(id, **kwargs) # noqa: E501 def get_log_with_http_info(self, id, **kwargs): # noqa: E501 @@ -438,36 +427,31 @@ def get_log_with_http_info(self, id, **kwargs): # noqa: E501 local_var_params = locals() - all_params = [ - 'id' - ] + all_params = ["id"] all_params.extend( - [ - 'async_req', - '_return_http_data_only', - '_preload_content', - '_request_timeout' - ] + ["async_req", "_return_http_data_only", "_preload_content", "_request_timeout"] ) - for key, val in six.iteritems(local_var_params['kwargs']): + for key, val in six.iteritems(local_var_params["kwargs"]): if key not in all_params: raise ApiTypeError( - "Got an unexpected keyword argument '%s'" - " to method get_log" % key + "Got an unexpected keyword argument '%s' to method get_log" % key ) local_var_params[key] = val - del local_var_params['kwargs'] + del local_var_params["kwargs"] # verify the required parameter 'id' is set - if self.api_client.client_side_validation and ('id' not in local_var_params or # noqa: E501 - local_var_params['id'] is None): # noqa: E501 - raise ApiValueError("Missing the required parameter `id` when calling `get_log`") # noqa: E501 + if self.api_client.client_side_validation and ( + "id" not in local_var_params or local_var_params["id"] is None # noqa: E501 + ): # noqa: E501 + raise ApiValueError( + "Missing the required parameter `id` when calling `get_log`" + ) # noqa: E501 collection_formats = {} path_params = {} - if 'id' in local_var_params: - path_params['id'] = local_var_params['id'] # noqa: E501 + if "id" in local_var_params: + path_params["id"] = local_var_params["id"] # noqa: E501 query_params = [] @@ -478,27 +462,30 @@ def get_log_with_http_info(self, id, **kwargs): # noqa: E501 body_params = None # HTTP header `Accept` - header_params['Accept'] = self.api_client.select_header_accept( - ['application/json; charset=utf-8']) # noqa: E501 + header_params["Accept"] = self.api_client.select_header_accept( + ["application/json; charset=utf-8"] + ) # noqa: E501 # Authentication setting auth_settings = [] # noqa: E501 return self.api_client.call_api( - '/v1/experiment/logs/{id}', 'GET', + "/v1/experiment/logs/{id}", + "GET", path_params, query_params, header_params, body=body_params, post_params=form_params, files=local_var_files, - response_type='JsonResponse', # noqa: E501 + response_type="JsonResponse", # noqa: E501 auth_settings=auth_settings, - async_req=local_var_params.get('async_req'), - _return_http_data_only=local_var_params.get('_return_http_data_only'), # noqa: E501 - _preload_content=local_var_params.get('_preload_content', True), - _request_timeout=local_var_params.get('_request_timeout'), - collection_formats=collection_formats) + async_req=local_var_params.get("async_req"), + _return_http_data_only=local_var_params.get("_return_http_data_only"), # noqa: E501 + _preload_content=local_var_params.get("_preload_content", True), + _request_timeout=local_var_params.get("_request_timeout"), + collection_formats=collection_formats, + ) def list_experiments(self, **kwargs): # noqa: E501 """List experiments # noqa: E501 @@ -521,7 +508,7 @@ def list_experiments(self, **kwargs): # noqa: E501 If the method is called asynchronously, returns the request thread. """ - kwargs['_return_http_data_only'] = True + kwargs["_return_http_data_only"] = True return self.list_experiments_with_http_info(**kwargs) # noqa: E501 def list_experiments_with_http_info(self, **kwargs): # noqa: E501 @@ -550,34 +537,26 @@ def list_experiments_with_http_info(self, **kwargs): # noqa: E501 local_var_params = locals() - all_params = [ - 'status' - ] + all_params = ["status"] all_params.extend( - [ - 'async_req', - '_return_http_data_only', - '_preload_content', - '_request_timeout' - ] + ["async_req", "_return_http_data_only", "_preload_content", "_request_timeout"] ) - for key, val in six.iteritems(local_var_params['kwargs']): + for key, val in six.iteritems(local_var_params["kwargs"]): if key not in all_params: raise ApiTypeError( - "Got an unexpected keyword argument '%s'" - " to method list_experiments" % key + "Got an unexpected keyword argument '%s' to method list_experiments" % key ) local_var_params[key] = val - del local_var_params['kwargs'] + del local_var_params["kwargs"] collection_formats = {} path_params = {} query_params = [] - if 'status' in local_var_params and local_var_params['status'] is not None: # noqa: E501 - query_params.append(('status', local_var_params['status'])) # noqa: E501 + if "status" in local_var_params and local_var_params["status"] is not None: # noqa: E501 + query_params.append(("status", local_var_params["status"])) # noqa: E501 header_params = {} @@ -586,27 +565,30 @@ def list_experiments_with_http_info(self, **kwargs): # noqa: E501 body_params = None # HTTP header `Accept` - header_params['Accept'] = self.api_client.select_header_accept( - ['application/json; charset=utf-8']) # noqa: E501 + header_params["Accept"] = self.api_client.select_header_accept( + ["application/json; charset=utf-8"] + ) # noqa: E501 # Authentication setting auth_settings = [] # noqa: E501 return self.api_client.call_api( - '/v1/experiment', 'GET', + "/v1/experiment", + "GET", path_params, query_params, header_params, body=body_params, post_params=form_params, files=local_var_files, - response_type='JsonResponse', # noqa: E501 + response_type="JsonResponse", # noqa: E501 auth_settings=auth_settings, - async_req=local_var_params.get('async_req'), - _return_http_data_only=local_var_params.get('_return_http_data_only'), # noqa: E501 - _preload_content=local_var_params.get('_preload_content', True), - _request_timeout=local_var_params.get('_request_timeout'), - collection_formats=collection_formats) + async_req=local_var_params.get("async_req"), + _return_http_data_only=local_var_params.get("_return_http_data_only"), # noqa: E501 + _preload_content=local_var_params.get("_preload_content", True), + _request_timeout=local_var_params.get("_request_timeout"), + collection_formats=collection_formats, + ) def list_log(self, **kwargs): # noqa: E501 """List experiment's log # noqa: E501 @@ -629,7 +611,7 @@ def list_log(self, **kwargs): # noqa: E501 If the method is called asynchronously, returns the request thread. """ - kwargs['_return_http_data_only'] = True + kwargs["_return_http_data_only"] = True return self.list_log_with_http_info(**kwargs) # noqa: E501 def list_log_with_http_info(self, **kwargs): # noqa: E501 @@ -658,34 +640,26 @@ def list_log_with_http_info(self, **kwargs): # noqa: E501 local_var_params = locals() - all_params = [ - 'status' - ] + all_params = ["status"] all_params.extend( - [ - 'async_req', - '_return_http_data_only', - '_preload_content', - '_request_timeout' - ] + ["async_req", "_return_http_data_only", "_preload_content", "_request_timeout"] ) - for key, val in six.iteritems(local_var_params['kwargs']): + for key, val in six.iteritems(local_var_params["kwargs"]): if key not in all_params: raise ApiTypeError( - "Got an unexpected keyword argument '%s'" - " to method list_log" % key + "Got an unexpected keyword argument '%s' to method list_log" % key ) local_var_params[key] = val - del local_var_params['kwargs'] + del local_var_params["kwargs"] collection_formats = {} path_params = {} query_params = [] - if 'status' in local_var_params and local_var_params['status'] is not None: # noqa: E501 - query_params.append(('status', local_var_params['status'])) # noqa: E501 + if "status" in local_var_params and local_var_params["status"] is not None: # noqa: E501 + query_params.append(("status", local_var_params["status"])) # noqa: E501 header_params = {} @@ -694,27 +668,30 @@ def list_log_with_http_info(self, **kwargs): # noqa: E501 body_params = None # HTTP header `Accept` - header_params['Accept'] = self.api_client.select_header_accept( - ['application/json; charset=utf-8']) # noqa: E501 + header_params["Accept"] = self.api_client.select_header_accept( + ["application/json; charset=utf-8"] + ) # noqa: E501 # Authentication setting auth_settings = [] # noqa: E501 return self.api_client.call_api( - '/v1/experiment/logs', 'GET', + "/v1/experiment/logs", + "GET", path_params, query_params, header_params, body=body_params, post_params=form_params, files=local_var_files, - response_type='JsonResponse', # noqa: E501 + response_type="JsonResponse", # noqa: E501 auth_settings=auth_settings, - async_req=local_var_params.get('async_req'), - _return_http_data_only=local_var_params.get('_return_http_data_only'), # noqa: E501 - _preload_content=local_var_params.get('_preload_content', True), - _request_timeout=local_var_params.get('_request_timeout'), - collection_formats=collection_formats) + async_req=local_var_params.get("async_req"), + _return_http_data_only=local_var_params.get("_return_http_data_only"), # noqa: E501 + _preload_content=local_var_params.get("_preload_content", True), + _request_timeout=local_var_params.get("_request_timeout"), + collection_formats=collection_formats, + ) def patch_experiment(self, id, **kwargs): # noqa: E501 """Update the experiment in the submarine server with spec # noqa: E501 @@ -738,7 +715,7 @@ def patch_experiment(self, id, **kwargs): # noqa: E501 If the method is called asynchronously, returns the request thread. """ - kwargs['_return_http_data_only'] = True + kwargs["_return_http_data_only"] = True return self.patch_experiment_with_http_info(id, **kwargs) # noqa: E501 def patch_experiment_with_http_info(self, id, **kwargs): # noqa: E501 @@ -768,37 +745,31 @@ def patch_experiment_with_http_info(self, id, **kwargs): # noqa: E501 local_var_params = locals() - all_params = [ - 'id', - 'experiment_spec' - ] + all_params = ["id", "experiment_spec"] all_params.extend( - [ - 'async_req', - '_return_http_data_only', - '_preload_content', - '_request_timeout' - ] + ["async_req", "_return_http_data_only", "_preload_content", "_request_timeout"] ) - for key, val in six.iteritems(local_var_params['kwargs']): + for key, val in six.iteritems(local_var_params["kwargs"]): if key not in all_params: raise ApiTypeError( - "Got an unexpected keyword argument '%s'" - " to method patch_experiment" % key + "Got an unexpected keyword argument '%s' to method patch_experiment" % key ) local_var_params[key] = val - del local_var_params['kwargs'] + del local_var_params["kwargs"] # verify the required parameter 'id' is set - if self.api_client.client_side_validation and ('id' not in local_var_params or # noqa: E501 - local_var_params['id'] is None): # noqa: E501 - raise ApiValueError("Missing the required parameter `id` when calling `patch_experiment`") # noqa: E501 + if self.api_client.client_side_validation and ( + "id" not in local_var_params or local_var_params["id"] is None # noqa: E501 + ): # noqa: E501 + raise ApiValueError( + "Missing the required parameter `id` when calling `patch_experiment`" + ) # noqa: E501 collection_formats = {} path_params = {} - if 'id' in local_var_params: - path_params['id'] = local_var_params['id'] # noqa: E501 + if "id" in local_var_params: + path_params["id"] = local_var_params["id"] # noqa: E501 query_params = [] @@ -808,34 +779,38 @@ def patch_experiment_with_http_info(self, id, **kwargs): # noqa: E501 local_var_files = {} body_params = None - if 'experiment_spec' in local_var_params: - body_params = local_var_params['experiment_spec'] + if "experiment_spec" in local_var_params: + body_params = local_var_params["experiment_spec"] # HTTP header `Accept` - header_params['Accept'] = self.api_client.select_header_accept( - ['application/json; charset=utf-8']) # noqa: E501 + header_params["Accept"] = self.api_client.select_header_accept( + ["application/json; charset=utf-8"] + ) # noqa: E501 # HTTP header `Content-Type` - header_params['Content-Type'] = self.api_client.select_header_content_type( # noqa: E501 - ['application/yaml', 'application/json']) # noqa: E501 + header_params["Content-Type"] = self.api_client.select_header_content_type( # noqa: E501 + ["application/yaml", "application/json"] + ) # noqa: E501 # Authentication setting auth_settings = [] # noqa: E501 return self.api_client.call_api( - '/v1/experiment/{id}', 'PATCH', + "/v1/experiment/{id}", + "PATCH", path_params, query_params, header_params, body=body_params, post_params=form_params, files=local_var_files, - response_type='JsonResponse', # noqa: E501 + response_type="JsonResponse", # noqa: E501 auth_settings=auth_settings, - async_req=local_var_params.get('async_req'), - _return_http_data_only=local_var_params.get('_return_http_data_only'), # noqa: E501 - _preload_content=local_var_params.get('_preload_content', True), - _request_timeout=local_var_params.get('_request_timeout'), - collection_formats=collection_formats) + async_req=local_var_params.get("async_req"), + _return_http_data_only=local_var_params.get("_return_http_data_only"), # noqa: E501 + _preload_content=local_var_params.get("_preload_content", True), + _request_timeout=local_var_params.get("_request_timeout"), + collection_formats=collection_formats, + ) def ping(self, **kwargs): # noqa: E501 """Ping submarine server # noqa: E501 @@ -858,7 +833,7 @@ def ping(self, **kwargs): # noqa: E501 If the method is called asynchronously, returns the request thread. """ - kwargs['_return_http_data_only'] = True + kwargs["_return_http_data_only"] = True return self.ping_with_http_info(**kwargs) # noqa: E501 def ping_with_http_info(self, **kwargs): # noqa: E501 @@ -887,25 +862,16 @@ def ping_with_http_info(self, **kwargs): # noqa: E501 local_var_params = locals() - all_params = [ - ] + all_params = [] all_params.extend( - [ - 'async_req', - '_return_http_data_only', - '_preload_content', - '_request_timeout' - ] + ["async_req", "_return_http_data_only", "_preload_content", "_request_timeout"] ) - for key, val in six.iteritems(local_var_params['kwargs']): + for key, val in six.iteritems(local_var_params["kwargs"]): if key not in all_params: - raise ApiTypeError( - "Got an unexpected keyword argument '%s'" - " to method ping" % key - ) + raise ApiTypeError("Got an unexpected keyword argument '%s' to method ping" % key) local_var_params[key] = val - del local_var_params['kwargs'] + del local_var_params["kwargs"] collection_formats = {} @@ -920,24 +886,27 @@ def ping_with_http_info(self, **kwargs): # noqa: E501 body_params = None # HTTP header `Accept` - header_params['Accept'] = self.api_client.select_header_accept( - ['application/json; charset=utf-8']) # noqa: E501 + header_params["Accept"] = self.api_client.select_header_accept( + ["application/json; charset=utf-8"] + ) # noqa: E501 # Authentication setting auth_settings = [] # noqa: E501 return self.api_client.call_api( - '/v1/experiment/ping', 'GET', + "/v1/experiment/ping", + "GET", path_params, query_params, header_params, body=body_params, post_params=form_params, files=local_var_files, - response_type='str', # noqa: E501 + response_type="str", # noqa: E501 auth_settings=auth_settings, - async_req=local_var_params.get('async_req'), - _return_http_data_only=local_var_params.get('_return_http_data_only'), # noqa: E501 - _preload_content=local_var_params.get('_preload_content', True), - _request_timeout=local_var_params.get('_request_timeout'), - collection_formats=collection_formats) + async_req=local_var_params.get("async_req"), + _return_http_data_only=local_var_params.get("_return_http_data_only"), # noqa: E501 + _preload_content=local_var_params.get("_preload_content", True), + _request_timeout=local_var_params.get("_request_timeout"), + collection_formats=collection_formats, + ) diff --git a/submarine-sdk/pysubmarine/submarine/experiment/api/experiment_client.py b/submarine-sdk/pysubmarine/submarine/experiment/api/experiment_client.py index 50b909dd7b..d65987608d 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/api/experiment_client.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/api/experiment_client.py @@ -17,12 +17,12 @@ import os import time -from submarine.experiment.configuration import Configuration -from submarine.experiment.api_client import ApiClient from submarine.experiment.api.experiment_api import ExperimentApi +from submarine.experiment.api_client import ApiClient +from submarine.experiment.configuration import Configuration logger = logging.getLogger(__name__) -logging.basicConfig(format='%(message)s') +logging.basicConfig(format="%(message)s") logging.getLogger().setLevel(logging.INFO) @@ -33,7 +33,7 @@ def generate_host(): """ submarine_server_dns_name = str(os.environ.get("SUBMARINE_SERVER_DNS_NAME")) submarine_server_port = str(os.environ.get("SUBMARINE_SERVER_PORT")) - host = submarine_server_dns_name + ':' + submarine_server_port + host = submarine_server_dns_name + ":" + submarine_server_port return host @@ -46,7 +46,7 @@ def __init__(self, host=generate_host()): # TODO(pingsutw): support authentication for talking to the submarine server self.host = host configuration = Configuration() - configuration.host = host + '/api' + configuration.host = host + "/api" api_client = ApiClient(configuration=configuration) self.experiment_api = ExperimentApi(api_client=api_client) @@ -68,8 +68,8 @@ def wait_for_finish(self, id, polling_interval=10): """ index = 0 while True: - status = self.get_experiment(id)['status'] - if status == 'Succeeded' or status == 'Deleted': + status = self.get_experiment(id)["status"] + if status == "Succeeded" or status == "Deleted": self._log_pod(id, index) break index = self._log_pod(id, index) @@ -77,11 +77,11 @@ def wait_for_finish(self, id, polling_interval=10): def _log_pod(self, id, index): response = self.experiment_api.get_log(id) - log_contents = response.result['logContent'] + log_contents = response.result["logContent"] if len(log_contents) == 0: return index log_content = log_contents[0] - for i, log in enumerate(log_content['podLog']): + for i, log in enumerate(log_content["podLog"]): if i < index: continue index += 1 @@ -135,14 +135,14 @@ def get_log(self, id, onlyMaster=False): :return: str: pods logs """ response = self.experiment_api.get_log(id) - log_contents = response.result['logContent'] + log_contents = response.result["logContent"] if onlyMaster is True and len(log_contents) != 0: log_contents = [log_contents[0]] for log_content in log_contents: - logging.info("The logs of Pod %s:\n", log_content['podName']) - for log in log_content['podLog']: + logging.info("The logs of Pod %s:\n", log_content["podName"]) + for log in log_content["podLog"]: logging.info("%s", log) def list_log(self, status): diff --git a/submarine-sdk/pysubmarine/submarine/experiment/api_client.py b/submarine-sdk/pysubmarine/submarine/experiment/api_client.py index e2f3d0a94d..54c18d3cad 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/api_client.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/api_client.py @@ -70,23 +70,20 @@ class ApiClient(object): PRIMITIVE_TYPES = (float, bool, bytes, six.text_type) + six.integer_types NATIVE_TYPES_MAPPING = { - 'int': int, - 'long': int if six.PY3 else long, # noqa: F821 - 'float': float, - 'str': str, - 'bool': bool, - 'date': datetime.date, - 'datetime': datetime.datetime, - 'object': object, + "int": int, + "long": int if six.PY3 else long, # noqa: F821 + "float": float, + "str": str, + "bool": bool, + "date": datetime.date, + "datetime": datetime.datetime, + "object": object, } _pool = None - def __init__(self, - configuration=None, - header_name=None, - header_value=None, - cookie=None, - pool_threads=1): + def __init__( + self, configuration=None, header_name=None, header_value=None, cookie=None, pool_threads=1 + ): if configuration is None: configuration = Configuration.get_default_copy() self.configuration = configuration @@ -98,7 +95,7 @@ def __init__(self, self.default_headers[header_name] = header_value self.cookie = cookie # Set default User-Agent. - self.user_agent = 'OpenAPI-Generator/0.6.0-SNAPSHOT/python' + self.user_agent = "OpenAPI-Generator/0.6.0-SNAPSHOT/python" self.client_side_validation = configuration.client_side_validation def __enter__(self): @@ -112,13 +109,13 @@ def close(self): self._pool.close() self._pool.join() self._pool = None - if hasattr(atexit, 'unregister'): + if hasattr(atexit, "unregister"): atexit.unregister(self.close) @property def pool(self): """Create thread pool on first request - avoids instantiating unused threadpool for blocking clients. + avoids instantiating unused threadpool for blocking clients. """ if self._pool is None: atexit.register(self.close) @@ -128,31 +125,33 @@ def pool(self): @property def user_agent(self): """User agent for this API client""" - return self.default_headers['User-Agent'] + return self.default_headers["User-Agent"] @user_agent.setter def user_agent(self, value): - self.default_headers['User-Agent'] = value + self.default_headers["User-Agent"] = value def set_default_header(self, header_name, header_value): self.default_headers[header_name] = header_value - def __call_api(self, - resource_path, - method, - path_params=None, - query_params=None, - header_params=None, - body=None, - post_params=None, - files=None, - response_type=None, - auth_settings=None, - _return_http_data_only=None, - collection_formats=None, - _preload_content=True, - _request_timeout=None, - _host=None): + def __call_api( + self, + resource_path, + method, + path_params=None, + query_params=None, + header_params=None, + body=None, + post_params=None, + files=None, + response_type=None, + auth_settings=None, + _return_http_data_only=None, + collection_formats=None, + _preload_content=True, + _request_timeout=None, + _host=None, + ): config = self.configuration @@ -160,35 +159,31 @@ def __call_api(self, header_params = header_params or {} header_params.update(self.default_headers) if self.cookie: - header_params['Cookie'] = self.cookie + header_params["Cookie"] = self.cookie if header_params: header_params = self.sanitize_for_serialization(header_params) - header_params = dict( - self.parameters_to_tuples(header_params, collection_formats)) + header_params = dict(self.parameters_to_tuples(header_params, collection_formats)) # path parameters if path_params: path_params = self.sanitize_for_serialization(path_params) - path_params = self.parameters_to_tuples(path_params, - collection_formats) + path_params = self.parameters_to_tuples(path_params, collection_formats) for k, v in path_params: # specified safe chars, encode everything resource_path = resource_path.replace( - '{%s}' % k, - quote(str(v), safe=config.safe_chars_for_path_param)) + "{%s}" % k, quote(str(v), safe=config.safe_chars_for_path_param) + ) # query parameters if query_params: query_params = self.sanitize_for_serialization(query_params) - query_params = self.parameters_to_tuples(query_params, - collection_formats) + query_params = self.parameters_to_tuples(query_params, collection_formats) # post parameters if post_params or files: post_params = post_params if post_params else [] post_params = self.sanitize_for_serialization(post_params) - post_params = self.parameters_to_tuples(post_params, - collection_formats) + post_params = self.parameters_to_tuples(post_params, collection_formats) post_params.extend(self.files_parameters(files)) # auth setting @@ -207,19 +202,21 @@ def __call_api(self, try: # perform request and return response - response_data = self.request(method, - url, - query_params=query_params, - headers=header_params, - post_params=post_params, - body=body, - _preload_content=_preload_content, - _request_timeout=_request_timeout) + response_data = self.request( + method, + url, + query_params=query_params, + headers=header_params, + post_params=post_params, + body=body, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + ) except ApiException as e: - e.body = e.body.decode('utf-8') if six.PY3 else e.body + e.body = e.body.decode("utf-8") if six.PY3 else e.body raise e - content_type = response_data.getheader('content-type') + content_type = response_data.getheader("content-type") self.last_response = response_data @@ -231,8 +228,7 @@ def __call_api(self, if six.PY3 and response_type not in ["file", "bytes"]: match = None if content_type is not None: - match = re.search(r"charset=([a-zA-Z\-\d]+)[\s\;]?", - content_type) + match = re.search(r"charset=([a-zA-Z\-\d]+)[\s\;]?", content_type) encoding = match.group(1) if match else "utf-8" response_data.data = response_data.data.decode(encoding) @@ -243,10 +239,9 @@ def __call_api(self, return_data = None if _return_http_data_only: - return (return_data) + return return_data else: - return (return_data, response_data.status, - response_data.getheaders()) + return (return_data, response_data.status, response_data.getheaders()) def sanitize_for_serialization(self, obj): """Builds a JSON POST object. @@ -269,8 +264,7 @@ def sanitize_for_serialization(self, obj): elif isinstance(obj, list): return [self.sanitize_for_serialization(sub_obj) for sub_obj in obj] elif isinstance(obj, tuple): - return tuple( - self.sanitize_for_serialization(sub_obj) for sub_obj in obj) + return tuple(self.sanitize_for_serialization(sub_obj) for sub_obj in obj) elif isinstance(obj, (datetime.datetime, datetime.date)): return obj.isoformat() @@ -288,10 +282,7 @@ def sanitize_for_serialization(self, obj): if getattr(obj, attr) is not None } - return { - key: self.sanitize_for_serialization(val) - for key, val in six.iteritems(obj_dict) - } + return {key: self.sanitize_for_serialization(val) for key, val in six.iteritems(obj_dict)} def deserialize(self, response, response_type): """Deserializes response into an object. @@ -327,18 +318,13 @@ def __deserialize(self, data, klass): return None if type(klass) == str: - if klass.startswith('list['): - sub_kls = re.match(r'list\[(.*)\]', klass).group(1) - return [ - self.__deserialize(sub_data, sub_kls) for sub_data in data - ] - - if klass.startswith('dict('): - sub_kls = re.match(r'dict\(([^,]*), (.*)\)', klass).group(2) - return { - k: self.__deserialize(v, sub_kls) - for k, v in six.iteritems(data) - } + if klass.startswith("list["): + sub_kls = re.match(r"list\[(.*)\]", klass).group(1) + return [self.__deserialize(sub_data, sub_kls) for sub_data in data] + + if klass.startswith("dict("): + sub_kls = re.match(r"dict\(([^,]*), (.*)\)", klass).group(2) + return {k: self.__deserialize(v, sub_kls) for k, v in six.iteritems(data)} # convert str to class if klass in self.NATIVE_TYPES_MAPPING: @@ -357,23 +343,25 @@ def __deserialize(self, data, klass): else: return self.__deserialize_model(data, klass) - def call_api(self, - resource_path, - method, - path_params=None, - query_params=None, - header_params=None, - body=None, - post_params=None, - files=None, - response_type=None, - auth_settings=None, - async_req=None, - _return_http_data_only=None, - collection_formats=None, - _preload_content=True, - _request_timeout=None, - _host=None): + def call_api( + self, + resource_path, + method, + path_params=None, + query_params=None, + header_params=None, + body=None, + post_params=None, + files=None, + response_type=None, + auth_settings=None, + async_req=None, + _return_http_data_only=None, + collection_formats=None, + _preload_content=True, + _request_timeout=None, + _host=None, + ): """Makes the HTTP request (synchronous) and returns deserialized data. To make an async_req request, set the async_req parameter. @@ -411,82 +399,124 @@ def call_api(self, then the method will return the response directly. """ if not async_req: - return self.__call_api(resource_path, method, path_params, - query_params, header_params, body, - post_params, files, response_type, - auth_settings, _return_http_data_only, - collection_formats, _preload_content, - _request_timeout, _host) + return self.__call_api( + resource_path, + method, + path_params, + query_params, + header_params, + body, + post_params, + files, + response_type, + auth_settings, + _return_http_data_only, + collection_formats, + _preload_content, + _request_timeout, + _host, + ) return self.pool.apply_async( self.__call_api, - (resource_path, method, path_params, query_params, header_params, - body, post_params, files, response_type, auth_settings, - _return_http_data_only, collection_formats, _preload_content, - _request_timeout, _host)) - - def request(self, + ( + resource_path, method, - url, - query_params=None, - headers=None, - post_params=None, - body=None, - _preload_content=True, - _request_timeout=None): + path_params, + query_params, + header_params, + body, + post_params, + files, + response_type, + auth_settings, + _return_http_data_only, + collection_formats, + _preload_content, + _request_timeout, + _host, + ), + ) + + def request( + self, + method, + url, + query_params=None, + headers=None, + post_params=None, + body=None, + _preload_content=True, + _request_timeout=None, + ): """Makes the HTTP request using RESTClient.""" if method == "GET": - return self.rest_client.GET(url, - query_params=query_params, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - headers=headers) + return self.rest_client.GET( + url, + query_params=query_params, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + headers=headers, + ) elif method == "HEAD": - return self.rest_client.HEAD(url, - query_params=query_params, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - headers=headers) + return self.rest_client.HEAD( + url, + query_params=query_params, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + headers=headers, + ) elif method == "OPTIONS": - return self.rest_client.OPTIONS(url, - query_params=query_params, - headers=headers, - _preload_content=_preload_content, - _request_timeout=_request_timeout) + return self.rest_client.OPTIONS( + url, + query_params=query_params, + headers=headers, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + ) elif method == "POST": - return self.rest_client.POST(url, - query_params=query_params, - headers=headers, - post_params=post_params, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - body=body) + return self.rest_client.POST( + url, + query_params=query_params, + headers=headers, + post_params=post_params, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + body=body, + ) elif method == "PUT": - return self.rest_client.PUT(url, - query_params=query_params, - headers=headers, - post_params=post_params, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - body=body) + return self.rest_client.PUT( + url, + query_params=query_params, + headers=headers, + post_params=post_params, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + body=body, + ) elif method == "PATCH": - return self.rest_client.PATCH(url, - query_params=query_params, - headers=headers, - post_params=post_params, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - body=body) + return self.rest_client.PATCH( + url, + query_params=query_params, + headers=headers, + post_params=post_params, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + body=body, + ) elif method == "DELETE": - return self.rest_client.DELETE(url, - query_params=query_params, - headers=headers, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - body=body) + return self.rest_client.DELETE( + url, + query_params=query_params, + headers=headers, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + body=body, + ) else: - raise ApiValueError("http method must be `GET`, `HEAD`, `OPTIONS`," - " `POST`, `PATCH`, `PUT` or `DELETE`.") + raise ApiValueError( + "http method must be `GET`, `HEAD`, `OPTIONS`, `POST`, `PATCH`, `PUT` or `DELETE`." + ) def parameters_to_tuples(self, params, collection_formats): """Get parameters as list of tuples, formatting collections. @@ -498,23 +528,21 @@ def parameters_to_tuples(self, params, collection_formats): new_params = [] if collection_formats is None: collection_formats = {} - for k, v in six.iteritems(params) if isinstance( - params, dict) else params: # noqa: E501 + for k, v in six.iteritems(params) if isinstance(params, dict) else params: # noqa: E501 if k in collection_formats: collection_format = collection_formats[k] - if collection_format == 'multi': + if collection_format == "multi": new_params.extend((k, value) for value in v) else: - if collection_format == 'ssv': - delimiter = ' ' - elif collection_format == 'tsv': - delimiter = '\t' - elif collection_format == 'pipes': - delimiter = '|' + if collection_format == "ssv": + delimiter = " " + elif collection_format == "tsv": + delimiter = "\t" + elif collection_format == "pipes": + delimiter = "|" else: # csv is the default - delimiter = ',' - new_params.append( - (k, delimiter.join(str(value) for value in v))) + delimiter = "," + new_params.append((k, delimiter.join(str(value) for value in v))) else: new_params.append((k, v)) return new_params @@ -533,13 +561,11 @@ def files_parameters(self, files=None): continue file_names = v if type(v) is list else [v] for n in file_names: - with open(n, 'rb') as f: + with open(n, "rb") as f: filename = os.path.basename(f.name) filedata = f.read() - mimetype = (mimetypes.guess_type(filename)[0] or - 'application/octet-stream') - params.append( - tuple([k, tuple([filename, filedata, mimetype])])) + mimetype = mimetypes.guess_type(filename)[0] or "application/octet-stream" + params.append(tuple([k, tuple([filename, filedata, mimetype])])) return params @@ -554,10 +580,10 @@ def select_header_accept(self, accepts): accepts = [x.lower() for x in accepts] - if 'application/json' in accepts: - return 'application/json' + if "application/json" in accepts: + return "application/json" else: - return ', '.join(accepts) + return ", ".join(accepts) def select_header_content_type(self, content_types): """Returns `Content-Type` based on an array of content_types provided. @@ -566,12 +592,12 @@ def select_header_content_type(self, content_types): :return: Content-Type (e.g. application/json). """ if not content_types: - return 'application/json' + return "application/json" content_types = [x.lower() for x in content_types] - if 'application/json' in content_types or '*/*' in content_types: - return 'application/json' + if "application/json" in content_types or "*/*" in content_types: + return "application/json" else: return content_types[0] @@ -588,15 +614,14 @@ def update_params_for_auth(self, headers, querys, auth_settings): for auth in auth_settings: auth_setting = self.configuration.auth_settings().get(auth) if auth_setting: - if auth_setting['in'] == 'cookie': - headers['Cookie'] = auth_setting['value'] - elif auth_setting['in'] == 'header': - headers[auth_setting['key']] = auth_setting['value'] - elif auth_setting['in'] == 'query': - querys.append((auth_setting['key'], auth_setting['value'])) + if auth_setting["in"] == "cookie": + headers["Cookie"] = auth_setting["value"] + elif auth_setting["in"] == "header": + headers[auth_setting["key"]] = auth_setting["value"] + elif auth_setting["in"] == "query": + querys.append((auth_setting["key"], auth_setting["value"])) else: - raise ApiValueError( - 'Authentication token must be in `query` or `header`') + raise ApiValueError("Authentication token must be in `query` or `header`") def __deserialize_file(self, response): """Deserializes body to file @@ -613,8 +638,7 @@ def __deserialize_file(self, response): content_disposition = response.getheader("Content-Disposition") if content_disposition: - filename = re.search(r'filename=[\'"]?([^\'"\s]+)[\'"]?', - content_disposition).group(1) + filename = re.search(r'filename=[\'"]?([^\'"\s]+)[\'"]?', content_disposition).group(1) path = os.path.join(os.path.dirname(path), filename) with open(path, "wb") as f: @@ -656,8 +680,8 @@ def __deserialize_date(self, string): return string except ValueError: raise rest.ApiException( - status=0, - reason="Failed to parse `{0}` as date object".format(string)) + status=0, reason="Failed to parse `{0}` as date object".format(string) + ) def __deserialize_datetime(self, string): """Deserializes string to datetime. @@ -673,9 +697,8 @@ def __deserialize_datetime(self, string): return string except ValueError: raise rest.ApiException( - status=0, - reason=( - "Failed to parse `{0}` as datetime object".format(string))) + status=0, reason="Failed to parse `{0}` as datetime object".format(string) + ) def __deserialize_model(self, data, klass): """Deserializes list or dict to model. @@ -685,16 +708,14 @@ def __deserialize_model(self, data, klass): :return: model object. """ has_discriminator = False - if (hasattr(klass, 'get_real_child_model') and - klass.discriminator_value_class_map): + if hasattr(klass, "get_real_child_model") and klass.discriminator_value_class_map: has_discriminator = True if not klass.openapi_types and has_discriminator is False: return data kwargs = {} - if (data is not None and klass.openapi_types is not None and - isinstance(data, (list, dict))): + if data is not None and klass.openapi_types is not None and isinstance(data, (list, dict)): for attr, attr_type in six.iteritems(klass.openapi_types): if klass.attribute_map[attr] in data: value = data[klass.attribute_map[attr]] diff --git a/submarine-sdk/pysubmarine/submarine/experiment/configuration.py b/submarine-sdk/pysubmarine/submarine/experiment/configuration.py index 7243663534..098749475d 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/configuration.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/configuration.py @@ -77,8 +77,7 @@ def __init__( password=None, discard_unknown_keys=False, ): - """Constructor - """ + """Constructor""" self.host = host """Default Base url """ @@ -109,10 +108,9 @@ def __init__( self.logger = {} """Logging Settings """ - self.logger["package_logger"] = logging.getLogger( - "submarine.experiment") + self.logger["package_logger"] = logging.getLogger("submarine.experiment") self.logger["urllib3_logger"] = logging.getLogger("urllib3") - self.logger_format = '%(asctime)s %(levelname)s %(message)s' + self.logger_format = "%(asctime)s %(levelname)s %(message)s" """Log format """ self.logger_stream_handler = None @@ -160,7 +158,7 @@ def __init__( self.proxy_headers = None """Proxy headers """ - self.safe_chars_for_path_param = '' + self.safe_chars_for_path_param = "" """Safe chars for path_param """ self.retries = None @@ -174,7 +172,7 @@ def __deepcopy__(self, memo): result = cls.__new__(cls) memo[id(self)] = result for k, v in self.__dict__.items(): - if k not in ('logger', 'logger_file_handler'): + if k not in ("logger", "logger_file_handler"): setattr(result, k, copy.deepcopy(v, memo)) # shallow copy of loggers result.logger = copy.copy(self.logger) @@ -323,8 +321,7 @@ def get_basic_auth_token(self): password = "" if self.password is not None: password = self.password - return urllib3.util.make_headers(basic_auth=username + ':' + - password).get('authorization') + return urllib3.util.make_headers(basic_auth=username + ":" + password).get("authorization") def auth_settings(self): """Gets Auth Settings dict for api client. @@ -339,22 +336,25 @@ def to_debug_report(self): :return: The report for debugging. """ - return "Python SDK Debug Report:\n"\ - "OS: {env}\n"\ - "Python Version: {pyversion}\n"\ - "Version of the API: 0.6.0-SNAPSHOT\n"\ - "SDK Package Version: 0.6.0-SNAPSHOT".\ - format(env=sys.platform, pyversion=sys.version) + return ( + "Python SDK Debug Report:\n" + "OS: {env}\n" + "Python Version: {pyversion}\n" + "Version of the API: 0.6.0-SNAPSHOT\n" + "SDK Package Version: 0.6.0-SNAPSHOT".format(env=sys.platform, pyversion=sys.version) + ) def get_host_settings(self): """Gets an array of host settings :return: An array of host settings """ - return [{ - 'url': "/api", - 'description': "No description provided", - }] + return [ + { + "url": "/api", + "description": "No description provided", + } + ] def get_host_from_settings(self, index, variables=None): """Gets host URL based on the index and variables @@ -369,22 +369,23 @@ def get_host_from_settings(self, index, variables=None): server = servers[index] except IndexError: raise ValueError( - "Invalid index {0} when selecting the host settings. " - "Must be less than {1}".format(index, len(servers))) + "Invalid index {0} when selecting the host settings. Must be less than {1}".format( + index, len(servers) + ) + ) - url = server['url'] + url = server["url"] # go through variables and replace placeholders - for variable_name, variable in server['variables'].items(): - used_value = variables.get(variable_name, variable['default_value']) + for variable_name, variable in server["variables"].items(): + used_value = variables.get(variable_name, variable["default_value"]) - if 'enum_values' in variable \ - and used_value not in variable['enum_values']: + if "enum_values" in variable and used_value not in variable["enum_values"]: raise ValueError( - "The variable `{0}` in the host URL has invalid value " - "{1}. Must be {2}.".format(variable_name, - variables[variable_name], - variable['enum_values'])) + "The variable `{0}` in the host URL has invalid value {1}. Must be {2}.".format( + variable_name, variables[variable_name], variable["enum_values"] + ) + ) url = url.replace("{" + variable_name + "}", used_value) diff --git a/submarine-sdk/pysubmarine/submarine/experiment/exceptions.py b/submarine-sdk/pysubmarine/submarine/experiment/exceptions.py index 87f283ee06..48df32475b 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/exceptions.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/exceptions.py @@ -32,13 +32,8 @@ class OpenApiException(Exception): class ApiTypeError(OpenApiException, TypeError): - - def __init__(self, - msg, - path_to_item=None, - valid_classes=None, - key_type=None): - """ Raises an exception for TypeErrors + def __init__(self, msg, path_to_item=None, valid_classes=None, key_type=None): + """Raises an exception for TypeErrors Args: msg (str): the exception message @@ -65,7 +60,6 @@ def __init__(self, class ApiValueError(OpenApiException, ValueError): - def __init__(self, msg, path_to_item=None): """ Args: @@ -84,7 +78,6 @@ def __init__(self, msg, path_to_item=None): class ApiKeyError(OpenApiException, KeyError): - def __init__(self, msg, path_to_item=None): """ Args: @@ -102,7 +95,6 @@ def __init__(self, msg, path_to_item=None): class ApiException(OpenApiException): - def __init__(self, status=None, reason=None, http_resp=None): if http_resp: self.status = http_resp.status @@ -117,8 +109,7 @@ def __init__(self, status=None, reason=None, http_resp=None): def __str__(self): """Custom error messages for exception""" - error_message = "({0})\n"\ - "Reason: {1}\n".format(self.status, self.reason) + error_message = "({0})\nReason: {1}\n".format(self.status, self.reason) if self.headers: error_message += "HTTP response headers: {0}\n".format(self.headers) diff --git a/submarine-sdk/pysubmarine/submarine/experiment/models/code_spec.py b/submarine-sdk/pysubmarine/submarine/experiment/models/code_spec.py index 3d4a4b9295..b394a797f6 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/models/code_spec.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/models/code_spec.py @@ -48,15 +48,9 @@ class CodeSpec(object): attribute_map (dict): The key is attribute name and the value is json key in definition. """ - openapi_types = { - 'sync_mode': 'str', - 'url': 'str' - } + openapi_types = {"sync_mode": "str", "url": "str"} - attribute_map = { - 'sync_mode': 'syncMode', - 'url': 'url' - } + attribute_map = {"sync_mode": "syncMode", "url": "url"} def __init__(self, sync_mode=None, url=None, local_vars_configuration=None): # noqa: E501 """CodeSpec - a model defined in OpenAPI""" # noqa: E501 @@ -122,18 +116,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/submarine-sdk/pysubmarine/submarine/experiment/models/environment_spec.py b/submarine-sdk/pysubmarine/submarine/experiment/models/environment_spec.py index 2f976b93b3..68f151f434 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/models/environment_spec.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/models/environment_spec.py @@ -49,22 +49,30 @@ class EnvironmentSpec(object): and the value is json key in definition. """ openapi_types = { - 'name': 'str', - 'docker_image': 'str', - 'kernel_spec': 'KernelSpec', - 'description': 'str', - 'image': 'str' + "name": "str", + "docker_image": "str", + "kernel_spec": "KernelSpec", + "description": "str", + "image": "str", } attribute_map = { - 'name': 'name', - 'docker_image': 'dockerImage', - 'kernel_spec': 'kernelSpec', - 'description': 'description', - 'image': 'image' + "name": "name", + "docker_image": "dockerImage", + "kernel_spec": "kernelSpec", + "description": "description", + "image": "image", } - def __init__(self, name=None, docker_image=None, kernel_spec=None, description=None, image=None, local_vars_configuration=None): # noqa: E501 + def __init__( + self, + name=None, + docker_image=None, + kernel_spec=None, + description=None, + image=None, + local_vars_configuration=None, + ): # noqa: E501 """EnvironmentSpec - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -200,18 +208,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/submarine-sdk/pysubmarine/submarine/experiment/models/experiment_meta.py b/submarine-sdk/pysubmarine/submarine/experiment/models/experiment_meta.py index 9a70d807d2..8e91a6fa8e 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/models/experiment_meta.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/models/experiment_meta.py @@ -49,22 +49,30 @@ class ExperimentMeta(object): and the value is json key in definition. """ openapi_types = { - 'name': 'str', - 'namespace': 'str', - 'framework': 'str', - 'cmd': 'str', - 'env_vars': 'dict(str, str)' + "name": "str", + "namespace": "str", + "framework": "str", + "cmd": "str", + "env_vars": "dict(str, str)", } attribute_map = { - 'name': 'name', - 'namespace': 'namespace', - 'framework': 'framework', - 'cmd': 'cmd', - 'env_vars': 'envVars' + "name": "name", + "namespace": "namespace", + "framework": "framework", + "cmd": "cmd", + "env_vars": "envVars", } - def __init__(self, name=None, namespace=None, framework=None, cmd=None, env_vars=None, local_vars_configuration=None): # noqa: E501 + def __init__( + self, + name=None, + namespace=None, + framework=None, + cmd=None, + env_vars=None, + local_vars_configuration=None, + ): # noqa: E501 """ExperimentMeta - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -200,18 +208,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/submarine-sdk/pysubmarine/submarine/experiment/models/experiment_spec.py b/submarine-sdk/pysubmarine/submarine/experiment/models/experiment_spec.py index e83865de7a..de480a8400 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/models/experiment_spec.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/models/experiment_spec.py @@ -49,20 +49,17 @@ class ExperimentSpec(object): and the value is json key in definition. """ openapi_types = { - 'meta': 'ExperimentMeta', - 'environment': 'EnvironmentSpec', - 'spec': 'dict(str, ExperimentTaskSpec)', - 'code': 'CodeSpec' + "meta": "ExperimentMeta", + "environment": "EnvironmentSpec", + "spec": "dict(str, ExperimentTaskSpec)", + "code": "CodeSpec", } - attribute_map = { - 'meta': 'meta', - 'environment': 'environment', - 'spec': 'spec', - 'code': 'code' - } + attribute_map = {"meta": "meta", "environment": "environment", "spec": "spec", "code": "code"} - def __init__(self, meta=None, environment=None, spec=None, code=None, local_vars_configuration=None): # noqa: E501 + def __init__( + self, meta=None, environment=None, spec=None, code=None, local_vars_configuration=None + ): # noqa: E501 """ExperimentSpec - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -174,18 +171,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/submarine-sdk/pysubmarine/submarine/experiment/models/experiment_task_spec.py b/submarine-sdk/pysubmarine/submarine/experiment/models/experiment_task_spec.py index 813a6069e3..2b89813ebd 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/models/experiment_task_spec.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/models/experiment_task_spec.py @@ -49,30 +49,42 @@ class ExperimentTaskSpec(object): and the value is json key in definition. """ openapi_types = { - 'replicas': 'int', - 'resources': 'str', - 'name': 'str', - 'image': 'str', - 'cmd': 'str', - 'env_vars': 'dict(str, str)', - 'cpu': 'str', - 'gpu': 'str', - 'memory': 'str' + "replicas": "int", + "resources": "str", + "name": "str", + "image": "str", + "cmd": "str", + "env_vars": "dict(str, str)", + "cpu": "str", + "gpu": "str", + "memory": "str", } attribute_map = { - 'replicas': 'replicas', - 'resources': 'resources', - 'name': 'name', - 'image': 'image', - 'cmd': 'cmd', - 'env_vars': 'envVars', - 'cpu': 'cpu', - 'gpu': 'gpu', - 'memory': 'memory' + "replicas": "replicas", + "resources": "resources", + "name": "name", + "image": "image", + "cmd": "cmd", + "env_vars": "envVars", + "cpu": "cpu", + "gpu": "gpu", + "memory": "memory", } - def __init__(self, replicas=None, resources=None, name=None, image=None, cmd=None, env_vars=None, cpu=None, gpu=None, memory=None, local_vars_configuration=None): # noqa: E501 + def __init__( + self, + replicas=None, + resources=None, + name=None, + image=None, + cmd=None, + env_vars=None, + cpu=None, + gpu=None, + memory=None, + local_vars_configuration=None, + ): # noqa: E501 """ExperimentTaskSpec - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -304,18 +316,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/submarine-sdk/pysubmarine/submarine/experiment/models/json_response.py b/submarine-sdk/pysubmarine/submarine/experiment/models/json_response.py index 5e3e4a815a..625b508350 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/models/json_response.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/models/json_response.py @@ -49,20 +49,22 @@ class JsonResponse(object): and the value is json key in definition. """ openapi_types = { - 'code': 'int', - 'success': 'bool', - 'result': 'object', - 'attributes': 'dict(str, object)' + "code": "int", + "success": "bool", + "result": "object", + "attributes": "dict(str, object)", } attribute_map = { - 'code': 'code', - 'success': 'success', - 'result': 'result', - 'attributes': 'attributes' + "code": "code", + "success": "success", + "result": "result", + "attributes": "attributes", } - def __init__(self, code=None, success=None, result=None, attributes=None, local_vars_configuration=None): # noqa: E501 + def __init__( + self, code=None, success=None, result=None, attributes=None, local_vars_configuration=None + ): # noqa: E501 """JsonResponse - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -174,18 +176,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/submarine-sdk/pysubmarine/submarine/experiment/models/kernel_spec.py b/submarine-sdk/pysubmarine/submarine/experiment/models/kernel_spec.py index 9cc7897a50..79124b874a 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/models/kernel_spec.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/models/kernel_spec.py @@ -48,19 +48,13 @@ class KernelSpec(object): attribute_map (dict): The key is attribute name and the value is json key in definition. """ - openapi_types = { - 'name': 'str', - 'channels': 'list[str]', - 'dependencies': 'list[str]' - } - - attribute_map = { - 'name': 'name', - 'channels': 'channels', - 'dependencies': 'dependencies' - } - - def __init__(self, name=None, channels=None, dependencies=None, local_vars_configuration=None): # noqa: E501 + openapi_types = {"name": "str", "channels": "list[str]", "dependencies": "list[str]"} + + attribute_map = {"name": "name", "channels": "channels", "dependencies": "dependencies"} + + def __init__( + self, name=None, channels=None, dependencies=None, local_vars_configuration=None + ): # noqa: E501 """KernelSpec - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -148,18 +142,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/submarine-sdk/pysubmarine/submarine/experiment/rest.py b/submarine-sdk/pysubmarine/submarine/experiment/rest.py index dccfb8e8f0..09aefd6276 100644 --- a/submarine-sdk/pysubmarine/submarine/experiment/rest.py +++ b/submarine-sdk/pysubmarine/submarine/experiment/rest.py @@ -33,6 +33,7 @@ import ssl import certifi + # python 2 and python 3 compatibility library import six import urllib3 @@ -44,7 +45,6 @@ class RESTResponse(io.IOBase): - def __init__(self, resp): self.urllib3_response = resp self.status = resp.status @@ -61,7 +61,6 @@ def getheader(self, name, default=None): class RESTClientObject(object): - def __init__(self, configuration, pools_size=4, maxsize=None): # urllib3.PoolManager will pass all kw parameters to connectionpool # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/poolmanager.py#L75 # noqa: E501 @@ -84,11 +83,10 @@ def __init__(self, configuration, pools_size=4, maxsize=None): addition_pool_args = {} if configuration.assert_hostname is not None: - addition_pool_args[ - 'assert_hostname'] = configuration.assert_hostname # noqa: E501 + addition_pool_args["assert_hostname"] = configuration.assert_hostname # noqa: E501 if configuration.retries is not None: - addition_pool_args['retries'] = configuration.retries + addition_pool_args["retries"] = configuration.retries if maxsize is None: if configuration.connection_pool_maxsize is not None: @@ -107,7 +105,8 @@ def __init__(self, configuration, pools_size=4, maxsize=None): key_file=configuration.key_file, proxy_url=configuration.proxy, proxy_headers=configuration.proxy_headers, - **addition_pool_args) + **addition_pool_args + ) else: self.pool_manager = urllib3.PoolManager( num_pools=pools_size, @@ -116,17 +115,20 @@ def __init__(self, configuration, pools_size=4, maxsize=None): ca_certs=ca_certs, cert_file=configuration.cert_file, key_file=configuration.key_file, - **addition_pool_args) - - def request(self, - method, - url, - query_params=None, - headers=None, - body=None, - post_params=None, - _preload_content=True, - _request_timeout=None): + **addition_pool_args + ) + + def request( + self, + method, + url, + query_params=None, + headers=None, + body=None, + post_params=None, + _preload_content=True, + _request_timeout=None, + ): """Perform requests. :param method: http request method @@ -146,36 +148,30 @@ def request(self, (connection, read) timeouts. """ method = method.upper() - assert method in [ - 'GET', 'HEAD', 'DELETE', 'POST', 'PUT', 'PATCH', 'OPTIONS' - ] + assert method in ["GET", "HEAD", "DELETE", "POST", "PUT", "PATCH", "OPTIONS"] if post_params and body: - raise ApiValueError( - "body parameter cannot be used with post_params parameter.") + raise ApiValueError("body parameter cannot be used with post_params parameter.") post_params = post_params or {} headers = headers or {} timeout = None if _request_timeout: - if isinstance(_request_timeout, (int,) if six.PY3 else - (int, long)): # noqa: E501,F821 + if isinstance(_request_timeout, (int,) if six.PY3 else (int, long)): # noqa: E501,F821 timeout = urllib3.Timeout(total=_request_timeout) - elif (isinstance(_request_timeout, tuple) and - len(_request_timeout) == 2): - timeout = urllib3.Timeout(connect=_request_timeout[0], - read=_request_timeout[1]) + elif isinstance(_request_timeout, tuple) and len(_request_timeout) == 2: + timeout = urllib3.Timeout(connect=_request_timeout[0], read=_request_timeout[1]) - if 'Content-Type' not in headers: - headers['Content-Type'] = 'application/json' + if "Content-Type" not in headers: + headers["Content-Type"] = "application/json" try: # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE` - if method in ['POST', 'PUT', 'PATCH', 'OPTIONS', 'DELETE']: + if method in ["POST", "PUT", "PATCH", "OPTIONS", "DELETE"]: if query_params: - url += '?' + urlencode(query_params) - if re.search('json', headers['Content-Type'], re.IGNORECASE): + url += "?" + urlencode(query_params) + if re.search("json", headers["Content-Type"], re.IGNORECASE): request_body = None if body is not None: request_body = json.dumps(body) @@ -185,9 +181,9 @@ def request(self, body=request_body, preload_content=_preload_content, timeout=timeout, - headers=headers) - elif headers[ - 'Content-Type'] == 'application/x-www-form-urlencoded': # noqa: E501 + headers=headers, + ) + elif headers["Content-Type"] == "application/x-www-form-urlencoded": # noqa: E501 r = self.pool_manager.request( method, url, @@ -195,12 +191,13 @@ def request(self, encode_multipart=False, preload_content=_preload_content, timeout=timeout, - headers=headers) - elif headers['Content-Type'] == 'multipart/form-data': + headers=headers, + ) + elif headers["Content-Type"] == "multipart/form-data": # must del headers['Content-Type'], or the correct # Content-Type which generated by urllib3 will be # overwritten. - del headers['Content-Type'] + del headers["Content-Type"] r = self.pool_manager.request( method, url, @@ -208,7 +205,8 @@ def request(self, encode_multipart=True, preload_content=_preload_content, timeout=timeout, - headers=headers) + headers=headers, + ) # Pass a `string` parameter directly in the body to support # other content types than Json when `body` argument is # provided in serialized form @@ -220,7 +218,8 @@ def request(self, body=request_body, preload_content=_preload_content, timeout=timeout, - headers=headers) + headers=headers, + ) else: # Cannot generate the request from given parameters msg = """Cannot prepare a request message for provided @@ -229,12 +228,14 @@ def request(self, raise ApiException(status=0, reason=msg) # For `GET`, `HEAD` else: - r = self.pool_manager.request(method, - url, - fields=query_params, - preload_content=_preload_content, - timeout=timeout, - headers=headers) + r = self.pool_manager.request( + method, + url, + fields=query_params, + preload_content=_preload_content, + timeout=timeout, + headers=headers, + ) except urllib3.exceptions.SSLError as e: msg = "{0}\n{1}".format(type(e).__name__, str(e)) raise ApiException(status=0, reason=msg) @@ -250,111 +251,129 @@ def request(self, return r - def GET(self, + def GET( + self, url, headers=None, query_params=None, _preload_content=True, _request_timeout=None + ): + return self.request( + "GET", + url, + headers=headers, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + query_params=query_params, + ) + + def HEAD( + self, url, headers=None, query_params=None, _preload_content=True, _request_timeout=None + ): + return self.request( + "HEAD", + url, + headers=headers, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + query_params=query_params, + ) + + def OPTIONS( + self, + url, + headers=None, + query_params=None, + post_params=None, + body=None, + _preload_content=True, + _request_timeout=None, + ): + return self.request( + "OPTIONS", + url, + headers=headers, + query_params=query_params, + post_params=post_params, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + body=body, + ) + + def DELETE( + self, + url, + headers=None, + query_params=None, + body=None, + _preload_content=True, + _request_timeout=None, + ): + return self.request( + "DELETE", + url, + headers=headers, + query_params=query_params, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + body=body, + ) + + def POST( + self, + url, + headers=None, + query_params=None, + post_params=None, + body=None, + _preload_content=True, + _request_timeout=None, + ): + return self.request( + "POST", + url, + headers=headers, + query_params=query_params, + post_params=post_params, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + body=body, + ) + + def PUT( + self, + url, + headers=None, + query_params=None, + post_params=None, + body=None, + _preload_content=True, + _request_timeout=None, + ): + return self.request( + "PUT", url, - headers=None, - query_params=None, - _preload_content=True, - _request_timeout=None): - return self.request("GET", - url, - headers=headers, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - query_params=query_params) - - def HEAD(self, - url, - headers=None, - query_params=None, - _preload_content=True, - _request_timeout=None): - return self.request("HEAD", - url, - headers=headers, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - query_params=query_params) - - def OPTIONS(self, - url, - headers=None, - query_params=None, - post_params=None, - body=None, - _preload_content=True, - _request_timeout=None): - return self.request("OPTIONS", - url, - headers=headers, - query_params=query_params, - post_params=post_params, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - body=body) - - def DELETE(self, - url, - headers=None, - query_params=None, - body=None, - _preload_content=True, - _request_timeout=None): - return self.request("DELETE", - url, - headers=headers, - query_params=query_params, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - body=body) - - def POST(self, - url, - headers=None, - query_params=None, - post_params=None, - body=None, - _preload_content=True, - _request_timeout=None): - return self.request("POST", - url, - headers=headers, - query_params=query_params, - post_params=post_params, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - body=body) - - def PUT(self, + headers=headers, + query_params=query_params, + post_params=post_params, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + body=body, + ) + + def PATCH( + self, + url, + headers=None, + query_params=None, + post_params=None, + body=None, + _preload_content=True, + _request_timeout=None, + ): + return self.request( + "PATCH", url, - headers=None, - query_params=None, - post_params=None, - body=None, - _preload_content=True, - _request_timeout=None): - return self.request("PUT", - url, - headers=headers, - query_params=query_params, - post_params=post_params, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - body=body) - - def PATCH(self, - url, - headers=None, - query_params=None, - post_params=None, - body=None, - _preload_content=True, - _request_timeout=None): - return self.request("PATCH", - url, - headers=headers, - query_params=query_params, - post_params=post_params, - _preload_content=_preload_content, - _request_timeout=_request_timeout, - body=body) + headers=headers, + query_params=query_params, + post_params=post_params, + _preload_content=_preload_content, + _request_timeout=_request_timeout, + body=body, + ) diff --git a/submarine-sdk/pysubmarine/submarine/ml/abstract_model.py b/submarine-sdk/pysubmarine/submarine/ml/abstract_model.py index a93201013f..963f4a8a55 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/abstract_model.py +++ b/submarine-sdk/pysubmarine/submarine/ml/abstract_model.py @@ -28,7 +28,9 @@ class AbstractModel: __metaclass__ = ABCMeta @abstractmethod - def __init__(self,): + def __init__( + self, + ): pass @abstractmethod diff --git a/submarine-sdk/pysubmarine/submarine/ml/pytorch/input/libsvm_dataset.py b/submarine-sdk/pysubmarine/submarine/ml/pytorch/input/libsvm_dataset.py index dc35c4a775..0ec3b0d80e 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/pytorch/input/libsvm_dataset.py +++ b/submarine-sdk/pysubmarine/submarine/ml/pytorch/input/libsvm_dataset.py @@ -13,22 +13,21 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools +import itertools +import multiprocessing as mp +import os +from typing import List, Tuple + import numpy as np import torch from torch.utils.data import DataLoader, Dataset from torch.utils.data.distributed import DistributedSampler -from submarine.utils.fileio import open_buffered_file_reader, file_info - -import os -import itertools -import functools -import multiprocessing as mp -from typing import List, Tuple +from submarine.utils.fileio import file_info, open_buffered_file_reader class LIBSVMDataset(Dataset): - def __init__(self, data_uri: str, sample_offset: np.ndarray): super().__init__() self.data_uri = data_uri @@ -44,20 +43,18 @@ def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, int]: return LIBSVMDataset.parse_sample(sample) @classmethod - def parse_sample(cls, - sample: bytes) -> Tuple[torch.Tensor, torch.Tensor, int]: - label, *entries = sample.rstrip(b'\n').split(b' ') + def parse_sample(cls, sample: bytes) -> Tuple[torch.Tensor, torch.Tensor, int]: + label, *entries = sample.rstrip(b"\n").split(b" ") feature_idx = torch.zeros(len(entries), dtype=torch.long) feature_value = torch.zeros(len(entries), dtype=torch.float) for i, entry in enumerate(entries): - fidx, fvalue = entry.split(b':') + fidx, fvalue = entry.split(b":") feature_idx[i], feature_value[i] = int(fidx), float(fvalue) return feature_idx, feature_value, int(label) @classmethod def prepare_dataset(cls, data_uri: str, n_jobs: int = os.cpu_count()): - sample_offset = LIBSVMDataset._locate_sample_offsets(data_uri=data_uri, - n_jobs=n_jobs) + sample_offset = LIBSVMDataset._locate_sample_offsets(data_uri=data_uri, n_jobs=n_jobs) return LIBSVMDataset(data_uri=data_uri, sample_offset=sample_offset) @classmethod @@ -72,20 +69,22 @@ def _locate_sample_offsets(cls, data_uri: str, n_jobs: int) -> np.ndarray: infile.readline() chunk_starts.append(min(infile.tell(), finfo.size)) - with mp.Pool(processes=n_jobs, - maxtasksperchild=1) as pool: + with mp.Pool(processes=n_jobs, maxtasksperchild=1) as pool: return np.asarray( list( itertools.chain.from_iterable( - pool.imap(functools.partial( - LIBSVMDataset._locate_sample_offsets_job, data_uri), - iterable=enumerate( - zip(chunk_starts[:-1], - chunk_starts[1:])))))) + pool.imap( + functools.partial(LIBSVMDataset._locate_sample_offsets_job, data_uri), + iterable=enumerate(zip(chunk_starts[:-1], chunk_starts[1:])), + ) + ) + ) + ) @classmethod def _locate_sample_offsets_job( - cls, data_uri: str, task: Tuple[int, Tuple[int, int]]) -> List[int]: + cls, data_uri: str, task: Tuple[int, Tuple[int, int]] + ) -> List[int]: _, (start, end) = task offsets = [start] with open_buffered_file_reader(data_uri) as infile: @@ -98,14 +97,11 @@ def _locate_sample_offsets_job( def libsvm_input_fn(filepath, batch_size=256, num_threads=1, **kwargs): - def _input_fn(): - dataset = LIBSVMDataset.prepare_dataset(data_uri=filepath, - n_jobs=num_threads) + dataset = LIBSVMDataset.prepare_dataset(data_uri=filepath, n_jobs=num_threads) sampler = DistributedSampler(dataset) - return DataLoader(dataset=dataset, - batch_size=batch_size, - sampler=sampler, - num_workers=0) # should be 0 (pytorch bug) + return DataLoader( + dataset=dataset, batch_size=batch_size, sampler=sampler, num_workers=0 + ) # should be 0 (pytorch bug) return _input_fn diff --git a/submarine-sdk/pysubmarine/submarine/ml/pytorch/layers/core.py b/submarine-sdk/pysubmarine/submarine/ml/pytorch/layers/core.py index d0fb2f584b..265ea1f1b5 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/pytorch/layers/core.py +++ b/submarine-sdk/pysubmarine/submarine/ml/pytorch/layers/core.py @@ -19,15 +19,13 @@ # pylint: disable=W0223 class FeatureLinear(nn.Module): - def __init__(self, num_features, out_features): """ :param num_features: number of total features. :param out_features: The number of output features. """ super().__init__() - self.weight = nn.Embedding(num_embeddings=num_features, - embedding_dim=out_features) + self.weight = nn.Embedding(num_embeddings=num_features, embedding_dim=out_features) self.bias = nn.Parameter(torch.zeros((out_features,))) def forward(self, feature_idx, feature_value): @@ -35,17 +33,15 @@ def forward(self, feature_idx, feature_value): :param feature_idx: torch.LongTensor (batch_size, num_fields) :param feature_value: torch.LongTensor (batch_size, num_fields) """ - return torch.sum( - self.weight(feature_idx) * feature_value.unsqueeze(dim=-1), - dim=1) + self.bias + return ( + torch.sum(self.weight(feature_idx) * feature_value.unsqueeze(dim=-1), dim=1) + self.bias + ) class FeatureEmbedding(nn.Module): - def __init__(self, num_features, embedding_dim): super().__init__() - self.weight = nn.Embedding(num_embeddings=num_features, - embedding_dim=embedding_dim) + self.weight = nn.Embedding(num_embeddings=num_features, embedding_dim=embedding_dim) def forward(self, feature_idx, feature_value): """ @@ -56,32 +52,34 @@ def forward(self, feature_idx, feature_value): class PairwiseInteraction(nn.Module): - def forward(self, x): """ :param x: torch.Tensor (batch_size, num_fields, embedding_dim) """ - square_of_sum = torch.square(torch.sum( - x, dim=1)) # (batch_size, embedding_dim) + square_of_sum = torch.square(torch.sum(x, dim=1)) # (batch_size, embedding_dim) # (batch_size, embedding_dim) sum_of_square = torch.sum(torch.square(x), dim=1) - return 0.5 * torch.sum(square_of_sum - sum_of_square, - dim=1, - keepdim=True) # (batch_size, 1) + return 0.5 * torch.sum( + square_of_sum - sum_of_square, dim=1, keepdim=True + ) # (batch_size, 1) class DNN(nn.Module): - def __init__(self, in_features, out_features, hidden_units, dropout_rates): super().__init__() - *layers, out_layer = list( - zip([in_features, *hidden_units], [*hidden_units, out_features])) + *layers, out_layer = list(zip([in_features, *hidden_units], [*hidden_units, out_features])) self.net = nn.Sequential( - *(nn.Sequential(nn.Linear(in_features=i, out_features=o), - nn.BatchNorm1d(num_features=o), nn.ReLU(), - nn.Dropout(p=p)) - for (i, o), p in zip(layers, dropout_rates)), - nn.Linear(*out_layer)) + *( + nn.Sequential( + nn.Linear(in_features=i, out_features=o), + nn.BatchNorm1d(num_features=o), + nn.ReLU(), + nn.Dropout(p=p), + ) + for (i, o), p in zip(layers, dropout_rates) + ), + nn.Linear(*out_layer) + ) def forward(self, x): """ diff --git a/submarine-sdk/pysubmarine/submarine/ml/pytorch/loss.py b/submarine-sdk/pysubmarine/submarine/ml/pytorch/loss.py index 823e233d73..234c237d47 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/pytorch/loss.py +++ b/submarine-sdk/pysubmarine/submarine/ml/pytorch/loss.py @@ -17,10 +17,10 @@ class LossKey: - BCELoss = 'BCELoss'.lower() - CrossEntropyLoss = 'CrossEntropyLoss'.lower() - NLLLoss = 'NLLLoss'.lower() - BCEWithLogitsLoss = 'BCEWithLogitsLoss'.lower() + BCELoss = "BCELoss".lower() + CrossEntropyLoss = "CrossEntropyLoss".lower() + NLLLoss = "NLLLoss".lower() + BCEWithLogitsLoss = "BCEWithLogitsLoss".lower() def get_loss_fn(key): @@ -33,4 +33,4 @@ def get_loss_fn(key): return nn.CrossEntropyLoss if key == LossKey.NLLLoss: return nn.NLLLoss - raise ValueError('Invalid loss_key:', key) + raise ValueError("Invalid loss_key:", key) diff --git a/submarine-sdk/pysubmarine/submarine/ml/pytorch/metric.py b/submarine-sdk/pysubmarine/submarine/ml/pytorch/metric.py index 69d360faf1..43f3d26c58 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/pytorch/metric.py +++ b/submarine-sdk/pysubmarine/submarine/ml/pytorch/metric.py @@ -17,11 +17,11 @@ class MetricKey: - F1_SCORE = 'f1_score' - ACCURACY = 'accuracy' - ROC_AUC = 'roc_auc' - PRECISION = 'precision' - RECALL = 'recall' + F1_SCORE = "f1_score" + ACCURACY = "accuracy" + ROC_AUC = "roc_auc" + PRECISION = "precision" + RECALL = "recall" def get_metric_fn(key): @@ -36,4 +36,4 @@ def get_metric_fn(key): return metrics.precision_score if key == MetricKey.RECALL: return metrics.recall_score - raise ValueError('Invalid metric_key:', key) + raise ValueError("Invalid metric_key:", key) diff --git a/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/base_pytorch_model.py b/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/base_pytorch_model.py index 168a78ad8f..846241d46f 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/base_pytorch_model.py +++ b/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/base_pytorch_model.py @@ -29,8 +29,7 @@ from submarine.ml.pytorch.optimizer import get_optimizer from submarine.ml.pytorch.parameters import default_parameters from submarine.ml.pytorch.registries import input_fn_registry -from submarine.utils.env import (get_from_dicts, get_from_json, - get_from_registry) +from submarine.utils.env import get_from_dicts, get_from_json, get_from_registry from submarine.utils.fileio import write_file from submarine.utils.pytorch_utils import get_device @@ -39,34 +38,32 @@ # pylint: disable=W0221 class BasePyTorchModel(AbstractModel, ABC): - def __init__(self, params=None, json_path=None): super().__init__() self.params = get_from_dicts(params, default_parameters) self.params = get_from_json(json_path, self.params) self._sanity_check() - Path(self.params['output'] - ['save_model_dir']).expanduser().resolve().mkdir(parents=True, - exist_ok=True) + Path(self.params["output"]["save_model_dir"]).expanduser().resolve().mkdir( + parents=True, exist_ok=True + ) logging.info("Model parameters : %s", self.params) - self.input_type = self.params['input']['type'] + self.input_type = self.params["input"]["type"] self.init_process_group() - self.model = DistributedDataParallel( - self.model_fn(self.params).to(get_device(self.params))) - self.optimizer = get_optimizer(key=self.params['optimizer']['name'])( - params=self.model.parameters(), - **self.params['optimizer']['kwargs']) - self.loss = get_loss_fn(key=self.params['loss']['name'])( - **self.params['loss']['kwargs']) - self.metric = get_metric_fn(key=self.params['output']['metric']) + self.model = DistributedDataParallel(self.model_fn(self.params).to(get_device(self.params))) + self.optimizer = get_optimizer(key=self.params["optimizer"]["name"])( + params=self.model.parameters(), **self.params["optimizer"]["kwargs"] + ) + self.loss = get_loss_fn(key=self.params["loss"]["name"])(**self.params["loss"]["kwargs"]) + self.metric = get_metric_fn(key=self.params["output"]["metric"]) def init_process_group(self): distributed.init_process_group( - backend=os.environ.get('backend', distributed.Backend.GLOO), - init_method=os.environ.get('INIT_METHOD', 'tcp://127.0.0.1:23456'), - world_size=int(os.environ.get('WORLD', 1)), - rank=int(os.environ.get('RANK', 0))) + backend=os.environ.get("backend", distributed.Backend.GLOO), + init_method=os.environ.get("INIT_METHOD", "tcp://127.0.0.1:23456"), + world_size=int(os.environ.get("WORLD", 1)), + rank=int(os.environ.get("RANK", 0)), + ) def __del__(self): distributed.destroy_process_group() @@ -87,8 +84,8 @@ def evaluate(self): labels = [] valid_loader = get_from_registry(self.input_type, input_fn_registry)( - filepath=self.params['input']['valid_data'], - **self.params['training'])() + filepath=self.params["input"]["valid_data"], **self.params["training"] + )() self.model.eval() with torch.no_grad(): for _, batch in enumerate(valid_loader): @@ -99,15 +96,15 @@ def evaluate(self): labels.append(label) return self.metric( - torch.cat(labels, dim=0).cpu().numpy(), - torch.cat(outputs, dim=0).cpu().numpy()) + torch.cat(labels, dim=0).cpu().numpy(), torch.cat(outputs, dim=0).cpu().numpy() + ) def predict(self): outputs = [] test_loader = get_from_registry(self.input_type, input_fn_registry)( - filepath=self.params['input']['test_data'], - **self.params['training'])() + filepath=self.params["input"]["test_data"], **self.params["training"] + )() self.model.eval() with torch.no_grad(): for _, batch in enumerate(test_loader): @@ -126,10 +123,10 @@ def fit(self): # should be replaced by a indicator function. best_eval_score = 0.0 train_loader = get_from_registry(self.input_type, input_fn_registry)( - filepath=self.params['input']['train_data'], - **self.params['training'])() + filepath=self.params["input"]["train_data"], **self.params["training"] + )() - for epoch in range(self.params['training']['num_epochs']): + for epoch in range(self.params["training"]["num_epochs"]): train_loader.sampler.set_epoch(epoch) self.train(train_loader) eval_score = self.evaluate() @@ -142,21 +139,18 @@ def fit(self): def save_checkpoint(self): with io.BytesIO() as buffer: torch.save( - { - 'model': self.model.module.state_dict(), - 'optimizer': self.optimizer.state_dict() - }, buffer) - write_file(buffer, - uri=os.path.join(self.params['output']['save_model_dir'], - 'ckpt.pkl')) + {"model": self.model.module.state_dict(), "optimizer": self.optimizer.state_dict()}, + buffer, + ) + write_file( + buffer, uri=os.path.join(self.params["output"]["save_model_dir"], "ckpt.pkl") + ) def model_fn(self, params): seed = params["training"]["seed"] torch.manual_seed(seed) def _sanity_check(self): - assert 'input' in self.params, ('Does not define any input parameters') - assert 'type' in self.params['input'], ( - 'Does not define any input type') - assert 'output' in self.params, ( - 'Does not define any output parameters') + assert "input" in self.params, "Does not define any input parameters" + assert "type" in self.params["input"], "Does not define any input type" + assert "output" in self.params, "Does not define any output parameters" diff --git a/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/ctr/__init__.py b/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/ctr/__init__.py index 34bc8d677f..7879f4a840 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/ctr/__init__.py +++ b/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/ctr/__init__.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .deepfm import DeepFM from .afm import AFM +from .deepfm import DeepFM -__all__ = ['DeepFM', 'AFM'] +__all__ = ["DeepFM", "AFM"] diff --git a/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/ctr/afm.py b/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/ctr/afm.py index cb3454f25d..106254819c 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/ctr/afm.py +++ b/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/ctr/afm.py @@ -16,58 +16,63 @@ import torch from torch import nn -from submarine.ml.pytorch.layers.core import (FeatureEmbedding, FeatureLinear) +from submarine.ml.pytorch.layers.core import FeatureEmbedding, FeatureLinear from submarine.ml.pytorch.model.base_pytorch_model import BasePyTorchModel class AFM(BasePyTorchModel): - def model_fn(self, params): super().model_fn(params) - return _AFM(**self.params['model']['kwargs']) + return _AFM(**self.params["model"]["kwargs"]) # pylint: disable=W0223 class _AFM(nn.Module): - - def __init__(self, num_features: int, embedding_dim: int, - attention_dim: int, out_features: int, dropout_rate: float, - **kwargs): + def __init__( + self, + num_features: int, + embedding_dim: int, + attention_dim: int, + out_features: int, + dropout_rate: float, + **kwargs + ): super().__init__() - self.feature_linear = FeatureLinear(num_features=num_features, - out_features=out_features) - self.feature_embedding = FeatureEmbedding(num_features=num_features, - embedding_dim=embedding_dim) + self.feature_linear = FeatureLinear(num_features=num_features, out_features=out_features) + self.feature_embedding = FeatureEmbedding( + num_features=num_features, embedding_dim=embedding_dim + ) self.attentional_interaction = AttentionalInteratction( embedding_dim=embedding_dim, attention_dim=attention_dim, out_features=out_features, - dropout_rate=dropout_rate) + dropout_rate=dropout_rate, + ) - def forward(self, feature_idx: torch.LongTensor, - feature_value: torch.LongTensor): + def forward(self, feature_idx: torch.LongTensor, feature_value: torch.LongTensor): """ :param feature_idx: torch.LongTensor (batch_size, num_fields) :param feature_value: torch.LongTensor (batch_size, num_fields) """ - return self.feature_linear( - feature_idx, feature_value) + self.attentional_interaction( - self.feature_embedding(feature_idx, feature_value)) + return self.feature_linear(feature_idx, feature_value) + self.attentional_interaction( + self.feature_embedding(feature_idx, feature_value) + ) class AttentionalInteratction(nn.Module): - - def __init__(self, embedding_dim: int, attention_dim: int, - out_features: int, dropout_rate: float): + def __init__( + self, embedding_dim: int, attention_dim: int, out_features: int, dropout_rate: float + ): super().__init__() self.attention_score = nn.Sequential( nn.Linear(in_features=embedding_dim, out_features=attention_dim), - nn.ReLU(), nn.Linear(in_features=attention_dim, out_features=1), - nn.Softmax(dim=1)) + nn.ReLU(), + nn.Linear(in_features=attention_dim, out_features=1), + nn.Softmax(dim=1), + ) self.pairwise_product = PairwiseProduct() self.dropout = nn.Dropout(p=dropout_rate) - self.fc = nn.Linear(in_features=embedding_dim, - out_features=out_features) + self.fc = nn.Linear(in_features=embedding_dim, out_features=out_features) def forward(self, x: torch.FloatTensor): """ @@ -80,7 +85,6 @@ def forward(self, x: torch.FloatTensor): class PairwiseProduct(nn.Module): - def forward(self, x: torch.FloatTensor): """ :param x: torch.FloatTensor (batch_sie, num_fields, embedding_dim) @@ -88,8 +92,5 @@ def forward(self, x: torch.FloatTensor): _, num_fields, _ = x.size() all_pairs_product = x.unsqueeze(dim=1) * x.unsqueeze(dim=2) - idx_row, idx_col = torch.unbind(torch.triu_indices(num_fields, - num_fields, - offset=1), - dim=0) + idx_row, idx_col = torch.unbind(torch.triu_indices(num_fields, num_fields, offset=1), dim=0) return all_pairs_product[:, idx_row, idx_col] diff --git a/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/ctr/deepfm.py b/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/ctr/deepfm.py index fd239604e3..6bc97604af 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/ctr/deepfm.py +++ b/submarine-sdk/pysubmarine/submarine/ml/pytorch/model/ctr/deepfm.py @@ -16,34 +16,45 @@ import torch from torch import nn -from submarine.ml.pytorch.layers.core import (DNN, FeatureEmbedding, - FeatureLinear, - PairwiseInteraction) +from submarine.ml.pytorch.layers.core import ( + DNN, + FeatureEmbedding, + FeatureLinear, + PairwiseInteraction, +) from submarine.ml.pytorch.model.base_pytorch_model import BasePyTorchModel class DeepFM(BasePyTorchModel): - def model_fn(self, params): super().model_fn(params) - return _DeepFM(**self.params['model']['kwargs']) + return _DeepFM(**self.params["model"]["kwargs"]) # pylint: disable=W0223 class _DeepFM(nn.Module): - - def __init__(self, num_fields, num_features, embedding_dim, out_features, - hidden_units, dropout_rates, **kwargs): + def __init__( + self, + num_fields, + num_features, + embedding_dim, + out_features, + hidden_units, + dropout_rates, + **kwargs + ): super().__init__() - self.feature_linear = FeatureLinear(num_features=num_features, - out_features=out_features) - self.feature_embedding = FeatureEmbedding(num_features=num_features, - embedding_dim=embedding_dim) + self.feature_linear = FeatureLinear(num_features=num_features, out_features=out_features) + self.feature_embedding = FeatureEmbedding( + num_features=num_features, embedding_dim=embedding_dim + ) self.pairwise_interaction = PairwiseInteraction() - self.dnn = DNN(in_features=num_fields * embedding_dim, - out_features=out_features, - hidden_units=hidden_units, - dropout_rates=dropout_rates) + self.dnn = DNN( + in_features=num_fields * embedding_dim, + out_features=out_features, + hidden_units=hidden_units, + dropout_rates=dropout_rates, + ) def forward(self, feature_idx, feature_value): """ @@ -51,8 +62,8 @@ def forward(self, feature_idx, feature_value): :param feature_value: torch.LongTensor (batch_size, num_fields) """ emb = self.feature_embedding( - feature_idx, - feature_value) # (batch_size, num_fields, embedding_dim) + feature_idx, feature_value + ) # (batch_size, num_fields, embedding_dim) linear_logit = self.feature_linear(feature_idx, feature_value) fm_logit = self.pairwise_interaction(emb) deep_logit = self.dnn(torch.flatten(emb, start_dim=1)) diff --git a/submarine-sdk/pysubmarine/submarine/ml/pytorch/optimizer.py b/submarine-sdk/pysubmarine/submarine/ml/pytorch/optimizer.py index 15bbd184d7..08e1ca2e73 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/pytorch/optimizer.py +++ b/submarine-sdk/pysubmarine/submarine/ml/pytorch/optimizer.py @@ -17,9 +17,9 @@ class OptimizerKey: - ADAM = 'adam' - ADAGRAD = 'adagrad' - SGD = 'sgd' + ADAM = "adam" + ADAGRAD = "adagrad" + SGD = "sgd" def get_optimizer(key): @@ -30,4 +30,4 @@ def get_optimizer(key): return optim.Adagrad if key == OptimizerKey.SGD: return optim.SGD - raise ValueError('Invalid optimizer_key:', key) + raise ValueError("Invalid optimizer_key:", key) diff --git a/submarine-sdk/pysubmarine/submarine/ml/pytorch/parameters.py b/submarine-sdk/pysubmarine/submarine/ml/pytorch/parameters.py index 4619460a51..596fb44322 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/pytorch/parameters.py +++ b/submarine-sdk/pysubmarine/submarine/ml/pytorch/parameters.py @@ -14,10 +14,7 @@ # limitations under the License. default_parameters = { - "output": { - "save_model_dir": "./output", - "metric": "roc_auc" - }, + "output": {"save_model_dir": "./output", "metric": "roc_auc"}, "training": { "batch_size": 64, "num_epochs": 1, @@ -26,7 +23,7 @@ "num_gpus": 0, "seed": 42, "mode": "distributed", - "backend": "gloo" + "backend": "gloo", }, "model": { "name": "ctr.deepfm", @@ -34,22 +31,10 @@ "out_features": 1, "embedding_dim": 256, "hidden_units": [400, 400], - "dropout_rates": [0.2, 0.2] - } - }, - "loss": { - "name": "BCEWithLogitsLoss", - "kwargs": {} + "dropout_rates": [0.2, 0.2], + }, }, - "optimizer": { - "name": "adam", - "kwargs": { - "lr": 1e-3 - } - }, - "resource": { - "num_cpus": 4, - "num_gpus": 0, - "num_threads": 0 - } + "loss": {"name": "BCEWithLogitsLoss", "kwargs": {}}, + "optimizer": {"name": "adam", "kwargs": {"lr": 1e-3}}, + "resource": {"num_cpus": 4, "num_gpus": 0, "num_threads": 0}, } diff --git a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/input/input.py b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/input/input.py index f32285fc05..f779fb6093 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/input/input.py +++ b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/input/input.py @@ -23,29 +23,29 @@ def libsvm_input_fn( - filepath, - batch_size=256, - num_epochs=3, # pylint: disable=W0613 - perform_shuffle=False, - delimiter=" ", - **kwargs): - + filepath, + batch_size=256, + num_epochs=3, # pylint: disable=W0613 + perform_shuffle=False, + delimiter=" ", + **kwargs +): def _input_fn(): - def decode_libsvm(line): columns = tf.string_split([line], delimiter) labels = tf.string_to_number(columns.values[0], out_type=tf.float32) - splits = tf.string_split(columns.values[1:], ':') + splits = tf.string_split(columns.values[1:], ":") id_vals = tf.reshape(splits.values, splits.dense_shape) - feat_ids, feat_vals = tf.split(id_vals, - num_or_size_splits=2, - axis=1) + feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1) feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32) feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32) return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels - dataset = tf.data.TextLineDataset(filepath)\ - .map(decode_libsvm, num_parallel_calls=AUTOTUNE).prefetch(AUTOTUNE) + dataset = ( + tf.data.TextLineDataset(filepath) + .map(decode_libsvm, num_parallel_calls=AUTOTUNE) + .prefetch(AUTOTUNE) + ) if perform_shuffle: dataset = dataset.shuffle(buffer_size=batch_size) diff --git a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/layers/core.py b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/layers/core.py index ec0f18fec3..dbe048f1d4 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/layers/core.py +++ b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/layers/core.py @@ -18,34 +18,39 @@ def batch_norm_layer(x, train_phase, scope_bn, batch_norm_decay): - bn_train = tf.contrib.layers.batch_norm(x, - decay=batch_norm_decay, - center=True, - scale=True, - updates_collections=None, - is_training=True, - reuse=None, - scope=scope_bn) - bn_infer = tf.contrib.layers.batch_norm(x, - decay=batch_norm_decay, - center=True, - scale=True, - updates_collections=None, - is_training=False, - reuse=True, - scope=scope_bn) - return tf.cond(tf.cast(train_phase, tf.bool), lambda: bn_train, - lambda: bn_infer) - - -def dnn_layer(inputs, - estimator_mode, - batch_norm, - deep_layers, - dropout, - batch_norm_decay=0.9, - l2_reg=0, - **kwargs): + bn_train = tf.contrib.layers.batch_norm( + x, + decay=batch_norm_decay, + center=True, + scale=True, + updates_collections=None, + is_training=True, + reuse=None, + scope=scope_bn, + ) + bn_infer = tf.contrib.layers.batch_norm( + x, + decay=batch_norm_decay, + center=True, + scale=True, + updates_collections=None, + is_training=False, + reuse=True, + scope=scope_bn, + ) + return tf.cond(tf.cast(train_phase, tf.bool), lambda: bn_train, lambda: bn_infer) + + +def dnn_layer( + inputs, + estimator_mode, + batch_norm, + deep_layers, + dropout, + batch_norm_decay=0.9, + l2_reg=0, + **kwargs +): """ The Multi Layer Perceptron :param inputs: A tensor of at least rank 2 and static value for the last dimension; i.e. @@ -72,13 +77,15 @@ def dnn_layer(inputs, inputs=inputs, num_outputs=deep_layers[i], weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg), - scope='mlp%d' % i) + scope="mlp%d" % i, + ) if batch_norm: deep_inputs = batch_norm_layer( deep_inputs, train_phase=train_phase, - scope_bn='bn_%d' % i, - batch_norm_decay=batch_norm_decay) + scope_bn="bn_%d" % i, + batch_norm_decay=batch_norm_decay, + ) if estimator_mode == tf.estimator.ModeKeys.TRAIN: deep_inputs = tf.nn.dropout(deep_inputs, keep_prob=dropout[i]) @@ -87,7 +94,8 @@ def dnn_layer(inputs, num_outputs=1, activation_fn=tf.identity, weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg), - scope='deep_out') + scope="deep_out", + ) deep_out = tf.reshape(deep_out, shape=[-1]) return deep_out @@ -101,34 +109,29 @@ def linear_layer(features, feature_size, field_size, l2_reg=0, **kwargs): :param l2_reg: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix. """ - feat_ids = features['feat_ids'] + feat_ids = features["feat_ids"] feat_ids = tf.reshape(feat_ids, shape=[-1, field_size]) - feat_vals = features['feat_vals'] + feat_vals = features["feat_vals"] feat_vals = tf.reshape(feat_vals, shape=[-1, field_size]) regularizer = tf.contrib.layers.l2_regularizer(l2_reg) with tf.variable_scope("LinearLayer_Layer"): - linear_bias = tf.get_variable(name='linear_bias', - shape=[1], - initializer=tf.constant_initializer(0.0)) + linear_bias = tf.get_variable( + name="linear_bias", shape=[1], initializer=tf.constant_initializer(0.0) + ) linear_weight = tf.get_variable( - name='linear_weight', + name="linear_weight", shape=[feature_size], initializer=tf.glorot_normal_initializer(), - regularizer=regularizer) + regularizer=regularizer, + ) feat_weights = tf.nn.embedding_lookup(linear_weight, feat_ids) - linear_out = tf.reduce_sum(tf.multiply(feat_weights, feat_vals), - 1) + linear_bias + linear_out = tf.reduce_sum(tf.multiply(feat_weights, feat_vals), 1) + linear_bias return linear_out -def embedding_layer(features, - feature_size, - field_size, - embedding_size, - l2_reg=0, - **kwargs): +def embedding_layer(features, feature_size, field_size, embedding_size, l2_reg=0, **kwargs): """ Turns positive integers (indexes) into dense vectors of fixed size. eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]] @@ -139,18 +142,19 @@ def embedding_layer(features, :param l2_reg: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix. """ - feat_ids = features['feat_ids'] + feat_ids = features["feat_ids"] feat_ids = tf.reshape(feat_ids, shape=[-1, field_size]) - feat_vals = features['feat_vals'] + feat_vals = features["feat_vals"] feat_vals = tf.reshape(feat_vals, shape=[-1, field_size]) with tf.variable_scope("Embedding_Layer"): regularizer = tf.contrib.layers.l2_regularizer(l2_reg) embedding_dict = tf.get_variable( - name='embedding_dict', + name="embedding_dict", shape=[feature_size, embedding_size], initializer=tf.glorot_normal_initializer(), - regularizer=regularizer) + regularizer=regularizer, + ) embeddings = tf.nn.embedding_lookup(embedding_dict, feat_ids) feat_vals = tf.reshape(feat_vals, shape=[-1, field_size, 1]) embedding_out = tf.multiply(embeddings, feat_vals) @@ -186,14 +190,14 @@ def fm_layer(inputs, **kwargs): class KMaxPooling(Layer): """K Max pooling that selects the k biggest value along the specific axis. - Input shape - - nD tensor with shape: ``(batch_size, ..., input_dim)``. - Output shape - - nD tensor with shape: ``(batch_size, ..., output_dim)``. - Arguments - - **k**: positive integer, number of top elements to look for along the ``axis`` dimension. - - **axis**: positive integer, the dimension to look for elements. - """ + Input shape + - nD tensor with shape: ``(batch_size, ..., input_dim)``. + Output shape + - nD tensor with shape: ``(batch_size, ..., output_dim)``. + Arguments + - **k**: positive integer, number of top elements to look for along the ``axis`` dimension. + - **axis**: positive integer, the dimension to look for elements. + """ def __init__(self, k=1, axis=-1, **kwargs): @@ -205,12 +209,10 @@ def __init__(self, k=1, axis=-1, **kwargs): def build(self, input_shape): if self.axis < 1 or self.axis > len(input_shape): - raise ValueError("axis must be 1~%d,now is %d" % - (len(input_shape), self.axis)) + raise ValueError("axis must be 1~%d,now is %d" % (len(input_shape), self.axis)) if self.k < 1 or self.k > input_shape[self.axis]: - raise ValueError("k must be in 1 ~ %d,now k is %d" % - (input_shape[self.axis], self.k)) + raise ValueError("k must be in 1 ~ %d,now k is %d" % (input_shape[self.axis], self.k)) self.dims = len(input_shape) super(KMaxPooling, self).build(input_shape) @@ -230,7 +232,9 @@ def compute_output_shape(self, input_shape): output_shape[self.axis] = self.k return tuple(output_shape) - def get_config(self, ): - config = {'k': self.k, 'axis': self.axis} + def get_config( + self, + ): + config = {"k": self.k, "axis": self.axis} base_config = super(KMaxPooling, self).get_config() return dict(list(base_config.items()) + list(config.items())) diff --git a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/__init__.py b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/__init__.py index 7f561f964c..6651f47da3 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/__init__.py +++ b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/__init__.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .ccpm import CCPM from .deepfm import DeepFM from .fm import FM from .nfm import NFM -from .ccpm import CCPM __all__ = ["DeepFM", "FM", "NFM", "CCPM"] diff --git a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/base_tf_model.py b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/base_tf_model.py index 118f876b7a..1f9845f293 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/base_tf_model.py +++ b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/base_tf_model.py @@ -22,8 +22,7 @@ from submarine.ml.abstract_model import AbstractModel from submarine.ml.tensorflow.parameters import default_parameters from submarine.ml.tensorflow.registries import input_fn_registry -from submarine.utils.env import (get_from_dicts, get_from_json, - get_from_registry) +from submarine.utils.env import get_from_dicts, get_from_json, get_from_registry from submarine.utils.tf_utils import get_tf_config logger = logging.getLogger(__name__) @@ -31,20 +30,21 @@ # pylint: disable=W0221 class BaseTFModel(AbstractModel, ABC): - def __init__(self, model_params=None, json_path=None): super().__init__() self.model_params = get_from_dicts(model_params, default_parameters) self.model_params = get_from_json(json_path, self.model_params) self._sanity_checks() logging.info("Model parameters : %s", self.model_params) - self.input_type = self.model_params['input']['type'] - self.model_dir = self.model_params['output']['save_model_dir'] + self.input_type = self.model_params["input"]["type"] + self.model_dir = self.model_params["output"]["save_model_dir"] self.config = get_tf_config(self.model_params) - self.model = tf.estimator.Estimator(model_fn=self.model_fn, - model_dir=self.model_dir, - params=self.model_params, - config=self.config) + self.model = tf.estimator.Estimator( + model_fn=self.model_fn, + model_dir=self.model_dir, + params=self.model_params, + config=self.config, + ) def train(self, train_input_fn=None, eval_input_fn=None, **kwargs): """ @@ -54,20 +54,17 @@ def train(self, train_input_fn=None, eval_input_fn=None, **kwargs): :return: None """ if train_input_fn is None: - train_input_fn = get_from_registry( - self.input_type, input_fn_registry)( - filepath=self.model_params['input']['train_data'], - **self.model_params['training']) + train_input_fn = get_from_registry(self.input_type, input_fn_registry)( + filepath=self.model_params["input"]["train_data"], **self.model_params["training"] + ) if eval_input_fn is None: - eval_input_fn = get_from_registry( - self.input_type, input_fn_registry)( - filepath=self.model_params['input']['valid_data'], - **self.model_params['training']) + eval_input_fn = get_from_registry(self.input_type, input_fn_registry)( + filepath=self.model_params["input"]["valid_data"], **self.model_params["training"] + ) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) - tf.estimator.train_and_evaluate(self.model, train_spec, eval_spec, - **kwargs) + tf.estimator.train_and_evaluate(self.model, train_spec, eval_spec, **kwargs) def evaluate(self, eval_input_fn=None, **kwargs): """ @@ -78,10 +75,9 @@ def evaluate(self, eval_input_fn=None, **kwargs): global step for which this evaluation was performed """ if eval_input_fn is None: - eval_input_fn = get_from_registry( - self.input_type, input_fn_registry)( - filepath=self.model_params['input']['valid_data'], - **self.model_params['training']) + eval_input_fn = get_from_registry(self.input_type, input_fn_registry)( + filepath=self.model_params["input"]["valid_data"], **self.model_params["training"] + ) return self.model.evaluate(input_fn=eval_input_fn, **kwargs) @@ -93,20 +89,16 @@ def predict(self, predict_input_fn=None, **kwargs): :return: Evaluated values of predictions tensors. """ if predict_input_fn is None: - predict_input_fn = get_from_registry( - self.input_type, input_fn_registry)( - filepath=self.model_params['input']['test_data'], - **self.model_params['training']) + predict_input_fn = get_from_registry(self.input_type, input_fn_registry)( + filepath=self.model_params["input"]["test_data"], **self.model_params["training"] + ) return self.model.predict(input_fn=predict_input_fn, **kwargs) def _sanity_checks(self): - assert 'input' in self.model_params, ( - 'Does not define any input parameters') - assert 'type' in self.model_params['input'], ( - 'Does not define any input type') - assert 'output' in self.model_params, ( - 'Does not define any output parameters') + assert "input" in self.model_params, "Does not define any input parameters" + assert "type" in self.model_params["input"], "Does not define any input type" + assert "output" in self.model_params, "Does not define any output parameters" def model_fn(self, features, labels, mode, params): seed = params["training"]["seed"] diff --git a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/ccpm.py b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/ccpm.py index de41adf8d7..18f55797a1 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/ccpm.py +++ b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/ccpm.py @@ -14,10 +14,15 @@ # limitations under the License. import logging + import tensorflow as tf -from submarine.ml.tensorflow.layers.core import (dnn_layer, embedding_layer, linear_layer, - KMaxPooling) +from submarine.ml.tensorflow.layers.core import ( + KMaxPooling, + dnn_layer, + embedding_layer, + linear_layer, +) from submarine.ml.tensorflow.model.base_tf_model import BaseTFModel from submarine.utils.tf_utils import get_estimator_spec @@ -28,21 +33,19 @@ class CCPM(BaseTFModel): def model_fn(self, features, labels, mode, params): super().model_fn(features, labels, mode, params) - if len(params['training']['conv_kernel_width']) != len(params['training']['conv_filters']): - raise ValueError( - "conv_kernel_width must have same element with conv_filters") + if len(params["training"]["conv_kernel_width"]) != len(params["training"]["conv_filters"]): + raise ValueError("conv_kernel_width must have same element with conv_filters") - linear_logit = linear_layer(features, **params['training']) - embedding_outputs = embedding_layer(features, **params['training']) - conv_filters = params['training']['conv_filters'] - conv_kernel_width = params['training']['conv_kernel_width'] + linear_logit = linear_layer(features, **params["training"]) + embedding_outputs = embedding_layer(features, **params["training"]) + conv_filters = params["training"]["conv_filters"] + conv_kernel_width = params["training"]["conv_kernel_width"] - n = params['training']['embedding_size'] + n = params["training"]["embedding_size"] conv_filters_len = len(conv_filters) conv_input = tf.concat(embedding_outputs, axis=1) - pooling_result = tf.keras.layers.Lambda( - lambda x: tf.expand_dims(x, axis=3))(conv_input) + pooling_result = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=3))(conv_input) for i in range(1, conv_filters_len + 1): filters = conv_filters[i - 1] @@ -50,15 +53,19 @@ def model_fn(self, features, labels, mode, params): p = pow(i / conv_filters_len, conv_filters_len - i) k = max(1, int((1 - p) * n)) if i < conv_filters_len else 3 - conv_result = tf.keras.layers.Conv2D(filters=filters, kernel_size=(width, 1), - strides=(1, 1), padding='same', - activation='tanh', use_bias=True, )(pooling_result) + conv_result = tf.keras.layers.Conv2D( + filters=filters, + kernel_size=(width, 1), + strides=(1, 1), + padding="same", + activation="tanh", + use_bias=True, + )(pooling_result) - pooling_result = KMaxPooling( - k=min(k, int(conv_result.shape[1])), axis=1)(conv_result) + pooling_result = KMaxPooling(k=min(k, int(conv_result.shape[1])), axis=1)(conv_result) flatten_result = tf.keras.layers.Flatten()(pooling_result) - deep_logit = dnn_layer(flatten_result, mode, **params['training']) + deep_logit = dnn_layer(flatten_result, mode, **params["training"]) with tf.variable_scope("CCPM_out"): logit = linear_logit + deep_logit diff --git a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/deepfm.py b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/deepfm.py index 844b9a6052..64fb001be0 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/deepfm.py +++ b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/deepfm.py @@ -28,8 +28,7 @@ import tensorflow as tf -from submarine.ml.tensorflow.layers.core import (dnn_layer, embedding_layer, - fm_layer, linear_layer) +from submarine.ml.tensorflow.layers.core import dnn_layer, embedding_layer, fm_layer, linear_layer from submarine.ml.tensorflow.model.base_tf_model import BaseTFModel from submarine.utils.tf_utils import get_estimator_spec @@ -37,20 +36,18 @@ class DeepFM(BaseTFModel): - def model_fn(self, features, labels, mode, params): super().model_fn(features, labels, mode, params) - linear_logit = linear_layer(features, **params['training']) + linear_logit = linear_layer(features, **params["training"]) - embedding_outputs = embedding_layer(features, **params['training']) - fm_logit = fm_layer(embedding_outputs, **params['training']) + embedding_outputs = embedding_layer(features, **params["training"]) + fm_logit = fm_layer(embedding_outputs, **params["training"]) - field_size = params['training']['field_size'] - embedding_size = params['training']['embedding_size'] - deep_inputs = tf.reshape(embedding_outputs, - shape=[-1, field_size * embedding_size]) - deep_logit = dnn_layer(deep_inputs, mode, **params['training']) + field_size = params["training"]["field_size"] + embedding_size = params["training"]["embedding_size"] + deep_inputs = tf.reshape(embedding_outputs, shape=[-1, field_size * embedding_size]) + deep_logit = dnn_layer(deep_inputs, mode, **params["training"]) with tf.variable_scope("DeepFM_out"): logit = linear_logit + fm_logit + deep_logit diff --git a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/fm.py b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/fm.py index 74b7e8dc36..9716d171d8 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/fm.py +++ b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/fm.py @@ -24,8 +24,7 @@ import tensorflow as tf -from submarine.ml.tensorflow.layers.core import (embedding_layer, fm_layer, - linear_layer) +from submarine.ml.tensorflow.layers.core import embedding_layer, fm_layer, linear_layer from submarine.ml.tensorflow.model.base_tf_model import BaseTFModel from submarine.utils.tf_utils import get_estimator_spec @@ -33,13 +32,12 @@ class FM(BaseTFModel): - def model_fn(self, features, labels, mode, params): super().model_fn(features, labels, mode, params) - linear_logit = linear_layer(features, **params['training']) - embedding_outputs = embedding_layer(features, **params['training']) - fm_logit = fm_layer(embedding_outputs, **params['training']) + linear_logit = linear_layer(features, **params["training"]) + embedding_outputs = embedding_layer(features, **params["training"]) + fm_logit = fm_layer(embedding_outputs, **params["training"]) with tf.variable_scope("FM_out"): logit = linear_logit + fm_logit diff --git a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/nfm.py b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/nfm.py index 2ae4d7a619..e3d838dc6c 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/nfm.py +++ b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/model/nfm.py @@ -25,8 +25,12 @@ import tensorflow as tf -from submarine.ml.tensorflow.layers.core import (bilinear_layer, dnn_layer, - embedding_layer, linear_layer) +from submarine.ml.tensorflow.layers.core import ( + bilinear_layer, + dnn_layer, + embedding_layer, + linear_layer, +) from submarine.ml.tensorflow.model.base_tf_model import BaseTFModel from submarine.utils.tf_utils import get_estimator_spec @@ -34,14 +38,13 @@ class NFM(BaseTFModel): - def model_fn(self, features, labels, mode, params): super().model_fn(features, labels, mode, params) - linear_logit = linear_layer(features, **params['training']) - embedding_outputs = embedding_layer(features, **params['training']) - deep_inputs = bilinear_layer(embedding_outputs, **params['training']) - deep_logit = dnn_layer(deep_inputs, mode, **params['training']) + linear_logit = linear_layer(features, **params["training"]) + embedding_outputs = embedding_layer(features, **params["training"]) + deep_inputs = bilinear_layer(embedding_outputs, **params["training"]) + deep_logit = dnn_layer(deep_inputs, mode, **params["training"]) with tf.variable_scope("NFM_out"): logit = linear_logit + deep_logit diff --git a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/optimizer.py b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/optimizer.py index 89bc28a654..dd61d6e1d2 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/optimizer.py +++ b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/optimizer.py @@ -22,26 +22,24 @@ class OptimizerKey(object): """Optimizer key strings.""" - ADAM = 'adam' - ADAGRAD = 'adagrad' - MOMENTUM = 'momentum' - FTRL = 'ftrl' + + ADAM = "adam" + ADAGRAD = "adagrad" + MOMENTUM = "momentum" + FTRL = "ftrl" def get_optimizer(optimizer_key, learning_rate): optimizer_key = optimizer_key.lower() if optimizer_key == OptimizerKey.ADAM: - op = tf.train.AdamOptimizer(learning_rate=learning_rate, - beta1=0.9, - beta2=0.999, - epsilon=1e-8) + op = tf.train.AdamOptimizer( + learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8 + ) elif optimizer_key == OptimizerKey.ADAGRAD: - op = tf.train.AdagradOptimizer(learning_rate=learning_rate, - initial_accumulator_value=1e-8) + op = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8) elif optimizer_key == OptimizerKey.MOMENTUM: - op = tf.train.MomentumOptimizer(learning_rate=learning_rate, - momentum=0.95) + op = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.95) elif optimizer_key == OptimizerKey.FTRL: op = tf.train.FtrlOptimizer(learning_rate) else: diff --git a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/parameters.py b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/parameters.py index 0815a2d9e1..261f62b879 100644 --- a/submarine-sdk/pysubmarine/submarine/ml/tensorflow/parameters.py +++ b/submarine-sdk/pysubmarine/submarine/ml/tensorflow/parameters.py @@ -14,10 +14,7 @@ # limitations under the License. default_parameters = { - "output": { - "save_model_dir": "./experiment", - "metric": "auc" - }, + "output": {"save_model_dir": "./experiment", "metric": "auc"}, "training": { "batch_size": 512, "field_size": 39, @@ -37,11 +34,7 @@ "num_threads": 4, "num_gpu": 0, "seed": 77, - "mode": "local" + "mode": "local", }, - "resource": { - "num_cpu": 4, - "num_gpu": 0, - "num_thread": 0 # tf determines automatically - } + "resource": {"num_cpu": 4, "num_gpu": 0, "num_thread": 0}, # tf determines automatically } diff --git a/submarine-sdk/pysubmarine/submarine/models/client.py b/submarine-sdk/pysubmarine/submarine/models/client.py index d258d48cb1..a954bace2a 100644 --- a/submarine-sdk/pysubmarine/submarine/models/client.py +++ b/submarine-sdk/pysubmarine/submarine/models/client.py @@ -21,21 +21,23 @@ from mlflow.exceptions import MlflowException from mlflow.tracking import MlflowClient -from .constant import (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, - MLFLOW_S3_ENDPOINT_URL, MLFLOW_TRACKING_URI) +from .constant import ( + AWS_ACCESS_KEY_ID, + AWS_SECRET_ACCESS_KEY, + MLFLOW_S3_ENDPOINT_URL, + MLFLOW_TRACKING_URI, +) from .utils import exist_ps, get_job_id, get_worker_index -class ModelsClient(): - +class ModelsClient: def __init__(self, tracking_uri=None, registry_uri=None): """ Set up mlflow server connection, including: s3 endpoint, aws, tracking server """ # if setting url in environment variable, # there is no need to set it by MlflowClient() or mlflow.set_tracking_uri() again - os.environ[ - "MLFLOW_S3_ENDPOINT_URL"] = registry_uri or MLFLOW_S3_ENDPOINT_URL + os.environ["MLFLOW_S3_ENDPOINT_URL"] = registry_uri or MLFLOW_S3_ENDPOINT_URL os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY os.environ["MLFLOW_TRACKING_URI"] = tracking_uri or MLFLOW_TRACKING_URI @@ -44,7 +46,7 @@ def __init__(self, tracking_uri=None, registry_uri=None): "pytorch": mlflow.pytorch.log_model, "sklearn": mlflow.sklearn.log_model, "tensorflow": mlflow.tensorflow.log_model, - "keras": mlflow.keras.log_model + "keras": mlflow.keras.log_model, } def start(self): @@ -83,11 +85,7 @@ def update_model(self, name, new_name): def delete_model(self, name, version): self.client.delete_model_version(name=name, version=version) - def save_model(self, - model_type, - model, - artifact_path, - registered_model_name=None): + def save_model(self, model_type, model, artifact_path, registered_model_name=None): run_name = get_worker_index() if exist_ps(): # TODO for Tensorflow ParameterServer strategy @@ -95,9 +93,8 @@ def save_model(self, elif run_name == "worker-0": if model_type in self.type_to_log_model: self.type_to_log_model[model_type]( - model, - artifact_path, - registered_model_name=registered_model_name) + model, artifact_path, registered_model_name=registered_model_name + ) else: raise MlflowException("No valid type of model has been matched") @@ -116,8 +113,7 @@ def _get_or_create_experiment(self, experiment_name): else: while experiment is None: time.sleep(1) - experiment = mlflow.get_experiment_by_name( - experiment_name) + experiment = mlflow.get_experiment_by_name(experiment_name) return experiment.experiment_id # if found except MlflowException: experiment = mlflow.create_experiment(name=experiment_name) diff --git a/submarine-sdk/pysubmarine/submarine/models/utils.py b/submarine-sdk/pysubmarine/submarine/models/utils.py index d8281a0a70..88c2e00166 100644 --- a/submarine-sdk/pysubmarine/submarine/models/utils.py +++ b/submarine-sdk/pysubmarine/submarine/models/utils.py @@ -58,13 +58,13 @@ def get_worker_index(): task_config = tf_config.get(_TASK) task_type = task_config.get(_TYPE) task_index = task_config.get(_INDEX) - worker_index = task_type + '-' + str(task_index) + worker_index = task_type + "-" + str(task_index) elif env.get_env(_CLUSTER_SPEC) is not None: cluster_spec = json.loads(os.environ.get(_CLUSTER_SPEC)) task_config = cluster_spec.get(_TASK) task_type = task_config.get(_JOB_NAME) task_index = task_config.get(_INDEX) - worker_index = task_type + '-' + str(task_index) + worker_index = task_type + "-" + str(task_index) # Get PyTorch worker index elif env.get_env(_RANK) is not None: rank = env.get_env(_RANK) diff --git a/submarine-sdk/pysubmarine/submarine/store/database/db_types.py b/submarine-sdk/pysubmarine/submarine/store/database/db_types.py index f7883710f7..dd3f08190b 100644 --- a/submarine-sdk/pysubmarine/submarine/store/database/db_types.py +++ b/submarine-sdk/pysubmarine/submarine/store/database/db_types.py @@ -16,9 +16,9 @@ Set of SQLAlchemy database schemas supported in Submarine for tracking server backends. """ -POSTGRES = 'postgresql' -MYSQL = 'mysql' -SQLITE = 'sqlite' -MSSQL = 'mssql' +POSTGRES = "postgresql" +MYSQL = "mysql" +SQLITE = "sqlite" +MSSQL = "mssql" DATABASE_ENGINES = [POSTGRES, MYSQL, SQLITE, MSSQL] diff --git a/submarine-sdk/pysubmarine/submarine/store/database/models.py b/submarine-sdk/pysubmarine/submarine/store/database/models.py index 8c8db56cb7..8b9752ed57 100644 --- a/submarine-sdk/pysubmarine/submarine/store/database/models.py +++ b/submarine-sdk/pysubmarine/submarine/store/database/models.py @@ -16,8 +16,7 @@ import time import sqlalchemy as sa -from sqlalchemy import (BigInteger, Boolean, Column, PrimaryKeyConstraint, - String) +from sqlalchemy import BigInteger, Boolean, Column, PrimaryKeyConstraint, String from sqlalchemy.ext.declarative import declarative_base from submarine.entities import Metric, Param @@ -36,7 +35,7 @@ class SqlMetric(Base): - __tablename__ = 'metrics' + __tablename__ = "metrics" id = Column(String(64)) """ @@ -69,28 +68,27 @@ class SqlMetric(Base): True if the value is in fact NaN. """ - __table_args__ = (PrimaryKeyConstraint('id', - 'key', - 'timestamp', - 'worker_index', - name='metric_pk'),) + __table_args__ = ( + PrimaryKeyConstraint("id", "key", "timestamp", "worker_index", name="metric_pk"), + ) def __repr__(self): - return ''.format(self.key, self.value, - self.worker_index, - self.timestamp, - self.step) + return "".format( + self.key, self.value, self.worker_index, self.timestamp, self.step + ) def to_submarine_entity(self): """ Convert DB model to corresponding Submarine entity. :return: :py:class:`submarine.entities.Metric`. """ - return Metric(key=self.key, - value=self.value if not self.is_nan else float("nan"), - worker_index=self.worker_index, - timestamp=self.timestamp, - step=self.step) + return Metric( + key=self.key, + value=self.value if not self.is_nan else float("nan"), + worker_index=self.worker_index, + timestamp=self.timestamp, + step=self.step, + ) # +-----------------------+----------+-------+--------------+ @@ -103,7 +101,7 @@ def to_submarine_entity(self): class SqlParam(Base): - __tablename__ = 'params' + __tablename__ = "params" id = Column(String(64)) """ @@ -123,20 +121,14 @@ class SqlParam(Base): ``metrics`` table. """ - __table_args__ = (PrimaryKeyConstraint('id', - 'key', - 'worker_index', - name='param_pk'),) + __table_args__ = (PrimaryKeyConstraint("id", "key", "worker_index", name="param_pk"),) def __repr__(self): - return ''.format(self.key, self.value, - self.worker_index) + return "".format(self.key, self.value, self.worker_index) def to_submarine_entity(self): """ Convert DB model to corresponding submarine entity. :return: :py:class:`submarine.entities.Param`. """ - return Param(key=self.key, - value=self.value, - worker_index=self.worker_index) + return Param(key=self.key, value=self.value, worker_index=self.worker_index) diff --git a/submarine-sdk/pysubmarine/submarine/store/sqlalchemy_store.py b/submarine-sdk/pysubmarine/submarine/store/sqlalchemy_store.py index 9771edf4f4..01adec5c89 100644 --- a/submarine-sdk/pysubmarine/submarine/store/sqlalchemy_store.py +++ b/submarine-sdk/pysubmarine/submarine/store/sqlalchemy_store.py @@ -137,27 +137,31 @@ def log_metric(self, job_id, metric): value = float(metric.value) with self.ManagedSessionMaker() as session: try: - self._get_or_create(model=SqlMetric, - id=job_id, - key=metric.key, - value=value, - worker_index=metric.worker_index, - timestamp=metric.timestamp, - step=metric.step, - session=session, - is_nan=is_nan) + self._get_or_create( + model=SqlMetric, + id=job_id, + key=metric.key, + value=value, + worker_index=metric.worker_index, + timestamp=metric.timestamp, + step=metric.step, + session=session, + is_nan=is_nan, + ) except sqlalchemy.exc.IntegrityError: session.rollback() def log_param(self, job_id, param): with self.ManagedSessionMaker() as session: try: - self._get_or_create(model=SqlParam, - id=job_id, - session=session, - key=param.key, - value=param.value, - worker_index=param.worker_index) + self._get_or_create( + model=SqlParam, + id=job_id, + session=session, + key=param.key, + value=param.value, + worker_index=param.worker_index, + ) session.commit() except sqlalchemy.exc.IntegrityError: session.rollback() diff --git a/submarine-sdk/pysubmarine/submarine/tracking/__init__.py b/submarine-sdk/pysubmarine/submarine/tracking/__init__.py index bb1303ac9e..24e3314b67 100644 --- a/submarine-sdk/pysubmarine/submarine/tracking/__init__.py +++ b/submarine-sdk/pysubmarine/submarine/tracking/__init__.py @@ -14,8 +14,12 @@ # limitations under the License. from submarine.tracking.client import SubmarineClient -from submarine.tracking.utils import (_JOB_ID_ENV_VAR, _TRACKING_URI_ENV_VAR, - get_tracking_uri, set_tracking_uri) +from submarine.tracking.utils import ( + _JOB_ID_ENV_VAR, + _TRACKING_URI_ENV_VAR, + get_tracking_uri, + set_tracking_uri, +) __all__ = [ "SubmarineClient", diff --git a/submarine-sdk/pysubmarine/submarine/tracking/client.py b/submarine-sdk/pysubmarine/submarine/tracking/client.py index 171688f5c2..b342b24582 100644 --- a/submarine-sdk/pysubmarine/submarine/tracking/client.py +++ b/submarine-sdk/pysubmarine/submarine/tracking/client.py @@ -34,13 +34,7 @@ def __init__(self, tracking_uri=None): self.tracking_uri = tracking_uri or utils.get_tracking_uri() self.store = utils.get_sqlalchemy_store(self.tracking_uri) - def log_metric(self, - job_id, - key, - value, - worker_index, - timestamp=None, - step=None): + def log_metric(self, job_id, key, value, worker_index, timestamp=None, step=None): """ Log a metric against the run ID. :param job_id: The job name to which the metric should be logged. diff --git a/submarine-sdk/pysubmarine/submarine/tracking/fluent.py b/submarine-sdk/pysubmarine/submarine/tracking/fluent.py index 65a733aa45..816569ea05 100644 --- a/submarine-sdk/pysubmarine/submarine/tracking/fluent.py +++ b/submarine-sdk/pysubmarine/submarine/tracking/fluent.py @@ -52,5 +52,6 @@ def log_metric(key, value, step=None): """ job_id = get_job_id() worker_index = get_worker_index() - SubmarineClient().log_metric(job_id, key, value, worker_index, - int(time.time() * 1000), step or 0) + SubmarineClient().log_metric( + job_id, key, value, worker_index, int(time.time() * 1000), step or 0 + ) diff --git a/submarine-sdk/pysubmarine/submarine/tracking/utils.py b/submarine-sdk/pysubmarine/submarine/tracking/utils.py index 95a3a285bd..af169ecac5 100644 --- a/submarine-sdk/pysubmarine/submarine/tracking/utils.py +++ b/submarine-sdk/pysubmarine/submarine/tracking/utils.py @@ -101,13 +101,13 @@ def get_worker_index(): task_config = tf_config.get(_TASK) task_type = task_config.get(_TYPE) task_index = task_config.get(_INDEX) - worker_index = task_type + '-' + str(task_index) + worker_index = task_type + "-" + str(task_index) elif env.get_env(_CLUSTER_SPEC) is not None: cluster_spec = json.loads(os.environ.get(_CLUSTER_SPEC)) task_config = cluster_spec.get(_TASK) task_type = task_config.get(_JOB_NAME) task_index = task_config.get(_INDEX) - worker_index = task_type + '-' + str(task_index) + worker_index = task_type + "-" + str(task_index) # Get PyTorch worker index elif env.get_env(_RANK) is not None: rank = env.get_env(_RANK) diff --git a/submarine-sdk/pysubmarine/submarine/utils/__init__.py b/submarine-sdk/pysubmarine/submarine/utils/__init__.py index 8a94b605a7..6f2b95c674 100644 --- a/submarine-sdk/pysubmarine/submarine/utils/__init__.py +++ b/submarine-sdk/pysubmarine/submarine/utils/__init__.py @@ -24,15 +24,14 @@ def extract_db_type_from_uri(db_uri): supported. If a driver is specified, confirm it passes a plausible regex. """ scheme = urllib.parse.urlparse(db_uri).scheme - scheme_plus_count = scheme.count('+') + scheme_plus_count = scheme.count("+") if scheme_plus_count == 0: db_type = scheme elif scheme_plus_count == 1: - db_type, _ = scheme.split('+') + db_type, _ = scheme.split("+") else: - error_msg = "Invalid database URI: '%s'. %s" % (db_uri, - 'INVALID_DB_URI_MSG') + error_msg = "Invalid database URI: '%s'. %s" % (db_uri, "INVALID_DB_URI_MSG") raise SubmarineException(error_msg) return db_type diff --git a/submarine-sdk/pysubmarine/submarine/utils/env.py b/submarine-sdk/pysubmarine/submarine/utils/env.py index f3396507c5..3797efccdf 100644 --- a/submarine-sdk/pysubmarine/submarine/utils/env.py +++ b/submarine-sdk/pysubmarine/submarine/utils/env.py @@ -64,8 +64,7 @@ def get_from_dicts(params, defaultParams): dct = copy.deepcopy(defaultParams) for k, _ in params.items(): - if (k in dct and isinstance(dct[k], dict) and - isinstance(defaultParams[k], Mapping)): + if k in dct and isinstance(dct[k], dict) and isinstance(defaultParams[k], Mapping): dct[k] = get_from_dicts(params[k], dct[k]) else: dct[k] = params[k] @@ -73,10 +72,9 @@ def get_from_dicts(params, defaultParams): def get_from_registry(key, registry): - if hasattr(key, 'lower'): + if hasattr(key, "lower"): key = key.lower() if key in registry: return registry[key] else: - raise ValueError('Key {} not supported, available options: {}'.format( - key, registry.keys())) + raise ValueError("Key {} not supported, available options: {}".format(key, registry.keys())) diff --git a/submarine-sdk/pysubmarine/submarine/utils/fileio.py b/submarine-sdk/pysubmarine/submarine/utils/fileio.py index d756757d02..53a1145ad2 100644 --- a/submarine-sdk/pysubmarine/submarine/utils/fileio.py +++ b/submarine-sdk/pysubmarine/submarine/utils/fileio.py @@ -22,8 +22,8 @@ def open_buffered_file_reader( - uri: str, - buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> io.BufferedReader: + uri: str, buffer_size: int = io.DEFAULT_BUFFER_SIZE +) -> io.BufferedReader: try: input_file = open_input_file(uri) return io.BufferedReader(input_file, buffer_size=buffer_size) @@ -33,8 +33,8 @@ def open_buffered_file_reader( def open_buffered_stream_writer( - uri: str, - buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> io.BufferedWriter: + uri: str, buffer_size: int = io.DEFAULT_BUFFER_SIZE +) -> io.BufferedWriter: try: output_stream = open_output_stream(uri) return io.BufferedWriter(output_stream, buffer_size=buffer_size) @@ -43,11 +43,8 @@ def open_buffered_stream_writer( raise e -def write_file(buffer: io.BytesIO, - uri: str, - buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> None: - with open_buffered_stream_writer(uri, - buffer_size=buffer_size) as output_stream: +def write_file(buffer: io.BytesIO, uri: str, buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> None: + with open_buffered_stream_writer(uri, buffer_size=buffer_size) as output_stream: output_stream.write(buffer.getbuffer()) @@ -63,13 +60,12 @@ def open_output_stream(uri: str): def file_info(uri: str) -> fs.FileInfo: filesystem, path = _parse_uri(uri) - info, = filesystem.get_file_info([path]) + (info,) = filesystem.get_file_info([path]) return info def _parse_uri(uri: str) -> Tuple[fs.FileSystem, str]: parsed = urlparse(uri) - uri = uri if parsed.scheme else str( - Path(parsed.path).expanduser().absolute()) + uri = uri if parsed.scheme else str(Path(parsed.path).expanduser().absolute()) filesystem, path = fs.FileSystem.from_uri(uri) return filesystem, path diff --git a/submarine-sdk/pysubmarine/submarine/utils/pytorch_utils.py b/submarine-sdk/pysubmarine/submarine/utils/pytorch_utils.py index ddbb7d4357..5add47f224 100644 --- a/submarine-sdk/pysubmarine/submarine/utils/pytorch_utils.py +++ b/submarine-sdk/pysubmarine/submarine/utils/pytorch_utils.py @@ -17,7 +17,7 @@ def get_device(params): - if params['resource']['num_gpus'] > 0: - return torch.device('cuda:0') + if params["resource"]["num_gpus"] > 0: + return torch.device("cuda:0") else: - return torch.device('cpu') + return torch.device("cpu") diff --git a/submarine-sdk/pysubmarine/submarine/utils/rest_utils.py b/submarine-sdk/pysubmarine/submarine/utils/rest_utils.py index 0437469fba..db71b13477 100644 --- a/submarine-sdk/pysubmarine/submarine/utils/rest_utils.py +++ b/submarine-sdk/pysubmarine/submarine/utils/rest_utils.py @@ -23,13 +23,7 @@ _logger = logging.getLogger(__name__) -def http_request(base_url, - endpoint, - method, - json_body, - timeout=60, - headers=None, - **kwargs): +def http_request(base_url, endpoint, method, json_body, timeout=60, headers=None, **kwargs): """ Perform requests. :param base_url: http request base url containing hostname and port. e.g. https://submarine:8088 @@ -41,24 +35,19 @@ def http_request(base_url, :return: """ method = method.upper() - assert method in [ - 'GET', 'HEAD', 'DELETE', 'POST', 'PUT', 'PATCH', 'OPTIONS' - ] + assert method in ["GET", "HEAD", "DELETE", "POST", "PUT", "PATCH", "OPTIONS"] headers = headers or {} - if 'Content-Type' not in headers: - headers['Content-Type'] = 'application/json' + if "Content-Type" not in headers: + headers["Content-Type"] = "application/json" url = base_url + endpoint - response = requests.request(url=url, - method=method, - json=json_body, - headers=headers, - timeout=timeout, - **kwargs) + response = requests.request( + url=url, method=method, json=json_body, headers=headers, timeout=timeout, **kwargs + ) verify_rest_response(response, endpoint) response = json.loads(response.text) - result = response['result'] + result = response["result"] return result @@ -76,8 +65,9 @@ def verify_rest_response(response, endpoint): if _can_parse_as_json(response.text): raise RestException(json.loads(response.text)) else: - base_msg = "API request to endpoint %s failed with error code " \ - "%s != 200" % (endpoint, response.status_code) - raise SubmarineException("%s. Response body: '%s'" % - (base_msg, response.text)) + base_msg = "API request to endpoint %s failed with error code %s != 200" % ( + endpoint, + response.status_code, + ) + raise SubmarineException("%s. Response body: '%s'" % (base_msg, response.text)) return response diff --git a/submarine-sdk/pysubmarine/submarine/utils/tf_utils.py b/submarine-sdk/pysubmarine/submarine/utils/tf_utils.py index 96da0ce1f7..4b79e065bb 100644 --- a/submarine-sdk/pysubmarine/submarine/utils/tf_utils.py +++ b/submarine-sdk/pysubmarine/submarine/utils/tf_utils.py @@ -24,25 +24,28 @@ def _get_session_config_from_env_var(params): """Returns a tf.ConfigProto instance with appropriate device_filters set.""" - tf_config = json.loads(os.environ.get('TF_CONFIG', '{}')) - - if tf_config and 'task' in tf_config and 'type' in tf_config['task'] \ - and 'index' in tf_config['task']: + tf_config = json.loads(os.environ.get("TF_CONFIG", "{}")) + + if ( + tf_config + and "task" in tf_config + and "type" in tf_config["task"] + and "index" in tf_config["task"] + ): # Master should only communicate with itself and ps. - if tf_config['task']['type'] == 'master': + if tf_config["task"]["type"] == "master": return tf.ConfigProto( - device_filters=['/job:ps', '/job:master'], - intra_op_parallelism_threads=params["resource"]['num_thread'], - inter_op_parallelism_threads=params["resource"]['num_thread']) + device_filters=["/job:ps", "/job:master"], + intra_op_parallelism_threads=params["resource"]["num_thread"], + inter_op_parallelism_threads=params["resource"]["num_thread"], + ) # Worker should only communicate with itself and ps. - elif tf_config['task']['type'] == 'worker': + elif tf_config["task"]["type"] == "worker": return tf.ConfigProto( # gpu_options=gpu_options, - device_filters=[ - '/job:ps', - '/job:worker/task:%d' % tf_config['task']['index'] - ], - intra_op_parallelism_threads=params["resource"]['num_thread'], - inter_op_parallelism_threads=params["resource"]['num_thread']) + device_filters=["/job:ps", "/job:worker/task:%d" % tf_config["task"]["index"]], + intra_op_parallelism_threads=params["resource"]["num_thread"], + inter_op_parallelism_threads=params["resource"]["num_thread"], + ) return None @@ -55,28 +58,30 @@ def get_tf_config(params): :type params: Dictionary :return: The class specifies the configurations for an Estimator run """ - if params["training"]['mode'] == 'local': # local mode + if params["training"]["mode"] == "local": # local mode tf_config = tf.estimator.RunConfig().replace( session_config=tf.ConfigProto( device_count={ - 'GPU': params["resource"]['num_gpu'], - 'CPU': params["resource"]['num_cpu'] + "GPU": params["resource"]["num_gpu"], + "CPU": params["resource"]["num_cpu"], }, - intra_op_parallelism_threads=params["resource"]['num_thread'], - inter_op_parallelism_threads=params["resource"]['num_thread']), - log_step_count_steps=params["training"]['log_steps'], - save_summary_steps=params["training"]['log_steps']) - - elif params["training"]['mode'] == 'distributed': + intra_op_parallelism_threads=params["resource"]["num_thread"], + inter_op_parallelism_threads=params["resource"]["num_thread"], + ), + log_step_count_steps=params["training"]["log_steps"], + save_summary_steps=params["training"]["log_steps"], + ) + + elif params["training"]["mode"] == "distributed": tf_config = tf.estimator.RunConfig( experimental_distribute=tf.contrib.distribute.DistributeConfig( - train_distribute=tf.contrib.distribute.ParameterServerStrategy( - ), - eval_distribute=tf.contrib.distribute.ParameterServerStrategy( - )), + train_distribute=tf.contrib.distribute.ParameterServerStrategy(), + eval_distribute=tf.contrib.distribute.ParameterServerStrategy(), + ), session_config=_get_session_config_from_env_var(params), - save_summary_steps=params["training"]['log_steps'], - log_step_count_steps=params["training"]['log_steps']) + save_summary_steps=params["training"]["log_steps"], + log_step_count_steps=params["training"]["log_steps"], + ) else: raise ValueError("mode should be local or distributed") return tf_config @@ -94,37 +99,37 @@ def get_estimator_spec(logit, labels, mode, params): """ learning_rate = params["training"]["learning_rate"] optimizer = params["training"]["optimizer"] - metric = params['output']['metric'] + metric = params["output"]["metric"] output = tf.sigmoid(logit) predictions = {"probabilities": output} export_outputs = { + # https://github.com/psf/black/issues/2434 + # fmt: off tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.estimator.export.PredictOutput(predictions) + # fmt: on } # Provide an estimator spec for `ModeKeys.PREDICT` if mode == tf.estimator.ModeKeys.PREDICT: - return tf.estimator.EstimatorSpec(mode=mode, - predictions=predictions, - export_outputs=export_outputs) + return tf.estimator.EstimatorSpec( + mode=mode, predictions=predictions, export_outputs=export_outputs + ) with tf.name_scope("Loss"): - loss = tf.reduce_mean( - tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, - labels=labels)) + loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=labels)) # Provide an estimator spec for `ModeKeys.EVAL` eval_metric_ops = {} - if metric == 'auc': - eval_metric_ops['auc'] = tf.metrics.auc(labels, output) + if metric == "auc": + eval_metric_ops["auc"] = tf.metrics.auc(labels, output) else: raise TypeError("Invalid metric :", metric) if mode == tf.estimator.ModeKeys.EVAL: - return tf.estimator.EstimatorSpec(mode=mode, - predictions=predictions, - loss=loss, - eval_metric_ops=eval_metric_ops) + return tf.estimator.EstimatorSpec( + mode=mode, predictions=predictions, loss=loss, eval_metric_ops=eval_metric_ops + ) with tf.name_scope("Train"): op = get_optimizer(optimizer, learning_rate) @@ -132,7 +137,6 @@ def get_estimator_spec(logit, labels, mode, params): # Provide an estimator spec for `ModeKeys.TRAIN` modes if mode == tf.estimator.ModeKeys.TRAIN: - return tf.estimator.EstimatorSpec(mode=mode, - predictions=predictions, - loss=loss, - train_op=train_op) + return tf.estimator.EstimatorSpec( + mode=mode, predictions=predictions, loss=loss, train_op=train_op + ) diff --git a/submarine-sdk/pysubmarine/submarine/utils/validation.py b/submarine-sdk/pysubmarine/submarine/utils/validation.py index 304adac51c..37e5766443 100644 --- a/submarine-sdk/pysubmarine/submarine/utils/validation.py +++ b/submarine-sdk/pysubmarine/submarine/utils/validation.py @@ -29,53 +29,55 @@ _BAD_CHARACTERS_MESSAGE = ( "Names may only contain alphanumerics, underscores (_), dashes (-), periods (.)," - " spaces ( ), and slashes (/).") + " spaces ( ), and slashes (/)." +) -_UNSUPPORTED_DB_TYPE_MSG = "Supported database engines are {%s}" % ', '.join( - DATABASE_ENGINES) +_UNSUPPORTED_DB_TYPE_MSG = "Supported database engines are {%s}" % ", ".join(DATABASE_ENGINES) def bad_path_message(name): return ( "Names may be treated as files in certain cases, and must not resolve to other names" " when treated as such. This name would resolve to '%s'" - ) % posixpath.normpath(name) + % posixpath.normpath(name) + ) def path_not_unique(name): norm = posixpath.normpath(name) - return norm != name or norm == '.' or norm.startswith( - '..') or norm.startswith('/') + return norm != name or norm == "." or norm.startswith("..") or norm.startswith("/") def _validate_param_name(name): """Check that `name` is a valid parameter name and raise an exception if it isn't.""" if not _VALID_PARAM_AND_METRIC_NAMES.match(name): raise SubmarineException( - "Invalid parameter name: '%s'. %s" % - (name, _BAD_CHARACTERS_MESSAGE),) + "Invalid parameter name: '%s'. %s" % (name, _BAD_CHARACTERS_MESSAGE), + ) if path_not_unique(name): - raise SubmarineException("Invalid parameter name: '%s'. %s" % - (name, bad_path_message(name))) + raise SubmarineException( + "Invalid parameter name: '%s'. %s" % (name, bad_path_message(name)) + ) def _validate_metric_name(name): """Check that `name` is a valid metric name and raise an exception if it isn't.""" if not _VALID_PARAM_AND_METRIC_NAMES.match(name): raise SubmarineException( - "Invalid metric name: '%s'. %s" % (name, _BAD_CHARACTERS_MESSAGE),) + "Invalid metric name: '%s'. %s" % (name, _BAD_CHARACTERS_MESSAGE), + ) if path_not_unique(name): - raise SubmarineException("Invalid metric name: '%s'. %s" % - (name, bad_path_message(name))) + raise SubmarineException("Invalid metric name: '%s'. %s" % (name, bad_path_message(name))) def _validate_length_limit(entity_name, limit, value): if len(value) > limit: raise SubmarineException( - "%s '%s' had length %s, which exceeded length limit of %s" % - (entity_name, value[:250], len(value), limit)) + "%s '%s' had length %s, which exceeded length limit of %s" + % (entity_name, value[:250], len(value), limit) + ) def validate_metric(key, value, timestamp, step): @@ -87,17 +89,20 @@ def validate_metric(key, value, timestamp, step): if not isinstance(value, numbers.Number): raise SubmarineException( "Got invalid value %s for metric '%s' (timestamp=%s). Please specify value as a valid " - "double (64-bit floating point)" % (value, key, timestamp),) + "double (64-bit floating point)" % (value, key, timestamp), + ) if not isinstance(timestamp, numbers.Number) or timestamp < 0: raise SubmarineException( "Got invalid timestamp %s for metric '%s' (value=%s). Timestamp must be a nonnegative " - "long (64-bit integer) " % (timestamp, key, value),) + "long (64-bit integer) " % (timestamp, key, value), + ) if not isinstance(step, numbers.Number): raise SubmarineException( "Got invalid step %s for metric '%s' (value=%s). Step must be a valid long " - "(64-bit integer)." % (step, key, value),) + "(64-bit integer)." % (step, key, value), + ) def validate_param(key, value): @@ -113,6 +118,5 @@ def validate_param(key, value): def _validate_db_type_string(db_type): """validates db_type parsed from DB URI is supported""" if db_type not in DATABASE_ENGINES: - error_msg = "Invalid database engine: '%s'. '%s'" % ( - db_type, _UNSUPPORTED_DB_TYPE_MSG) + error_msg = "Invalid database engine: '%s'. '%s'" % (db_type, _UNSUPPORTED_DB_TYPE_MSG) raise SubmarineException(error_msg) diff --git a/submarine-sdk/pysubmarine/tests/experiment/test_experiment_client.py b/submarine-sdk/pysubmarine/tests/experiment/test_experiment_client.py index eee9410a3f..f8603886ed 100644 --- a/submarine-sdk/pysubmarine/tests/experiment/test_experiment_client.py +++ b/submarine-sdk/pysubmarine/tests/experiment/test_experiment_client.py @@ -25,33 +25,30 @@ @pytest.mark.e2e def test_experiment_e2e(): - submarine_client = submarine.ExperimentClient(host='http://localhost:8080') - environment = EnvironmentSpec( - image='apache/submarine:tf-dist-mnist-test-1.0') + submarine_client = submarine.ExperimentClient(host="http://localhost:8080") + environment = EnvironmentSpec(image="apache/submarine:tf-dist-mnist-test-1.0") experiment_meta = ExperimentMeta( - name='mnist-dist', - namespace='default', - framework='Tensorflow', - cmd='python /var/tf_dist_mnist/dist_mnist.py --train_steps=100', - env_vars={'ENV1': 'ENV1'}) - - worker_spec = ExperimentTaskSpec(resources='cpu=1,memory=1024M', replicas=1) - ps_spec = ExperimentTaskSpec(resources='cpu=1,memory=1024M', replicas=1) - - code_spec = CodeSpec(sync_mode='git', - url='https://github.com/apache/submarine.git') - - experiment_spec = ExperimentSpec(meta=experiment_meta, - environment=environment, - code=code_spec, - spec={ - 'Ps': ps_spec, - 'Worker': worker_spec - }) - - experiment = submarine_client.create_experiment( - experiment_spec=experiment_spec) - id = experiment['experimentId'] + name="mnist-dist", + namespace="default", + framework="Tensorflow", + cmd="python /var/tf_dist_mnist/dist_mnist.py --train_steps=100", + env_vars={"ENV1": "ENV1"}, + ) + + worker_spec = ExperimentTaskSpec(resources="cpu=1,memory=1024M", replicas=1) + ps_spec = ExperimentTaskSpec(resources="cpu=1,memory=1024M", replicas=1) + + code_spec = CodeSpec(sync_mode="git", url="https://github.com/apache/submarine.git") + + experiment_spec = ExperimentSpec( + meta=experiment_meta, + environment=environment, + code=code_spec, + spec={"Ps": ps_spec, "Worker": worker_spec}, + ) + + experiment = submarine_client.create_experiment(experiment_spec=experiment_spec) + id = experiment["experimentId"] submarine_client.get_experiment(id) submarine_client.list_experiments() diff --git a/submarine-sdk/pysubmarine/tests/ml/pytorch/model/conftest.py b/submarine-sdk/pysubmarine/tests/ml/pytorch/model/conftest.py index 2da47efa43..a425a03c67 100644 --- a/submarine-sdk/pysubmarine/tests/ml/pytorch/model/conftest.py +++ b/submarine-sdk/pysubmarine/tests/ml/pytorch/model/conftest.py @@ -45,12 +45,9 @@ def get_model_param(tmpdir): "train_data": data_file, "valid_data": data_file, "test_data": data_file, - "type": "libsvm" - }, - "output": { - "save_model_dir": save_model_dir, - "metric": "roc_auc" + "type": "libsvm", }, + "output": {"save_model_dir": save_model_dir, "metric": "roc_auc"}, "training": { "batch_size": 4, "num_epochs": 1, @@ -59,7 +56,7 @@ def get_model_param(tmpdir): "num_gpus": 0, "seed": 42, "mode": "distributed", - "backend": "gloo" + "backend": "gloo", }, "model": { "name": "ctr.deepfm", @@ -71,24 +68,12 @@ def get_model_param(tmpdir): "attention_dim": 64, "hidden_units": [400, 400], "dropout_rate": 0.3, - "dropout_rates": [0.2, 0.2] - } - }, - "loss": { - "name": "BCEWithLogitsLoss", - "kwargs": {} + "dropout_rates": [0.2, 0.2], + }, }, - "optimizer": { - "name": "adam", - "kwargs": { - "lr": 1e-3 - } - }, - "resource": { - "num_cpus": 2, - "num_gpus": 0, - "num_threads": 0 - } + "loss": {"name": "BCEWithLogitsLoss", "kwargs": {}}, + "optimizer": {"name": "adam", "kwargs": {"lr": 1e-3}}, + "resource": {"num_cpus": 2, "num_gpus": 0, "num_threads": 0}, } yield params diff --git a/submarine-sdk/pysubmarine/tests/ml/pytorch/test_loss_pytorch.py b/submarine-sdk/pysubmarine/tests/ml/pytorch/test_loss_pytorch.py index bacdd1bc14..432fc7fc0a 100644 --- a/submarine-sdk/pysubmarine/tests/ml/pytorch/test_loss_pytorch.py +++ b/submarine-sdk/pysubmarine/tests/ml/pytorch/test_loss_pytorch.py @@ -19,12 +19,12 @@ def test_get_loss_fn(): - loss_keys = ['BCELoss', 'CrossEntropyLoss', 'NLLLoss', 'BCEWithLogitsLoss'] - invalid_loss_keys = ['NotExistLoss'] + loss_keys = ["BCELoss", "CrossEntropyLoss", "NLLLoss", "BCEWithLogitsLoss"] + invalid_loss_keys = ["NotExistLoss"] for key in loss_keys: get_loss_fn(key) for key_invalid in invalid_loss_keys: - with pytest.raises(ValueError, match='Invalid loss_key:'): + with pytest.raises(ValueError, match="Invalid loss_key:"): get_loss_fn(key_invalid) diff --git a/submarine-sdk/pysubmarine/tests/ml/pytorch/test_metric_pytorch.py b/submarine-sdk/pysubmarine/tests/ml/pytorch/test_metric_pytorch.py index 3a42dba955..254ce4303f 100644 --- a/submarine-sdk/pysubmarine/tests/ml/pytorch/test_metric_pytorch.py +++ b/submarine-sdk/pysubmarine/tests/ml/pytorch/test_metric_pytorch.py @@ -19,12 +19,12 @@ def test_get_metric_fn(): - metric_keys = ['f1_score', 'accuracy', 'roc_auc', 'precision', 'recall'] - invalid_metric_keys = ['NotExistMetric'] + metric_keys = ["f1_score", "accuracy", "roc_auc", "precision", "recall"] + invalid_metric_keys = ["NotExistMetric"] for key in metric_keys: get_metric_fn(key) for key_invalid in invalid_metric_keys: - with pytest.raises(ValueError, match='Invalid metric_key:'): + with pytest.raises(ValueError, match="Invalid metric_key:"): get_metric_fn(key_invalid) diff --git a/submarine-sdk/pysubmarine/tests/ml/pytorch/test_optimizer_pytorch.py b/submarine-sdk/pysubmarine/tests/ml/pytorch/test_optimizer_pytorch.py index edb399fe8f..db7dcbb5b0 100644 --- a/submarine-sdk/pysubmarine/tests/ml/pytorch/test_optimizer_pytorch.py +++ b/submarine-sdk/pysubmarine/tests/ml/pytorch/test_optimizer_pytorch.py @@ -19,12 +19,12 @@ def test_get_optimizer(): - optimizer_keys = ['adam', 'adagrad', 'sgd'] - invalid_optimizer_keys = ['adddam'] + optimizer_keys = ["adam", "adagrad", "sgd"] + invalid_optimizer_keys = ["adddam"] for key in optimizer_keys: get_optimizer(key) for key_invalid in invalid_optimizer_keys: - with pytest.raises(ValueError, match='Invalid optimizer_key:'): + with pytest.raises(ValueError, match="Invalid optimizer_key:"): get_optimizer(key_invalid) diff --git a/submarine-sdk/pysubmarine/tests/ml/tensorflow/model/conftest.py b/submarine-sdk/pysubmarine/tests/ml/tensorflow/model/conftest.py index 17cf29571f..132bdeaee4 100644 --- a/submarine-sdk/pysubmarine/tests/ml/tensorflow/model/conftest.py +++ b/submarine-sdk/pysubmarine/tests/ml/tensorflow/model/conftest.py @@ -40,18 +40,10 @@ def get_model_param(tmpdir): "train_data": data_file, "valid_data": data_file, "test_data": data_file, - "type": "libsvm" + "type": "libsvm", }, - "output": { - "save_model_dir": save_model_dir, - "metric": "auc" - }, - "training": { - "batch_size": 256, - "num_epochs": 1, - "field_size": 10, - "feature_size": 1000 - } + "output": {"save_model_dir": save_model_dir, "metric": "auc"}, + "training": {"batch_size": 256, "num_epochs": 1, "field_size": 10, "feature_size": 1000}, } yield params diff --git a/submarine-sdk/pysubmarine/tests/ml/tensorflow/model/test_base_tf_model.py b/submarine-sdk/pysubmarine/tests/ml/tensorflow/model/test_base_tf_model.py index 7fb475ac90..9c31908f05 100644 --- a/submarine-sdk/pysubmarine/tests/ml/tensorflow/model/test_base_tf_model.py +++ b/submarine-sdk/pysubmarine/tests/ml/tensorflow/model/test_base_tf_model.py @@ -20,13 +20,12 @@ def test_create_base_tf_model(): params = {"learning rate": 0.05} - with pytest.raises(AssertionError, - match="Does not define any input parameters"): + with pytest.raises(AssertionError, match="Does not define any input parameters"): BaseTFModel(params) - params.update({'input': {'train_data': '/tmp/train.csv'}}) + params.update({"input": {"train_data": "/tmp/train.csv"}}) with pytest.raises(AssertionError, match="Does not define any input type"): BaseTFModel(params) - params.update({'input': {'type': 'libsvm'}}) + params.update({"input": {"type": "libsvm"}}) BaseTFModel(params) diff --git a/submarine-sdk/pysubmarine/tests/ml/tensorflow/test_optimizer.py b/submarine-sdk/pysubmarine/tests/ml/tensorflow/test_optimizer.py index 96daf09958..831d427354 100644 --- a/submarine-sdk/pysubmarine/tests/ml/tensorflow/test_optimizer.py +++ b/submarine-sdk/pysubmarine/tests/ml/tensorflow/test_optimizer.py @@ -19,13 +19,12 @@ def test_get_optimizer(): - optimizer_keys = ['adam', 'adagrad', 'momentum', 'ftrl'] - invalid_optimizer_keys = ['adddam'] + optimizer_keys = ["adam", "adagrad", "momentum", "ftrl"] + invalid_optimizer_keys = ["adddam"] for optimizer_key in optimizer_keys: get_optimizer(optimizer_key=optimizer_key, learning_rate=0.3) for invalid_optimizer_key in invalid_optimizer_keys: with pytest.raises(ValueError, match="Invalid optimizer_key :"): - get_optimizer(optimizer_key=invalid_optimizer_key, - learning_rate=0.3) + get_optimizer(optimizer_key=invalid_optimizer_key, learning_rate=0.3) diff --git a/submarine-sdk/pysubmarine/tests/models/pytorch.py b/submarine-sdk/pysubmarine/tests/models/pytorch.py index a54325251e..eeffc305f7 100644 --- a/submarine-sdk/pysubmarine/tests/models/pytorch.py +++ b/submarine-sdk/pysubmarine/tests/models/pytorch.py @@ -14,12 +14,10 @@ specific language governing permissions and limitations under the License. """ -import numpy as np import torch class LinearNNModel(torch.nn.Module): - def __init__(self): super(LinearNNModel, self).__init__() self.linear = torch.nn.Linear(1, 1) # One in and one out diff --git a/submarine-sdk/pysubmarine/tests/models/test_model.py b/submarine-sdk/pysubmarine/tests/models/test_model.py index e0efcb7a7b..f94acf176d 100644 --- a/submarine-sdk/pysubmarine/tests/models/test_model.py +++ b/submarine-sdk/pysubmarine/tests/models/test_model.py @@ -17,15 +17,13 @@ import mlflow import numpy as np -import pytest from mlflow.tracking import MlflowClient - from pytorch import LinearNNModel -from submarine import ModelsClient +from submarine import ModelsClient -class TestSubmarineModelsClient(): +class TestSubmarineModelsClient: def setUp(self): pass @@ -41,25 +39,21 @@ def test_save_model(self, mocker): mock_method.assert_called_once_with("pytorch", model, "simple-nn-model") def test_update_model(self, mocker): - mock_method = mocker.patch.object(MlflowClient, - "rename_registered_model") + mock_method = mocker.patch.object(MlflowClient, "rename_registered_model") client = ModelsClient() name = "simple-nn-model" new_name = "new-simple-nn-model" client.update_model(name, new_name) - mock_method.assert_called_once_with(name="simple-nn-model", - new_name="new-simple-nn-model") + mock_method.assert_called_once_with(name="simple-nn-model", new_name="new-simple-nn-model") def test_load_model(self, mocker): mock_method = mocker.patch.object(mlflow.pyfunc, "load_model") - mock_method.return_value = mlflow.pytorch._PyTorchWrapper( - LinearNNModel()) + mock_method.return_value = mlflow.pytorch._PyTorchWrapper(LinearNNModel()) client = ModelsClient() name = "simple-nn-model" version = "1" model = client.load_model(name, version) - mock_method.assert_called_once_with( - model_uri="models:/simple-nn-model/1") + mock_method.assert_called_once_with(model_uri="models:/simple-nn-model/1") x = np.float32([[1.0], [2.0]]) y = model.predict(x) assert y.shape[0] == 2 @@ -69,5 +63,5 @@ def test_delete_model(self, mocker): mock_method = mocker.patch.object(MlflowClient, "delete_model_version") client = ModelsClient() name = "simple-nn-model" - client.delete_model(name, '1') + client.delete_model(name, "1") mock_method.assert_called_once_with(name="simple-nn-model", version="1") diff --git a/submarine-sdk/pysubmarine/tests/models/test_model_e2e.py b/submarine-sdk/pysubmarine/tests/models/test_model_e2e.py index 7fb1a55e89..635a7c3e5d 100644 --- a/submarine-sdk/pysubmarine/tests/models/test_model_e2e.py +++ b/submarine-sdk/pysubmarine/tests/models/test_model_e2e.py @@ -15,14 +15,11 @@ under the License. """ -import os - import numpy as np import pytest - from pytorch import LinearNNModel + from submarine import ModelsClient -from submarine.models import constant @pytest.fixture(name="models_client", scope="class") @@ -32,16 +29,12 @@ def models_client_fixture(): @pytest.mark.e2e -class TestSubmarineModelsClientE2E(): - +class TestSubmarineModelsClientE2E: def test_model(self, models_client): model = LinearNNModel() # log name = "simple-nn-model" - models_client.save_model("pytorch", - model, - name, - registered_model_name=name) + models_client.save_model("pytorch", model, name, registered_model_name=name) # update new_name = "new-simple-nn-model" models_client.update_model(name, new_name) @@ -54,4 +47,4 @@ def test_model(self, models_client): assert y.shape[0] == 2 assert y.shape[1] == 1 # delete - models_client.delete_model(name, '1') + models_client.delete_model(name, "1") diff --git a/submarine-sdk/pysubmarine/tests/store/test_sqlalchemy_store.py b/submarine-sdk/pysubmarine/tests/store/test_sqlalchemy_store.py index 72c1c00129..f30dcd7dbf 100644 --- a/submarine-sdk/pysubmarine/tests/store/test_sqlalchemy_store.py +++ b/submarine-sdk/pysubmarine/tests/store/test_sqlalchemy_store.py @@ -29,7 +29,6 @@ @pytest.mark.e2e class TestSqlAlchemyStore(unittest.TestCase): - def setUp(self): submarine.set_tracking_uri( "mysql+pymysql://submarine_test:password_test@localhost:3306/submarine_test" @@ -47,10 +46,7 @@ def test_log_param(self): # Validate params with self.store.ManagedSessionMaker() as session: - params = session \ - .query(SqlParam) \ - .options() \ - .filter(SqlParam.id == JOB_ID).all() + params = session.query(SqlParam).options().filter(SqlParam.id == JOB_ID).all() assert params[0].key == "name_1" assert params[0].value == "a" assert params[0].worker_index == "worker-1" @@ -64,10 +60,7 @@ def test_log_metric(self): # Validate metrics with self.store.ManagedSessionMaker() as session: - metrics = session \ - .query(SqlMetric) \ - .options() \ - .filter(SqlMetric.id == JOB_ID).all() + metrics = session.query(SqlMetric).options().filter(SqlMetric.id == JOB_ID).all() assert len(metrics) == 2 assert metrics[0].key == "name_1" assert metrics[0].value == 5 diff --git a/submarine-sdk/pysubmarine/tests/tracking/test_tracking.py b/submarine-sdk/pysubmarine/tests/tracking/test_tracking.py index 699fc4e4c0..70587416e4 100644 --- a/submarine-sdk/pysubmarine/tests/tracking/test_tracking.py +++ b/submarine-sdk/pysubmarine/tests/tracking/test_tracking.py @@ -28,7 +28,6 @@ @pytest.mark.e2e class TestTracking(unittest.TestCase): - def setUp(self): environ["JOB_ID"] = JOB_ID submarine.set_tracking_uri( @@ -45,10 +44,7 @@ def log_param(self): submarine.log_param("name_1", "a") # Validate params with self.store.ManagedSessionMaker() as session: - params = session \ - .query(SqlParam) \ - .options() \ - .filter(SqlParam.id == JOB_ID).all() + params = session.query(SqlParam).options().filter(SqlParam.id == JOB_ID).all() assert params[0].key == "name_1" assert params[0].value == "a" assert params[0].id == JOB_ID @@ -58,10 +54,7 @@ def test_log_metric(self): submarine.log_metric("name_1", 6) # Validate params with self.store.ManagedSessionMaker() as session: - metrics = session \ - .query(SqlMetric) \ - .options() \ - .filter(SqlMetric.id == JOB_ID).all() + metrics = session.query(SqlMetric).options().filter(SqlMetric.id == JOB_ID).all() assert len(metrics) == 2 assert metrics[0].key == "name_1" assert metrics[0].value == 5 diff --git a/submarine-sdk/pysubmarine/tests/tracking/test_utils.py b/submarine-sdk/pysubmarine/tests/tracking/test_utils.py index bfcc85eb99..d3e20e2b84 100644 --- a/submarine-sdk/pysubmarine/tests/tracking/test_utils.py +++ b/submarine-sdk/pysubmarine/tests/tracking/test_utils.py @@ -19,9 +19,14 @@ from submarine.store import DEFAULT_SUBMARINE_JDBC_URL from submarine.store.sqlalchemy_store import SqlAlchemyStore -from submarine.tracking.utils import (_JOB_ID_ENV_VAR, _TRACKING_URI_ENV_VAR, - get_job_id, get_sqlalchemy_store, - get_tracking_uri, is_tracking_uri_set) +from submarine.tracking.utils import ( + _JOB_ID_ENV_VAR, + _TRACKING_URI_ENV_VAR, + get_job_id, + get_sqlalchemy_store, + get_tracking_uri, + is_tracking_uri_set, +) def test_is_tracking_uri_set(): @@ -52,8 +57,9 @@ def test_get_sqlalchemy_store(): patch_create_engine = mock.patch("sqlalchemy.create_engine") uri = DEFAULT_SUBMARINE_JDBC_URL env = {_TRACKING_URI_ENV_VAR: uri} - with mock.patch.dict(os.environ, env), patch_create_engine as mock_create_engine, \ - mock.patch("submarine.store.sqlalchemy_store.SqlAlchemyStore._initialize_tables"): + with mock.patch.dict(os.environ, env), patch_create_engine as mock_create_engine, mock.patch( + "submarine.store.sqlalchemy_store.SqlAlchemyStore._initialize_tables" + ): store = get_sqlalchemy_store(uri) assert isinstance(store, SqlAlchemyStore) assert store.db_uri == uri diff --git a/submarine-sdk/pysubmarine/tests/utils/test_env.py b/submarine-sdk/pysubmarine/tests/utils/test_env.py index eefd176e9d..cc6524a8cd 100644 --- a/submarine-sdk/pysubmarine/tests/utils/test_env.py +++ b/submarine-sdk/pysubmarine/tests/utils/test_env.py @@ -18,15 +18,20 @@ import pytest -from submarine.utils.env import (get_env, get_from_dicts, get_from_json, - get_from_registry, unset_variable) +from submarine.utils.env import ( + get_env, + get_from_dicts, + get_from_json, + get_from_registry, + unset_variable, +) @pytest.fixture(scope="function") def output_json_filepath(): params = {"learning_rate": 0.05} - path = '/tmp/data.json' - with open(path, 'w') as f: + path = "/tmp/data.json" + with open(path, "w") as f: json.dump(params, f) return path @@ -45,22 +50,22 @@ def test_unset_variable(): def test_merge_json(output_json_filepath): default_params = {"learning_rate": 0.08, "embedding_size": 256} params = get_from_json(output_json_filepath, default_params) - assert params['learning_rate'] == 0.05 - assert params['embedding_size'] == 256 + assert params["learning_rate"] == 0.05 + assert params["embedding_size"] == 256 def test_merge_dicts(): params = {"learning_rate": 0.05} default_params = {"learning_rate": 0.08, "embedding_size": 256} final = get_from_dicts(params, default_params) - assert final['learning_rate'] == 0.05 - assert final['embedding_size'] == 256 + assert final["learning_rate"] == 0.05 + assert final["embedding_size"] == 256 def test_get_from_registry(): - registry = {'model': 'xgboost'} - val = get_from_registry('MODEL', registry) - assert val == 'xgboost' + registry = {"model": "xgboost"} + val = get_from_registry("MODEL", registry) + assert val == "xgboost" with pytest.raises(ValueError): - get_from_registry('test', registry) + get_from_registry("test", registry) diff --git a/submarine-sdk/pysubmarine/tests/utils/test_rest_utils.py b/submarine-sdk/pysubmarine/tests/utils/test_rest_utils.py index 3a866e93a3..f8da8fa373 100644 --- a/submarine-sdk/pysubmarine/tests/utils/test_rest_utils.py +++ b/submarine-sdk/pysubmarine/tests/utils/test_rest_utils.py @@ -23,47 +23,43 @@ def test_http_request(): - dummy_json = json.dumps({ - 'result': { - 'jobId': 'job_1234567', - 'name': 'submarine', - 'identifier': 'test' - } - }) + dummy_json = json.dumps( + {"result": {"jobId": "job_1234567", "name": "submarine", "identifier": "test"}} + ) - with patch('requests.request') as mock_requests: + with patch("requests.request") as mock_requests: mock_requests.return_value.text = dummy_json mock_requests.return_value.status_code = 200 - result = http_request('http://submarine:8080', - json_body='dummy', - endpoint='/api/v1/jobs', - method='POST') + result = http_request( + "http://submarine:8080", json_body="dummy", endpoint="/api/v1/jobs", method="POST" + ) - assert result['jobId'] == 'job_1234567' - assert result['name'] == 'submarine' - assert result['identifier'] == 'test' + assert result["jobId"] == "job_1234567" + assert result["name"] == "submarine" + assert result["identifier"] == "test" def test_verify_rest_response(): # Test correct response mock_response = Mock() mock_response.status_code = 200 - verify_rest_response(mock_response, '/api/v1/jobs') + verify_rest_response(mock_response, "/api/v1/jobs") # Test response status code not equal 200(OK) and response can parse as JSON mock_response.status_code = 400 - mock_json_body = {'a': 200, 'b': 2, 'c': 3} + mock_json_body = {"a": 200, "b": 2, "c": 3} dummy_json = json.dumps(mock_json_body) mock_response.text = dummy_json with pytest.raises(RestException, match=str(json.loads(dummy_json))): - verify_rest_response(mock_response, '/api/v1/jobs') + verify_rest_response(mock_response, "/api/v1/jobs") # Test response status code not equal 200(OK) and response can not parse as JSON - mock_json_body = 'test, 123' + mock_json_body = "test, 123" mock_response.text = mock_json_body - with pytest.raises(SubmarineException, - match='API request to endpoint /api/v1/jobs failed ' - 'with error code 400 != 200'): - verify_rest_response(mock_response, '/api/v1/jobs') + with pytest.raises( + SubmarineException, + match="API request to endpoint /api/v1/jobs failed with error code 400 != 200", + ): + verify_rest_response(mock_response, "/api/v1/jobs") diff --git a/submarine-sdk/pysubmarine/tests/utils/test_tf_utils.py b/submarine-sdk/pysubmarine/tests/utils/test_tf_utils.py index 52ed4ad097..a03d4347f5 100644 --- a/submarine-sdk/pysubmarine/tests/utils/test_tf_utils.py +++ b/submarine-sdk/pysubmarine/tests/utils/test_tf_utils.py @@ -19,34 +19,24 @@ def test_get_tf_config(): - params = {'training': {'mode': 'test'}} + params = {"training": {"mode": "test"}} with pytest.raises(ValueError, match="mode should be local or distributed"): get_tf_config(params) # conf for local training - params.update({ - 'training': { - 'mode': 'local', - 'log_steps': 10 - }, - 'resource': { - 'num_cpu': 4, - 'num_thread': 4, - 'num_gpu': 1 + params.update( + { + "training": {"mode": "local", "log_steps": 10}, + "resource": {"num_cpu": 4, "num_thread": 4, "num_gpu": 1}, } - }) + ) get_tf_config(params) # conf for distributed training - params.update({ - 'training': { - 'mode': 'distributed', - 'log_steps': 10 - }, - 'resource': { - 'num_cpu': 4, - 'num_thread': 4, - 'num_gpu': 2 + params.update( + { + "training": {"mode": "distributed", "log_steps": 10}, + "resource": {"num_cpu": 4, "num_thread": 4, "num_gpu": 2}, } - }) + ) get_tf_config(params) diff --git a/submarine-sdk/pysubmarine/tests/utils/test_validation.py b/submarine-sdk/pysubmarine/tests/utils/test_validation.py index 0030913d2e..96f4b13ff7 100644 --- a/submarine-sdk/pysubmarine/tests/utils/test_validation.py +++ b/submarine-sdk/pysubmarine/tests/utils/test_validation.py @@ -16,10 +16,12 @@ import pytest from submarine.exceptions import SubmarineException -from submarine.utils.validation import (_validate_db_type_string, - _validate_length_limit, - _validate_metric_name, - _validate_param_name) +from submarine.utils.validation import ( + _validate_db_type_string, + _validate_length_limit, + _validate_metric_name, + _validate_param_name, +) GOOD_METRIC_OR_PARAM_NAMES = [ "a", diff --git a/website/docs/userDocs/submarine-sdk/pysubmarine/development.md b/website/docs/userDocs/submarine-sdk/pysubmarine/development.md index 8975ce12c5..d2f18b6794 100644 --- a/website/docs/userDocs/submarine-sdk/pysubmarine/development.md +++ b/website/docs/userDocs/submarine-sdk/pysubmarine/development.md @@ -27,8 +27,10 @@ in its own conda environment by running the following conda create --name submarine-dev python=3.6 conda activate submarine-dev -# lint-requirements.txt and test-requirements.txt are in ./submarine-sdk/pysubmarine/github-actions +# Install auto-format and lints (lint-requirements.txt is in ./dev-support/style-check/python) pip install -r lint-requirements.txt + +# test-requirements.txt is in ./submarine-sdk/pysubmarine/github-actions pip install -r test-requirements.txt # Installs pysubmarine from current checkout @@ -52,18 +54,26 @@ The script does the following things: ### Coding Style -- Use [yapf](https://github.com/google/yapf) to format Python code -- yapf style is configured in `.style.yapf` file +- Use [isort](https://github.com/PyCQA/isort) to sort the Python imports and [black](https://github.com/psf/black) to format Python code +- Both style is configured in `pyproject.toml` - To autoformat code ```bash -./submarine-sdk/pysubmarine/github-actions/auto-format.sh +./dev-support/style-check/python/auto-format.sh ``` +- Use [flake8](https://github.com/PyCQA/flake8) to verify the linter, its' configure is in `.flake8` - Verify linter pass before submitting a pull request by running: ```bash -./submarine-sdk/pysubmarine/github-actions/lint.sh +./dev-support/style-check/python/lint.sh +``` + +- If you encouter a unexpected format, use the following method +```python +# fmt: off + "Unexpected format, formated by yourself" +# fmt: on ``` ### Unit Testing diff --git a/website/docs/userDocs/yarn/docker/pytorch/with-cifar10-models/cifar10_tutorial.py b/website/docs/userDocs/yarn/docker/pytorch/with-cifar10-models/cifar10_tutorial.py index 02824eca56..c3bb9914a8 100644 --- a/website/docs/userDocs/yarn/docker/pytorch/with-cifar10-models/cifar10_tutorial.py +++ b/website/docs/userDocs/yarn/docker/pytorch/with-cifar10-models/cifar10_tutorial.py @@ -80,21 +80,20 @@ # We transform them to Tensors of normalized range [-1, 1]. transform = transforms.Compose( - [transforms.ToTensor(), - transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] +) -trainset = torchvision.datasets.CIFAR10(root='./data', train=True, - download=True, transform=transform) -trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, - shuffle=True, num_workers=2) +trainset = torchvision.datasets.CIFAR10( + root="./data", train=True, download=True, transform=transform +) +trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2) -testset = torchvision.datasets.CIFAR10(root='./data', train=False, - download=True, transform=transform) -testloader = torch.utils.data.DataLoader(testset, batch_size=4, - shuffle=False, num_workers=2) +testset = torchvision.datasets.CIFAR10( + root="./data", train=False, download=True, transform=transform +) +testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=2) -classes = ('plane', 'car', 'bird', 'cat', - 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') +classes = ("plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck") ######################################################################## # Let us show some of the training images, for fun. @@ -102,15 +101,14 @@ import matplotlib.pyplot as plt import numpy as np - # functions to show an image def imshow(img): - img = img / 2 + 0.5 # unnormalize - npimg = img.numpy() - plt.imshow(np.transpose(npimg, (1, 2, 0))) - plt.show() + img = img / 2 + 0.5 # unnormalize + npimg = img.numpy() + plt.imshow(np.transpose(npimg, (1, 2, 0))) + plt.show() # get some random training images @@ -120,7 +118,7 @@ def imshow(img): # show images imshow(torchvision.utils.make_grid(images)) # print labels -print(' '.join('%5s' % classes[labels[j]] for j in range(4))) +print(" ".join("%5s" % classes[labels[j]] for j in range(4))) ######################################################################## # 2. Define a Convolutional Neural Network @@ -133,23 +131,23 @@ def imshow(img): class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x net = Net() @@ -174,28 +172,27 @@ def forward(self, x): for epoch in range(2): # loop over the dataset multiple times - running_loss = 0.0 - for i, data in enumerate(trainloader, 0): - # get the inputs - inputs, labels = data + running_loss = 0.0 + for i, data in enumerate(trainloader, 0): + # get the inputs + inputs, labels = data - # zero the parameter gradients - optimizer.zero_grad() + # zero the parameter gradients + optimizer.zero_grad() - # forward + backward + optimize - outputs = net(inputs) - loss = criterion(outputs, labels) - loss.backward() - optimizer.step() + # forward + backward + optimize + outputs = net(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() - # print statistics - running_loss += loss.item() - if i % 2000 == 1999: # print every 2000 mini-batches - print('[%d, %5d] loss: %.3f' % - (epoch + 1, i + 1, running_loss / 2000)) - running_loss = 0.0 + # print statistics + running_loss += loss.item() + if i % 2000 == 1999: # print every 2000 mini-batches + print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / 2000)) + running_loss = 0.0 -print('Finished Training') +print("Finished Training") ######################################################################## # 5. Test the network on the test data @@ -215,7 +212,7 @@ def forward(self, x): # print images imshow(torchvision.utils.make_grid(images)) -print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4))) +print("GroundTruth: ", " ".join("%5s" % classes[labels[j]] for j in range(4))) ######################################################################## # Okay, now let us see what the neural network thinks these examples above are: @@ -229,8 +226,7 @@ def forward(self, x): # So, let's get the index of the highest energy: _, predicted = torch.max(outputs, 1) -print('Predicted: ', ' '.join('%5s' % classes[predicted[j]] - for j in range(4))) +print("Predicted: ", " ".join("%5s" % classes[predicted[j]] for j in range(4))) ######################################################################## # The results seem pretty good. @@ -240,15 +236,14 @@ def forward(self, x): correct = 0 total = 0 with torch.no_grad(): - for data in testloader: - images, labels = data - outputs = net(images) - _, predicted = torch.max(outputs.data, 1) - total += labels.size(0) - correct += (predicted == labels).sum().item() + for data in testloader: + images, labels = data + outputs = net(images) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() -print('Accuracy of the network on the 10000 test images: %d %%' % ( - 100 * correct / total)) +print("Accuracy of the network on the 10000 test images: %d %%" % (100 * correct / total)) ######################################################################## # That looks waaay better than chance, which is 10% accuracy (randomly picking @@ -258,22 +253,21 @@ def forward(self, x): # Hmmm, what are the classes that performed well, and the classes that did # not perform well: -class_correct = list(0. for i in range(10)) -class_total = list(0. for i in range(10)) +class_correct = list(0.0 for i in range(10)) +class_total = list(0.0 for i in range(10)) with torch.no_grad(): - for data in testloader: - images, labels = data - outputs = net(images) - _, predicted = torch.max(outputs, 1) - c = (predicted == labels).squeeze() - for i in range(4): - label = labels[i] - class_correct[label] += c[i].item() - class_total[label] += 1 + for data in testloader: + images, labels = data + outputs = net(images) + _, predicted = torch.max(outputs, 1) + c = (predicted == labels).squeeze() + for i in range(4): + label = labels[i] + class_correct[label] += c[i].item() + class_total[label] += 1 for i in range(10): - print('Accuracy of %5s : %2d %%' % ( - classes[i], 100 * class_correct[i] / class_total[i])) + print("Accuracy of %5s : %2d %%" % (classes[i], 100 * class_correct[i] / class_total[i])) ######################################################################## # Okay, so what next? diff --git a/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10.py b/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10.py index 5e1a70895a..29883d291d 100644 --- a/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10.py +++ b/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10.py @@ -26,88 +26,87 @@ class Cifar10DataSet(object): - """Cifar10 data set. - - Described by http://www.cs.toronto.edu/~kriz/cifar.html. - """ - - def __init__(self, data_dir, subset='train', use_distortion=True): - self.data_dir = data_dir - self.subset = subset - self.use_distortion = use_distortion - - def get_filenames(self): - if self.subset in ['train', 'validation', 'eval']: - return [os.path.join(self.data_dir, self.subset + '.tfrecords')] - else: - raise ValueError('Invalid data subset "%s"' % self.subset) - - def parser(self, serialized_example): - """Parses a single tf.Example into image and label tensors.""" - # Dimensions of the images in the CIFAR-10 dataset. - # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the - # input format. - features = tf.parse_single_example( - serialized_example, - features={ - 'image': tf.FixedLenFeature([], tf.string), - 'label': tf.FixedLenFeature([], tf.int64), - }) - image = tf.decode_raw(features['image'], tf.uint8) - image.set_shape([DEPTH * HEIGHT * WIDTH]) - - # Reshape from [depth * height * width] to [depth, height, width]. - image = tf.cast( - tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), - tf.float32) - label = tf.cast(features['label'], tf.int32) - - # Custom preprocessing. - image = self.preprocess(image) - - return image, label - - def make_batch(self, batch_size): - """Read the images and labels from 'filenames'.""" - filenames = self.get_filenames() - # Repeat infinitely. - dataset = tf.data.TFRecordDataset(filenames).repeat() - - # Parse records. - dataset = dataset.map( - self.parser, num_parallel_calls=batch_size) - - # Potentially shuffle records. - if self.subset == 'train': - min_queue_examples = int( - Cifar10DataSet.num_examples_per_epoch(self.subset) * 0.4) - # Ensure that the capacity is sufficiently large to provide good random - # shuffling. - dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size) - - # Batch it up. - dataset = dataset.batch(batch_size) - iterator = dataset.make_one_shot_iterator() - image_batch, label_batch = iterator.get_next() - - return image_batch, label_batch - - def preprocess(self, image): - """Preprocess a single image in [height, width, depth] layout.""" - if self.subset == 'train' and self.use_distortion: - # Pad 4 pixels on each dimension of feature map, done in mini-batch - image = tf.image.resize_image_with_crop_or_pad(image, 40, 40) - image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH]) - image = tf.image.random_flip_left_right(image) - return image - - @staticmethod - def num_examples_per_epoch(subset='train'): - if subset == 'train': - return 45000 - elif subset == 'validation': - return 5000 - elif subset == 'eval': - return 10000 - else: - raise ValueError('Invalid data subset "%s"' % subset) + """Cifar10 data set. + + Described by http://www.cs.toronto.edu/~kriz/cifar.html. + """ + + def __init__(self, data_dir, subset="train", use_distortion=True): + self.data_dir = data_dir + self.subset = subset + self.use_distortion = use_distortion + + def get_filenames(self): + if self.subset in ["train", "validation", "eval"]: + return [os.path.join(self.data_dir, self.subset + ".tfrecords")] + else: + raise ValueError('Invalid data subset "%s"' % self.subset) + + def parser(self, serialized_example): + """Parses a single tf.Example into image and label tensors.""" + # Dimensions of the images in the CIFAR-10 dataset. + # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the + # input format. + features = tf.parse_single_example( + serialized_example, + features={ + "image": tf.FixedLenFeature([], tf.string), + "label": tf.FixedLenFeature([], tf.int64), + }, + ) + image = tf.decode_raw(features["image"], tf.uint8) + image.set_shape([DEPTH * HEIGHT * WIDTH]) + + # Reshape from [depth * height * width] to [depth, height, width]. + image = tf.cast( + tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), tf.float32 + ) + label = tf.cast(features["label"], tf.int32) + + # Custom preprocessing. + image = self.preprocess(image) + + return image, label + + def make_batch(self, batch_size): + """Read the images and labels from 'filenames'.""" + filenames = self.get_filenames() + # Repeat infinitely. + dataset = tf.data.TFRecordDataset(filenames).repeat() + + # Parse records. + dataset = dataset.map(self.parser, num_parallel_calls=batch_size) + + # Potentially shuffle records. + if self.subset == "train": + min_queue_examples = int(Cifar10DataSet.num_examples_per_epoch(self.subset) * 0.4) + # Ensure that the capacity is sufficiently large to provide good random + # shuffling. + dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size) + + # Batch it up. + dataset = dataset.batch(batch_size) + iterator = dataset.make_one_shot_iterator() + image_batch, label_batch = iterator.get_next() + + return image_batch, label_batch + + def preprocess(self, image): + """Preprocess a single image in [height, width, depth] layout.""" + if self.subset == "train" and self.use_distortion: + # Pad 4 pixels on each dimension of feature map, done in mini-batch + image = tf.image.resize_image_with_crop_or_pad(image, 40, 40) + image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH]) + image = tf.image.random_flip_left_right(image) + return image + + @staticmethod + def num_examples_per_epoch(subset="train"): + if subset == "train": + return 45000 + elif subset == "validation": + return 5000 + elif subset == "eval": + return 10000 + else: + raise ValueError('Invalid data subset "%s"' % subset) diff --git a/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_main.py b/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_main.py index dbd2418905..91e8e48861 100644 --- a/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_main.py +++ b/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_main.py @@ -25,8 +25,7 @@ """ -from __future__ import division -from __future__ import print_function +from __future__ import division, print_function import argparse import functools @@ -38,484 +37,483 @@ import cifar10_utils import numpy as np import six -from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf +from six.moves import xrange # pylint: disable=redefined-builtin tf.logging.set_verbosity(tf.logging.INFO) def get_model_fn(num_gpus, variable_strategy, num_workers): - """Returns a function that will build the resnet model.""" + """Returns a function that will build the resnet model.""" + + def _resnet_model_fn(features, labels, mode, params): + """Resnet model body. + + Support single host, one or more GPU training. Parameter distribution can + be either one of the following scheme. + 1. CPU is the parameter server and manages gradient updates. + 2. Parameters are distributed evenly across all GPUs, and the first GPU + manages gradient updates. + + Args: + features: a list of tensors, one for each tower + labels: a list of tensors, one for each tower + mode: ModeKeys.TRAIN or EVAL + params: Hyperparameters suitable for tuning + Returns: + A EstimatorSpec object. + """ + is_training = mode == tf.estimator.ModeKeys.TRAIN + weight_decay = params.weight_decay + momentum = params.momentum + + tower_features = features + tower_labels = labels + tower_losses = [] + tower_gradvars = [] + tower_preds = [] + + # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) + # on CPU. The exception is Intel MKL on CPU which is optimal with + # channels_last. + data_format = params.data_format + if not data_format: + if num_gpus == 0: + data_format = "channels_last" + else: + data_format = "channels_first" + + if num_gpus == 0: + num_devices = 1 + device_type = "cpu" + else: + num_devices = num_gpus + device_type = "gpu" + + for i in range(num_devices): + worker_device = "/{}:{}".format(device_type, i) + if variable_strategy == "CPU": + device_setter = cifar10_utils.local_device_setter(worker_device=worker_device) + elif variable_strategy == "GPU": + device_setter = cifar10_utils.local_device_setter( + ps_device_type="gpu", + worker_device=worker_device, + ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( + num_gpus, tf.contrib.training.byte_size_load_fn + ), + ) + with tf.variable_scope("resnet", reuse=bool(i != 0)): + with tf.name_scope("tower_%d" % i) as name_scope: + with tf.device(device_setter): + loss, gradvars, preds = _tower_fn( + is_training, + weight_decay, + tower_features[i], + tower_labels[i], + data_format, + params.num_layers, + params.batch_norm_decay, + params.batch_norm_epsilon, + ) + tower_losses.append(loss) + tower_gradvars.append(gradvars) + tower_preds.append(preds) + if i == 0: + # Only trigger batch_norm moving mean and variance update from + # the 1st tower. Ideally, we should grab the updates from all + # towers but these stats accumulate extremely fast so we can + # ignore the other stats from the other towers without + # significant detriment. + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope) + + # Now compute global loss and gradients. + gradvars = [] + with tf.name_scope("gradient_averaging"): + all_grads = {} + for grad, var in itertools.chain(*tower_gradvars): + if grad is not None: + all_grads.setdefault(var, []).append(grad) + for var, grads in six.iteritems(all_grads): + # Average gradients on the same device as the variables + # to which they apply. + with tf.device(var.device): + if len(grads) == 1: + avg_grad = grads[0] + else: + avg_grad = tf.multiply(tf.add_n(grads), 1.0 / len(grads)) + gradvars.append((avg_grad, var)) + + # Device that runs the ops to apply global gradient updates. + consolidation_device = "/gpu:0" if variable_strategy == "GPU" else "/cpu:0" + with tf.device(consolidation_device): + # Suggested learning rate scheduling from + # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155 + num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch("train") // ( + params.train_batch_size * num_workers + ) + boundaries = [ + num_batches_per_epoch * x for x in np.array([82, 123, 300], dtype=np.int64) + ] + staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]] + + learning_rate = tf.train.piecewise_constant( + tf.train.get_global_step(), boundaries, staged_lr + ) + + loss = tf.reduce_mean(tower_losses, name="loss") + + examples_sec_hook = cifar10_utils.ExamplesPerSecondHook( + params.train_batch_size, every_n_steps=10 + ) + + tensors_to_log = {"learning_rate": learning_rate, "loss": loss} + + logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100) + + train_hooks = [logging_hook, examples_sec_hook] + + optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum) + + if params.sync: + optimizer = tf.train.SyncReplicasOptimizer( + optimizer, replicas_to_aggregate=num_workers + ) + sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief) + train_hooks.append(sync_replicas_hook) + + # Create single grouped train op + train_op = [optimizer.apply_gradients(gradvars, global_step=tf.train.get_global_step())] + train_op.extend(update_ops) + train_op = tf.group(*train_op) + + predictions = { + "classes": tf.concat([p["classes"] for p in tower_preds], axis=0), + "probabilities": tf.concat([p["probabilities"] for p in tower_preds], axis=0), + } + stacked_labels = tf.concat(labels, axis=0) + metrics = {"accuracy": tf.metrics.accuracy(stacked_labels, predictions["classes"])} + + return tf.estimator.EstimatorSpec( + mode=mode, + predictions=predictions, + loss=loss, + train_op=train_op, + training_hooks=train_hooks, + eval_metric_ops=metrics, + ) + + return _resnet_model_fn + + +def _tower_fn( + is_training, + weight_decay, + feature, + label, + data_format, + num_layers, + batch_norm_decay, + batch_norm_epsilon, +): + """Build computation tower (Resnet). + + Args: + is_training: true if is training graph. + weight_decay: weight regularization strength, a float. + feature: a Tensor. + label: a Tensor. + data_format: channels_last (NHWC) or channels_first (NCHW). + num_layers: number of layers, an int. + batch_norm_decay: decay for batch normalization, a float. + batch_norm_epsilon: epsilon for batch normalization, a float. + + Returns: + A tuple with the loss for the tower, the gradients and parameters, and + predictions. + + """ + model = cifar10_model.ResNetCifar10( + num_layers, + batch_norm_decay=batch_norm_decay, + batch_norm_epsilon=batch_norm_epsilon, + is_training=is_training, + data_format=data_format, + ) + logits = model.forward_pass(feature, input_data_format="channels_last") + tower_pred = { + "classes": tf.argmax(input=logits, axis=1), + "probabilities": tf.nn.softmax(logits), + } + + tower_loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=label) + tower_loss = tf.reduce_mean(tower_loss) - def _resnet_model_fn(features, labels, mode, params): - """Resnet model body. + model_params = tf.trainable_variables() + tower_loss += weight_decay * tf.add_n([tf.nn.l2_loss(v) for v in model_params]) - Support single host, one or more GPU training. Parameter distribution can - be either one of the following scheme. - 1. CPU is the parameter server and manages gradient updates. - 2. Parameters are distributed evenly across all GPUs, and the first GPU - manages gradient updates. + tower_grad = tf.gradients(tower_loss, model_params) + + return tower_loss, zip(tower_grad, model_params), tower_pred + + +def input_fn(data_dir, subset, num_shards, batch_size, use_distortion_for_training=True): + """Create input graph for model. Args: - features: a list of tensors, one for each tower - labels: a list of tensors, one for each tower - mode: ModeKeys.TRAIN or EVAL - params: Hyperparameters suitable for tuning + data_dir: Directory where TFRecords representing the dataset are located. + subset: one of 'train', 'validate' and 'eval'. + num_shards: num of towers participating in data-parallel training. + batch_size: total batch size for training to be divided by the number of + shards. + use_distortion_for_training: True to use distortions. Returns: - A EstimatorSpec object. + two lists of tensors for features and labels, each of num_shards length. """ - is_training = (mode == tf.estimator.ModeKeys.TRAIN) - weight_decay = params.weight_decay - momentum = params.momentum - - tower_features = features - tower_labels = labels - tower_losses = [] - tower_gradvars = [] - tower_preds = [] - - # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) - # on CPU. The exception is Intel MKL on CPU which is optimal with - # channels_last. - data_format = params.data_format - if not data_format: - if num_gpus == 0: - data_format = 'channels_last' - else: - data_format = 'channels_first' - - if num_gpus == 0: - num_devices = 1 - device_type = 'cpu' - else: - num_devices = num_gpus - device_type = 'gpu' - - for i in range(num_devices): - worker_device = '/{}:{}'.format(device_type, i) - if variable_strategy == 'CPU': - device_setter = cifar10_utils.local_device_setter( - worker_device=worker_device) - elif variable_strategy == 'GPU': - device_setter = cifar10_utils.local_device_setter( - ps_device_type='gpu', - worker_device=worker_device, - ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( - num_gpus, tf.contrib.training.byte_size_load_fn)) - with tf.variable_scope('resnet', reuse=bool(i != 0)): - with tf.name_scope('tower_%d' % i) as name_scope: - with tf.device(device_setter): - loss, gradvars, preds = _tower_fn( - is_training, weight_decay, tower_features[i], tower_labels[i], - data_format, params.num_layers, params.batch_norm_decay, - params.batch_norm_epsilon) - tower_losses.append(loss) - tower_gradvars.append(gradvars) - tower_preds.append(preds) - if i == 0: - # Only trigger batch_norm moving mean and variance update from - # the 1st tower. Ideally, we should grab the updates from all - # towers but these stats accumulate extremely fast so we can - # ignore the other stats from the other towers without - # significant detriment. - update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, - name_scope) - - # Now compute global loss and gradients. - gradvars = [] - with tf.name_scope('gradient_averaging'): - all_grads = {} - for grad, var in itertools.chain(*tower_gradvars): - if grad is not None: - all_grads.setdefault(var, []).append(grad) - for var, grads in six.iteritems(all_grads): - # Average gradients on the same device as the variables - # to which they apply. - with tf.device(var.device): - if len(grads) == 1: - avg_grad = grads[0] - else: - avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) - gradvars.append((avg_grad, var)) - - # Device that runs the ops to apply global gradient updates. - consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' - with tf.device(consolidation_device): - # Suggested learning rate scheduling from - # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155 - num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch( - 'train') // (params.train_batch_size * num_workers) - boundaries = [ - num_batches_per_epoch * x - for x in np.array([82, 123, 300], dtype=np.int64) - ] - staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]] - - learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(), - boundaries, staged_lr) - - loss = tf.reduce_mean(tower_losses, name='loss') - - examples_sec_hook = cifar10_utils.ExamplesPerSecondHook( - params.train_batch_size, every_n_steps=10) - - tensors_to_log = {'learning_rate': learning_rate, 'loss': loss} - - logging_hook = tf.train.LoggingTensorHook( - tensors=tensors_to_log, every_n_iter=100) - - train_hooks = [logging_hook, examples_sec_hook] - - optimizer = tf.train.MomentumOptimizer( - learning_rate=learning_rate, momentum=momentum) - - if params.sync: - optimizer = tf.train.SyncReplicasOptimizer( - optimizer, replicas_to_aggregate=num_workers) - sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief) - train_hooks.append(sync_replicas_hook) - - # Create single grouped train op - train_op = [ - optimizer.apply_gradients( - gradvars, global_step=tf.train.get_global_step()) - ] - train_op.extend(update_ops) - train_op = tf.group(*train_op) - - predictions = { - 'classes': - tf.concat([p['classes'] for p in tower_preds], axis=0), - 'probabilities': - tf.concat([p['probabilities'] for p in tower_preds], axis=0) - } - stacked_labels = tf.concat(labels, axis=0) - metrics = { - 'accuracy': - tf.metrics.accuracy(stacked_labels, predictions['classes']) - } - - return tf.estimator.EstimatorSpec( - mode=mode, - predictions=predictions, - loss=loss, - train_op=train_op, - training_hooks=train_hooks, - eval_metric_ops=metrics) - - return _resnet_model_fn - - -def _tower_fn(is_training, weight_decay, feature, label, data_format, - num_layers, batch_norm_decay, batch_norm_epsilon): - """Build computation tower (Resnet). - - Args: - is_training: true if is training graph. - weight_decay: weight regularization strength, a float. - feature: a Tensor. - label: a Tensor. - data_format: channels_last (NHWC) or channels_first (NCHW). - num_layers: number of layers, an int. - batch_norm_decay: decay for batch normalization, a float. - batch_norm_epsilon: epsilon for batch normalization, a float. - - Returns: - A tuple with the loss for the tower, the gradients and parameters, and - predictions. - - """ - model = cifar10_model.ResNetCifar10( - num_layers, - batch_norm_decay=batch_norm_decay, - batch_norm_epsilon=batch_norm_epsilon, - is_training=is_training, - data_format=data_format) - logits = model.forward_pass(feature, input_data_format='channels_last') - tower_pred = { - 'classes': tf.argmax(input=logits, axis=1), - 'probabilities': tf.nn.softmax(logits) - } - - tower_loss = tf.losses.sparse_softmax_cross_entropy( - logits=logits, labels=label) - tower_loss = tf.reduce_mean(tower_loss) - - model_params = tf.trainable_variables() - tower_loss += weight_decay * tf.add_n( - [tf.nn.l2_loss(v) for v in model_params]) - - tower_grad = tf.gradients(tower_loss, model_params) - - return tower_loss, zip(tower_grad, model_params), tower_pred - - -def input_fn(data_dir, - subset, - num_shards, - batch_size, - use_distortion_for_training=True): - """Create input graph for model. - - Args: - data_dir: Directory where TFRecords representing the dataset are located. - subset: one of 'train', 'validate' and 'eval'. - num_shards: num of towers participating in data-parallel training. - batch_size: total batch size for training to be divided by the number of - shards. - use_distortion_for_training: True to use distortions. - Returns: - two lists of tensors for features and labels, each of num_shards length. - """ - with tf.device('/cpu:0'): - use_distortion = subset == 'train' and use_distortion_for_training - dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion) - image_batch, label_batch = dataset.make_batch(batch_size) - if num_shards <= 1: - # No GPU available or only 1 GPU. - return [image_batch], [label_batch] - - # Note that passing num=batch_size is safe here, even though - # dataset.batch(batch_size) can, in some cases, return fewer than batch_size - # examples. This is because it does so only when repeating for a limited - # number of epochs, but our dataset repeats forever. - image_batch = tf.unstack(image_batch, num=batch_size, axis=0) - label_batch = tf.unstack(label_batch, num=batch_size, axis=0) - feature_shards = [[] for i in range(num_shards)] - label_shards = [[] for i in range(num_shards)] - for i in xrange(batch_size): - idx = i % num_shards - feature_shards[idx].append(image_batch[i]) - label_shards[idx].append(label_batch[i]) - feature_shards = [tf.parallel_stack(x) for x in feature_shards] - label_shards = [tf.parallel_stack(x) for x in label_shards] - return feature_shards, label_shards - - -def get_experiment_fn(data_dir, - num_gpus, - variable_strategy, - use_distortion_for_training=True): - """Returns an Experiment function. - - Experiments perform training on several workers in parallel, - in other words experiments know how to invoke train and eval in a sensible - fashion for distributed training. Arguments passed directly to this - function are not tunable, all other arguments should be passed within - tf.HParams, passed to the enclosed function. - - Args: - data_dir: str. Location of the data for input_fns. - num_gpus: int. Number of GPUs on each worker. - variable_strategy: String. CPU to use CPU as the parameter server - and GPU to use the GPUs as the parameter server. - use_distortion_for_training: bool. See cifar10.Cifar10DataSet. - Returns: - A function (tf.estimator.RunConfig, tf.contrib.training.HParams) -> - tf.contrib.learn.Experiment. - - Suitable for use by tf.contrib.learn.learn_runner, which will run various - methods on Experiment (train, evaluate) based on information - about the current runner in `run_config`. - """ - - def _experiment_fn(run_config, hparams): - """Returns an Experiment.""" - # Create estimator. - train_input_fn = functools.partial( - input_fn, - data_dir, - subset='train', - num_shards=num_gpus, - batch_size=hparams.train_batch_size, - use_distortion_for_training=use_distortion_for_training) - - eval_input_fn = functools.partial( - input_fn, - data_dir, - subset='eval', - batch_size=hparams.eval_batch_size, - num_shards=num_gpus) - - num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval') - if num_eval_examples % hparams.eval_batch_size != 0: - raise ValueError( - 'validation set size must be multiple of eval_batch_size') - - train_steps = hparams.train_steps - eval_steps = num_eval_examples // hparams.eval_batch_size - - classifier = tf.estimator.Estimator( - model_fn=get_model_fn(num_gpus, variable_strategy, - run_config.num_worker_replicas or 1), - config=run_config, - params=hparams) - - # Create experiment. - return tf.contrib.learn.Experiment( - classifier, - train_input_fn=train_input_fn, - eval_input_fn=eval_input_fn, - train_steps=train_steps, - eval_steps=eval_steps) - - return _experiment_fn - - -def main(job_dir, data_dir, num_gpus, variable_strategy, - use_distortion_for_training, log_device_placement, num_intra_threads, - **hparams): - # The env variable is on deprecation path, default is set to off. - os.environ['TF_SYNC_ON_FINISH'] = '0' - os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' - - # Session configuration. - sess_config = tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=log_device_placement, - intra_op_parallelism_threads=num_intra_threads, - gpu_options=tf.GPUOptions(force_gpu_compatible=True)) - - config = cifar10_utils.RunConfig( - session_config=sess_config, model_dir=job_dir) - tf.contrib.learn.learn_runner.run( - get_experiment_fn(data_dir, num_gpus, variable_strategy, - use_distortion_for_training), - run_config=config, - hparams=tf.contrib.training.HParams( - is_chief=config.is_chief, - **hparams)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--data-dir', - type=str, - required=True, - help='The directory where the CIFAR-10 input data is stored.') - parser.add_argument( - '--job-dir', - type=str, - required=True, - help='The directory where the model will be stored.') - parser.add_argument( - '--variable-strategy', - choices=['CPU', 'GPU'], - type=str, - default='CPU', - help='Where to locate variable operations') - parser.add_argument( - '--num-gpus', - type=int, - default=1, - help='The number of gpus used. Uses only CPU if set to 0.') - parser.add_argument( - '--num-layers', - type=int, - default=44, - help='The number of layers of the model.') - parser.add_argument( - '--train-steps', - type=int, - default=80000, - help='The number of steps to use for training.') - parser.add_argument( - '--train-batch-size', - type=int, - default=128, - help='Batch size for training.') - parser.add_argument( - '--eval-batch-size', - type=int, - default=100, - help='Batch size for validation.') - parser.add_argument( - '--momentum', - type=float, - default=0.9, - help='Momentum for MomentumOptimizer.') - parser.add_argument( - '--weight-decay', - type=float, - default=2e-4, - help='Weight decay for convolutions.') - parser.add_argument( - '--learning-rate', - type=float, - default=0.1, - help="""\ + with tf.device("/cpu:0"): + use_distortion = subset == "train" and use_distortion_for_training + dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion) + image_batch, label_batch = dataset.make_batch(batch_size) + if num_shards <= 1: + # No GPU available or only 1 GPU. + return [image_batch], [label_batch] + + # Note that passing num=batch_size is safe here, even though + # dataset.batch(batch_size) can, in some cases, return fewer than batch_size + # examples. This is because it does so only when repeating for a limited + # number of epochs, but our dataset repeats forever. + image_batch = tf.unstack(image_batch, num=batch_size, axis=0) + label_batch = tf.unstack(label_batch, num=batch_size, axis=0) + feature_shards = [[] for i in range(num_shards)] + label_shards = [[] for i in range(num_shards)] + for i in xrange(batch_size): + idx = i % num_shards + feature_shards[idx].append(image_batch[i]) + label_shards[idx].append(label_batch[i]) + feature_shards = [tf.parallel_stack(x) for x in feature_shards] + label_shards = [tf.parallel_stack(x) for x in label_shards] + return feature_shards, label_shards + + +def get_experiment_fn(data_dir, num_gpus, variable_strategy, use_distortion_for_training=True): + """Returns an Experiment function. + + Experiments perform training on several workers in parallel, + in other words experiments know how to invoke train and eval in a sensible + fashion for distributed training. Arguments passed directly to this + function are not tunable, all other arguments should be passed within + tf.HParams, passed to the enclosed function. + + Args: + data_dir: str. Location of the data for input_fns. + num_gpus: int. Number of GPUs on each worker. + variable_strategy: String. CPU to use CPU as the parameter server + and GPU to use the GPUs as the parameter server. + use_distortion_for_training: bool. See cifar10.Cifar10DataSet. + Returns: + A function (tf.estimator.RunConfig, tf.contrib.training.HParams) -> + tf.contrib.learn.Experiment. + + Suitable for use by tf.contrib.learn.learn_runner, which will run various + methods on Experiment (train, evaluate) based on information + about the current runner in `run_config`. + """ + + def _experiment_fn(run_config, hparams): + """Returns an Experiment.""" + # Create estimator. + train_input_fn = functools.partial( + input_fn, + data_dir, + subset="train", + num_shards=num_gpus, + batch_size=hparams.train_batch_size, + use_distortion_for_training=use_distortion_for_training, + ) + + eval_input_fn = functools.partial( + input_fn, + data_dir, + subset="eval", + batch_size=hparams.eval_batch_size, + num_shards=num_gpus, + ) + + num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch("eval") + if num_eval_examples % hparams.eval_batch_size != 0: + raise ValueError("validation set size must be multiple of eval_batch_size") + + train_steps = hparams.train_steps + eval_steps = num_eval_examples // hparams.eval_batch_size + + classifier = tf.estimator.Estimator( + model_fn=get_model_fn(num_gpus, variable_strategy, run_config.num_worker_replicas or 1), + config=run_config, + params=hparams, + ) + + # Create experiment. + return tf.contrib.learn.Experiment( + classifier, + train_input_fn=train_input_fn, + eval_input_fn=eval_input_fn, + train_steps=train_steps, + eval_steps=eval_steps, + ) + + return _experiment_fn + + +def main( + job_dir, + data_dir, + num_gpus, + variable_strategy, + use_distortion_for_training, + log_device_placement, + num_intra_threads, + **hparams +): + # The env variable is on deprecation path, default is set to off. + os.environ["TF_SYNC_ON_FINISH"] = "0" + os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1" + + # Session configuration. + sess_config = tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=log_device_placement, + intra_op_parallelism_threads=num_intra_threads, + gpu_options=tf.GPUOptions(force_gpu_compatible=True), + ) + + config = cifar10_utils.RunConfig(session_config=sess_config, model_dir=job_dir) + tf.contrib.learn.learn_runner.run( + get_experiment_fn(data_dir, num_gpus, variable_strategy, use_distortion_for_training), + run_config=config, + hparams=tf.contrib.training.HParams(is_chief=config.is_chief, **hparams), + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--data-dir", + type=str, + required=True, + help="The directory where the CIFAR-10 input data is stored.", + ) + parser.add_argument( + "--job-dir", type=str, required=True, help="The directory where the model will be stored." + ) + parser.add_argument( + "--variable-strategy", + choices=["CPU", "GPU"], + type=str, + default="CPU", + help="Where to locate variable operations", + ) + parser.add_argument( + "--num-gpus", + type=int, + default=1, + help="The number of gpus used. Uses only CPU if set to 0.", + ) + parser.add_argument( + "--num-layers", type=int, default=44, help="The number of layers of the model." + ) + parser.add_argument( + "--train-steps", type=int, default=80000, help="The number of steps to use for training." + ) + parser.add_argument( + "--train-batch-size", type=int, default=128, help="Batch size for training." + ) + parser.add_argument( + "--eval-batch-size", type=int, default=100, help="Batch size for validation." + ) + parser.add_argument( + "--momentum", type=float, default=0.9, help="Momentum for MomentumOptimizer." + ) + parser.add_argument( + "--weight-decay", type=float, default=2e-4, help="Weight decay for convolutions." + ) + parser.add_argument( + "--learning-rate", + type=float, + default=0.1, + help="""\ This is the initial learning rate value. The learning rate will decrease during training. For more details check the model_fn implementation in this file.\ - """) - parser.add_argument( - '--use-distortion-for-training', - type=bool, - default=True, - help='If doing image distortion for training.') - parser.add_argument( - '--sync', - action='store_true', - default=False, - help="""\ + """, + ) + parser.add_argument( + "--use-distortion-for-training", + type=bool, + default=True, + help="If doing image distortion for training.", + ) + parser.add_argument( + "--sync", + action="store_true", + default=False, + help="""\ If present when running in a distributed environment will run on sync mode.\ - """) - parser.add_argument( - '--num-intra-threads', - type=int, - default=0, - help="""\ + """, + ) + parser.add_argument( + "--num-intra-threads", + type=int, + default=0, + help="""\ Number of threads to use for intra-op parallelism. When training on CPU set to 0 to have the system pick the appropriate number or alternatively set it to the number of physical CPU cores.\ - """) - parser.add_argument( - '--num-inter-threads', - type=int, - default=0, - help="""\ + """, + ) + parser.add_argument( + "--num-inter-threads", + type=int, + default=0, + help="""\ Number of threads to use for inter-op parallelism. If set to 0, the system will pick an appropriate number.\ - """) - parser.add_argument( - '--data-format', - type=str, - default=None, - help="""\ + """, + ) + parser.add_argument( + "--data-format", + type=str, + default=None, + help="""\ If not set, the data format best for the training device is used. Allowed values: channels_first (NCHW) channels_last (NHWC).\ - """) - parser.add_argument( - '--log-device-placement', - action='store_true', - default=False, - help='Whether to log device placement.') - parser.add_argument( - '--batch-norm-decay', - type=float, - default=0.997, - help='Decay for batch norm.') - parser.add_argument( - '--batch-norm-epsilon', - type=float, - default=1e-5, - help='Epsilon for batch norm.') - args = parser.parse_args() - - if args.num_gpus > 0: - assert tf.test.is_gpu_available(), "Requested GPUs but none found." - if args.num_gpus < 0: - raise ValueError( - 'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.') - if args.num_gpus == 0 and args.variable_strategy == 'GPU': - raise ValueError('num-gpus=0, CPU must be used as parameter server. Set' - '--variable-strategy=CPU.') - if (args.num_layers - 2) % 6 != 0: - raise ValueError('Invalid --num-layers parameter.') - if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0: - raise ValueError('--train-batch-size must be multiple of --num-gpus.') - if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0: - raise ValueError('--eval-batch-size must be multiple of --num-gpus.') - - main(**vars(args)) + """, + ) + parser.add_argument( + "--log-device-placement", + action="store_true", + default=False, + help="Whether to log device placement.", + ) + parser.add_argument( + "--batch-norm-decay", type=float, default=0.997, help="Decay for batch norm." + ) + parser.add_argument( + "--batch-norm-epsilon", type=float, default=1e-5, help="Epsilon for batch norm." + ) + args = parser.parse_args() + + if args.num_gpus > 0: + assert tf.test.is_gpu_available(), "Requested GPUs but none found." + if args.num_gpus < 0: + raise ValueError('Invalid GPU count: "--num-gpus" must be 0 or a positive integer.') + if args.num_gpus == 0 and args.variable_strategy == "GPU": + raise ValueError( + "num-gpus=0, CPU must be used as parameter server. Set--variable-strategy=CPU." + ) + if (args.num_layers - 2) % 6 != 0: + raise ValueError("Invalid --num-layers parameter.") + if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0: + raise ValueError("--train-batch-size must be multiple of --num-gpus.") + if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0: + raise ValueError("--eval-batch-size must be multiple of --num-gpus.") + + main(**vars(args)) diff --git a/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_model.py b/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_model.py index d67c233dbb..019f28b82f 100644 --- a/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_model.py +++ b/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_model.py @@ -13,68 +13,64 @@ # limitations under the License. # ============================================================================== """Model class for Cifar10 Dataset.""" -from __future__ import division -from __future__ import print_function - -import tensorflow as tf +from __future__ import division, print_function import model_base +import tensorflow as tf class ResNetCifar10(model_base.ResNet): - """Cifar10 model with ResNetV1 and basic residual block.""" + """Cifar10 model with ResNetV1 and basic residual block.""" - def __init__(self, - num_layers, - is_training, - batch_norm_decay, - batch_norm_epsilon, - data_format='channels_first'): - super(ResNetCifar10, self).__init__( + def __init__( + self, + num_layers, is_training, - data_format, batch_norm_decay, - batch_norm_epsilon - ) - self.n = (num_layers - 2) // 6 - # Add one in case label starts with 1. No impact if label starts with 0. - self.num_classes = 10 + 1 - self.filters = [16, 16, 32, 64] - self.strides = [1, 2, 2] + batch_norm_epsilon, + data_format="channels_first", + ): + super(ResNetCifar10, self).__init__( + is_training, data_format, batch_norm_decay, batch_norm_epsilon + ) + self.n = (num_layers - 2) // 6 + # Add one in case label starts with 1. No impact if label starts with 0. + self.num_classes = 10 + 1 + self.filters = [16, 16, 32, 64] + self.strides = [1, 2, 2] - def forward_pass(self, x, input_data_format='channels_last'): - """Build the core model within the graph.""" - if self._data_format != input_data_format: - if input_data_format == 'channels_last': - # Computation requires channels_first. - x = tf.transpose(x, [0, 3, 1, 2]) - else: - # Computation requires channels_last. - x = tf.transpose(x, [0, 2, 3, 1]) + def forward_pass(self, x, input_data_format="channels_last"): + """Build the core model within the graph.""" + if self._data_format != input_data_format: + if input_data_format == "channels_last": + # Computation requires channels_first. + x = tf.transpose(x, [0, 3, 1, 2]) + else: + # Computation requires channels_last. + x = tf.transpose(x, [0, 2, 3, 1]) - # Image standardization. - x = x / 128 - 1 + # Image standardization. + x = x / 128 - 1 - x = self._conv(x, 3, 16, 1) - x = self._batch_norm(x) - x = self._relu(x) + x = self._conv(x, 3, 16, 1) + x = self._batch_norm(x) + x = self._relu(x) - # Use basic (non-bottleneck) block and ResNet V1 (post-activation). - res_func = self._residual_v1 + # Use basic (non-bottleneck) block and ResNet V1 (post-activation). + res_func = self._residual_v1 - # 3 stages of block stacking. - for i in range(3): - with tf.name_scope('stage'): - for j in range(self.n): - if j == 0: - # First block in a stage, filters and strides may change. - x = res_func(x, 3, self.filters[i], self.filters[i + 1], - self.strides[i]) - else: - # Following blocks in a stage, constant filters and unit stride. - x = res_func(x, 3, self.filters[i + 1], self.filters[i + 1], 1) + # 3 stages of block stacking. + for i in range(3): + with tf.name_scope("stage"): + for j in range(self.n): + if j == 0: + # First block in a stage, filters and strides may change. + x = res_func(x, 3, self.filters[i], self.filters[i + 1], self.strides[i]) + else: + # Following blocks in a stage, constant filters and unit stride. + x = res_func(x, 3, self.filters[i + 1], self.filters[i + 1], 1) - x = self._global_avg_pool(x) - x = self._fully_connected(x, self.num_classes) + x = self._global_avg_pool(x) + x = self._fully_connected(x, self.num_classes) - return x + return x diff --git a/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_utils.py b/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_utils.py index d3d29dd82e..56d7d9185f 100644 --- a/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_utils.py +++ b/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_utils.py @@ -13,141 +13,144 @@ # limitations under the License. # ============================================================================== import collections -import six +import six import tensorflow as tf - -from tensorflow.python.platform import tf_logging as logging +from tensorflow.contrib.learn.python.learn import run_config from tensorflow.core.framework import node_def_pb2 from tensorflow.python.framework import device as pydev -from tensorflow.python.training import basic_session_run_hooks -from tensorflow.python.training import session_run_hook -from tensorflow.python.training import training_util -from tensorflow.python.training import device_setter -from tensorflow.contrib.learn.python.learn import run_config +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.training import ( + basic_session_run_hooks, + device_setter, + session_run_hook, + training_util, +) # TODO(b/64848083) Remove once uid bug is fixed -class RunConfig(tf.contrib.learn.RunConfig): - def uid(self, whitelist=None): - """Generates a 'Unique Identifier' based on all internal fields. - Caller should use the uid string to check `RunConfig` instance integrity - in one session use, but should not rely on the implementation details, which - is subject to change. - Args: - whitelist: A list of the string names of the properties uid should not - include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which - includes most properties user allowed to change. - Returns: - A uid string. - """ - if whitelist is None: - whitelist = run_config._DEFAULT_UID_WHITE_LIST - - state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')} - # Pop out the keys in whitelist. - for k in whitelist: - state.pop('_' + k, None) - - ordered_state = collections.OrderedDict( - sorted(state.items(), key=lambda t: t[0])) - # For class instance without __repr__, some special cares are required. - # Otherwise, the object address will be used. - if '_cluster_spec' in ordered_state: - ordered_state['_cluster_spec'] = collections.OrderedDict( - sorted(ordered_state['_cluster_spec'].as_dict().items(), - key=lambda t: t[0]) - ) - return ', '.join( - '%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state)) +class RunConfig(tf.contrib.learn.RunConfig): + def uid(self, whitelist=None): + """Generates a 'Unique Identifier' based on all internal fields. + Caller should use the uid string to check `RunConfig` instance integrity + in one session use, but should not rely on the implementation details, which + is subject to change. + Args: + whitelist: A list of the string names of the properties uid should not + include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which + includes most properties user allowed to change. + Returns: + A uid string. + """ + if whitelist is None: + whitelist = run_config._DEFAULT_UID_WHITE_LIST + + state = {k: v for k, v in self.__dict__.items() if not k.startswith("__")} + # Pop out the keys in whitelist. + for k in whitelist: + state.pop("_" + k, None) + + ordered_state = collections.OrderedDict(sorted(state.items(), key=lambda t: t[0])) + # For class instance without __repr__, some special cares are required. + # Otherwise, the object address will be used. + if "_cluster_spec" in ordered_state: + ordered_state["_cluster_spec"] = collections.OrderedDict( + sorted(ordered_state["_cluster_spec"].as_dict().items(), key=lambda t: t[0]) + ) + return ", ".join("%s=%r" % (k, v) for (k, v) in six.iteritems(ordered_state)) class ExamplesPerSecondHook(session_run_hook.SessionRunHook): - """Hook to print out examples per second. + """Hook to print out examples per second. Total time is tracked and then divided by the total number of steps to get the average step time and then batch_size is used to determine the running average of examples per second. The examples per second for the most recent interval is also logged. - """ - - def __init__( - self, - batch_size, - every_n_steps=100, - every_n_secs=None,): - """Initializer for ExamplesPerSecondHook. - - Args: - batch_size: Total batch size used to calculate examples/second from - global time. - every_n_steps: Log stats every n steps. - every_n_secs: Log stats every n seconds. """ - if (every_n_steps is None) == (every_n_secs is None): - raise ValueError('exactly one of every_n_steps' - ' and every_n_secs should be provided.') - self._timer = basic_session_run_hooks.SecondOrStepTimer( - every_steps=every_n_steps, every_secs=every_n_secs) - - self._step_train_time = 0 - self._total_steps = 0 - self._batch_size = batch_size - - def begin(self): - self._global_step_tensor = training_util.get_global_step() - if self._global_step_tensor is None: - raise RuntimeError( - 'Global step should be created to use StepCounterHook.') - - def before_run(self, run_context): # pylint: disable=unused-argument - return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor) - - def after_run(self, run_context, run_values): - _ = run_context - - global_step = run_values.results - if self._timer.should_trigger_for_step(global_step): - elapsed_time, elapsed_steps = self._timer.update_last_triggered_step( - global_step) - if elapsed_time is not None: - steps_per_sec = elapsed_steps / elapsed_time - self._step_train_time += elapsed_time - self._total_steps += elapsed_steps - - average_examples_per_sec = self._batch_size * ( - self._total_steps / self._step_train_time) - current_examples_per_sec = steps_per_sec * self._batch_size - # Average examples/sec followed by current examples/sec - logging.info('%s: %g (%g), step = %g', 'Average examples/sec', - average_examples_per_sec, current_examples_per_sec, - self._total_steps) - -def local_device_setter(num_devices=1, - ps_device_type='cpu', - worker_device='/cpu:0', - ps_ops=None, - ps_strategy=None): - if ps_ops == None: - ps_ops = ['Variable', 'VariableV2', 'VarHandleOp'] - - if ps_strategy is None: - ps_strategy = device_setter._RoundRobinStrategy(num_devices) - if not six.callable(ps_strategy): - raise TypeError("ps_strategy must be callable") - - def _local_device_chooser(op): - current_device = pydev.DeviceSpec.from_string(op.device or "") - - node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def - if node_def.op in ps_ops: - ps_device_spec = pydev.DeviceSpec.from_string( - '/{}:{}'.format(ps_device_type, ps_strategy(op))) - - ps_device_spec.merge_from(current_device) - return ps_device_spec.to_string() - else: - worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "") - worker_device_spec.merge_from(current_device) - return worker_device_spec.to_string() - return _local_device_chooser + + def __init__( + self, + batch_size, + every_n_steps=100, + every_n_secs=None, + ): + """Initializer for ExamplesPerSecondHook. + + Args: + batch_size: Total batch size used to calculate examples/second from + global time. + every_n_steps: Log stats every n steps. + every_n_secs: Log stats every n seconds. + """ + if (every_n_steps is None) == (every_n_secs is None): + raise ValueError("exactly one of every_n_steps and every_n_secs should be provided.") + self._timer = basic_session_run_hooks.SecondOrStepTimer( + every_steps=every_n_steps, every_secs=every_n_secs + ) + + self._step_train_time = 0 + self._total_steps = 0 + self._batch_size = batch_size + + def begin(self): + self._global_step_tensor = training_util.get_global_step() + if self._global_step_tensor is None: + raise RuntimeError("Global step should be created to use StepCounterHook.") + + def before_run(self, run_context): # pylint: disable=unused-argument + return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor) + + def after_run(self, run_context, run_values): + _ = run_context + + global_step = run_values.results + if self._timer.should_trigger_for_step(global_step): + elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(global_step) + if elapsed_time is not None: + steps_per_sec = elapsed_steps / elapsed_time + self._step_train_time += elapsed_time + self._total_steps += elapsed_steps + + average_examples_per_sec = self._batch_size * ( + self._total_steps / self._step_train_time + ) + current_examples_per_sec = steps_per_sec * self._batch_size + # Average examples/sec followed by current examples/sec + logging.info( + "%s: %g (%g), step = %g", + "Average examples/sec", + average_examples_per_sec, + current_examples_per_sec, + self._total_steps, + ) + + +def local_device_setter( + num_devices=1, ps_device_type="cpu", worker_device="/cpu:0", ps_ops=None, ps_strategy=None +): + if ps_ops == None: + ps_ops = ["Variable", "VariableV2", "VarHandleOp"] + + if ps_strategy is None: + ps_strategy = device_setter._RoundRobinStrategy(num_devices) + if not six.callable(ps_strategy): + raise TypeError("ps_strategy must be callable") + + def _local_device_chooser(op): + current_device = pydev.DeviceSpec.from_string(op.device or "") + + node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def + if node_def.op in ps_ops: + ps_device_spec = pydev.DeviceSpec.from_string( + "/{}:{}".format(ps_device_type, ps_strategy(op)) + ) + + ps_device_spec.merge_from(current_device) + return ps_device_spec.to_string() + else: + worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "") + worker_device_spec.merge_from(current_device) + return worker_device_spec.to_string() + + return _local_device_chooser diff --git a/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/generate_cifar10_tfrecords.py b/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/generate_cifar10_tfrecords.py index d1a599c31b..ca5fc95019 100644 --- a/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/generate_cifar10_tfrecords.py +++ b/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/generate_cifar10_tfrecords.py @@ -19,100 +19,97 @@ https://www.cs.toronto.edu/~kriz/cifar.html. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function +from __future__ import absolute_import, division, print_function import argparse import os import sys - import tarfile + +import tensorflow as tf from six.moves import cPickle as pickle from six.moves import xrange # pylint: disable=redefined-builtin -import tensorflow as tf -CIFAR_FILENAME = 'cifar-10-python.tar.gz' -CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME -CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py' +CIFAR_FILENAME = "cifar-10-python.tar.gz" +CIFAR_DOWNLOAD_URL = "https://www.cs.toronto.edu/~kriz/" + CIFAR_FILENAME +CIFAR_LOCAL_FOLDER = "cifar-10-batches-py" def download_and_extract(data_dir): - # download CIFAR-10 if not already downloaded. - tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir, - CIFAR_DOWNLOAD_URL) - tarfile.open(os.path.join(data_dir, CIFAR_FILENAME), - 'r:gz').extractall(data_dir) + # download CIFAR-10 if not already downloaded. + tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir, CIFAR_DOWNLOAD_URL) + tarfile.open(os.path.join(data_dir, CIFAR_FILENAME), "r:gz").extractall(data_dir) def _int64_feature(value): - return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) + return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) def _bytes_feature(value): - return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) def _get_file_names(): - """Returns the file names expected to exist in the input_dir.""" - file_names = {} - file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)] - file_names['validation'] = ['data_batch_5'] - file_names['eval'] = ['test_batch'] - return file_names + """Returns the file names expected to exist in the input_dir.""" + file_names = {} + file_names["train"] = ["data_batch_%d" % i for i in xrange(1, 5)] + file_names["validation"] = ["data_batch_5"] + file_names["eval"] = ["test_batch"] + return file_names def read_pickle_from_file(filename): - with tf.gfile.Open(filename, 'rb') as f: - if sys.version_info >= (3, 0): - data_dict = pickle.load(f, encoding='bytes') - else: - data_dict = pickle.load(f) - return data_dict + with tf.gfile.Open(filename, "rb") as f: + if sys.version_info >= (3, 0): + data_dict = pickle.load(f, encoding="bytes") + else: + data_dict = pickle.load(f) + return data_dict def convert_to_tfrecord(input_files, output_file): - """Converts a file to TFRecords.""" - print('Generating %s' % output_file) - with tf.python_io.TFRecordWriter(output_file) as record_writer: - for input_file in input_files: - data_dict = read_pickle_from_file(input_file) - data = data_dict[b'data'] - labels = data_dict[b'labels'] - num_entries_in_batch = len(labels) - for i in range(num_entries_in_batch): - example = tf.train.Example(features=tf.train.Features( - feature={ - 'image': _bytes_feature(data[i].tobytes()), - 'label': _int64_feature(labels[i]) - })) - record_writer.write(example.SerializeToString()) + """Converts a file to TFRecords.""" + print("Generating %s" % output_file) + with tf.python_io.TFRecordWriter(output_file) as record_writer: + for input_file in input_files: + data_dict = read_pickle_from_file(input_file) + data = data_dict[b"data"] + labels = data_dict[b"labels"] + num_entries_in_batch = len(labels) + for i in range(num_entries_in_batch): + example = tf.train.Example( + features=tf.train.Features( + feature={ + "image": _bytes_feature(data[i].tobytes()), + "label": _int64_feature(labels[i]), + } + ) + ) + record_writer.write(example.SerializeToString()) def main(data_dir): - print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL)) - download_and_extract(data_dir) - file_names = _get_file_names() - input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER) - for mode, files in file_names.items(): - input_files = [os.path.join(input_dir, f) for f in files] - output_file = os.path.join(data_dir, mode + '.tfrecords') - try: - os.remove(output_file) - except OSError: - pass - # Convert to tf.train.Example and write the to TFRecords. - convert_to_tfrecord(input_files, output_file) - print('Done!') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--data-dir', - type=str, - default='', - help='Directory to download and extract CIFAR-10 to.') - - args = parser.parse_args() - main(args.data_dir) + print("Download from {} and extract.".format(CIFAR_DOWNLOAD_URL)) + download_and_extract(data_dir) + file_names = _get_file_names() + input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER) + for mode, files in file_names.items(): + input_files = [os.path.join(input_dir, f) for f in files] + output_file = os.path.join(data_dir, mode + ".tfrecords") + try: + os.remove(output_file) + except OSError: + pass + # Convert to tf.train.Example and write the to TFRecords. + convert_to_tfrecord(input_files, output_file) + print("Done!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--data-dir", type=str, default="", help="Directory to download and extract CIFAR-10 to." + ) + + args = parser.parse_args() + main(args.data_dir) diff --git a/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/model_base.py b/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/model_base.py index 35e52b8355..9c468bcde0 100644 --- a/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/model_base.py +++ b/website/docs/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/model_base.py @@ -19,201 +19,189 @@ https://arxiv.org/pdf/1512.03385v1.pdf https://arxiv.org/pdf/1605.07146v1.pdf """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function +from __future__ import absolute_import, division, print_function import tensorflow as tf class ResNet(object): - """ResNet model.""" - - def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon): - """ResNet constructor. - - Args: - is_training: if build training or inference model. - data_format: the data_format used during computation. - one of 'channels_first' or 'channels_last'. - """ - self._batch_norm_decay = batch_norm_decay - self._batch_norm_epsilon = batch_norm_epsilon - self._is_training = is_training - assert data_format in ('channels_first', 'channels_last') - self._data_format = data_format - - def forward_pass(self, x): - raise NotImplementedError( - 'forward_pass() is implemented in ResNet sub classes') - - def _residual_v1(self, - x, - kernel_size, - in_filter, - out_filter, - stride, - activate_before_residual=False): - """Residual unit with 2 sub layers, using Plan A for shortcut connection.""" - - del activate_before_residual - with tf.name_scope('residual_v1') as name_scope: - orig_x = x - - x = self._conv(x, kernel_size, out_filter, stride) - x = self._batch_norm(x) - x = self._relu(x) - - x = self._conv(x, kernel_size, out_filter, 1) - x = self._batch_norm(x) - - if in_filter != out_filter: - orig_x = self._avg_pool(orig_x, stride, stride) - pad = (out_filter - in_filter) // 2 - if self._data_format == 'channels_first': - orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) + """ResNet model.""" + + def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon): + """ResNet constructor. + + Args: + is_training: if build training or inference model. + data_format: the data_format used during computation. + one of 'channels_first' or 'channels_last'. + """ + self._batch_norm_decay = batch_norm_decay + self._batch_norm_epsilon = batch_norm_epsilon + self._is_training = is_training + assert data_format in ("channels_first", "channels_last") + self._data_format = data_format + + def forward_pass(self, x): + raise NotImplementedError("forward_pass() is implemented in ResNet sub classes") + + def _residual_v1( + self, x, kernel_size, in_filter, out_filter, stride, activate_before_residual=False + ): + """Residual unit with 2 sub layers, using Plan A for shortcut connection.""" + + del activate_before_residual + with tf.name_scope("residual_v1") as name_scope: + orig_x = x + + x = self._conv(x, kernel_size, out_filter, stride) + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, kernel_size, out_filter, 1) + x = self._batch_norm(x) + + if in_filter != out_filter: + orig_x = self._avg_pool(orig_x, stride, stride) + pad = (out_filter - in_filter) // 2 + if self._data_format == "channels_first": + orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) + else: + orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) + + x = self._relu(tf.add(x, orig_x)) + + tf.logging.info("image after unit %s: %s", name_scope, x.get_shape()) + return x + + def _residual_v2(self, x, in_filter, out_filter, stride, activate_before_residual=False): + """Residual unit with 2 sub layers with preactivation, plan A shortcut.""" + + with tf.name_scope("residual_v2") as name_scope: + if activate_before_residual: + x = self._batch_norm(x) + x = self._relu(x) + orig_x = x + else: + orig_x = x + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, 3, out_filter, stride) + + x = self._batch_norm(x) + x = self._relu(x) + x = self._conv(x, 3, out_filter, [1, 1, 1, 1]) + + if in_filter != out_filter: + pad = (out_filter - in_filter) // 2 + orig_x = self._avg_pool(orig_x, stride, stride) + if self._data_format == "channels_first": + orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) + else: + orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) + + x = tf.add(x, orig_x) + + tf.logging.info("image after unit %s: %s", name_scope, x.get_shape()) + return x + + def _bottleneck_residual_v2( + self, x, in_filter, out_filter, stride, activate_before_residual=False + ): + """Bottleneck residual unit with 3 sub layers, plan B shortcut.""" + + with tf.name_scope("bottle_residual_v2") as name_scope: + if activate_before_residual: + x = self._batch_norm(x) + x = self._relu(x) + orig_x = x + else: + orig_x = x + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, 1, out_filter // 4, stride, is_atrous=True) + + x = self._batch_norm(x) + x = self._relu(x) + # pad when stride isn't unit + x = self._conv(x, 3, out_filter // 4, 1, is_atrous=True) + + x = self._batch_norm(x) + x = self._relu(x) + x = self._conv(x, 1, out_filter, 1, is_atrous=True) + + if in_filter != out_filter: + orig_x = self._conv(orig_x, 1, out_filter, stride, is_atrous=True) + x = tf.add(x, orig_x) + + tf.logging.info("image after unit %s: %s", name_scope, x.get_shape()) + return x + + def _conv(self, x, kernel_size, filters, strides, is_atrous=False): + """Convolution.""" + + padding = "SAME" + if not is_atrous and strides > 1: + pad = kernel_size - 1 + pad_beg = pad // 2 + pad_end = pad - pad_beg + if self._data_format == "channels_first": + x = tf.pad(x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) + else: + x = tf.pad(x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) + padding = "VALID" + return tf.layers.conv2d( + inputs=x, + kernel_size=kernel_size, + filters=filters, + strides=strides, + padding=padding, + use_bias=False, + data_format=self._data_format, + ) + + def _batch_norm(self, x): + if self._data_format == "channels_first": + data_format = "NCHW" else: - orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) - - x = self._relu(tf.add(x, orig_x)) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _residual_v2(self, - x, - in_filter, - out_filter, - stride, - activate_before_residual=False): - """Residual unit with 2 sub layers with preactivation, plan A shortcut.""" - - with tf.name_scope('residual_v2') as name_scope: - if activate_before_residual: - x = self._batch_norm(x) - x = self._relu(x) - orig_x = x - else: - orig_x = x - x = self._batch_norm(x) - x = self._relu(x) - - x = self._conv(x, 3, out_filter, stride) - - x = self._batch_norm(x) - x = self._relu(x) - x = self._conv(x, 3, out_filter, [1, 1, 1, 1]) - - if in_filter != out_filter: - pad = (out_filter - in_filter) // 2 - orig_x = self._avg_pool(orig_x, stride, stride) - if self._data_format == 'channels_first': - orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) - else: - orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) - - x = tf.add(x, orig_x) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _bottleneck_residual_v2(self, - x, - in_filter, - out_filter, - stride, - activate_before_residual=False): - """Bottleneck residual unit with 3 sub layers, plan B shortcut.""" - - with tf.name_scope('bottle_residual_v2') as name_scope: - if activate_before_residual: - x = self._batch_norm(x) - x = self._relu(x) - orig_x = x - else: - orig_x = x - x = self._batch_norm(x) - x = self._relu(x) - - x = self._conv(x, 1, out_filter // 4, stride, is_atrous=True) - - x = self._batch_norm(x) - x = self._relu(x) - # pad when stride isn't unit - x = self._conv(x, 3, out_filter // 4, 1, is_atrous=True) - - x = self._batch_norm(x) - x = self._relu(x) - x = self._conv(x, 1, out_filter, 1, is_atrous=True) - - if in_filter != out_filter: - orig_x = self._conv(orig_x, 1, out_filter, stride, is_atrous=True) - x = tf.add(x, orig_x) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _conv(self, x, kernel_size, filters, strides, is_atrous=False): - """Convolution.""" - - padding = 'SAME' - if not is_atrous and strides > 1: - pad = kernel_size - 1 - pad_beg = pad // 2 - pad_end = pad - pad_beg - if self._data_format == 'channels_first': - x = tf.pad(x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) - else: - x = tf.pad(x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) - padding = 'VALID' - return tf.layers.conv2d( - inputs=x, - kernel_size=kernel_size, - filters=filters, - strides=strides, - padding=padding, - use_bias=False, - data_format=self._data_format) - - def _batch_norm(self, x): - if self._data_format == 'channels_first': - data_format = 'NCHW' - else: - data_format = 'NHWC' - return tf.contrib.layers.batch_norm( - x, - decay=self._batch_norm_decay, - center=True, - scale=True, - epsilon=self._batch_norm_epsilon, - is_training=self._is_training, - fused=True, - data_format=data_format) - - def _relu(self, x): - return tf.nn.relu(x) - - def _fully_connected(self, x, out_dim): - with tf.name_scope('fully_connected') as name_scope: - x = tf.layers.dense(x, out_dim) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _avg_pool(self, x, pool_size, stride): - with tf.name_scope('avg_pool') as name_scope: - x = tf.layers.average_pooling2d( - x, pool_size, stride, 'SAME', data_format=self._data_format) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _global_avg_pool(self, x): - with tf.name_scope('global_avg_pool') as name_scope: - assert x.get_shape().ndims == 4 - if self._data_format == 'channels_first': - x = tf.reduce_mean(x, [2, 3]) - else: - x = tf.reduce_mean(x, [1, 2]) - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x + data_format = "NHWC" + return tf.contrib.layers.batch_norm( + x, + decay=self._batch_norm_decay, + center=True, + scale=True, + epsilon=self._batch_norm_epsilon, + is_training=self._is_training, + fused=True, + data_format=data_format, + ) + + def _relu(self, x): + return tf.nn.relu(x) + + def _fully_connected(self, x, out_dim): + with tf.name_scope("fully_connected") as name_scope: + x = tf.layers.dense(x, out_dim) + + tf.logging.info("image after unit %s: %s", name_scope, x.get_shape()) + return x + + def _avg_pool(self, x, pool_size, stride): + with tf.name_scope("avg_pool") as name_scope: + x = tf.layers.average_pooling2d( + x, pool_size, stride, "SAME", data_format=self._data_format + ) + + tf.logging.info("image after unit %s: %s", name_scope, x.get_shape()) + return x + + def _global_avg_pool(self, x): + with tf.name_scope("global_avg_pool") as name_scope: + assert x.get_shape().ndims == 4 + if self._data_format == "channels_first": + x = tf.reduce_mean(x, [2, 3]) + else: + x = tf.reduce_mean(x, [1, 2]) + tf.logging.info("image after unit %s: %s", name_scope, x.get_shape()) + return x diff --git a/website/versioned_docs/version-0.6.0/userDocs/submarine-sdk/pysubmarine/development.md b/website/versioned_docs/version-0.6.0/userDocs/submarine-sdk/pysubmarine/development.md index 8975ce12c5..d2f18b6794 100644 --- a/website/versioned_docs/version-0.6.0/userDocs/submarine-sdk/pysubmarine/development.md +++ b/website/versioned_docs/version-0.6.0/userDocs/submarine-sdk/pysubmarine/development.md @@ -27,8 +27,10 @@ in its own conda environment by running the following conda create --name submarine-dev python=3.6 conda activate submarine-dev -# lint-requirements.txt and test-requirements.txt are in ./submarine-sdk/pysubmarine/github-actions +# Install auto-format and lints (lint-requirements.txt is in ./dev-support/style-check/python) pip install -r lint-requirements.txt + +# test-requirements.txt is in ./submarine-sdk/pysubmarine/github-actions pip install -r test-requirements.txt # Installs pysubmarine from current checkout @@ -52,18 +54,26 @@ The script does the following things: ### Coding Style -- Use [yapf](https://github.com/google/yapf) to format Python code -- yapf style is configured in `.style.yapf` file +- Use [isort](https://github.com/PyCQA/isort) to sort the Python imports and [black](https://github.com/psf/black) to format Python code +- Both style is configured in `pyproject.toml` - To autoformat code ```bash -./submarine-sdk/pysubmarine/github-actions/auto-format.sh +./dev-support/style-check/python/auto-format.sh ``` +- Use [flake8](https://github.com/PyCQA/flake8) to verify the linter, its' configure is in `.flake8` - Verify linter pass before submitting a pull request by running: ```bash -./submarine-sdk/pysubmarine/github-actions/lint.sh +./dev-support/style-check/python/lint.sh +``` + +- If you encouter a unexpected format, use the following method +```python +# fmt: off + "Unexpected format, formated by yourself" +# fmt: on ``` ### Unit Testing diff --git a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/pytorch/with-cifar10-models/cifar10_tutorial.py b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/pytorch/with-cifar10-models/cifar10_tutorial.py index 02824eca56..c3bb9914a8 100644 --- a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/pytorch/with-cifar10-models/cifar10_tutorial.py +++ b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/pytorch/with-cifar10-models/cifar10_tutorial.py @@ -80,21 +80,20 @@ # We transform them to Tensors of normalized range [-1, 1]. transform = transforms.Compose( - [transforms.ToTensor(), - transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] +) -trainset = torchvision.datasets.CIFAR10(root='./data', train=True, - download=True, transform=transform) -trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, - shuffle=True, num_workers=2) +trainset = torchvision.datasets.CIFAR10( + root="./data", train=True, download=True, transform=transform +) +trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2) -testset = torchvision.datasets.CIFAR10(root='./data', train=False, - download=True, transform=transform) -testloader = torch.utils.data.DataLoader(testset, batch_size=4, - shuffle=False, num_workers=2) +testset = torchvision.datasets.CIFAR10( + root="./data", train=False, download=True, transform=transform +) +testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=2) -classes = ('plane', 'car', 'bird', 'cat', - 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') +classes = ("plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck") ######################################################################## # Let us show some of the training images, for fun. @@ -102,15 +101,14 @@ import matplotlib.pyplot as plt import numpy as np - # functions to show an image def imshow(img): - img = img / 2 + 0.5 # unnormalize - npimg = img.numpy() - plt.imshow(np.transpose(npimg, (1, 2, 0))) - plt.show() + img = img / 2 + 0.5 # unnormalize + npimg = img.numpy() + plt.imshow(np.transpose(npimg, (1, 2, 0))) + plt.show() # get some random training images @@ -120,7 +118,7 @@ def imshow(img): # show images imshow(torchvision.utils.make_grid(images)) # print labels -print(' '.join('%5s' % classes[labels[j]] for j in range(4))) +print(" ".join("%5s" % classes[labels[j]] for j in range(4))) ######################################################################## # 2. Define a Convolutional Neural Network @@ -133,23 +131,23 @@ def imshow(img): class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x net = Net() @@ -174,28 +172,27 @@ def forward(self, x): for epoch in range(2): # loop over the dataset multiple times - running_loss = 0.0 - for i, data in enumerate(trainloader, 0): - # get the inputs - inputs, labels = data + running_loss = 0.0 + for i, data in enumerate(trainloader, 0): + # get the inputs + inputs, labels = data - # zero the parameter gradients - optimizer.zero_grad() + # zero the parameter gradients + optimizer.zero_grad() - # forward + backward + optimize - outputs = net(inputs) - loss = criterion(outputs, labels) - loss.backward() - optimizer.step() + # forward + backward + optimize + outputs = net(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() - # print statistics - running_loss += loss.item() - if i % 2000 == 1999: # print every 2000 mini-batches - print('[%d, %5d] loss: %.3f' % - (epoch + 1, i + 1, running_loss / 2000)) - running_loss = 0.0 + # print statistics + running_loss += loss.item() + if i % 2000 == 1999: # print every 2000 mini-batches + print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / 2000)) + running_loss = 0.0 -print('Finished Training') +print("Finished Training") ######################################################################## # 5. Test the network on the test data @@ -215,7 +212,7 @@ def forward(self, x): # print images imshow(torchvision.utils.make_grid(images)) -print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4))) +print("GroundTruth: ", " ".join("%5s" % classes[labels[j]] for j in range(4))) ######################################################################## # Okay, now let us see what the neural network thinks these examples above are: @@ -229,8 +226,7 @@ def forward(self, x): # So, let's get the index of the highest energy: _, predicted = torch.max(outputs, 1) -print('Predicted: ', ' '.join('%5s' % classes[predicted[j]] - for j in range(4))) +print("Predicted: ", " ".join("%5s" % classes[predicted[j]] for j in range(4))) ######################################################################## # The results seem pretty good. @@ -240,15 +236,14 @@ def forward(self, x): correct = 0 total = 0 with torch.no_grad(): - for data in testloader: - images, labels = data - outputs = net(images) - _, predicted = torch.max(outputs.data, 1) - total += labels.size(0) - correct += (predicted == labels).sum().item() + for data in testloader: + images, labels = data + outputs = net(images) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() -print('Accuracy of the network on the 10000 test images: %d %%' % ( - 100 * correct / total)) +print("Accuracy of the network on the 10000 test images: %d %%" % (100 * correct / total)) ######################################################################## # That looks waaay better than chance, which is 10% accuracy (randomly picking @@ -258,22 +253,21 @@ def forward(self, x): # Hmmm, what are the classes that performed well, and the classes that did # not perform well: -class_correct = list(0. for i in range(10)) -class_total = list(0. for i in range(10)) +class_correct = list(0.0 for i in range(10)) +class_total = list(0.0 for i in range(10)) with torch.no_grad(): - for data in testloader: - images, labels = data - outputs = net(images) - _, predicted = torch.max(outputs, 1) - c = (predicted == labels).squeeze() - for i in range(4): - label = labels[i] - class_correct[label] += c[i].item() - class_total[label] += 1 + for data in testloader: + images, labels = data + outputs = net(images) + _, predicted = torch.max(outputs, 1) + c = (predicted == labels).squeeze() + for i in range(4): + label = labels[i] + class_correct[label] += c[i].item() + class_total[label] += 1 for i in range(10): - print('Accuracy of %5s : %2d %%' % ( - classes[i], 100 * class_correct[i] / class_total[i])) + print("Accuracy of %5s : %2d %%" % (classes[i], 100 * class_correct[i] / class_total[i])) ######################################################################## # Okay, so what next? diff --git a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10.py b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10.py index 5e1a70895a..29883d291d 100644 --- a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10.py +++ b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10.py @@ -26,88 +26,87 @@ class Cifar10DataSet(object): - """Cifar10 data set. - - Described by http://www.cs.toronto.edu/~kriz/cifar.html. - """ - - def __init__(self, data_dir, subset='train', use_distortion=True): - self.data_dir = data_dir - self.subset = subset - self.use_distortion = use_distortion - - def get_filenames(self): - if self.subset in ['train', 'validation', 'eval']: - return [os.path.join(self.data_dir, self.subset + '.tfrecords')] - else: - raise ValueError('Invalid data subset "%s"' % self.subset) - - def parser(self, serialized_example): - """Parses a single tf.Example into image and label tensors.""" - # Dimensions of the images in the CIFAR-10 dataset. - # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the - # input format. - features = tf.parse_single_example( - serialized_example, - features={ - 'image': tf.FixedLenFeature([], tf.string), - 'label': tf.FixedLenFeature([], tf.int64), - }) - image = tf.decode_raw(features['image'], tf.uint8) - image.set_shape([DEPTH * HEIGHT * WIDTH]) - - # Reshape from [depth * height * width] to [depth, height, width]. - image = tf.cast( - tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), - tf.float32) - label = tf.cast(features['label'], tf.int32) - - # Custom preprocessing. - image = self.preprocess(image) - - return image, label - - def make_batch(self, batch_size): - """Read the images and labels from 'filenames'.""" - filenames = self.get_filenames() - # Repeat infinitely. - dataset = tf.data.TFRecordDataset(filenames).repeat() - - # Parse records. - dataset = dataset.map( - self.parser, num_parallel_calls=batch_size) - - # Potentially shuffle records. - if self.subset == 'train': - min_queue_examples = int( - Cifar10DataSet.num_examples_per_epoch(self.subset) * 0.4) - # Ensure that the capacity is sufficiently large to provide good random - # shuffling. - dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size) - - # Batch it up. - dataset = dataset.batch(batch_size) - iterator = dataset.make_one_shot_iterator() - image_batch, label_batch = iterator.get_next() - - return image_batch, label_batch - - def preprocess(self, image): - """Preprocess a single image in [height, width, depth] layout.""" - if self.subset == 'train' and self.use_distortion: - # Pad 4 pixels on each dimension of feature map, done in mini-batch - image = tf.image.resize_image_with_crop_or_pad(image, 40, 40) - image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH]) - image = tf.image.random_flip_left_right(image) - return image - - @staticmethod - def num_examples_per_epoch(subset='train'): - if subset == 'train': - return 45000 - elif subset == 'validation': - return 5000 - elif subset == 'eval': - return 10000 - else: - raise ValueError('Invalid data subset "%s"' % subset) + """Cifar10 data set. + + Described by http://www.cs.toronto.edu/~kriz/cifar.html. + """ + + def __init__(self, data_dir, subset="train", use_distortion=True): + self.data_dir = data_dir + self.subset = subset + self.use_distortion = use_distortion + + def get_filenames(self): + if self.subset in ["train", "validation", "eval"]: + return [os.path.join(self.data_dir, self.subset + ".tfrecords")] + else: + raise ValueError('Invalid data subset "%s"' % self.subset) + + def parser(self, serialized_example): + """Parses a single tf.Example into image and label tensors.""" + # Dimensions of the images in the CIFAR-10 dataset. + # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the + # input format. + features = tf.parse_single_example( + serialized_example, + features={ + "image": tf.FixedLenFeature([], tf.string), + "label": tf.FixedLenFeature([], tf.int64), + }, + ) + image = tf.decode_raw(features["image"], tf.uint8) + image.set_shape([DEPTH * HEIGHT * WIDTH]) + + # Reshape from [depth * height * width] to [depth, height, width]. + image = tf.cast( + tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), tf.float32 + ) + label = tf.cast(features["label"], tf.int32) + + # Custom preprocessing. + image = self.preprocess(image) + + return image, label + + def make_batch(self, batch_size): + """Read the images and labels from 'filenames'.""" + filenames = self.get_filenames() + # Repeat infinitely. + dataset = tf.data.TFRecordDataset(filenames).repeat() + + # Parse records. + dataset = dataset.map(self.parser, num_parallel_calls=batch_size) + + # Potentially shuffle records. + if self.subset == "train": + min_queue_examples = int(Cifar10DataSet.num_examples_per_epoch(self.subset) * 0.4) + # Ensure that the capacity is sufficiently large to provide good random + # shuffling. + dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size) + + # Batch it up. + dataset = dataset.batch(batch_size) + iterator = dataset.make_one_shot_iterator() + image_batch, label_batch = iterator.get_next() + + return image_batch, label_batch + + def preprocess(self, image): + """Preprocess a single image in [height, width, depth] layout.""" + if self.subset == "train" and self.use_distortion: + # Pad 4 pixels on each dimension of feature map, done in mini-batch + image = tf.image.resize_image_with_crop_or_pad(image, 40, 40) + image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH]) + image = tf.image.random_flip_left_right(image) + return image + + @staticmethod + def num_examples_per_epoch(subset="train"): + if subset == "train": + return 45000 + elif subset == "validation": + return 5000 + elif subset == "eval": + return 10000 + else: + raise ValueError('Invalid data subset "%s"' % subset) diff --git a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_main.py b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_main.py index dbd2418905..91e8e48861 100644 --- a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_main.py +++ b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_main.py @@ -25,8 +25,7 @@ """ -from __future__ import division -from __future__ import print_function +from __future__ import division, print_function import argparse import functools @@ -38,484 +37,483 @@ import cifar10_utils import numpy as np import six -from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf +from six.moves import xrange # pylint: disable=redefined-builtin tf.logging.set_verbosity(tf.logging.INFO) def get_model_fn(num_gpus, variable_strategy, num_workers): - """Returns a function that will build the resnet model.""" + """Returns a function that will build the resnet model.""" + + def _resnet_model_fn(features, labels, mode, params): + """Resnet model body. + + Support single host, one or more GPU training. Parameter distribution can + be either one of the following scheme. + 1. CPU is the parameter server and manages gradient updates. + 2. Parameters are distributed evenly across all GPUs, and the first GPU + manages gradient updates. + + Args: + features: a list of tensors, one for each tower + labels: a list of tensors, one for each tower + mode: ModeKeys.TRAIN or EVAL + params: Hyperparameters suitable for tuning + Returns: + A EstimatorSpec object. + """ + is_training = mode == tf.estimator.ModeKeys.TRAIN + weight_decay = params.weight_decay + momentum = params.momentum + + tower_features = features + tower_labels = labels + tower_losses = [] + tower_gradvars = [] + tower_preds = [] + + # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) + # on CPU. The exception is Intel MKL on CPU which is optimal with + # channels_last. + data_format = params.data_format + if not data_format: + if num_gpus == 0: + data_format = "channels_last" + else: + data_format = "channels_first" + + if num_gpus == 0: + num_devices = 1 + device_type = "cpu" + else: + num_devices = num_gpus + device_type = "gpu" + + for i in range(num_devices): + worker_device = "/{}:{}".format(device_type, i) + if variable_strategy == "CPU": + device_setter = cifar10_utils.local_device_setter(worker_device=worker_device) + elif variable_strategy == "GPU": + device_setter = cifar10_utils.local_device_setter( + ps_device_type="gpu", + worker_device=worker_device, + ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( + num_gpus, tf.contrib.training.byte_size_load_fn + ), + ) + with tf.variable_scope("resnet", reuse=bool(i != 0)): + with tf.name_scope("tower_%d" % i) as name_scope: + with tf.device(device_setter): + loss, gradvars, preds = _tower_fn( + is_training, + weight_decay, + tower_features[i], + tower_labels[i], + data_format, + params.num_layers, + params.batch_norm_decay, + params.batch_norm_epsilon, + ) + tower_losses.append(loss) + tower_gradvars.append(gradvars) + tower_preds.append(preds) + if i == 0: + # Only trigger batch_norm moving mean and variance update from + # the 1st tower. Ideally, we should grab the updates from all + # towers but these stats accumulate extremely fast so we can + # ignore the other stats from the other towers without + # significant detriment. + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope) + + # Now compute global loss and gradients. + gradvars = [] + with tf.name_scope("gradient_averaging"): + all_grads = {} + for grad, var in itertools.chain(*tower_gradvars): + if grad is not None: + all_grads.setdefault(var, []).append(grad) + for var, grads in six.iteritems(all_grads): + # Average gradients on the same device as the variables + # to which they apply. + with tf.device(var.device): + if len(grads) == 1: + avg_grad = grads[0] + else: + avg_grad = tf.multiply(tf.add_n(grads), 1.0 / len(grads)) + gradvars.append((avg_grad, var)) + + # Device that runs the ops to apply global gradient updates. + consolidation_device = "/gpu:0" if variable_strategy == "GPU" else "/cpu:0" + with tf.device(consolidation_device): + # Suggested learning rate scheduling from + # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155 + num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch("train") // ( + params.train_batch_size * num_workers + ) + boundaries = [ + num_batches_per_epoch * x for x in np.array([82, 123, 300], dtype=np.int64) + ] + staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]] + + learning_rate = tf.train.piecewise_constant( + tf.train.get_global_step(), boundaries, staged_lr + ) + + loss = tf.reduce_mean(tower_losses, name="loss") + + examples_sec_hook = cifar10_utils.ExamplesPerSecondHook( + params.train_batch_size, every_n_steps=10 + ) + + tensors_to_log = {"learning_rate": learning_rate, "loss": loss} + + logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100) + + train_hooks = [logging_hook, examples_sec_hook] + + optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum) + + if params.sync: + optimizer = tf.train.SyncReplicasOptimizer( + optimizer, replicas_to_aggregate=num_workers + ) + sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief) + train_hooks.append(sync_replicas_hook) + + # Create single grouped train op + train_op = [optimizer.apply_gradients(gradvars, global_step=tf.train.get_global_step())] + train_op.extend(update_ops) + train_op = tf.group(*train_op) + + predictions = { + "classes": tf.concat([p["classes"] for p in tower_preds], axis=0), + "probabilities": tf.concat([p["probabilities"] for p in tower_preds], axis=0), + } + stacked_labels = tf.concat(labels, axis=0) + metrics = {"accuracy": tf.metrics.accuracy(stacked_labels, predictions["classes"])} + + return tf.estimator.EstimatorSpec( + mode=mode, + predictions=predictions, + loss=loss, + train_op=train_op, + training_hooks=train_hooks, + eval_metric_ops=metrics, + ) + + return _resnet_model_fn + + +def _tower_fn( + is_training, + weight_decay, + feature, + label, + data_format, + num_layers, + batch_norm_decay, + batch_norm_epsilon, +): + """Build computation tower (Resnet). + + Args: + is_training: true if is training graph. + weight_decay: weight regularization strength, a float. + feature: a Tensor. + label: a Tensor. + data_format: channels_last (NHWC) or channels_first (NCHW). + num_layers: number of layers, an int. + batch_norm_decay: decay for batch normalization, a float. + batch_norm_epsilon: epsilon for batch normalization, a float. + + Returns: + A tuple with the loss for the tower, the gradients and parameters, and + predictions. + + """ + model = cifar10_model.ResNetCifar10( + num_layers, + batch_norm_decay=batch_norm_decay, + batch_norm_epsilon=batch_norm_epsilon, + is_training=is_training, + data_format=data_format, + ) + logits = model.forward_pass(feature, input_data_format="channels_last") + tower_pred = { + "classes": tf.argmax(input=logits, axis=1), + "probabilities": tf.nn.softmax(logits), + } + + tower_loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=label) + tower_loss = tf.reduce_mean(tower_loss) - def _resnet_model_fn(features, labels, mode, params): - """Resnet model body. + model_params = tf.trainable_variables() + tower_loss += weight_decay * tf.add_n([tf.nn.l2_loss(v) for v in model_params]) - Support single host, one or more GPU training. Parameter distribution can - be either one of the following scheme. - 1. CPU is the parameter server and manages gradient updates. - 2. Parameters are distributed evenly across all GPUs, and the first GPU - manages gradient updates. + tower_grad = tf.gradients(tower_loss, model_params) + + return tower_loss, zip(tower_grad, model_params), tower_pred + + +def input_fn(data_dir, subset, num_shards, batch_size, use_distortion_for_training=True): + """Create input graph for model. Args: - features: a list of tensors, one for each tower - labels: a list of tensors, one for each tower - mode: ModeKeys.TRAIN or EVAL - params: Hyperparameters suitable for tuning + data_dir: Directory where TFRecords representing the dataset are located. + subset: one of 'train', 'validate' and 'eval'. + num_shards: num of towers participating in data-parallel training. + batch_size: total batch size for training to be divided by the number of + shards. + use_distortion_for_training: True to use distortions. Returns: - A EstimatorSpec object. + two lists of tensors for features and labels, each of num_shards length. """ - is_training = (mode == tf.estimator.ModeKeys.TRAIN) - weight_decay = params.weight_decay - momentum = params.momentum - - tower_features = features - tower_labels = labels - tower_losses = [] - tower_gradvars = [] - tower_preds = [] - - # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) - # on CPU. The exception is Intel MKL on CPU which is optimal with - # channels_last. - data_format = params.data_format - if not data_format: - if num_gpus == 0: - data_format = 'channels_last' - else: - data_format = 'channels_first' - - if num_gpus == 0: - num_devices = 1 - device_type = 'cpu' - else: - num_devices = num_gpus - device_type = 'gpu' - - for i in range(num_devices): - worker_device = '/{}:{}'.format(device_type, i) - if variable_strategy == 'CPU': - device_setter = cifar10_utils.local_device_setter( - worker_device=worker_device) - elif variable_strategy == 'GPU': - device_setter = cifar10_utils.local_device_setter( - ps_device_type='gpu', - worker_device=worker_device, - ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( - num_gpus, tf.contrib.training.byte_size_load_fn)) - with tf.variable_scope('resnet', reuse=bool(i != 0)): - with tf.name_scope('tower_%d' % i) as name_scope: - with tf.device(device_setter): - loss, gradvars, preds = _tower_fn( - is_training, weight_decay, tower_features[i], tower_labels[i], - data_format, params.num_layers, params.batch_norm_decay, - params.batch_norm_epsilon) - tower_losses.append(loss) - tower_gradvars.append(gradvars) - tower_preds.append(preds) - if i == 0: - # Only trigger batch_norm moving mean and variance update from - # the 1st tower. Ideally, we should grab the updates from all - # towers but these stats accumulate extremely fast so we can - # ignore the other stats from the other towers without - # significant detriment. - update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, - name_scope) - - # Now compute global loss and gradients. - gradvars = [] - with tf.name_scope('gradient_averaging'): - all_grads = {} - for grad, var in itertools.chain(*tower_gradvars): - if grad is not None: - all_grads.setdefault(var, []).append(grad) - for var, grads in six.iteritems(all_grads): - # Average gradients on the same device as the variables - # to which they apply. - with tf.device(var.device): - if len(grads) == 1: - avg_grad = grads[0] - else: - avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) - gradvars.append((avg_grad, var)) - - # Device that runs the ops to apply global gradient updates. - consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' - with tf.device(consolidation_device): - # Suggested learning rate scheduling from - # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155 - num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch( - 'train') // (params.train_batch_size * num_workers) - boundaries = [ - num_batches_per_epoch * x - for x in np.array([82, 123, 300], dtype=np.int64) - ] - staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]] - - learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(), - boundaries, staged_lr) - - loss = tf.reduce_mean(tower_losses, name='loss') - - examples_sec_hook = cifar10_utils.ExamplesPerSecondHook( - params.train_batch_size, every_n_steps=10) - - tensors_to_log = {'learning_rate': learning_rate, 'loss': loss} - - logging_hook = tf.train.LoggingTensorHook( - tensors=tensors_to_log, every_n_iter=100) - - train_hooks = [logging_hook, examples_sec_hook] - - optimizer = tf.train.MomentumOptimizer( - learning_rate=learning_rate, momentum=momentum) - - if params.sync: - optimizer = tf.train.SyncReplicasOptimizer( - optimizer, replicas_to_aggregate=num_workers) - sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief) - train_hooks.append(sync_replicas_hook) - - # Create single grouped train op - train_op = [ - optimizer.apply_gradients( - gradvars, global_step=tf.train.get_global_step()) - ] - train_op.extend(update_ops) - train_op = tf.group(*train_op) - - predictions = { - 'classes': - tf.concat([p['classes'] for p in tower_preds], axis=0), - 'probabilities': - tf.concat([p['probabilities'] for p in tower_preds], axis=0) - } - stacked_labels = tf.concat(labels, axis=0) - metrics = { - 'accuracy': - tf.metrics.accuracy(stacked_labels, predictions['classes']) - } - - return tf.estimator.EstimatorSpec( - mode=mode, - predictions=predictions, - loss=loss, - train_op=train_op, - training_hooks=train_hooks, - eval_metric_ops=metrics) - - return _resnet_model_fn - - -def _tower_fn(is_training, weight_decay, feature, label, data_format, - num_layers, batch_norm_decay, batch_norm_epsilon): - """Build computation tower (Resnet). - - Args: - is_training: true if is training graph. - weight_decay: weight regularization strength, a float. - feature: a Tensor. - label: a Tensor. - data_format: channels_last (NHWC) or channels_first (NCHW). - num_layers: number of layers, an int. - batch_norm_decay: decay for batch normalization, a float. - batch_norm_epsilon: epsilon for batch normalization, a float. - - Returns: - A tuple with the loss for the tower, the gradients and parameters, and - predictions. - - """ - model = cifar10_model.ResNetCifar10( - num_layers, - batch_norm_decay=batch_norm_decay, - batch_norm_epsilon=batch_norm_epsilon, - is_training=is_training, - data_format=data_format) - logits = model.forward_pass(feature, input_data_format='channels_last') - tower_pred = { - 'classes': tf.argmax(input=logits, axis=1), - 'probabilities': tf.nn.softmax(logits) - } - - tower_loss = tf.losses.sparse_softmax_cross_entropy( - logits=logits, labels=label) - tower_loss = tf.reduce_mean(tower_loss) - - model_params = tf.trainable_variables() - tower_loss += weight_decay * tf.add_n( - [tf.nn.l2_loss(v) for v in model_params]) - - tower_grad = tf.gradients(tower_loss, model_params) - - return tower_loss, zip(tower_grad, model_params), tower_pred - - -def input_fn(data_dir, - subset, - num_shards, - batch_size, - use_distortion_for_training=True): - """Create input graph for model. - - Args: - data_dir: Directory where TFRecords representing the dataset are located. - subset: one of 'train', 'validate' and 'eval'. - num_shards: num of towers participating in data-parallel training. - batch_size: total batch size for training to be divided by the number of - shards. - use_distortion_for_training: True to use distortions. - Returns: - two lists of tensors for features and labels, each of num_shards length. - """ - with tf.device('/cpu:0'): - use_distortion = subset == 'train' and use_distortion_for_training - dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion) - image_batch, label_batch = dataset.make_batch(batch_size) - if num_shards <= 1: - # No GPU available or only 1 GPU. - return [image_batch], [label_batch] - - # Note that passing num=batch_size is safe here, even though - # dataset.batch(batch_size) can, in some cases, return fewer than batch_size - # examples. This is because it does so only when repeating for a limited - # number of epochs, but our dataset repeats forever. - image_batch = tf.unstack(image_batch, num=batch_size, axis=0) - label_batch = tf.unstack(label_batch, num=batch_size, axis=0) - feature_shards = [[] for i in range(num_shards)] - label_shards = [[] for i in range(num_shards)] - for i in xrange(batch_size): - idx = i % num_shards - feature_shards[idx].append(image_batch[i]) - label_shards[idx].append(label_batch[i]) - feature_shards = [tf.parallel_stack(x) for x in feature_shards] - label_shards = [tf.parallel_stack(x) for x in label_shards] - return feature_shards, label_shards - - -def get_experiment_fn(data_dir, - num_gpus, - variable_strategy, - use_distortion_for_training=True): - """Returns an Experiment function. - - Experiments perform training on several workers in parallel, - in other words experiments know how to invoke train and eval in a sensible - fashion for distributed training. Arguments passed directly to this - function are not tunable, all other arguments should be passed within - tf.HParams, passed to the enclosed function. - - Args: - data_dir: str. Location of the data for input_fns. - num_gpus: int. Number of GPUs on each worker. - variable_strategy: String. CPU to use CPU as the parameter server - and GPU to use the GPUs as the parameter server. - use_distortion_for_training: bool. See cifar10.Cifar10DataSet. - Returns: - A function (tf.estimator.RunConfig, tf.contrib.training.HParams) -> - tf.contrib.learn.Experiment. - - Suitable for use by tf.contrib.learn.learn_runner, which will run various - methods on Experiment (train, evaluate) based on information - about the current runner in `run_config`. - """ - - def _experiment_fn(run_config, hparams): - """Returns an Experiment.""" - # Create estimator. - train_input_fn = functools.partial( - input_fn, - data_dir, - subset='train', - num_shards=num_gpus, - batch_size=hparams.train_batch_size, - use_distortion_for_training=use_distortion_for_training) - - eval_input_fn = functools.partial( - input_fn, - data_dir, - subset='eval', - batch_size=hparams.eval_batch_size, - num_shards=num_gpus) - - num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval') - if num_eval_examples % hparams.eval_batch_size != 0: - raise ValueError( - 'validation set size must be multiple of eval_batch_size') - - train_steps = hparams.train_steps - eval_steps = num_eval_examples // hparams.eval_batch_size - - classifier = tf.estimator.Estimator( - model_fn=get_model_fn(num_gpus, variable_strategy, - run_config.num_worker_replicas or 1), - config=run_config, - params=hparams) - - # Create experiment. - return tf.contrib.learn.Experiment( - classifier, - train_input_fn=train_input_fn, - eval_input_fn=eval_input_fn, - train_steps=train_steps, - eval_steps=eval_steps) - - return _experiment_fn - - -def main(job_dir, data_dir, num_gpus, variable_strategy, - use_distortion_for_training, log_device_placement, num_intra_threads, - **hparams): - # The env variable is on deprecation path, default is set to off. - os.environ['TF_SYNC_ON_FINISH'] = '0' - os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' - - # Session configuration. - sess_config = tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=log_device_placement, - intra_op_parallelism_threads=num_intra_threads, - gpu_options=tf.GPUOptions(force_gpu_compatible=True)) - - config = cifar10_utils.RunConfig( - session_config=sess_config, model_dir=job_dir) - tf.contrib.learn.learn_runner.run( - get_experiment_fn(data_dir, num_gpus, variable_strategy, - use_distortion_for_training), - run_config=config, - hparams=tf.contrib.training.HParams( - is_chief=config.is_chief, - **hparams)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--data-dir', - type=str, - required=True, - help='The directory where the CIFAR-10 input data is stored.') - parser.add_argument( - '--job-dir', - type=str, - required=True, - help='The directory where the model will be stored.') - parser.add_argument( - '--variable-strategy', - choices=['CPU', 'GPU'], - type=str, - default='CPU', - help='Where to locate variable operations') - parser.add_argument( - '--num-gpus', - type=int, - default=1, - help='The number of gpus used. Uses only CPU if set to 0.') - parser.add_argument( - '--num-layers', - type=int, - default=44, - help='The number of layers of the model.') - parser.add_argument( - '--train-steps', - type=int, - default=80000, - help='The number of steps to use for training.') - parser.add_argument( - '--train-batch-size', - type=int, - default=128, - help='Batch size for training.') - parser.add_argument( - '--eval-batch-size', - type=int, - default=100, - help='Batch size for validation.') - parser.add_argument( - '--momentum', - type=float, - default=0.9, - help='Momentum for MomentumOptimizer.') - parser.add_argument( - '--weight-decay', - type=float, - default=2e-4, - help='Weight decay for convolutions.') - parser.add_argument( - '--learning-rate', - type=float, - default=0.1, - help="""\ + with tf.device("/cpu:0"): + use_distortion = subset == "train" and use_distortion_for_training + dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion) + image_batch, label_batch = dataset.make_batch(batch_size) + if num_shards <= 1: + # No GPU available or only 1 GPU. + return [image_batch], [label_batch] + + # Note that passing num=batch_size is safe here, even though + # dataset.batch(batch_size) can, in some cases, return fewer than batch_size + # examples. This is because it does so only when repeating for a limited + # number of epochs, but our dataset repeats forever. + image_batch = tf.unstack(image_batch, num=batch_size, axis=0) + label_batch = tf.unstack(label_batch, num=batch_size, axis=0) + feature_shards = [[] for i in range(num_shards)] + label_shards = [[] for i in range(num_shards)] + for i in xrange(batch_size): + idx = i % num_shards + feature_shards[idx].append(image_batch[i]) + label_shards[idx].append(label_batch[i]) + feature_shards = [tf.parallel_stack(x) for x in feature_shards] + label_shards = [tf.parallel_stack(x) for x in label_shards] + return feature_shards, label_shards + + +def get_experiment_fn(data_dir, num_gpus, variable_strategy, use_distortion_for_training=True): + """Returns an Experiment function. + + Experiments perform training on several workers in parallel, + in other words experiments know how to invoke train and eval in a sensible + fashion for distributed training. Arguments passed directly to this + function are not tunable, all other arguments should be passed within + tf.HParams, passed to the enclosed function. + + Args: + data_dir: str. Location of the data for input_fns. + num_gpus: int. Number of GPUs on each worker. + variable_strategy: String. CPU to use CPU as the parameter server + and GPU to use the GPUs as the parameter server. + use_distortion_for_training: bool. See cifar10.Cifar10DataSet. + Returns: + A function (tf.estimator.RunConfig, tf.contrib.training.HParams) -> + tf.contrib.learn.Experiment. + + Suitable for use by tf.contrib.learn.learn_runner, which will run various + methods on Experiment (train, evaluate) based on information + about the current runner in `run_config`. + """ + + def _experiment_fn(run_config, hparams): + """Returns an Experiment.""" + # Create estimator. + train_input_fn = functools.partial( + input_fn, + data_dir, + subset="train", + num_shards=num_gpus, + batch_size=hparams.train_batch_size, + use_distortion_for_training=use_distortion_for_training, + ) + + eval_input_fn = functools.partial( + input_fn, + data_dir, + subset="eval", + batch_size=hparams.eval_batch_size, + num_shards=num_gpus, + ) + + num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch("eval") + if num_eval_examples % hparams.eval_batch_size != 0: + raise ValueError("validation set size must be multiple of eval_batch_size") + + train_steps = hparams.train_steps + eval_steps = num_eval_examples // hparams.eval_batch_size + + classifier = tf.estimator.Estimator( + model_fn=get_model_fn(num_gpus, variable_strategy, run_config.num_worker_replicas or 1), + config=run_config, + params=hparams, + ) + + # Create experiment. + return tf.contrib.learn.Experiment( + classifier, + train_input_fn=train_input_fn, + eval_input_fn=eval_input_fn, + train_steps=train_steps, + eval_steps=eval_steps, + ) + + return _experiment_fn + + +def main( + job_dir, + data_dir, + num_gpus, + variable_strategy, + use_distortion_for_training, + log_device_placement, + num_intra_threads, + **hparams +): + # The env variable is on deprecation path, default is set to off. + os.environ["TF_SYNC_ON_FINISH"] = "0" + os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1" + + # Session configuration. + sess_config = tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=log_device_placement, + intra_op_parallelism_threads=num_intra_threads, + gpu_options=tf.GPUOptions(force_gpu_compatible=True), + ) + + config = cifar10_utils.RunConfig(session_config=sess_config, model_dir=job_dir) + tf.contrib.learn.learn_runner.run( + get_experiment_fn(data_dir, num_gpus, variable_strategy, use_distortion_for_training), + run_config=config, + hparams=tf.contrib.training.HParams(is_chief=config.is_chief, **hparams), + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--data-dir", + type=str, + required=True, + help="The directory where the CIFAR-10 input data is stored.", + ) + parser.add_argument( + "--job-dir", type=str, required=True, help="The directory where the model will be stored." + ) + parser.add_argument( + "--variable-strategy", + choices=["CPU", "GPU"], + type=str, + default="CPU", + help="Where to locate variable operations", + ) + parser.add_argument( + "--num-gpus", + type=int, + default=1, + help="The number of gpus used. Uses only CPU if set to 0.", + ) + parser.add_argument( + "--num-layers", type=int, default=44, help="The number of layers of the model." + ) + parser.add_argument( + "--train-steps", type=int, default=80000, help="The number of steps to use for training." + ) + parser.add_argument( + "--train-batch-size", type=int, default=128, help="Batch size for training." + ) + parser.add_argument( + "--eval-batch-size", type=int, default=100, help="Batch size for validation." + ) + parser.add_argument( + "--momentum", type=float, default=0.9, help="Momentum for MomentumOptimizer." + ) + parser.add_argument( + "--weight-decay", type=float, default=2e-4, help="Weight decay for convolutions." + ) + parser.add_argument( + "--learning-rate", + type=float, + default=0.1, + help="""\ This is the initial learning rate value. The learning rate will decrease during training. For more details check the model_fn implementation in this file.\ - """) - parser.add_argument( - '--use-distortion-for-training', - type=bool, - default=True, - help='If doing image distortion for training.') - parser.add_argument( - '--sync', - action='store_true', - default=False, - help="""\ + """, + ) + parser.add_argument( + "--use-distortion-for-training", + type=bool, + default=True, + help="If doing image distortion for training.", + ) + parser.add_argument( + "--sync", + action="store_true", + default=False, + help="""\ If present when running in a distributed environment will run on sync mode.\ - """) - parser.add_argument( - '--num-intra-threads', - type=int, - default=0, - help="""\ + """, + ) + parser.add_argument( + "--num-intra-threads", + type=int, + default=0, + help="""\ Number of threads to use for intra-op parallelism. When training on CPU set to 0 to have the system pick the appropriate number or alternatively set it to the number of physical CPU cores.\ - """) - parser.add_argument( - '--num-inter-threads', - type=int, - default=0, - help="""\ + """, + ) + parser.add_argument( + "--num-inter-threads", + type=int, + default=0, + help="""\ Number of threads to use for inter-op parallelism. If set to 0, the system will pick an appropriate number.\ - """) - parser.add_argument( - '--data-format', - type=str, - default=None, - help="""\ + """, + ) + parser.add_argument( + "--data-format", + type=str, + default=None, + help="""\ If not set, the data format best for the training device is used. Allowed values: channels_first (NCHW) channels_last (NHWC).\ - """) - parser.add_argument( - '--log-device-placement', - action='store_true', - default=False, - help='Whether to log device placement.') - parser.add_argument( - '--batch-norm-decay', - type=float, - default=0.997, - help='Decay for batch norm.') - parser.add_argument( - '--batch-norm-epsilon', - type=float, - default=1e-5, - help='Epsilon for batch norm.') - args = parser.parse_args() - - if args.num_gpus > 0: - assert tf.test.is_gpu_available(), "Requested GPUs but none found." - if args.num_gpus < 0: - raise ValueError( - 'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.') - if args.num_gpus == 0 and args.variable_strategy == 'GPU': - raise ValueError('num-gpus=0, CPU must be used as parameter server. Set' - '--variable-strategy=CPU.') - if (args.num_layers - 2) % 6 != 0: - raise ValueError('Invalid --num-layers parameter.') - if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0: - raise ValueError('--train-batch-size must be multiple of --num-gpus.') - if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0: - raise ValueError('--eval-batch-size must be multiple of --num-gpus.') - - main(**vars(args)) + """, + ) + parser.add_argument( + "--log-device-placement", + action="store_true", + default=False, + help="Whether to log device placement.", + ) + parser.add_argument( + "--batch-norm-decay", type=float, default=0.997, help="Decay for batch norm." + ) + parser.add_argument( + "--batch-norm-epsilon", type=float, default=1e-5, help="Epsilon for batch norm." + ) + args = parser.parse_args() + + if args.num_gpus > 0: + assert tf.test.is_gpu_available(), "Requested GPUs but none found." + if args.num_gpus < 0: + raise ValueError('Invalid GPU count: "--num-gpus" must be 0 or a positive integer.') + if args.num_gpus == 0 and args.variable_strategy == "GPU": + raise ValueError( + "num-gpus=0, CPU must be used as parameter server. Set--variable-strategy=CPU." + ) + if (args.num_layers - 2) % 6 != 0: + raise ValueError("Invalid --num-layers parameter.") + if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0: + raise ValueError("--train-batch-size must be multiple of --num-gpus.") + if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0: + raise ValueError("--eval-batch-size must be multiple of --num-gpus.") + + main(**vars(args)) diff --git a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_model.py b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_model.py index d67c233dbb..019f28b82f 100644 --- a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_model.py +++ b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_model.py @@ -13,68 +13,64 @@ # limitations under the License. # ============================================================================== """Model class for Cifar10 Dataset.""" -from __future__ import division -from __future__ import print_function - -import tensorflow as tf +from __future__ import division, print_function import model_base +import tensorflow as tf class ResNetCifar10(model_base.ResNet): - """Cifar10 model with ResNetV1 and basic residual block.""" + """Cifar10 model with ResNetV1 and basic residual block.""" - def __init__(self, - num_layers, - is_training, - batch_norm_decay, - batch_norm_epsilon, - data_format='channels_first'): - super(ResNetCifar10, self).__init__( + def __init__( + self, + num_layers, is_training, - data_format, batch_norm_decay, - batch_norm_epsilon - ) - self.n = (num_layers - 2) // 6 - # Add one in case label starts with 1. No impact if label starts with 0. - self.num_classes = 10 + 1 - self.filters = [16, 16, 32, 64] - self.strides = [1, 2, 2] + batch_norm_epsilon, + data_format="channels_first", + ): + super(ResNetCifar10, self).__init__( + is_training, data_format, batch_norm_decay, batch_norm_epsilon + ) + self.n = (num_layers - 2) // 6 + # Add one in case label starts with 1. No impact if label starts with 0. + self.num_classes = 10 + 1 + self.filters = [16, 16, 32, 64] + self.strides = [1, 2, 2] - def forward_pass(self, x, input_data_format='channels_last'): - """Build the core model within the graph.""" - if self._data_format != input_data_format: - if input_data_format == 'channels_last': - # Computation requires channels_first. - x = tf.transpose(x, [0, 3, 1, 2]) - else: - # Computation requires channels_last. - x = tf.transpose(x, [0, 2, 3, 1]) + def forward_pass(self, x, input_data_format="channels_last"): + """Build the core model within the graph.""" + if self._data_format != input_data_format: + if input_data_format == "channels_last": + # Computation requires channels_first. + x = tf.transpose(x, [0, 3, 1, 2]) + else: + # Computation requires channels_last. + x = tf.transpose(x, [0, 2, 3, 1]) - # Image standardization. - x = x / 128 - 1 + # Image standardization. + x = x / 128 - 1 - x = self._conv(x, 3, 16, 1) - x = self._batch_norm(x) - x = self._relu(x) + x = self._conv(x, 3, 16, 1) + x = self._batch_norm(x) + x = self._relu(x) - # Use basic (non-bottleneck) block and ResNet V1 (post-activation). - res_func = self._residual_v1 + # Use basic (non-bottleneck) block and ResNet V1 (post-activation). + res_func = self._residual_v1 - # 3 stages of block stacking. - for i in range(3): - with tf.name_scope('stage'): - for j in range(self.n): - if j == 0: - # First block in a stage, filters and strides may change. - x = res_func(x, 3, self.filters[i], self.filters[i + 1], - self.strides[i]) - else: - # Following blocks in a stage, constant filters and unit stride. - x = res_func(x, 3, self.filters[i + 1], self.filters[i + 1], 1) + # 3 stages of block stacking. + for i in range(3): + with tf.name_scope("stage"): + for j in range(self.n): + if j == 0: + # First block in a stage, filters and strides may change. + x = res_func(x, 3, self.filters[i], self.filters[i + 1], self.strides[i]) + else: + # Following blocks in a stage, constant filters and unit stride. + x = res_func(x, 3, self.filters[i + 1], self.filters[i + 1], 1) - x = self._global_avg_pool(x) - x = self._fully_connected(x, self.num_classes) + x = self._global_avg_pool(x) + x = self._fully_connected(x, self.num_classes) - return x + return x diff --git a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_utils.py b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_utils.py index d3d29dd82e..56d7d9185f 100644 --- a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_utils.py +++ b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/cifar10_utils.py @@ -13,141 +13,144 @@ # limitations under the License. # ============================================================================== import collections -import six +import six import tensorflow as tf - -from tensorflow.python.platform import tf_logging as logging +from tensorflow.contrib.learn.python.learn import run_config from tensorflow.core.framework import node_def_pb2 from tensorflow.python.framework import device as pydev -from tensorflow.python.training import basic_session_run_hooks -from tensorflow.python.training import session_run_hook -from tensorflow.python.training import training_util -from tensorflow.python.training import device_setter -from tensorflow.contrib.learn.python.learn import run_config +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.training import ( + basic_session_run_hooks, + device_setter, + session_run_hook, + training_util, +) # TODO(b/64848083) Remove once uid bug is fixed -class RunConfig(tf.contrib.learn.RunConfig): - def uid(self, whitelist=None): - """Generates a 'Unique Identifier' based on all internal fields. - Caller should use the uid string to check `RunConfig` instance integrity - in one session use, but should not rely on the implementation details, which - is subject to change. - Args: - whitelist: A list of the string names of the properties uid should not - include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which - includes most properties user allowed to change. - Returns: - A uid string. - """ - if whitelist is None: - whitelist = run_config._DEFAULT_UID_WHITE_LIST - - state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')} - # Pop out the keys in whitelist. - for k in whitelist: - state.pop('_' + k, None) - - ordered_state = collections.OrderedDict( - sorted(state.items(), key=lambda t: t[0])) - # For class instance without __repr__, some special cares are required. - # Otherwise, the object address will be used. - if '_cluster_spec' in ordered_state: - ordered_state['_cluster_spec'] = collections.OrderedDict( - sorted(ordered_state['_cluster_spec'].as_dict().items(), - key=lambda t: t[0]) - ) - return ', '.join( - '%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state)) +class RunConfig(tf.contrib.learn.RunConfig): + def uid(self, whitelist=None): + """Generates a 'Unique Identifier' based on all internal fields. + Caller should use the uid string to check `RunConfig` instance integrity + in one session use, but should not rely on the implementation details, which + is subject to change. + Args: + whitelist: A list of the string names of the properties uid should not + include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which + includes most properties user allowed to change. + Returns: + A uid string. + """ + if whitelist is None: + whitelist = run_config._DEFAULT_UID_WHITE_LIST + + state = {k: v for k, v in self.__dict__.items() if not k.startswith("__")} + # Pop out the keys in whitelist. + for k in whitelist: + state.pop("_" + k, None) + + ordered_state = collections.OrderedDict(sorted(state.items(), key=lambda t: t[0])) + # For class instance without __repr__, some special cares are required. + # Otherwise, the object address will be used. + if "_cluster_spec" in ordered_state: + ordered_state["_cluster_spec"] = collections.OrderedDict( + sorted(ordered_state["_cluster_spec"].as_dict().items(), key=lambda t: t[0]) + ) + return ", ".join("%s=%r" % (k, v) for (k, v) in six.iteritems(ordered_state)) class ExamplesPerSecondHook(session_run_hook.SessionRunHook): - """Hook to print out examples per second. + """Hook to print out examples per second. Total time is tracked and then divided by the total number of steps to get the average step time and then batch_size is used to determine the running average of examples per second. The examples per second for the most recent interval is also logged. - """ - - def __init__( - self, - batch_size, - every_n_steps=100, - every_n_secs=None,): - """Initializer for ExamplesPerSecondHook. - - Args: - batch_size: Total batch size used to calculate examples/second from - global time. - every_n_steps: Log stats every n steps. - every_n_secs: Log stats every n seconds. """ - if (every_n_steps is None) == (every_n_secs is None): - raise ValueError('exactly one of every_n_steps' - ' and every_n_secs should be provided.') - self._timer = basic_session_run_hooks.SecondOrStepTimer( - every_steps=every_n_steps, every_secs=every_n_secs) - - self._step_train_time = 0 - self._total_steps = 0 - self._batch_size = batch_size - - def begin(self): - self._global_step_tensor = training_util.get_global_step() - if self._global_step_tensor is None: - raise RuntimeError( - 'Global step should be created to use StepCounterHook.') - - def before_run(self, run_context): # pylint: disable=unused-argument - return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor) - - def after_run(self, run_context, run_values): - _ = run_context - - global_step = run_values.results - if self._timer.should_trigger_for_step(global_step): - elapsed_time, elapsed_steps = self._timer.update_last_triggered_step( - global_step) - if elapsed_time is not None: - steps_per_sec = elapsed_steps / elapsed_time - self._step_train_time += elapsed_time - self._total_steps += elapsed_steps - - average_examples_per_sec = self._batch_size * ( - self._total_steps / self._step_train_time) - current_examples_per_sec = steps_per_sec * self._batch_size - # Average examples/sec followed by current examples/sec - logging.info('%s: %g (%g), step = %g', 'Average examples/sec', - average_examples_per_sec, current_examples_per_sec, - self._total_steps) - -def local_device_setter(num_devices=1, - ps_device_type='cpu', - worker_device='/cpu:0', - ps_ops=None, - ps_strategy=None): - if ps_ops == None: - ps_ops = ['Variable', 'VariableV2', 'VarHandleOp'] - - if ps_strategy is None: - ps_strategy = device_setter._RoundRobinStrategy(num_devices) - if not six.callable(ps_strategy): - raise TypeError("ps_strategy must be callable") - - def _local_device_chooser(op): - current_device = pydev.DeviceSpec.from_string(op.device or "") - - node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def - if node_def.op in ps_ops: - ps_device_spec = pydev.DeviceSpec.from_string( - '/{}:{}'.format(ps_device_type, ps_strategy(op))) - - ps_device_spec.merge_from(current_device) - return ps_device_spec.to_string() - else: - worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "") - worker_device_spec.merge_from(current_device) - return worker_device_spec.to_string() - return _local_device_chooser + + def __init__( + self, + batch_size, + every_n_steps=100, + every_n_secs=None, + ): + """Initializer for ExamplesPerSecondHook. + + Args: + batch_size: Total batch size used to calculate examples/second from + global time. + every_n_steps: Log stats every n steps. + every_n_secs: Log stats every n seconds. + """ + if (every_n_steps is None) == (every_n_secs is None): + raise ValueError("exactly one of every_n_steps and every_n_secs should be provided.") + self._timer = basic_session_run_hooks.SecondOrStepTimer( + every_steps=every_n_steps, every_secs=every_n_secs + ) + + self._step_train_time = 0 + self._total_steps = 0 + self._batch_size = batch_size + + def begin(self): + self._global_step_tensor = training_util.get_global_step() + if self._global_step_tensor is None: + raise RuntimeError("Global step should be created to use StepCounterHook.") + + def before_run(self, run_context): # pylint: disable=unused-argument + return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor) + + def after_run(self, run_context, run_values): + _ = run_context + + global_step = run_values.results + if self._timer.should_trigger_for_step(global_step): + elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(global_step) + if elapsed_time is not None: + steps_per_sec = elapsed_steps / elapsed_time + self._step_train_time += elapsed_time + self._total_steps += elapsed_steps + + average_examples_per_sec = self._batch_size * ( + self._total_steps / self._step_train_time + ) + current_examples_per_sec = steps_per_sec * self._batch_size + # Average examples/sec followed by current examples/sec + logging.info( + "%s: %g (%g), step = %g", + "Average examples/sec", + average_examples_per_sec, + current_examples_per_sec, + self._total_steps, + ) + + +def local_device_setter( + num_devices=1, ps_device_type="cpu", worker_device="/cpu:0", ps_ops=None, ps_strategy=None +): + if ps_ops == None: + ps_ops = ["Variable", "VariableV2", "VarHandleOp"] + + if ps_strategy is None: + ps_strategy = device_setter._RoundRobinStrategy(num_devices) + if not six.callable(ps_strategy): + raise TypeError("ps_strategy must be callable") + + def _local_device_chooser(op): + current_device = pydev.DeviceSpec.from_string(op.device or "") + + node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def + if node_def.op in ps_ops: + ps_device_spec = pydev.DeviceSpec.from_string( + "/{}:{}".format(ps_device_type, ps_strategy(op)) + ) + + ps_device_spec.merge_from(current_device) + return ps_device_spec.to_string() + else: + worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "") + worker_device_spec.merge_from(current_device) + return worker_device_spec.to_string() + + return _local_device_chooser diff --git a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/generate_cifar10_tfrecords.py b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/generate_cifar10_tfrecords.py index d1a599c31b..ca5fc95019 100644 --- a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/generate_cifar10_tfrecords.py +++ b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/generate_cifar10_tfrecords.py @@ -19,100 +19,97 @@ https://www.cs.toronto.edu/~kriz/cifar.html. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function +from __future__ import absolute_import, division, print_function import argparse import os import sys - import tarfile + +import tensorflow as tf from six.moves import cPickle as pickle from six.moves import xrange # pylint: disable=redefined-builtin -import tensorflow as tf -CIFAR_FILENAME = 'cifar-10-python.tar.gz' -CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME -CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py' +CIFAR_FILENAME = "cifar-10-python.tar.gz" +CIFAR_DOWNLOAD_URL = "https://www.cs.toronto.edu/~kriz/" + CIFAR_FILENAME +CIFAR_LOCAL_FOLDER = "cifar-10-batches-py" def download_and_extract(data_dir): - # download CIFAR-10 if not already downloaded. - tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir, - CIFAR_DOWNLOAD_URL) - tarfile.open(os.path.join(data_dir, CIFAR_FILENAME), - 'r:gz').extractall(data_dir) + # download CIFAR-10 if not already downloaded. + tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir, CIFAR_DOWNLOAD_URL) + tarfile.open(os.path.join(data_dir, CIFAR_FILENAME), "r:gz").extractall(data_dir) def _int64_feature(value): - return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) + return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) def _bytes_feature(value): - return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) def _get_file_names(): - """Returns the file names expected to exist in the input_dir.""" - file_names = {} - file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)] - file_names['validation'] = ['data_batch_5'] - file_names['eval'] = ['test_batch'] - return file_names + """Returns the file names expected to exist in the input_dir.""" + file_names = {} + file_names["train"] = ["data_batch_%d" % i for i in xrange(1, 5)] + file_names["validation"] = ["data_batch_5"] + file_names["eval"] = ["test_batch"] + return file_names def read_pickle_from_file(filename): - with tf.gfile.Open(filename, 'rb') as f: - if sys.version_info >= (3, 0): - data_dict = pickle.load(f, encoding='bytes') - else: - data_dict = pickle.load(f) - return data_dict + with tf.gfile.Open(filename, "rb") as f: + if sys.version_info >= (3, 0): + data_dict = pickle.load(f, encoding="bytes") + else: + data_dict = pickle.load(f) + return data_dict def convert_to_tfrecord(input_files, output_file): - """Converts a file to TFRecords.""" - print('Generating %s' % output_file) - with tf.python_io.TFRecordWriter(output_file) as record_writer: - for input_file in input_files: - data_dict = read_pickle_from_file(input_file) - data = data_dict[b'data'] - labels = data_dict[b'labels'] - num_entries_in_batch = len(labels) - for i in range(num_entries_in_batch): - example = tf.train.Example(features=tf.train.Features( - feature={ - 'image': _bytes_feature(data[i].tobytes()), - 'label': _int64_feature(labels[i]) - })) - record_writer.write(example.SerializeToString()) + """Converts a file to TFRecords.""" + print("Generating %s" % output_file) + with tf.python_io.TFRecordWriter(output_file) as record_writer: + for input_file in input_files: + data_dict = read_pickle_from_file(input_file) + data = data_dict[b"data"] + labels = data_dict[b"labels"] + num_entries_in_batch = len(labels) + for i in range(num_entries_in_batch): + example = tf.train.Example( + features=tf.train.Features( + feature={ + "image": _bytes_feature(data[i].tobytes()), + "label": _int64_feature(labels[i]), + } + ) + ) + record_writer.write(example.SerializeToString()) def main(data_dir): - print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL)) - download_and_extract(data_dir) - file_names = _get_file_names() - input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER) - for mode, files in file_names.items(): - input_files = [os.path.join(input_dir, f) for f in files] - output_file = os.path.join(data_dir, mode + '.tfrecords') - try: - os.remove(output_file) - except OSError: - pass - # Convert to tf.train.Example and write the to TFRecords. - convert_to_tfrecord(input_files, output_file) - print('Done!') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--data-dir', - type=str, - default='', - help='Directory to download and extract CIFAR-10 to.') - - args = parser.parse_args() - main(args.data_dir) + print("Download from {} and extract.".format(CIFAR_DOWNLOAD_URL)) + download_and_extract(data_dir) + file_names = _get_file_names() + input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER) + for mode, files in file_names.items(): + input_files = [os.path.join(input_dir, f) for f in files] + output_file = os.path.join(data_dir, mode + ".tfrecords") + try: + os.remove(output_file) + except OSError: + pass + # Convert to tf.train.Example and write the to TFRecords. + convert_to_tfrecord(input_files, output_file) + print("Done!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--data-dir", type=str, default="", help="Directory to download and extract CIFAR-10 to." + ) + + args = parser.parse_args() + main(args.data_dir) diff --git a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/model_base.py b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/model_base.py index 35e52b8355..9c468bcde0 100644 --- a/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/model_base.py +++ b/website/versioned_docs/version-0.6.0/userDocs/yarn/docker/tensorflow/with-cifar10-models/ubuntu-18.04/cifar10_estimator_tf_1.13.1/model_base.py @@ -19,201 +19,189 @@ https://arxiv.org/pdf/1512.03385v1.pdf https://arxiv.org/pdf/1605.07146v1.pdf """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function +from __future__ import absolute_import, division, print_function import tensorflow as tf class ResNet(object): - """ResNet model.""" - - def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon): - """ResNet constructor. - - Args: - is_training: if build training or inference model. - data_format: the data_format used during computation. - one of 'channels_first' or 'channels_last'. - """ - self._batch_norm_decay = batch_norm_decay - self._batch_norm_epsilon = batch_norm_epsilon - self._is_training = is_training - assert data_format in ('channels_first', 'channels_last') - self._data_format = data_format - - def forward_pass(self, x): - raise NotImplementedError( - 'forward_pass() is implemented in ResNet sub classes') - - def _residual_v1(self, - x, - kernel_size, - in_filter, - out_filter, - stride, - activate_before_residual=False): - """Residual unit with 2 sub layers, using Plan A for shortcut connection.""" - - del activate_before_residual - with tf.name_scope('residual_v1') as name_scope: - orig_x = x - - x = self._conv(x, kernel_size, out_filter, stride) - x = self._batch_norm(x) - x = self._relu(x) - - x = self._conv(x, kernel_size, out_filter, 1) - x = self._batch_norm(x) - - if in_filter != out_filter: - orig_x = self._avg_pool(orig_x, stride, stride) - pad = (out_filter - in_filter) // 2 - if self._data_format == 'channels_first': - orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) + """ResNet model.""" + + def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon): + """ResNet constructor. + + Args: + is_training: if build training or inference model. + data_format: the data_format used during computation. + one of 'channels_first' or 'channels_last'. + """ + self._batch_norm_decay = batch_norm_decay + self._batch_norm_epsilon = batch_norm_epsilon + self._is_training = is_training + assert data_format in ("channels_first", "channels_last") + self._data_format = data_format + + def forward_pass(self, x): + raise NotImplementedError("forward_pass() is implemented in ResNet sub classes") + + def _residual_v1( + self, x, kernel_size, in_filter, out_filter, stride, activate_before_residual=False + ): + """Residual unit with 2 sub layers, using Plan A for shortcut connection.""" + + del activate_before_residual + with tf.name_scope("residual_v1") as name_scope: + orig_x = x + + x = self._conv(x, kernel_size, out_filter, stride) + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, kernel_size, out_filter, 1) + x = self._batch_norm(x) + + if in_filter != out_filter: + orig_x = self._avg_pool(orig_x, stride, stride) + pad = (out_filter - in_filter) // 2 + if self._data_format == "channels_first": + orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) + else: + orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) + + x = self._relu(tf.add(x, orig_x)) + + tf.logging.info("image after unit %s: %s", name_scope, x.get_shape()) + return x + + def _residual_v2(self, x, in_filter, out_filter, stride, activate_before_residual=False): + """Residual unit with 2 sub layers with preactivation, plan A shortcut.""" + + with tf.name_scope("residual_v2") as name_scope: + if activate_before_residual: + x = self._batch_norm(x) + x = self._relu(x) + orig_x = x + else: + orig_x = x + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, 3, out_filter, stride) + + x = self._batch_norm(x) + x = self._relu(x) + x = self._conv(x, 3, out_filter, [1, 1, 1, 1]) + + if in_filter != out_filter: + pad = (out_filter - in_filter) // 2 + orig_x = self._avg_pool(orig_x, stride, stride) + if self._data_format == "channels_first": + orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) + else: + orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) + + x = tf.add(x, orig_x) + + tf.logging.info("image after unit %s: %s", name_scope, x.get_shape()) + return x + + def _bottleneck_residual_v2( + self, x, in_filter, out_filter, stride, activate_before_residual=False + ): + """Bottleneck residual unit with 3 sub layers, plan B shortcut.""" + + with tf.name_scope("bottle_residual_v2") as name_scope: + if activate_before_residual: + x = self._batch_norm(x) + x = self._relu(x) + orig_x = x + else: + orig_x = x + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, 1, out_filter // 4, stride, is_atrous=True) + + x = self._batch_norm(x) + x = self._relu(x) + # pad when stride isn't unit + x = self._conv(x, 3, out_filter // 4, 1, is_atrous=True) + + x = self._batch_norm(x) + x = self._relu(x) + x = self._conv(x, 1, out_filter, 1, is_atrous=True) + + if in_filter != out_filter: + orig_x = self._conv(orig_x, 1, out_filter, stride, is_atrous=True) + x = tf.add(x, orig_x) + + tf.logging.info("image after unit %s: %s", name_scope, x.get_shape()) + return x + + def _conv(self, x, kernel_size, filters, strides, is_atrous=False): + """Convolution.""" + + padding = "SAME" + if not is_atrous and strides > 1: + pad = kernel_size - 1 + pad_beg = pad // 2 + pad_end = pad - pad_beg + if self._data_format == "channels_first": + x = tf.pad(x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) + else: + x = tf.pad(x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) + padding = "VALID" + return tf.layers.conv2d( + inputs=x, + kernel_size=kernel_size, + filters=filters, + strides=strides, + padding=padding, + use_bias=False, + data_format=self._data_format, + ) + + def _batch_norm(self, x): + if self._data_format == "channels_first": + data_format = "NCHW" else: - orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) - - x = self._relu(tf.add(x, orig_x)) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _residual_v2(self, - x, - in_filter, - out_filter, - stride, - activate_before_residual=False): - """Residual unit with 2 sub layers with preactivation, plan A shortcut.""" - - with tf.name_scope('residual_v2') as name_scope: - if activate_before_residual: - x = self._batch_norm(x) - x = self._relu(x) - orig_x = x - else: - orig_x = x - x = self._batch_norm(x) - x = self._relu(x) - - x = self._conv(x, 3, out_filter, stride) - - x = self._batch_norm(x) - x = self._relu(x) - x = self._conv(x, 3, out_filter, [1, 1, 1, 1]) - - if in_filter != out_filter: - pad = (out_filter - in_filter) // 2 - orig_x = self._avg_pool(orig_x, stride, stride) - if self._data_format == 'channels_first': - orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) - else: - orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) - - x = tf.add(x, orig_x) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _bottleneck_residual_v2(self, - x, - in_filter, - out_filter, - stride, - activate_before_residual=False): - """Bottleneck residual unit with 3 sub layers, plan B shortcut.""" - - with tf.name_scope('bottle_residual_v2') as name_scope: - if activate_before_residual: - x = self._batch_norm(x) - x = self._relu(x) - orig_x = x - else: - orig_x = x - x = self._batch_norm(x) - x = self._relu(x) - - x = self._conv(x, 1, out_filter // 4, stride, is_atrous=True) - - x = self._batch_norm(x) - x = self._relu(x) - # pad when stride isn't unit - x = self._conv(x, 3, out_filter // 4, 1, is_atrous=True) - - x = self._batch_norm(x) - x = self._relu(x) - x = self._conv(x, 1, out_filter, 1, is_atrous=True) - - if in_filter != out_filter: - orig_x = self._conv(orig_x, 1, out_filter, stride, is_atrous=True) - x = tf.add(x, orig_x) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _conv(self, x, kernel_size, filters, strides, is_atrous=False): - """Convolution.""" - - padding = 'SAME' - if not is_atrous and strides > 1: - pad = kernel_size - 1 - pad_beg = pad // 2 - pad_end = pad - pad_beg - if self._data_format == 'channels_first': - x = tf.pad(x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) - else: - x = tf.pad(x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) - padding = 'VALID' - return tf.layers.conv2d( - inputs=x, - kernel_size=kernel_size, - filters=filters, - strides=strides, - padding=padding, - use_bias=False, - data_format=self._data_format) - - def _batch_norm(self, x): - if self._data_format == 'channels_first': - data_format = 'NCHW' - else: - data_format = 'NHWC' - return tf.contrib.layers.batch_norm( - x, - decay=self._batch_norm_decay, - center=True, - scale=True, - epsilon=self._batch_norm_epsilon, - is_training=self._is_training, - fused=True, - data_format=data_format) - - def _relu(self, x): - return tf.nn.relu(x) - - def _fully_connected(self, x, out_dim): - with tf.name_scope('fully_connected') as name_scope: - x = tf.layers.dense(x, out_dim) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _avg_pool(self, x, pool_size, stride): - with tf.name_scope('avg_pool') as name_scope: - x = tf.layers.average_pooling2d( - x, pool_size, stride, 'SAME', data_format=self._data_format) - - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x - - def _global_avg_pool(self, x): - with tf.name_scope('global_avg_pool') as name_scope: - assert x.get_shape().ndims == 4 - if self._data_format == 'channels_first': - x = tf.reduce_mean(x, [2, 3]) - else: - x = tf.reduce_mean(x, [1, 2]) - tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) - return x + data_format = "NHWC" + return tf.contrib.layers.batch_norm( + x, + decay=self._batch_norm_decay, + center=True, + scale=True, + epsilon=self._batch_norm_epsilon, + is_training=self._is_training, + fused=True, + data_format=data_format, + ) + + def _relu(self, x): + return tf.nn.relu(x) + + def _fully_connected(self, x, out_dim): + with tf.name_scope("fully_connected") as name_scope: + x = tf.layers.dense(x, out_dim) + + tf.logging.info("image after unit %s: %s", name_scope, x.get_shape()) + return x + + def _avg_pool(self, x, pool_size, stride): + with tf.name_scope("avg_pool") as name_scope: + x = tf.layers.average_pooling2d( + x, pool_size, stride, "SAME", data_format=self._data_format + ) + + tf.logging.info("image after unit %s: %s", name_scope, x.get_shape()) + return x + + def _global_avg_pool(self, x): + with tf.name_scope("global_avg_pool") as name_scope: + assert x.get_shape().ndims == 4 + if self._data_format == "channels_first": + x = tf.reduce_mean(x, [2, 3]) + else: + x = tf.reduce_mean(x, [1, 2]) + tf.logging.info("image after unit %s: %s", name_scope, x.get_shape()) + return x