diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..1d72255 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,86 @@ + +cmake_minimum_required (VERSION 2.8...3.23) + +Project(workloads NONE) +set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/script/") +include(prerequisite) + +add_custom_target(bom) +add_custom_target(kpi) +enable_testing() + +if(NOT DEFINED PLATFORM) + execute_process(COMMAND bash -c "head -n 1 '${CMAKE_SOURCE_DIR}/workload/platforms'" OUTPUT_VARIABLE PLATFORM OUTPUT_STRIP_TRAILING_WHITESPACE) +else() + execute_process(COMMAND bash -c "grep -E '^${PLATFORM}$' '${CMAKE_SOURCE_DIR}/workload/platforms' | head -n 1" OUTPUT_VARIABLE platform OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT platform) + message(FATAL_ERROR "Platform ${PLATFORM} not recognized!") + endif() + set(PLATFORM "${platform}") +endif() + +if(PLATFORM MATCHES "GRAVITON") + set(IMAGEARCH "linux/arm64") +else() + set(IMAGEARCH "linux/amd64") +endif() + +if(REGISTRY) + if (NOT ${REGISTRY} MATCHES "/$") + set(REGISTRY "${REGISTRY}/") + endif() +endif() + +set(ENABLE_BUILD ON) + +if (NOT DEFINED TIMEOUT) + set(TIMEOUT "28800,600") +endif() + +if (NOT DEFINED RELEASE) + set(RELEASE ":latest") +elseif (NOT ${RELEASE} MATCHES "^:") + set(RELEASE ":${RELEASE}") +endif() + +if (NOT BACKEND) + set(BACKEND "kubernetes") +endif() + +if (EXISTS "${PROJECT_SOURCE_DIR}/script/${BACKEND}.cmake") + include(${BACKEND}) +endif() + +add_subdirectory("script/march") +add_subdirectory(stack) +add_subdirectory(workload) + +message("") +message("This script will build third party components licensed under various open source licenses into your container images. The terms under which those components may be used and distributed can be found with the license document that is provided with those components. Please familiarize yourself with those terms to ensure your distribution of those components complies with the terms of those licenses.") +message("") +message("-- Setting: PLATFORM=${PLATFORM}, ARCH=${IMAGEARCH}") +if(NOT DEFINED REGISTRY_AUTH) + message("-- Setting: REGISTRY=${REGISTRY}") +else() + message("-- Setting: REGISTRY=${REGISTRY}, AUTH=${REGISTRY_AUTH}") +endif() +message("-- Setting: RELEASE=${RELEASE}") +message("-- Setting: TIMEOUT=${TIMEOUT}") +message("-- Setting: BACKEND=${BACKEND}") +if(ACCEPT_LICENSE) + message("-- Setting: ACCEPT_LICENSE=${ACCEPT_LICENSE}") +endif() +if(COMMAND show_backend_settings) + show_backend_settings() +endif() +if(BENCHMARK) + message("-- Setting: BENCHMARK=${BENCHMARK}") +endif() +if(NOT ENABLE_BUILD) + message("") + message("Build is disabled as ${DEFAULT_REGISTRY} is readonly.") + message("") +endif() +message("") + +execute_process(COMMAND bash -c "ln -s -r -f '${PROJECT_SOURCE_DIR}'/script/benchmark/*.sh ." WORKING_DIRECTORY "${CMAKE_BINARY_DIR}") diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f682f4e..04f153e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,57 +1,29 @@ -# Contributing +## Welcome to the Service Framework Workload Repository Contributing Guide -### License +### Evaluate Workloads - is licensed under the terms in [LICENSE]. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms. +Follow the [README](README.md#prerequisite) instructions to setup [local](doc/setup-docker.md), [remote](doc/setup-cumulus.md), or [Cloud](doc/setup-cumulus.md) systems to evaluate any [supported workloads](worklod/README.md#list-of-workloads). -### Sign your work +You can choose to build the workloads and evaluate the workload execution with `ctest`, which manage the workload test cases. You can run any subset of the test cases or all of them. -Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify -the below (from [developercertificate.org](http://developercertificate.org/)): +### Submit Issues -``` -Developer Certificate of Origin -Version 1.1 +If you spot a problem with the repository, submit an issue at the **github issues**. -Copyright (C) 2004, 2006 The Linux Foundation and its contributors. -660 York Street, Suite 102, -San Francisco, CA 94110 USA +### Contribute to Workload Development -Everyone is permitted to copy and distribute verbatim copies of this -license document, but changing it is not allowed. +Here is a list of references you can follow for workload development: +- A workload consists of a few critical pieces of scripts or manifests, documented in [Workload Elements](doc/workload.md): + - [`CMakeLists.txt`](doc/cmakelists.txt.md) + - [`build.sh`](doc/build.md) + - [`Dockerfiles`](doc/dockerfile.md) + - [`cluster-config.yaml.m4`](doc/cluster-config.md) + - [`kubernetes-config.yaml.m4`](doc/kuernetes-config.md) + - [`validate.sh`](doc/validate.md) + - [`kpi.sh`](doc/kpi.md) +- The best way to start a new workload development is by copying the [dummy](workload/dummy) workload and then modifying it to your needs. -Developer's Certificate of Origin 1.1 +### Submit Contributions -By making a contribution to this project, I certify that: +Thanks for your contribution to the Service Framework workload repository. Whether you plan to modify an existing workload or to create a new workload, please fork the SF workload repository to your own private work place. Make modifications there. Then submit a merge request. The branches of the main repository are reserved for release-related activities. -(a) The contribution was created in whole or in part by me and I - have the right to submit it under the open source license - indicated in the file; or - -(b) The contribution is based upon previous work that, to the best - of my knowledge, is covered under an appropriate open source - license and I have the right under that license to submit that - work with modifications, whether created in whole or in part - by me, under the same open source license (unless I am - permitted to submit under a different license), as indicated - in the file; or - -(c) The contribution was provided directly to me by some other - person who certified (a), (b) or (c) and I have not modified - it. - -(d) I understand and agree that this project and the contribution - are public and that a record of the contribution (including all - personal information I submit with it, including my sign-off) is - maintained indefinitely and may be redistributed consistent with - this project or the open source license(s) involved. -``` - -Then you just add a line to every git commit message: - - Signed-off-by: Joe Smith - -Use your real name (sorry, no pseudonyms or anonymous contributions.) - -If you set your `user.name` and `user.email` git configs, you can sign your -commit automatically with `git commit -s`. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..afdf0b1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright (c)2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..1f6ed29 --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ +## Introduction + +This is the **Workload Services Framework** repository. The repository contains a set of workloads that can be used to exercise multiple platforms. Each workload is a complete and standalone implementation that can be built and run collectively or individually. See the list of supported workloads under the [workload](workload/README.md) directory. + +### Prerequisite + +- Sync your system date/time. It is required by docker credential authorization. +- Install `cmake`, `make`, `m4`, and `gawk`. +- Setup [docker](doc/setup-docker.md), [Kubernetes](doc/setup-kubernetes.md), or [cumulus](doc/setup-cumulus.md). [docker](doc/setup-docker.md) is the minimum requirement and can be used for single-container workload validation. [Kubernetes](doc/setup-kubernetes.md) can be used for multiple-node workload validation. Setup [cumulus](doc/setup-cumulus.md) for remote worker validation. + +### Build & Evaluate Workload + +Evaluate a workload as follows: + +``` +mkdir -p build +cd build +cmake .. +cd workload/dummy +make +ctest -V +./list-kpi.sh logs* +``` + +> It takes a long time to rebuild all workload images. It is recommended that you only rebuild the workloads of interest by going to the workload sub-directory to make and test. + +You can optionally specify a `REGISTRY` value, `cmake -DREGISTRY=XYZ ..` to ask the build process to push the images to the docker registry. Please `docker login` beforehand if your docker registry requires authentication. A docker registry is optional except in the case of Kubernetes on-premises validation. + +### See Also + +- [Build Options](doc/cmake.md) +- [Test Options](doc/ctest.md) +- [Develop New Workload](doc/workload.md) diff --git a/doc/build.md b/doc/build.md new file mode 100644 index 0000000..924084a --- /dev/null +++ b/doc/build.md @@ -0,0 +1,29 @@ + +The `build.sh` script performs the workload build. Since the docker build process is [standardized](dockerfile.md), there is usually no need to customize it. You can use the following template as is: + +``` +#!/bin/bash -e + +DIR="$(dirname "$(readlink -f "$0")")" +. "$DIR"/../../script/build.sh +``` + +In some cases, the `script/buid.sh` can be customized as follows: +- **`FIND_OPTIONS`**: Specify any custom arguments to the `find` program to locate the set of Dockerfiles for building the docker images and for listing the BOMs. +- **`DOCKER_CONTEXT`**: Optionally specify a or an array of relative directory names where the Dockerfiles are located. By default, the Dockerfiles are assumed to be located directly under the workload directory. + +### Build Dependencies + +If your workload depends on some common software stacks, simply invoke the corresponding software stack's `build.sh`. For example, if your image depends on `QAT-Setup`, your `build.sh` can be something like below: + +``` +#!/bin/bash -e + +DIR="$(dirname "$(readlink -f "$0")")" + +# build QAT-Setup +STACK="qat_setup" "$DIR"/../../stack/QAT-Setup/build.sh $@ + +# build our image(s) +. "$DIR"/../../script/build.sh +``` diff --git a/doc/cluster-config.md b/doc/cluster-config.md new file mode 100644 index 0000000..a44792b --- /dev/null +++ b/doc/cluster-config.md @@ -0,0 +1,78 @@ + +The `cluster-config.yaml` manifest describes the machine specification to run the workloads. The specification is still evolving and subject to change. + +The following example describes a 3-node cluster to be used in some workload: + +``` +cluster: + - labels: {} + - labels: {} + - labels: {} +``` + +The `cluster-config.yaml` consists of the following sections: + +- **`cluster`**: This section defines the post-Sil cluster configurations. + +### cluster.labels + +The `cluster.labels` section describes any must have system level setup that a workload must use. The setup is specified in terms of a set of Kubernetes node labels as follows: + +| Label | Description | +|:-----:|:------------| +|
`HAS-SETUP-DISK
| This set of labels specify that SSD disks be mounted on the worker node(s).
See also: [Storage Setup](setup-storage.md). | +|
`HAS-SETUP-MODULE`
| This set of labels specify the kernel modules that the workload must use.
See also: [Module Setup](setup-module.md). | +|
`HAS-SETUP-HUGEPAGE`
| This set of labels specify the kernel hugepage settings.
See also: [Hugepage Setup](setup-hugepage.md) | + +The label value is either `required` or `preferred` as follows: + +``` +cluster: +- labels: + HAS-SETUP-HUGEPAGE-2048kB-2048: required +``` + +### cluster.cpuinfo + +The `cluster.cpuinfo` section describes any CPU-related constraints that a workload must use. The cpuinfo section is currently declarative and is not enforced. + +``` +cluster: +- cpuinfo: + flags: + - "avx512f" +``` + +where the CPU flags must match what are shown by `lscpu` or `cat /proc/cpuinfo`. + +### cluster.meminfo + +The `cluster.meminfo` section describes any memory constraints that a workload must use. The meminfo section is currently declarative and is not enforced. + +> Please also use the Kubernetes [resource constraints](https://kubernetes.io/docs/tasks/configure-pod-container/assign-memory-resource) to specify the workload memory requirements.) + +``` +cluster: +- meminfo: + available: 128 +``` + +where the available memory is in the unit of GBytes. + +### kubernetes + +The `kubernetes` section describes the Kubernetes configurations. This section is currently optionally enforced. +- `cni`: Specify the CNI plugin: `flannel` or `calico`. +- `cni-options`: Specify the CNI option: `vxlan` (calico). +- `kubelet-options`: Specify the kubelet options as described in [`KubeletConfiguration`](https://kubernetes.io/docs/reference/config-api/kubelet-config.v1beta1/#kubelet-config-k8s-io-v1beta1-KubeletConfiguration). +- `kubevirt`: Specify whether to enable kubevirt: `true/false`. + +``` +kubernetes: + cni: flannel + kubelet-options: + runtimeRequestTimeout: 10m +``` + +> Note that CNIs may behave differently on CSPs. Calico BGP and VXLAN work on AWS but only VXLAN works on GCP and AZure. + diff --git a/doc/cmake.md b/doc/cmake.md new file mode 100644 index 0000000..199dbec --- /dev/null +++ b/doc/cmake.md @@ -0,0 +1,40 @@ + +### Customize the Build Process: + +You can use the following build options to customize the build process: + +- **PLATFORM**: Specify the platform name. The only supported platform is `ICX`. +- **REGISTRY**: Specify the privacy docker registry URL. All built images will be pushed to the specified docker registry. + > `REGISTRY` must end with forward slash `/` +- **RELEASE**: Specify the release version. All built images will be tagged with it. Defaults to `:latest` + > `RELEASE` must begin with colon `:` +- **REGISTRY_AUTH**: Specify the registry authentication method. The only supported value is `docker`, which uses the docker configuration file. +- **BACKEND**: Specify the validation backend: [`docker`](setup-docker.md), [`kubernetes`](setup-kubernetes.md), or [`cumulus`](setup-cumulus.md). +- **CUMULUS_OPTIONS**: Specify the `cumulus` options. +- **TIMEOUT**: Specify the validation timeout, which contains the execution timeout and docker pull timeout. Default to 28800,300 seconds. +- **BENCHMARK**: Specify a workload pattern. Workloads not matching the pattern will be disabled. + +Build examples: + +```bash +cd build +cmake -DREGISTRY=xxyyzz.com:1234 .. +``` + +### Command Make Targets + +- **bom**: Print out the BOM list of each workload. +- **kpi**: Print out the KPI of each workload. +- **clean**: Purge the `logs`. + +```bash +cd build +cmake .. +make bom +``` + +### See Also + +- [Docker Engine](setup-docker.md) +- [Kubernetes Cluster](setup-kubernetes.md) +- [Cumulus Setup](setup-cumulus.md) diff --git a/doc/cmakelists.md b/doc/cmakelists.md new file mode 100644 index 0000000..46b5173 --- /dev/null +++ b/doc/cmakelists.md @@ -0,0 +1,56 @@ + +### Workload CMakeLists.txt + +The `CMakeLists.txt` defines actions in `cmake`: build and test. Let us start with a simple example: + +``` +add_workload("dummy") +add_testcase(${workload}) +``` + +The `add_workload` function defines `cmake` build rules for a workload. The name must be lower cased as a convention and does not contain any special characters with the exception of `_`. It is recommedded to append the version info to indicate the implementation versioning. The function also defines a parent-scope variable `workload` with the same name that any subsequent function can use. + +The `add_testcase` function defines a test case. You may define multiple test cases, each with a unique name and some configuration parameters. Internally, this gets routed to the `validate.sh` script with the specified parameters. (There is no argument in the above example.) The validation results are saved to the corresponding `logs-$workload` directory under the build tree. + +Note that the name of any `cmake` target must be unique across all workloads. Thus it is usually a concatenation of platform, feature, workload and configuration. + +### Special Test Cases + +The test case name should be descriptive for the workload test conditions. For example, use `_1n` to indicate that the workload runs on a single worker node, and `_3n` to indicate that the workload runs on multiple worker nodes. + +The following test case suffixes are reserved: +- `_gated`: A test case suffixed with `_gated` is designed for CI commit validation. The test case is expected to be a quick test of the workload software stack. To improve CI efficiency, design the test case such that the workload completes within 5 minutes. + +### Licensing Terms + +If the workload requires the user to agree to any license terms, use the `check_license` function. The function prompts the user for license agreement and then saves the decision. If the user denies the license terms, the workload will be skipped during the build process. If there are multiple license terms, you can write as many `check_license` functions as needed. + +``` +check_license("media.xiorg.com" "Please agree to the license terms for downloading datasets from xiorg.com") +add_workload("foo" LICENSE "media.xiorg.com") +``` + +### Fixed SUT + +If the workload can only run on specific SUT (System Under Test), specify the SUT constraints as part of the `add_workload` function as follows: + +``` +add_workload("foo" SUT azure) +``` + +where the `azure` SUT must be defined with `script/cumulus/cumulus-config..yaml`. + +### Software Stack CMakeLists.txt + +`CMakeLists.txt` for software stacks defines the software stack build and test targets. Let us start with a simple example: + +``` +add_stack("foo_setup") +add_testcase(${stack}) +``` + +The `add_stack` function defines `cmake` build rules for a workload. The name must be lower cased as a convention and does not contain any special characters with the exception of `_`. It is recommedded to append the version info to indicate the implementation versioning. The function also defines a parent-scope variable `stack` with the same name that any subsequent function can use. + +The `add_testcase` function defines a test case. You may define multiple test cases, each with a unique name and some configuration parameters. Internally, this gets routed to the `validate.sh` script with the specified parameters. (There is no argument in the above example.) The validation results are saved to the corresponding `logs-$stack` directory under the build tree. + +Note that the name of any `cmake` target must be unique across all workloads. Thus it is usually a concatenation of platform, feature, stack and configuration. diff --git a/doc/ctest.md b/doc/ctest.md new file mode 100644 index 0000000..dea3ae7 --- /dev/null +++ b/doc/ctest.md @@ -0,0 +1,205 @@ + +### Run Test + +Use `ctest` to run a single test or batch of tests. You can do this at the top level `build` directory or under each workload directory. In the latter case, only the tests of the workload will be executed. + +``` +cd build +cd workload/dummy +ctest +``` + +### CTest Options + +There are extensive list of options in `ctest` to control how tests can be executed. See the `ctest` manpage. The followings are most common options. + +- *`-R`*: Select tests based on a regular expression string. +- *`-E`*: Exclude tests based on a regular expression string. +- *`-V`*: Show test execution with details. +- *`-N`*: Dry-run the tests only. + +Example: list tests with `boringssl` in name excluding those with `_gated` +``` +ctest -R boringssl -E _gated -N +``` + +Example: run only `test_static_boringssl` (exact match) +``` +ctest -R '^test_static_boringssl$' +``` + +### Customize Configurations + +It is possible to specify a global test configuration file to overwrite any configuration parameter of a test case: + +``` +TEST_CONFIG=$(pwd)/test_config.yaml ctest -V +``` + +where `TEST_CONFIG` specifies the test configuration file. + +The configuration file uses the following format: + +``` +*_dummy_pi: + SCALE: 3000 +``` + +where `*_dummy_pi` specifies the test case name. You can use `*` to specify a wildcard match. The subsection underneath specifies the configuration variables and values. Any parameters specified in each test case [`validate.sh`](validate.md) can be overwritten. + +Use with caution as overwritting configuration parameters may lead to invalid parameter combinations. + +### Benchmark Scripts + +A set of utility scripts are linked under your workload build directory to make it easy for workload benchmark activities. + +- **`ctest.sh`**: This is an extended ctest script extending the following features, besides what ctest supports: + +``` +Usage: [options] +--nohup Run ctest in the daemon mode for long benchmark +--loop Run the benchmark multiple times sequentially. +--run Run the benchmark multiple times on the same SUT(s), only for cumulus. +--burst Run the benchmark multiple times simultaneously. +--test-config Specify the test-config file. +--set Set the workload parameter values during loop and burst iterations. +--stop Kill all ctest sessions. +--continue Ignore any errors and continue the loop and burst iterations. +--prepare-sut Prepare cloud SUT instances for reuse. +--reuse-sut Reuse previously prepared cloud SUT instances. +--cleanup-sut Cleanup cloud SUT instances. +--dry-run Generate the testcase configurations and then exit. +``` + +The followings are some examples: + +``` +# run aws test cases 5 times sequentially +./ctest.sh -R aws --loop=5 --nohup + +# run aws test cases 5 times simultaneously +./ctest.sh -R aws --burst=5 --nohup + +# run aws test cases 4 times simultaneously with the SCALE value +# incremented linearly as 1000, 1300, 1600, 1900 in each iteration. +# "..." uses three previous values to deduce the increment. +./ctest.sh -R aws --set "SCALE=1000 1300 1600 ...2000" --burst=4 --nohup + +# run aws test cases 4 times simultaneously with the SCALE value +# incremented linearly as 1000, 1600, 1000, 1600 in each iteration. +# "..." uses three previous values to deduce the increment. +# "|200" means the values must be divisible by 200. +./ctest.sh -R aws --set "SCALE=1000 1300 1600 ...2000 |200" --burst=4 --nohup + +# run aws test cases 4 times simultaneously with the SCALE value +# incremented linearly as 1000, 1600, 2000, 1000 in each iteration. +# "..." uses three previous values to deduce the increment. +# "8000|" means the values must be a factor of 8000. +./ctest.sh -R aws --set "SCALE=1000 1200 1400 ...2000 8000|" --burst=4 --nohup + +# run aws test cases 4 times simultaneously with the SCALE value +# incremented exponentially as 1000, 2000, 4000, 8000 in each iteration. +# "..." uses three previous values to deduce the multiplication factor. +./ctest.sh -R aws --set "SCALE=1000 2000 4000 ...10000" --burst=4 --nohup + +# run aws test cases 6 times simultaneously with the SCALE value +# enumerated repeatedly as 1000, 1500, 1700, 1000, 1500, 1700 in each iteration. +./ctest.sh -R aws --set "SCALE=1000 1500 1700" --burst=6 --nohup + +# run aws test cases 6 times simultaneously with the SCALE and BATCH_SIZE values +# enumerated seperately as (1000,1), (1500,2), (1700,4), (1000,8) in each +# iteration. Values are repeated as needed. +./ctest.sh -R aws --set "SCALE=1000 1500 1700" --set BATCH_SIZE="1 2 4 8" --burst=6 --nohup + +# run aws test cases 8 times simultaneously with the SCALE and BATCH_SIZE values +# permutated as (1000,1), (1000,2), (1000,4), (1000,8), (1500,1), (1500, 2), +# (1500, 4), (1500, 8) in each iteration. +./ctest.sh -R aws --set "SCALE=1000 1500 1700/BATCH_SIZE=1 2 4 8" --burst=8 --nohup + +# for cloud instances, it is possible to test different machine types by +# enumerating the AWS_MACHINE_TYPE values (or similar GCP_MACHINE_TYPE): +./ctest.sh -R aws --set "AWS_MACHINE_TYPE=m6i.xlarge m6i.2xlarge m6i.4xlarge" --loop 3 --nohup +``` + +See Also: [Cloud SUT Reuse](#cloud-sut-reuse) + +- **`list-kpi.sh`**: Scan the ctest logs files and export the KPI data. + +``` +Usage: [options] [logs-directory] +--primary List only the primary KPI. +--all List all KPIs. +--outlier Remove outliers beyond N-stdev. +--params List workload configurations. +--format list|xls-ai|xls-inst + Specify the output format. +--var1 Specify the spread sheet variable 1. +--var2 Specify the spread sheet variable 2. +--var3 Specify the spread sheet variable 3. +--var4 Specify the spread sheet variable 4. +--phost node1 Specify the hostname for identifying the primary instance type, in the multi-node workload scenario. +--pinst CPU.Microarchitecture + Specify the SVRInfo field name for identifying the primary instance name. +--filter _(real|throughput) + Specify a trim filter to shorten spreadsheet name. +--file Specify the spread sheet filename. +``` + +> The `xls-ai` option writes the KPI data in the `kpi-report.xls` spread sheet as follows: + + + +> where `--var1=batch_size` `--var2=cores_per_instance` `--var3='*Throughput'` `--var4=Throughput_`. + +> The `xls-inst` option writes the KPI data in the `kpi-report.xls` spread sheet as follows: + + + +> where `--phost=node1`. + +> The `xls-table` option writes the KPI data in the `kpi-report.xls` spread sheet as follows: + + + +> where `--var1=scale`, `--var2=sleep_time`. Optionally, you can specify `--var3` and `--var4` variables for multiple tables in the same spreadsheet. + +### Cloud SUT Reuse + +With the cumulus backend, it is possible to reuse the Cloud SUT instances during the benchmark process. This is especially useful in tuning parameters for any workload. + +To reuse any SUT instances, you need to first prepare (provision) the Cloud instances, using the `ctest.sh` `--prepare-sut` command as follows: + +``` +./ctest.sh -R aws_kafka_3n_pkm -V --prepare-sut +``` + +The `--prepare-sut` command provisions and prepares the Cloud instances suitable for running the `aws_kafka_3n_pkm` test case. The preparation includes installing docker/Kubernetes and labeling the worker nodes. The SUT details are stored under the `sut-logs-aws_kafka_3n_pkm` directory. + +Next, you can run any iterations of the test cases, reusing the prepared SUT instances with the `--reuse-sut` command, as follows: + +``` +./ctest.sh -R aws_kafka_3n_pkm -V --reuse-sut +``` + +> If `--reuse-sut` is set, `--burst` is disabled. + +Finally, to cleanup the SUT instances, use the `--cleanup-sut` command: + +``` +./ctest.sh -R aws_kafka_3n_pkm -V --cleanup-sut +``` + +SUT reuse is subject to the following limitations: +- The SUT instances are provisioned and prepared for a specific test case. Different test cases cannot share SUT instances. +- It is possible to change workload parameters, provided that such changes do not: + - The changes do not affect the worker node numbers. + - The changes do not affect the worker node machine types, disk storage, or network topologies. + - The changes do not affect worker node labeling. + - The changes do not introduce any new container images. + +--- + +Please cleanup the Cloud instances after use. You can also use the cumulus [cloud cleanup](setup-cumulus.md#cleanup-cloud-resources) procedure to completely cleanup any Cloud resources. + +--- + diff --git a/doc/dockerfile.md b/doc/dockerfile.md new file mode 100644 index 0000000..41b5e3d --- /dev/null +++ b/doc/dockerfile.md @@ -0,0 +1,86 @@ + + +The workload Dockerfile must meet certain requirements to facilitate image build, validation execution and data collection. + +### Use Template + +You can use `m4` template in constructing Dockerfles, which avoids duplication of identical steps. Any files with the `.m4` suffix will be replaced with the corresponding files without the suffix, during the build process. + +### Set Build Order + +If there are multiple Dockerfiles under the workload directory, the build order is determined by the filename pattern of the Dockerfile: `Dockerfile.[1-9].`. The bigger the number in the middle of the filename, the earlier that the build script builds the Dockerfile. If there are two Dockerfiles with the same number, the build order is platform specific. + +### Specify Image Name + +The first line of the Dockerfile is used to specify the docker image name, as follows: + +Final image: +``` +# resnet_50 +... +``` + +Intermediate image: +``` +## resnet_50_model +``` + +Any final images will be pushed to the docker registry. Any intermediate images will be left on the build machine. As a convention, the image name uses the following pattern: `[-]-`. The platform prefix is a must have if the image is platform specific, and optional if the image can run on any platform. + +### List Ingredients + +Any significent ingredients used in the workload must be marked with the `ARG` statement, so that we can easily list ingredients of a workload: + +``` +ARG IPP_CRYPTO_VER="ippcp_2020u3" +ARG IPP_CRYPTO_REPO=https://github.com/intel/ipp-crypto.git +... +``` + +The following `ARG` suffixes are supported: +- **_REPO/_REPOSITORY**: Specify the ingredient source repository location. +- **_VER/_VERSION**: Specify the ingredient version. +- **_IMG/_IMAGE**: Specify an ingredient docker image. +- **_PKG/_PACKAGE**: Specify an ingredient OS package, such as deb or rpm. + +> _VER and the corresponding _REPO/_PKACAGE/_IMAGE must be in a pair and in order to properly show up in the Wiki ingredient table. For example, if you define `OS_VER`, then there should be a following `OS_IMAGE` definition. + +### Export Status & Logs + +It is the workload developer's responsibility to design how to start the workload and how to stop the workload. However, it is a common requirement for the validation runtime to reliably collect execution logs and any telemetry data for analyzing the results. + +#### Export to FIFO + +The workload image must create a fifo `/export-logs` and then archive (1) the workload exit code (in `status`) and (2) any workload-specifc logs to the fifo. The workload exit code is mandatory. Workload logs can be used to generate KPIs. + +``` +RUN mkfifo /export-logs +CMD (; echo $? > status) 2>&1 | tee output.logs && \ + tar cf /export-logs status output.logs && \ + sleep infinity +``` + +#### Import from FIFO + +The validation backend (script/validate.sh) imports the logs data through the fifo, as follows: + +``` +# docker +docker exec cat /export-logs | tar xf - +``` +``` +# kubernetes +kubectl exec cat /export-logs | tar xf - +``` + +The above command blocks if the workload execution is in progress and exits after the execution is completed (thus it is time for cleanup.) + +### Reserved Feature + +Do not use `ENTRYPOINT` in the Dockerfile. This is a reserved feature for future extension. + +### See Also + +- [How to Create a Workload](workload.md) +- [Provisioning Specification](cluster-config.md) + diff --git a/doc/image/ss-ai.png b/doc/image/ss-ai.png new file mode 100644 index 0000000..b27b0f5 Binary files /dev/null and b/doc/image/ss-ai.png differ diff --git a/doc/image/ss-inst.png b/doc/image/ss-inst.png new file mode 100644 index 0000000..79ac2b8 Binary files /dev/null and b/doc/image/ss-inst.png differ diff --git a/doc/image/ss-table.png b/doc/image/ss-table.png new file mode 100644 index 0000000..1918977 Binary files /dev/null and b/doc/image/ss-table.png differ diff --git a/doc/kpi.md b/doc/kpi.md new file mode 100644 index 0000000..8e6b5e7 --- /dev/null +++ b/doc/kpi.md @@ -0,0 +1,28 @@ + +The `kpi.sh` script parses the validation output and exports a set of key/value pairs to represent the workload performance. + +The following is some example of the KPI data: +``` +# this is a test ## Optional comments +## threads: 4 ## Tunable parameters overwrite +throughput: 123.45 ## Simple key/value +throughput (op/s): 123.45 ## Key, unit (in parentheses) and value +*throughput (images/s): 123.45 ## Primary KPI for regression reporting +``` + +To avoid introducing additional software dependencies, it is recommended to use `gawk` to parse the validation logs and format the output. + +The validation output is assumed to be stored at 1 layer under the current directory. The `kpi.sh` example is as follows: + +``` +#!/bin/bash -e + +awk ' +{ + # KPI parsing script +} +' */output.logs 2>/dev/null || true +``` + +where `2>/dev/null` supresses any error message if `*/output.logs` does not exist, and `||true` makes the `kpi.sh` always returns an ok status. + diff --git a/doc/kubernetes-config.md b/doc/kubernetes-config.md new file mode 100644 index 0000000..359f928 --- /dev/null +++ b/doc/kubernetes-config.md @@ -0,0 +1,87 @@ + +The `kubernetes-config.yaml` script is a manifest that describes how the workload container(s) should be scheduled (to the machine cluster described by `cluster-config.yaml`.) This is the standard Kubernetes script. + +``` +include(config.m4) +... +spec: +... + spec: + containers: + - name: database + image: IMAGENAME(wordpress5mt-defn(`DATABASE')) +... +``` + +where the `IMAGENAME` macro expands the image name to include the `REGISTRY` prefix and the `RELEASE` versions. + +#### About `imagePullPolicy` + +To ensure that the validation runs always on the latest code, it is recommended to use `imagePullPolicy: Always`. However, this requires to use a private docker registry. In local development, `imagePullPolicy: IfNotPresent` is desired. The `config.m4` utility provides a macro, `IMAGEPOLICY`, to switch between `Always` and `IfNotPresent` depending on the `REGISTRY` setting. + +``` +... + spec: + containers: + - name: database + image: IMAGENAME(wordpress5mt-defn(`DATABASE')) + imagePullPolicy: IMAGEPOLICY(Always) +... +``` + +Not all docker images are built equally. Some are less frequently updated and less sensitive to performance. Thus it is preferrable to use `imagePullPolicy: IfNotPresent` in all cases. + +#### About `podAntiAffinity` + +To spread the pods onto different nodes, use `podAntiAffinity` as follows: + +``` +... + metadata: + labels: + app: foo + spec: + PODANTIAFFINITY(preferred,app,foo) +... +``` + +where the convenient macro `PODANTIAFFINITY` expands to + +``` +... + metadata: + labels: + app: foo + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - foo + topologyKey: "kubernetes.io/hostname" +... +``` + +#### About `CLUSTER_WORKERS` + +Some workload uses the Kubernetes operator to launch new Kubernetes pods during the workload execution. It is critical to restrict any newly launched pods to be within the cluster workers that the workload is assigned to run. Define the `CLUSTER_WORKERS` environment variable as follows to retrieve the information about the list of worker node IP addresses. + +``` + spec: + containers: + - name: foo + image: IMAGENAME(foo) + imagePullPolicy: IMAGEPOLICY(Always) + env: + - name: CLUSTER_WORKERS + value: "" +``` + +The value will be replaced with the list of worker node IP addresses, separated by `,`, if the workload is restricted to a set of worker nodes. The value will remain unchanged if there is no such restriction. + diff --git a/doc/readme.md b/doc/readme.md new file mode 100644 index 0000000..49d7e1a --- /dev/null +++ b/doc/readme.md @@ -0,0 +1,26 @@ + +The workload README should have the following sections: +- Introduction: Introduce the workload and any background information. +- Test Case: Describe the test cases. +- Docker Image: Describe the docker images and example usages. +- KPI: Describe the KPI definitions and the meanings of the values. +- [System Setup](#system-setup): Describe the system setup. +- [Index Info](#index-info): List workload indexing information. +- See Also: Add any workload-related references. + +### Index Info + +The following information must be provided: + +- Name: The workload friendly name. +- Category: The category of the workload: one of `DataServices`, `ML/DL/AI`, `HPC`, `Media`, `Networking`, `Synthetic`, `uServices`. +- Platform: A list of supported [platform](../workload/platforms) names. +- keywords: A list of related keywords. + +### System Setup + +Include (but not limited to) the following information: +- The minimum system setup. +- The recommendded system setup. +- Any guideline in tuning the workload performance. + diff --git a/doc/setup-auth.md b/doc/setup-auth.md new file mode 100644 index 0000000..691e118 --- /dev/null +++ b/doc/setup-auth.md @@ -0,0 +1,55 @@ + +### Introduction + +A private docker registry is optional in most of the validation senarios except if you want to run the workloads on an on-premesis Kubernetes cluster, or you explicitly setup a docker registry to store any newly built workload images. + +This document describes how to authenticate to a docker registry if the registry requires authentication. Skip this document if there is no authentication requriement. + +### `REGISTRY_AUTH` + +The [`cmake`](cmake.md) `REGISTRY_AUTH` option specifies how to authenticate to a private docker registry. Currently, `docker` is the only supported value, which uses the docker authentication mechanism. + +Enable the `REGISTRY_AUTH` option: + +``` +cmake -DREGISTRY= -DREGISTRY_AUTH=docker .. +``` + +With the above command, the validation scripts will upload the docker authentication information specified in `.docker/config.json` as a Kubernetes `imagePullSecret` to the validation cluster, on-premesis or in Cloud. + +> `CredHelpers` or `CredStore` in `.docker/config.json` is not suppoerted. + +### Authenticate to Cloud Private Registry + +#### Amazon Elastic Container Registry + +``` +make aws +$ aws ecr get-login-password --region | docker login --username AWS --password-stdin .dkr.ecr..amazonaws.com +$ exit +``` + +> Note that the build script will auto-create the image repository namespaces. + +#### Google Cloud Container Registry + +``` +make gcp +$ gcloud iam service-accounts create +$ gcloud projects add-iam-policy-binding --member "serviceAccount:@.iam.gserviceaccount.com" --role "roles/ROLE" +$ gcloud iam service-accounts keys create keyfile.json --iam-account @.iam.gserviceaccount.com +$ gcloud auth print-access-token | docker login -u oauth2accesstoken --password-stdin +$ exit +``` + +> Note that the Oauth2 access token will expire in an hour. + + +#### Azure Container Registry: + +``` +make azure +$ az acr login --name --expose-token --output tsv --query accessToken | docker login -username 00000000-0000-0000-0000-000000000000 --password-stdin +$ exit +``` + diff --git a/doc/setup-containerd.md b/doc/setup-containerd.md new file mode 100644 index 0000000..815051a --- /dev/null +++ b/doc/setup-containerd.md @@ -0,0 +1,57 @@ + +### Introduction + +Starting Kubernetes v1.20, Kubernetes deprecated docker as a runtime and used `containerd` instead. It is a prerequisite to install `containerd` before installing Kubernetes. + +#### Installation + +Install `containerd` from your OS packages: + +``` +apt-get install containerd # Ubuntu or Debian +yum install containerd # Centos +``` + +#### Setup Proxy + +``` +sudo mkdir -p /etc/systemd/system/containerd.service.d +printf "[Service]\nEnvironment=\"HTTP_PROXY=$http_proxy\" \"HTTPS_PROXY=$https_proxy\" \"NO_PROXY=$no_proxy\"\n" | sudo tee /etc/systemd/system/containerd.service.d/proxy.conf +sudo systemctl daemon-reload +sudo systemctl restart containerd +``` + +#### Setup Configuration Files + +``` +containerd config default | sudo tee /etc/containerd/config.toml +sudo systemctl restart containerd +``` + +#### Setup Insecure Registries + +On-Premisis workload validation based on Kuberenetes requires to use a docker registry. If you need to setup any insecure registries with `containerd`, modify the `containerd` configuration as follows, assuming your private registry is `foo.com:5000`: + +``` +sudo sed -i 's|config_path =.*|config_path = "/etc/containerd/certs.d"|' /etc/containerd/config.toml +sudo mkdir -p /etc/containerd/foo.com:5000 +cat | sudo tee /etc/containerd/certs.d/foo.com:5000/hosts.toml <.yaml`](../script/cumulus), where `` is the Cloud vendor name. You can customize as needed. +- If you are behind a corporate firewall, please update the proxy settings in [`ssh_config`](../script/cumulus/ssh_config) accordingly. + +#### Configure Cloud Account + +``` +make aws # or make -C ../.. aws, if under build/workload/ +$ aws configure # please specify a region +$ exit +``` + +``` +make azure # or make -C ../.. azure, if under build/workload/ +$ az login +$ exit +``` + +``` +make gcp # or make -C ../.. gcp, if under build/workload/ +$ gcloud init --no-browser +$ exit +``` + +``` +make tencent # or make -C ../.. tencent, if under build/workload/ +$ tccli configure # please specify a region +$ exit +``` + +``` +make alicloud # make -C ../.. alicloud, if under build/workload/ +$ aliyun configure # please specify a region +$ exit +``` + +#### Run Workload(s) Through Cumulus + +``` +cd workload/ +make +ctest -N +``` + +#### Cleanup Cloud Resources + +If your cumulus validation is interrupted for any reason, the Cloud resource may remain active. You can explicitly cleanup any Cloud resources as follows: + +``` +make -C ../.. aws +$ cleanup +$ exit +``` + +``` +make -C ../.. gcp +$ cleanup +$ exit +``` + +``` +make -C ../.. azure +$ cleanup +$ exit +``` + +``` +make -C ../.. tencent +$ cleanup +$ exit +``` + +``` +make -C ../.. alicloud +$ cleanup +$ exit +``` + +#### Use A Cloud Private Registry + +A Cloud private registry is a convenient option to store workload images. During the Cloud validation, the SUTs (System Under Test) can directly pull images from the docker registry without transfering any images. Here we assume the Cloud registry and the SUTs are in the same region. + +Add the following flag in the `flags` section of `script/cumulus/cumulus-config..yaml` to indicate that the SUTs can directly access to the registry. The cumulus backend will then skip transfering workload images during validation: + +``` + flags: + skopeo_sut_accessible_registries: "" +``` + +See Also: [Private Registry Authentication](setup-auth.md) + + +### Setup Cumulus for On-Premesis Validation + +- Setup a [Kubernetes](setup-kubernetes.md#Setup-Kubernetes) cluster. Customize [`cumulus-config.static.yaml`](../script/cumulus/cumulus-config.static.yaml) to specify your cluster information. +- Run the [`setup-sut.sh`](../script/cumulus/script/setup-sut.sh) script to setup the SUT (System Under Test) hosts as follows: + +``` +./setup-sut.sh user@host1 [user@host2...] +``` + +> The script requires sudo permission on the SUT hosts. + +### Cumulus Options + +Use the following options to customize the cumulus validation: + +- Set the default to use `docker` in validation wherever possible: + +``` +cmake -DCUMULUS_OPTIONS=--docker-run .. +``` + +- Set the dry-run mode. Configure the workload but skip the execution stage: + +``` +cmake -DCUMULUS_OPTIONS=--dry-run .. +``` + +### Telemetry Tracing + +You can enable telemetry tracing via `sar`, `emon`, and/or `collectd` as follows: +- **`sar`**: Add `--sar` to `CUMULUS_OPTIONS`. +- **`emon`**: Add `--emon --edp_publish --emon_post_process_skip` to `CUMULUS_OPTIONS`. +- **`collectd`**: Add `--collectd` to `CUMULUS_OPTIONS`. + +``` +cmake -DCUMULUS_OPTIONS=--collectd .. +cd workload/ +ctest -N +``` + +For On-Cloud validation, there is no additional setup. For On-Premesis validation, you need to perform additional setup for each telemetry tracing mechniasm: + +#### Setup `sar` On-Prem + +- Install the `sar` utility on your worker nodes. + +#### Setup `EMON` On-Prem + +On your worker nodes, +- Create a `/opt/pkb` folder with the right ownership: + +``` +sudo mkdir -p /opt/pkb +sudo chown $(id -u):$(id -g) /opt/pkb +``` + +- Download and install [EMON](https://www.intel.com/content/dam/develop/public/us/en/documents/emon-user-guide-nov-2019.pdf) to `/opt/emon/emon_files`. +> Note it is critical that the installation location is `/opt/emon/emon_files`. + +- Add your worker username to the `vtune` group. + +``` +sudo usermod -aG vtune $(id -gn) +``` + +##### For `EDP` post process capabilities + +On your worker nodes install `python3` and add the following pip packages: + +``` +sudo python3 -m pip install xlsxwriter pandas numpy pytz defusedxml tdigest dataclasses +``` + +#### Setup Collectd On-Prem + +On your worker nodes, +- Install `flex`, `bison`, `autoconf`, `automake` and `libtool`. +- Download [collectd](https://github.com/collectd/collectd) and compile it as follows: + +``` +sudo mkdir -p /opt/pkb +sudo chown -R $(id -u).$(id -g) /opt/pkb +sudo mkdir -p /opt/collectd +sudo chown -R $(id -u).$(id -g) /opt/collectd + +git clone https://github.com/collectd/collectd.git +cd collectd +./build.sh +./configure --prefix=/opt/collectd/collectd +make +make install +``` + +- Copy [collectd.conf](../script/cumulus/collectd.conf) to `/opt/collectd/collectd/etc`. + +### Setup SVRINFO + +The `svrinfo` utility is used to retrieve system-level information at the beginning of any validation run. Since `svrinfo` is under NDA only, the use of `svrinfo` is optional and by default disabled. + +To setup `svrinfo`, copy the `svrinfo` tarball under `script/cumulus/pkb/perfkitbenchmarker/data/svrinfo`. Remake. Then turn on the `svrinfo` option as follows: + +``` +cmake -DCUMULUS_OPTIONS=--svrinfo .. +``` + +### Cumulus Debugging + +Enable the cumulus debugging mode as follows: + +- Specify break points in `CUMULUS_OPTIONS`: + +``` +cmake -DCUMULUS_OPTIONS=--dpt_debug=[,] .. +``` + +where `` can be one of more of the following strings: +- `PrepareStage`: Pause when the workload is about to setup the host environment. +- `SetupVM`: Pause when the workload is about to setup external VMs. +- `RunStage`: Pause when the workload is about to start the workload execution. +- `CleanupStage`: Pause when the workload is about to cleanup. +- `ScheduleExec`: Pause when the workload is about to schedule execution. +- `ExtractLogs`: Pause when the workload is about to extract logs. +- `ExtractKPI`: Pause when the workload is about to extract KPIs. +- `ScheduleExecFailed`: Pause when scheduling execution is failed. +- `ExtractLogsFailed`: Pause when extracting logs is failed. +- `ExtractKPIFailed`: Pause when extracting KPI is failed. + +Start the workload validation as usual (ctest), cumulus will pause at the specified breakpoints. You can start a new shell and login to the cumulus container as follows: + +``` +./debug.sh +$ +``` + +Now you can `ssh` to the remote worker and start debugging. To resume validation, simply create an empty signalling file `Resume` under `/tmp/pkb/runs//` as follows: + +``` +> touch /tmp/pkb/runs/784d84f59e3d/ResumeRunStage +``` + + +### See Also + +- [TCP TIME_WAIT Reuse](https://github.com/intel/Updates-for-OSS-Performance/blob/main/time_wait.md) +- [Unsuitable CPU Speed Policy](https://github.com/intel/Updates-for-OSS-Performance/blob/main/cpufreq.md) diff --git a/doc/setup-docker.md b/doc/setup-docker.md new file mode 100644 index 0000000..531ab28 --- /dev/null +++ b/doc/setup-docker.md @@ -0,0 +1,33 @@ + +The `docker` engine is a prerequsite to build the workload images. It is also one of the validation backends that can be used to run single-container workloads on your local machine. + +### Setup Docker + +Follow the instructions to install the `docker` engine on your local system. The docker version `20.10.10` or later is required for full features. + +``` +curl -fsSL https://get.docker.com -o get-docker.sh +sh get-docker.sh +``` + +> It is recommended that you complete the [post-installation steps](https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user) to manage `docker` as a non-root user. + +### Setup Proxies + +If you are behind a firewall, complete the following steps to setup the proxies: + +``` +sudo mkdir -p /etc/systemd/system/docker.service.d +printf "[Service]\nEnvironment=\"HTTP_PROXY=$http_proxy\" \"HTTPS_PROXY=$https_proxy\" \"NO_PROXY=$no_proxy\"\n" | sudo tee /etc/systemd/system/docker.service.d/proxy.conf +sudo systemctl daemon-reload +sudo systemctl restart docker +``` + +### Docker Login + +Login to your dockerhub account so that you can pull images from dockerhub. + +### See Also + +- [Docker Setup](https://docs.docker.com/engine/install/#server) + diff --git a/doc/setup-hugepage.md b/doc/setup-hugepage.md new file mode 100644 index 0000000..044e6bb --- /dev/null +++ b/doc/setup-hugepage.md @@ -0,0 +1,43 @@ + +### Hugepage Setup + +Workloads that require to use hugepages must specify a `HAS-SETUP-HUGEPAGE` label in the format of `HAS-SETUP-HUGEPAGE--`, where `` is the hugepage size and `` is the #pages required. The `` value must exactly match the string, case sensitive, of the hugepage sizes supported under `/sys/kernel/mm/hugepages`. For example, to request 1024 pages of 2MB hugepages, use `HAS-SETUP-HUGEPAGE-2048kB-1024`. + +### System Setup + +Setup hugepages through the kernel boot parameters, as follows: + +``` +sudo grubby --update-kernel=DEFAULT --args="hugepages=1024" +``` + +Then reboot the machine for the hugepages to take effect. + +For Ubuntu, you need to edit `sudo vi /etc/default/grub` by adding the number of huge pages to `GRUB_CMDLINE_LINUX`, like this: + +``` +GRUB_CMDLINE_LINUX="hugepages=1024" +``` + +Then you need to do `sudo update-grub` and reboot. + +To verify changes you can use this `cat /proc/meminfo | grep Huge`. + +--- + +Kubernetes only recognizes hugepages if they are preallocated through boot parameters. + +--- + +### Node Labels + +To avoid creating a lot of node labels, it is recommended to specify #pages only in the power 2 values. Label the worker node(s) with the following node labels: +- `HAS-SETUP-HUGEPAGE-2048kB-512=yes` Optional +- `HAS-SETUP-HUGEPAGE-2048kB-1024=yes` Optional +- `HAS-SETUP-HUGEPAGE-2048kB-2048=yes` Optional +- `HAS-SETUP-HUGEPAGE-2048kB-4096=yes` Optional + +### See Also + +- [Manage HugePages](https://kubernetes.io/docs/tasks/manage-hugepages/scheduling-hugepages) + diff --git a/doc/setup-kubernetes.md b/doc/setup-kubernetes.md new file mode 100644 index 0000000..66a5aee --- /dev/null +++ b/doc/setup-kubernetes.md @@ -0,0 +1,45 @@ + +`Kubernetes` is the default validation backend to run single-or-multiple-container workloads on your local cluster of machines. + +### Prerequisite + +Starting Kubernetes v1.20, Kubernetes deprecated docker as a runtime and used containerd instead. Follow the [instructions](setup-containerd.md) to install and configure `containerd` on your system. + +### Setup Kubernetes + +Follow the [Ubuntu](https://phoenixnap.com/kb/install-kubernetes-on-ubuntu)/[CentOS](https://phoenixnap.com/kb/how-to-install-kubernetes-on-centos) instructions to setup a Kubernetes cluster. For full features, please install Kubernetes v1.21 or later. + +--- + +You can build the workloads and run the workloads on the same machine by setting up a single-node Kubernetes cluster: + +``` +kubectl taint node --all node-role.kubernetes.io/master- +kubectl taint node --all node-role.kubernetes.io/control-plane- # >= v1.20 +``` + +--- + +### Setup Node Feature Discovery + +Install node feature discovery as follows: + +``` +kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default +``` + +### Setup arm64 Emulation + +You can setup any worker node as an arm64 emulator. To do so, run the [`setup.sh`](../script/march/setup.sh) script on each worker node to setup the arm64 emulation. + +``` +script/march/setup.sh +``` + +### See Also + +- [Docker Setup](setup-docker.md) +- [Kubernetes Setup](setup-kubernetes.md) +- [Private Registry Authentication](setup-auth.md) +- [Cumulus Setup](setup-cumulus.md) +- [`cluster-config.yaml`](cluster-config.md) diff --git a/doc/setup-module.md b/doc/setup-module.md new file mode 100644 index 0000000..7181bbb --- /dev/null +++ b/doc/setup-module.md @@ -0,0 +1,20 @@ + +### Module Setup + +The set of `HAS-SETUP-MODULE` labels specify the request of installing kernel modules that are part of the OS distribution but not by default installed during boot. + +The label should be specified in the format of `HAS-SETUP-MODULE-`, where `` is the module name with `_` replaced with `-`. + +### System Setup + +The kernel module can be installed as follows: + +``` +sudo modprobe .ko +``` + +### Node Labels + +Add a node label to the worker node(s): +- `HAS-SETUP-MODULE-MSA`: Optional + diff --git a/doc/setup-storage.md b/doc/setup-storage.md new file mode 100644 index 0000000..5c5146c --- /dev/null +++ b/doc/setup-storage.md @@ -0,0 +1,10 @@ + +### Storage Setup + +#### SSD Disks + +Certain workloads require to use scratch disk(s) as cache storage. The workers must be equipped with the right SSD disks. + +Label the worker nodes with the following node labels: +- `HAS-SETUP-DISK-MOUNT-1`: The worker node must have a SSD disk with the size of at least 500GB. The SSD disk must be mounted under `/mnt/disk1`. + diff --git a/doc/stack.md b/doc/stack.md new file mode 100644 index 0000000..cea2428 --- /dev/null +++ b/doc/stack.md @@ -0,0 +1,14 @@ + +### Software Stack Elements + +A software stack is the underlying software layers that a workload is constructed upon. Software stack consists of the following elements, some described in this document and others in the linked document. + +- **[Dockerfiles](dockerfile.md)**: A software stack may contain one or many Dockerfiles. +- **[CMakeLists.txt](cmakelists.md)**: A manifest to configure `cmake`. +- **[build.sh](build.md)**: A script for building the workload docker image(s). + +### See Also + +- [Dockerfile Requirements](dockerfile.md) + + diff --git a/doc/template.md b/doc/template.md new file mode 100644 index 0000000..2d6aa0c --- /dev/null +++ b/doc/template.md @@ -0,0 +1,40 @@ + +There is a template system, based on `m4`, built into the workload build process. You can use the template system to simplify the workload recipe development by encapsulating any duplicated steps. + +To use the template system, create a (or more) `.m4` files under your workload folder, and put +any shared templates `.m4` under the `template` folder under the workload, feature, platform or the top directory. During the build process, those `.m4` files will be translated to the corresponding files without the `.m4` suffix, before docker build. + +The following sample uses `ippmb.m4` to encapsulate the IPP library installation steps: + +``` +# SPR/Crypto/WordPress/Docker.1.nginx.m4 +... +include(ippmb.m4) +... +``` + +where `ippmb.m4` will be expanded to: + +``` +# SPR/Crypto/template/ippmb.m4 +ARG IPP_CRYPTO_VERSION="ippcp_2020u3" +ARG IPP_CRYPTO_REPO=https://github.com/intel/ipp-crypto.git +RUN git clone -b ${IPP_CRYPTO_VERSION} --depth 1 ${IPP_CRYPTO_REPO} && \ + cd /ipp-crypto/sources/ippcp/crypto_mb && \ + cmake . -B"../build" \ + -DOPENSSL_INCLUDE_DIR=/usr/local/include/openssl \ + -DOPENSSL_LIBRARIES=/usr/local/lib64 \ + -DOPENSSL_ROOT_DIR=/usr/local/bin/openssl && \ + cd ../build && \ + make crypto_mb && \ + make install +``` + +### Pre-defined Variables: + +- **PLATFORM**: The platform name that the workload is defined for. +- **FEATURE**: The hero feature name that the workload is defined under. +- **WORKLOAD**: The workload name. +- **REGISTRY**: The private registry. +- **RELEASE**: The release version. + diff --git a/doc/validate.md b/doc/validate.md new file mode 100644 index 0000000..1e4fbbd --- /dev/null +++ b/doc/validate.md @@ -0,0 +1,55 @@ + +The `validate.sh` script initiates the workload execution, with a typical `validate.sh` shown as follows: + +``` +#!/bin/bash -e + +# Read test case configuration parameters +... + +# Logs Setting +DIR=$(dirname $(readlink -f "$0")) +. "$DIR/../../script/overwrite.sh" + +# Workload Setting +WORKLOAD_PARAMS="" + +# Docker Setting +DOCKER_IMAGE="$DIR/Dockerfile" +DOCKER_OPTIONS="" + +# Kubernetes Setting +RECONFIG_OPTIONS="-DCONFIG=$CONFIG" +JOB_FILTER="job-name=benchmark" + +. "$DIR/../../script/validate.sh" +``` + +The `validate.sh` saves any validation results to the current directory. + +> The following script variables are reserved. Avoid overwriting their values in `validate.sh`: +> `PLATFORM`, `WORKLOAD`, `TESTCASE`, `REGISTRY`, `RELEASE`, +> `IMAGEARCH`, `TIMEOUT`, and `SCRIPT`. + +### Validation Parameters + +- **`WORKLOAD_PARAMS`**: Specify the workload configuration parameters as an array variable of `key:value` pairs. The configuration parameters will be shown as software configuration metadata associated with the workload. +- **`WORKLOAD_TAGS`**: Specify any workload related tags as a space separated string. +- **`DOCKER_IMAGE`**: If the workload is a single-container workload and support docker run, specify either the docker image name or the `Dockerfile` used to compile the docker image. If the workload does not support docker run, leave the variable value empty. +- **`DOCKER_DATASET`**: Specify a set of dataset images as an array variable. The dataset image(s) will be volume-mounted to the running docker image. +- **`DOCKER_OPTIONS`**: Specify any docker run options, if the workload supports docker run. +- **`RECONFIG_OPTIONS`**: Specify any `m4` configuration parameters when `kubernetes-config.yaml.m4` and `cumulus-config.yaml.m4` are configured. This applies to the `kuberentes` and `cumulus` validation. +- **`JOB_FILTER`**: Specify which job/deployment is used to monitor the validation progress and after validation completion, retrieve the validation logs. +- **`SCRIPT_ARGS`**: Specify the script arguments for the `kpi.sh` or `setup.sh`. + +### Event Tracing Parameters + +- **`EVENT_TRACE_PARAMS`**: Specify the event tracing parameters: + - `roi`: Specify the ROI-based trace parameters: `roi,,`. For example, the trace parameters can be `roi,begin region of interest,end region of interest`. The workload must be instrumented to print these phrases in the console output. + - `time`: Specify a time-based trace parameters: `time,,`. For example, if the trace parameters are `time,30,10`, the trace collection starts 30 seconds after the workload containers become ready and the collection duration is 10 seconds. + - For short-ROI workloads (less than a few seconds), it is recommended that you specify the `EVENT_TRACE_PARAMS` value as an empty string, meaning that the trace ROI should be the entirety of the workload execution, which ensures that the trace collection catches the short duration of the workload execution. + +> Between `roi` and `time`, use `roi` if possible and use `time` as the last resort if the workload does not output anything meaningful to indicate a ROI. + +> Note that none of the event tracing mechanisms is timing accurate. You need to define the event trace parameter values with a high timing tolerance, at least in seconds. + diff --git a/doc/workload.md b/doc/workload.md new file mode 100644 index 0000000..d1be0b5 --- /dev/null +++ b/doc/workload.md @@ -0,0 +1,19 @@ + +### Workload Elements + +A workload consists of the following elements, some described in this document and others in the linked document: + +- **[Dockerfiles](dockerfile.md)**: A workload may contain one or many Dockerfiles. +- **[CMakeLists.txt](cmakelists.md)**: A manifest to configure `cmake`. +- **[build.sh](build.md)**: A script for building the workload docker image(s). +- **[validate.sh](validate.md)**: A script for executing the workload. +- **[kpi.sh](kpi.md)**: A script for extracting KPI data out of the workload execution logs. +- **[cluster-config.yaml.m4](cluster-config.md)**: A manifest to describe how to provision a machine or a set of machines for running the workload. +- **[kubernetes-config.yaml.m4](kubernetes-config.md)**: A manifest to describe how to schedule the containers to the cluster for Kubernetes. +- **[README](readme.md)**: A README to describe the workload. + +### See Also + +- [Dockerfile Requirements](dockerfile.md) +- [Provisioning Specification](cluster-config.md) + diff --git a/script/benchmark/ctest.sh b/script/benchmark/ctest.sh new file mode 100755 index 0000000..8a53e2d --- /dev/null +++ b/script/benchmark/ctest.sh @@ -0,0 +1,289 @@ +#!/bin/bash -e + +if [ "$#" -eq 0 ]; then + echo "Usage: [options]" + echo "--loop Run the ctest commands sequentially." + echo "--burst Run the ctest commands simultaneously." + echo "--run Run the ctest commands on same SUT (only with cumulus)." + echo "--test-config Specify the test-config yaml." + echo "--nohup Run the script as a daemon." + echo "--stop Kill all ctest sessions." + echo "--set Set variable values between burst and loop iterations." + echo "--continue Ignore any error and continue the burst and loop iterations." + echo "--prepare-sut Prepare cloud SUT for reuse." + echo "--reuse-sut Reuse the cloud SUT previously prepared." + echo "--cleanup-sut Cleanup cloud SUT." + echo "--dry-run Generate the testcase configurations and then exit." + echo "ctest options apply" + echo "" + echo " accepts the following formats:" + echo "VAR=str1 str2 str3 Enumerate the variable values." + echo "VAR=1 3 5 ...20 [|7] Increment variable values linearly, with mod optionally." + echo "VAR=1 2 4 ...32 [35|] Increment variable values exponentially, with mod optionally." + echo "VAR1=n1 n2/VAR2=n1 n2 Permutate variable 1 and 2." + echo "The values are repeated if insufficient to cover the loops." + exit 3 +fi + +run_as_nohup="" +args=() +for var in "$@"; do + case "$var" in + --nohup) + run_as_nohup="1" + ;; + --stop) + kill -9 -a $(ps auxwww | grep ctest | awk '{print$2}') 2> /dev/null || echo -n "" + exit 0 + ;; + *) + args+=("$var") + ;; + esac +done + +if [ -n "$run_as_nohup" ]; then + nohup "$0" "${args[@]}" > nohup.out 2>&1 & + echo "tail -f nohup.out to monitor progress" + exit 0 +fi + +run=1 +burst=1 +loop=1 +step=1 +args=() +steps=() +contf=0 +test_config="$(readlink -f "$TEST_CONFIG" || echo "")" +prepare_sut=0 +sut=() +cleanup_sut=0 +reuse_sut=0 +dry_run=0 +for var in "$@"; do + case "$var" in + --loop=*) + loop="${var/--loop=/}" + ;; + --loop) + loop="-1" + ;; + --burst=*) + burst="${var/--burst=/}" + ;; + --burst) + burst="-1" + ;; + --run=*) + run="${var/--run=/}" + ;; + --run) + run="-1" + ;; + --prepare-sut) + prepare_sut=1 + ;; + --set=*) + steps+=("${var/--set=/}") + ;; + --set) + step="-1" + ;; + --test-config=*) + test_config="$(readlink -f "${var/--test-config=/}")" + ;; + --test-config) + test_config="-1" + ;; + --continue) + contf=1 + ;; + --cleanup-sut) + cleanup_sut=1 + ;; + --reuse-sut) + reuse_sut=1 + ;; + --dry-run) + dry_run=1 + ;; + *) + if [ "$loop" = "-1" ]; then + loop="$var" + elif [ "$burst" = "-1" ]; then + burst="$var" + elif [ "$run" = "-1" ]; then + run="$var" + elif [ "$step" = "-1" ]; then + steps+=("$var") + step=1 + elif [ "$test_config" = "-1" ]; then + test_config="$(readlink -f "$var")" + else + args+=("$var") + fi + ;; + esac +done + +if [ "$loop" = "-1" ]; then + loop=1 +fi + +if [ "$burst" = "-1" ]; then + burst=1 +fi + +if [ "$run" = "-1" ]; then + run=1 +fi + +if [ $prepare_sut = 1 ] || [ $cleanup_sut = 1 ]; then + loop=1 + burst=1 + run=1 +fi + +if [ $reuse_sut = 1 ]; then + burst=1 +fi + +readarray -t values < <(for step1 in "${steps[@]}"; do + echo "$step1" | tr '/' '\n' | awk '{ + split($1,kv,"=") + $1=kv[2] + if ($NF~/^\|[0-9]+$/ || $NF~/^[0-9]+\|$/) { + modstr=$NF + NF=NF-1 + } else { + modstr="" + } + if (($NF ~ /^\.\.\./) && (NF>3)) { + stop=gensub(/^\.\.\./,"",1,$NF) + $NF="" + current=$(NF-1) + delta=current-$(NF-2) + if ($(NF-2)-$(NF-3)==delta) { + for(i=NF-1;(current+0) <= (stop+0);i++) { + $i=current + current+=delta + } + } + if ($(NF-3)!=0 && $(NF-2)!=0) { + factor=$(NF-1)/$(NF-2) + if ($(NF-3)*factor==$(NF-2)) { + for(i=NF-1;(current+0) <= (stop+0);i++) { + $i=current + current*=factor + } + } + } + } + if (modstr~/^\|[0-9]+/) { + modval=gensub(/\|/,"",1,modstr) + j=0 + for(i=1;i<=NF;i++) { + if (($i % modval)==0) { + j++ + if (i!=j) $j=$i + } + } + NF=j + } + if (modstr~/^[0-9]+\|$/) { + modval=gensub(/\|/,"",1,modstr) + j=0 + for(i=1;i<=NF;i++) { + if ($i!=0) { + if ((modval % $i)==0) { + j++ + if (i!=j) $j=$i + } + } + } + NF=j + } + for(k=1;k<=NF;k++) + vars[kv[1]][k]=$k + } + END { + nk=1 + for(k in vars) { + nk*=length(vars[k]) + } + nk1=nk + nk2=1 + for(k in vars) { + varn=length(vars[k]) + nk1=nk1/varn + printf "%s ", k + for(r=0;r> $tmp + for var1 in "${values[@]}"; do + values1=($(echo "$var1" | tr ' ' '\n')) + key1="${values1[0]}" + val1="${values1[$(( (((loop1-1)*burst+burst1-1) % (${#values1[@]}-1))+1 ))]}" + echo "$key1: $val1" + echo " $key1: \"$val1\"" >> $tmp + done + tmp_files+=($tmp) + fi + ( + export CTESTSH_OPTIONS="$CTESTSH_OPTIONS --run_stage_iterations=$run" + [ $prepare_sut = 1 ] && export CTESTSH_OPTIONS="$CTESTSH_OPTIONS --run_stage=provision,prepare" + [ $cleanup_sut = 1 ] && export CTESTSH_OPTIONS="$CTESTSH_OPTIONS --cleanup-sut" + [ $reuse_sut = 1 ] && export CTESTSH_OPTIONS="$CTESTSH_OPTIONS --reuse-sut" + [ $dry_run = 1 ] && export CTESTSH_OPTIONS="$CTESTSH_OPTIONS --dry-run" + set -x + ctest "${args[@]}" + ) & + pids+=($!) + done + if [ $contf = 1 ]; then + wait ${pids[@]} || true + else + wait ${pids[@]} + fi + remove_tmp_files +done diff --git a/script/benchmark/debug.sh b/script/benchmark/debug.sh new file mode 100755 index 0000000..3e4565d --- /dev/null +++ b/script/benchmark/debug.sh @@ -0,0 +1,13 @@ +#!/bin/bash -e + +owner="${1:-$( (git config user.name || id -un) 2> /dev/null)-}" +cmd="docker ps -f name=$(echo $owner | tr 'A-Z' 'a-z' | tr -c -d 'a-z0-9-' | sed 's|^\(.\{12\}\).*$|\1|')" +if [ "$($cmd | wc -l)" -ne 2 ]; then + echo "None or multiple ctest instances detected:" + echo "" + $cmd --format '{{.ID}} {{.Names}}' + echo "" + echo "Please identify the instance with: ./debug.sh " + exit 3 +fi +docker exec -u pkb -it $($cmd --format '{{.ID}}') bash diff --git a/script/benchmark/kpi-list.awk b/script/benchmark/kpi-list.awk new file mode 100755 index 0000000..07d3f2d --- /dev/null +++ b/script/benchmark/kpi-list.awk @@ -0,0 +1,79 @@ +#!/usr/bin/gawk + +/^#svrinfo[:-]/ { + next +} +{ + print $0 +} +/^*/ { + kpi=$NF + $NF="" + n[$0]=n[$0]+1 + kpis[$0][n[$0]]=kpi +} +END { + print "" + for (x in n) { + sum[x]=0 + sumsq[x]=0 + for (y in kpis[x]) { + sum[x]+=kpis[x][y] + sumsq[x]+=kpis[x][y]^2 + } + average=sum[x]/n[x] + stdev=sqrt((sumsq[x]-sum[x]^2/n[x])/n[x]) + + print "avg "x,average + print "std "x,stdev + + average=sum[x]/n[x] + stdev=sqrt((sumsq[x]-sum[x]^2/n[x])/n[x]) + + asort(kpis[x], kpis1, "@val_num_asc") + if(n[x]%2) { + k=(n[x]+1)/2 + print "med "x,kpis1[k] + } else { + k=n[x]/2+1 + print "med "x,kpis1[k] + } + + r=0 + if (outlier>0) { + for (y in kpis[x]) { + if ((kpis[x][y]>average+outlier*stdev)||(kpis[x][y]0) { + print "removed "r" outlier(s)" + + sum[x]=0 + sumsq[x]=0 + n[x]=0 + for (y in kpis[x]) { + sum[x]+=kpis[x][y] + sumsq[x]+=kpis[x][y]^2 + n[x]=n[x]+1 + } + + asort(kpis[x], kpis1, "@val_num_asc") + if(n[x]%2) { + k=(n[x]+1)/2 + print "med "x,kpis1[k] + } else { + k=n[x]/2+1 + print "med "x,kpis1[k] + } + + average=sum[x]/n[x] + stdev=sqrt((sumsq[x]-sum[x]^2/n[x])/n[x]) + print "avg "x,average + print "std "x,stdev + } + } +} diff --git a/script/benchmark/kpi-xls-ai.awk b/script/benchmark/kpi-xls-ai.awk new file mode 100755 index 0000000..264e2c9 --- /dev/null +++ b/script/benchmark/kpi-xls-ai.awk @@ -0,0 +1,217 @@ +#!/usr/bin/gawk + +BEGIN { + if (var1 == "default") var1="batch_size" + if (var2 == "default") var2="cores_per_instance" + if (var3 == "default") var3="*Throughput" + if (var4 == "default") var4="Throughput_" +} + +function get_value() { + if ($NF*1 == $NF) return $NF + if ($(NF-1)*1 == $(NF-1)) return $(NF-1) + print "Unable to extract value: "$0 > "/dev/stderr" + exit 3 +} + +/^#svrinfo: / { + product=$3 +} + +/\/itr-[0-9]*:$/{ + name=gensub(/^.*-logs-(.*)\/itr-.*$/,"\\1",1) + itr=gensub(/^.*\/itr-([0-9]+):$/,"\\1",1) +} + +index($0,var1)==1 || ($1=="#" && index($2,var1)==1) { + var1v=gensub(/"(.*)"/,"\\1",1,$NF) +} + +index($0,var2)==1 || ($1=="#" && index($2,var2)==1) { + var2v=gensub(/"(.*)"/,"\\1",1,$NF) +} + +index($0,var3)==1 { + var3v[name][product][var1v][var2v][++var3vct[name][product][var1v][var2v]]=get_value() + n=length(var3v[name][product][var1v][var2v]) + if (n>var34n[name][product][var1v][var2v]) + var34n[name][product][var1v][var2v]=n +} + +index($0,var4)==1 { + idx=gensub(/ *([0-9]+).*$/,"\\1",1,substr($0,length(var4)+1)) + var4v[name][product][var1v][var2v][idx][++var4vct[name][product][var1v][var2v][idx]]=get_value() + n=length(var4v[name][product][var1v][var2v][idx]) + if (n>var34n[name][product][var1v][var2v]) + var34n[name][product][var1v][var2v]=n +} + +END { + add_xls_header() + + print "" + print "" + + print "" + print "" escape(var1) "" + for (ws in var3v) { + for (p in var3v[ws]) { + ws_p=ws"-"p + print "" escape(ws_name_ex(ws_p)) "" + for (v1 in var3v[ws][p]) { + v1s[v1][ws_p]=0 + for (v2 in var3v[ws][p][v1]) { + var3m=length(var3v[ws][p][v1][v2])>0?median(var3v[ws][p][v1][v2]):0 + if (var3m>v1s[v1][ws_p]) v1s[v1][ws_p]=var3m + } + } + } + } + print "" + + n1=asorti(v1s,v1sp,"@ind_num_asc") + for (v1=1;v1<=n1;v1++) { + print "" + print "" v1sp[v1]*1 "" + for (ws in var3v) { + for (p in var3v[ws]) { + print "" v1s[v1sp[v1]][ws"-"p]*1 "" + } + } + print "" + } + print "
" + print "
" + + for (ws in var34n) { + for (p in var34n[ws]) { + print "" + print "" + n1=asorti(var34n[ws][p], var1sp, "@ind_num_asc") + + th=1 + for(v1=1;v1<=n1;v1++) { + th++ + n2=asorti(var34n[ws][p][var1sp[v1]], var2sp, "@ind_num_asc") + for (v2=1;v2<=n2;v2++) { + th++ + n3=var34n[ws][p][var1sp[v1]][var2sp[v2]] + for (i=1;i<=n3;i++) { + print "" + th++ + } + } + th++ + } + + th=1 + print "" + for (v1=1;v1<=n1;v1++) { + print "" escape(var1) "" + print "" var1sp[v1]*1 "" + th++ + + for (v2 in var34n[ws][p][var1sp[v1]]) + th+=var34n[ws][p][var1sp[v1]][v2]+1 + th++ + } + print "" + + print "" + th=1 + for(v1=1;v1<=n1;v1++) { + print "" escape(var2) "" + th++ + + n2=asorti(var34n[ws][p][var1sp[v1]], var2sp, "@ind_num_asc") + for (v2=1;v2<=n2;v2++) { + print "" var2sp[v2]*1 "" + th+=var34n[ws][p][var1sp[v1]][var2sp[v2]]+1 + } + th++ + } + print "" + + print "" + th=1 + for(v1=1;v1<=n1;v1++) { + print "" escape(gensub(/^\*/,"",1,var3)) "" + th++ + + n2=asorti(var34n[ws][p][var1sp[v1]], var2sp, "@ind_num_asc") + for (v2=1;v2<=n2;v2++) { + var3m=length(var3v[ws][p][var1sp[v1]][var2sp[v2]])>0?median(var3v[ws][p][var1sp[v1]][var2sp[v2]]):0 + print "" var3m*1 "" + th++ + + n3=var34n[ws][p][var1sp[v1]][var2sp[v2]] + for (i=1;i<=n3;i++) { + vi=(length(var3v[ws][p][var1sp[v1]][var2sp[v2]])>0)?var3v[ws][p][var1sp[v1]][var2sp[v2]][i]:0 + if (vi==var3m) { + print "" var3m*1 "" + var4i[ws][var1sp[v1]][var2sp[v2]]=i + } else { + print "" vi*1 "" + } + th++ + } + } + th++ + } + print "" + + print "" + th=1 + cn=0 + for(v1=1;v1<=n1;v1++) { + print "count" + th++ + + n2=asorti(var34n[ws][p][var1sp[v1]], var2sp, "@ind_num_asc") + for (v2=1;v2<=n2;v2++) { + count=length(var4v[ws][p][var1sp[v1]][var2sp[v2]]) + print "" count*1 "" + if (count>cn) cn=count + th+=var34n[ws][p][var1sp[v1]][var2sp[v2]]+1 + } + th++ + } + print "" + + for (c=1;c<=cn;c++) { + print "" + + th=2 + for(v1=1;v1<=n1;v1++) { + n2=asorti(var34n[ws][p][var1sp[v1]], var2sp, "@ind_num_asc") + for (v2=1;v2<=n2;v2++) { + n4=var34n[ws][p][var1sp[v1]][var2sp[v2]] + if (length(var4v[ws][p][var1sp[v1]][var2sp[v2]][c])>0) { + var4ii=var4i[ws][var1sp[v1]][var2sp[v2]] + print "" var4v[ws][p][var1sp[v1]][var2sp[v2]][c][var4ii]*1 "" + + for(i=1;i<=n4;i++) { + if (i==var4ii) { + print "" var4v[ws][p][var1sp[v1]][var2sp[v2]][c][i]*1 "" + } else { + print "" var4v[ws][p][var1sp[v1]][var2sp[v2]][c][i]*1 "" + } + } + } + th+=n4+1 + } + th+=2 + } + print "" + } + + print "
" + print "
" + } + + # write svrinfo + if (length(svrinfo_values[ws])>0) + add_svrinfo(ws) + } + print "" +} diff --git a/script/benchmark/kpi-xls-inst.awk b/script/benchmark/kpi-xls-inst.awk new file mode 100755 index 0000000..693ceb3 --- /dev/null +++ b/script/benchmark/kpi-xls-inst.awk @@ -0,0 +1,96 @@ +#!/usr/bin/gawk + +/^#svrinfo: / { + name=gensub(/^.*-logs-(.*)\/runs\/.*$/,"\\1",1,$2) + product=$3 +} + +/\/itr-[0-9]*:$/ { + name=gensub(/^.*-logs-(.*)\/itr-.*$/,"\\1",1) +} + +(!/^#/) && /.*: *[0-9.-]+ *$/ { + k=gensub(/^(.*):.*$/, "\\1", 1) + v=gensub(/^.*: *([0-9.-]+) *$/, "\\1", 1) + kpis[name][product][k][++kpisct[name][product][k]]=v + kpis_uniq[name][k]=1 +} + +END { + add_xls_header(1) + + for (ws in kpis) { + nk=asorti(kpis_uniq[ws], ksp, "@ind_str_asc") + if(nk>24) nk=24 + np=asorti(kpis[ws], psp, "@ind_str_asc") + + print "" + print "" + + th=2 + for (p=1;p<=np;p++) { + ith[p]=th + nk1=0 + for (k=1;k<=nk;k++) { + nk1n=length(kpis[ws][psp[p]][ksp[k]]) + if (nk1n>nk1) nk1=nk1n + } + for (k=1;k<=nk1;k++) + print "" + + th+=nk1+1 + } + ith[p]=th + + print "" + print "Instance Type" + for (p=1;p<=np;p++) { + print "" escape(psp[p]) "" + } + print "" + + # calculate median + for (p=1;p<=np;p++) { + kn[p]=length(kpis[ws][psp[p]][ksp[1]]) + m=median(kpis[ws][psp[p]][ksp[1]]) + kii[p]=0 + for (i=1;i<=kn[p];i++) + if (m==kpis[ws][psp[p]][ksp[1]][i]) + kii[p]=i + } + + # kpis + for(k=1;k<=nk;k++) { + print "" + print "" escape(ksp[k]) "" + for(p=1;p<=np;p++) { + print "" kpis[ws][psp[p]][ksp[k]][kii[p]]*1 "" + for(i=1;i<=kn[p];i++) { + style=(i==kii[p])?"-median":"" + print "" kpis[ws][psp[p]][ksp[k]][i]*1 "" + } + } + print "" + } + + # empty KPI lines + for(k=nk+1;k<=23;k++) { + print "" + print "" + for(p=1;p<=np;p++) { + print "" + for(i=1;i<=kn[p];i++) + print "" + } + print "" + } + + add_svrinfo_ex(ws, psp, ith) + + print "
" + print "
" + + add_svrinfo(ws) + } + print "" +} diff --git a/script/benchmark/kpi-xls-table.awk b/script/benchmark/kpi-xls-table.awk new file mode 100755 index 0000000..6507f3e --- /dev/null +++ b/script/benchmark/kpi-xls-table.awk @@ -0,0 +1,123 @@ +#!/usr/bin/gawk + +BEGIN { + name="default" + if (var1 == "default") var1="batch_size" + if (var2 == "default") var2="cores_per_instance" + if (var3 == "default") var3="" + if (var4 == "default") var4="" + var3v="" + var4v="" +} + +/^#svrinfo: / { + name=gensub("^.*logs-(.*)[/]runs[/].*$","\\1",1,$2) +} + +/[/]itr-[0-9]*:$/ { + name=gensub("^.*logs-(.*)[/]itr-.*$","\\1",1) +} + +index($0,var1)==1 || ($1=="#" && index($2,var1)==1) { + var1v=gensub(/"/,"","g",$NF) +} + +index($0,var2)==1 || ($1=="#" && index($2,var2)==1) { + var2v=gensub(/"/,"","g",$NF) +} + +(index($0,var3)==1 || ($1=="#" && index($2,var3)==1)) && length(var3)>0 { + var3v=var3": "gensub(/"/,"","g",$NF) +} + +(index($0,var4)==1 || ($1=="#" && index($2,var4)==1)) && length(var4)>0 { + var4v=var4": "gensub(/"/,"","g",$NF) +} + +/^[*]/ { + primary_kpi[name]=gensub(/^[*](.*):.*/,"\\1",1,$0) + var34v="" + if (length(var3)>0) var34v=var3v + if (length(var4)>0) { + if (length(var34v)>0) + var34v=var34v", "var4v + else + var34v=var4v + } + idx= ++ikpis[name][var34v][var2v][var1v] + kpis[name][var34v][var2v][var1v][idx]=$NF + if (idx > var1v_num[name][var1v]) + var1v_num[name][var1v]=idx +} + +END { + add_xls_header(1) + + for (ws in kpis) { + ntables=asorti(kpis[ws], tables, "@ind_str_asc") + + print "" + print "" + + th=3 + var1v_nsp=asorti(var1v_num[ws], var1v_sp, "@ind_num_asc") + for (v1=1;v1<=var1v_nsp;v1++) { + ith[v1]=th + nk=var1v_num[ws][var1v_sp[v1]] + for (k=1;k<=nk;k++) + print "" + th+=nk+1 + } + + for (t=1;t<=ntables;t++) { + print "" + print "" tables[t] "" + print "" + var34=tables[t] + + print "" + print "" var1 "" + print "" + + print "" + print "" primary_kpi[ws] "" + for (v1=1;v1<=var1v_nsp;v1++) { + style=(var1v_sp[v1]==var1v_sp[v1]*1)?"Number":"String" + print "" var1v_sp[v1] "" + } + print "" + + var2v_nsp=asorti(kpis[ws][var34], var2v_sp, "@ind_num_asc") + for (v2=1;v2<=var2v_nsp;v2++) { + print "" + if (v2==1) { + print "" var2 "" + } + style=(var2v_sp[v2]==var2v_sp[v2]*1)?"Number":"String" + print "" var2v_sp[v2] "" + for (v1=1;v1<=var1v_nsp;v1++) { + n=length(kpis[ws][var34][var2v_sp[v2]][var1v_sp[v1]]) + if (n>0) { + m=median(kpis[ws][var34][var2v_sp[v2]][var1v_sp[v1]]) + print "" m "" + for (n1=1;n1<=n;n1++) { + n1v=kpis[ws][var34][var2v_sp[v2]][var1v_sp[v1]][n1] + style=(m""==n1v"")?"-median":"" + print "" n1v "" + } + } + } + print "" + } + print "" + print "" + } + + print "
" + print "
" + + if (length(svrinfo_values[ws])>0) + add_svrinfo(ws) + } + print "" +} diff --git a/script/benchmark/list-kpi.sh b/script/benchmark/list-kpi.sh new file mode 100755 index 0000000..236513d --- /dev/null +++ b/script/benchmark/list-kpi.sh @@ -0,0 +1,222 @@ +#!/bin/bash -e + +DIR="$(dirname "$(readlink -f "$0")")" + +print_help () { + echo "Usage: [options] logsdir" + echo "--primary List only the primary KPI." + echo "--all List all KPIs." + echo "--params Print out all configuration parameters." + echo "--outlier Drop samples beyond N-stdev." + echo "--format Specify the output format: list, xls-ai, xls-inst, or xls-table." + echo "--var[1-9] value Specify spreadsheet variables." + echo "--phost name Specify the primary hostname for identifying instance type." + echo "--pinst name Specify the svrinfo field for identifying instance type name." + echo "--file filename Specify the spreadsheet filename." + echo "--filter filter Specify the trim filter to shorten the worksheet name." + exit 0 +} + +phost="node1" +pinst="System.Product Name" +prefixes=() +primary=1 +outlier=0 +printvar=0 +format="list" +xlsfile="kpi-report.xls" +params=0 +var1="default" +var2="default" +var3="default" +var4="default" +filter="_(tensorflow|throughput|inference|benchmark|real)" +for var in "$@"; do + case "$var" in + --primary) + primary=1 + ;; + --all) + primary="" + ;; + --outlier=*) + outlier="${var#--outlier=}" + ;; + --outlier) + outlier="-1" + ;; + --params|--params=true) + params=1 + ;; + --params=false) + params=0 + ;; + --format=*) + format="${var#--format=}" + ;; + --format) + format="-1" + ;; + --var1=*) + var1="${var#--var1=}" + ;; + --var1) + var1="-1" + ;; + --var2=*) + var2="${var#--var2=}" + ;; + --var2) + var2="-1" + ;; + --var3=*) + var3="${var#--var3=}" + ;; + --var3) + var3="-1" + ;; + --var4=*) + var4="${var#--var4=}" + ;; + --var4) + var4="-1" + ;; + --phost=*) + phost="${var#--phost=}" + ;; + --phost) + phost="-1" + ;; + --pinst=*) + pinst="${var#--pinst=}" + ;; + --pinst) + pinst="-1" + ;; + --filter=*) + filter="${var#--filter=}" + ;; + --filter) + filter="-1" + ;; + --file=*) + xlsfile="${var#--file=}" + ;; + --file) + xlsfile="" + ;; + --help) + print_help + ;; + *) + if [ "$outlier" = "-1" ]; then + outlier="$var" + elif [ "$format" = "-1" ]; then + format="$var" + elif [ "$var1" = "-1" ]; then + var1="$var" + elif [ "$var2" = "-1" ]; then + var2="$var" + elif [ "$var3" = "-1" ]; then + var3="$var" + elif [ "$var4" = "-1" ]; then + var4="$var" + elif [ "$phost" = "-1" ]; then + phost="$var" + elif [ "$pinst" = "-1" ]; then + pinst="$var" + elif [ "$filter" = "-1" ]; then + filter="$var" + elif [ -z "$xlsfile" ]; then + xlsfile="$var" + else + prefixes+=("$var") + fi + ;; + esac +done + +if [ "$outlier" = "-1" ]; then + outlier=0 +fi + +if [ "$format" != "list" ]; then + primary="" + params=1 +fi + +if [ ${#prefixes[@]} -eq 0 ]; then + print_help +fi + +for logsdir1 in ${prefixes[@]}; do + if [ -r "$logsdir1/kpi.sh" ] && [ -r "$logsdir1"/cumulus-config.yaml ]; then + pinstv="" + for svrinfo in "$logsdir1"/runs/*/pkb-*-svrinfo/*.json; do + if [ -r "$svrinfo" ]; then + pinstv="$(sed 's/^/#svrinfo- /' "$logsdir1"/runs/*/pkb-*-svrinfo/*.json | awk -v name=$phost -v pinst="$pinst" -f "$DIR/svrinfo-json.awk" -f "$DIR/svrinfo-inst.awk")" + break + fi + done + for svrinfo in "$logsdir1"/runs/*/pkb-*-svrinfo/*.json; do + if [ -r "$svrinfo" ]; then + echo "#svrinfo: $svrinfo $pinstv" + sed 's/^/#svrinfo- /' "$svrinfo" + echo + fi + done + script_args="$(awk '/dpt_script_args:/{$1="";print gensub(/"/,"","g")}' "$logsdir1/cumulus-config.yaml")" + if [ -d "$logsdir1/itr-1" ]; then + for itrdir1 in "$logsdir1"/itr-*; do + echo "$itrdir1:" + if [ $params -eq 1 ]; then + awk '/dpt_tunables:/{$1="";print gensub(/"/,"","g")}' "$logsdir1/cumulus-config.yaml" | tr ';' '\n' | sed 's/^ *\([^:]*\):\(.*\)$/# \1: "\2"/' + fi + chmod a+rx "$itrdir1/kpi.sh" + if [ -n "$primary" ]; then + ( cd "$itrdir1" && ./kpi.sh $script_args | grep -E "^\*" ) || true + else + ( cd "$itrdir1" && ./kpi.sh $script_args ) || true + fi + done + else + echo "$logsdir1:" + if [ $params -eq 1 ]; then + awk '/dpt_tunables:/{$1="";print gensub(/"/,"","g")}' "$logsdir1/cumulus-config.yaml" | tr ';' '\n' | sed 's/^ *\([^:]*\):\(.*\)$/# \1: "\2"/' + fi + chmod a+rx "$logsdir1/kpi.sh" + if [ -n "$primary" ]; then + ( cd "$logsdir1" && ./kpi.sh $script_args | grep -E "^\*" ) || true + else + ( cd "$logsdir1" && ./kpi.sh $script_args ) || true + fi + fi + elif [ -r "$logsdir1/kpi.sh" ] && [ -r "$logsdir1"/workload-config.yaml ]; then + script_args="$(awk '/^script_args:/{$1="";print gensub(/"/,"","g")}' "$logsdir1/workload-config.yaml")" + echo "$logsdir1:" + if [ $params -eq 1 ]; then + awk '/^tunables:/{$1="";print gensub(/"/,"","g")}' "$logsdir1/workload-config.yaml" | tr ';' '\n' | sed 's/^ *\([^:]*\):\(.*\)$/# \1: "\2"/' + fi + chmod a+rx "$logsdir1/kpi.sh" + if [ -n "$primary" ]; then + ( cd "$logsdir1" && ./kpi.sh $script_args | grep -E "^\*" ) || true + else + ( cd "$logsdir1" && ./kpi.sh $script_args ) || true + fi + fi +done | ( + case "$format" in + list) + awk -v outlier=$outlier -f "$DIR/kpi-list.awk" + ;; + xls-ai) + awk -v outlier=$outlier -v var1="$var1" -v var2="$var2" -v var3="$var3" -v var4="$var4" -v filter="$filter" -f "$DIR/xlsutil.awk" -f "$DIR/svrinfo-json.awk" -f "$DIR/svrinfo-xls.awk" -f "$DIR/kpi-xls-ai.awk" > "$xlsfile" + ;; + xls-inst) + awk -v var1="$var1" -v phost=$phost -v filter="$filter" -f "$DIR/xlsutil.awk" -f "$DIR/svrinfo-json.awk" -f "$DIR/svrinfo-xls.awk" -f "$DIR/kpi-xls-inst.awk" > "$xlsfile" + ;; + xls-table) + awk -v var1="$var1" -v var2="$var2" -v var3="$var3" -v var4="$var4" -v filter="$filter" -f "$DIR/xlsutil.awk" -f "$DIR/svrinfo-json.awk" -f "$DIR/svrinfo-xls.awk" -f "$DIR/kpi-xls-table.awk" > "$xlsfile" + ;; + esac +) diff --git a/script/benchmark/svrinfo-inst.awk b/script/benchmark/svrinfo-inst.awk new file mode 100755 index 0000000..c8b0128 --- /dev/null +++ b/script/benchmark/svrinfo-inst.awk @@ -0,0 +1,14 @@ +#!/usr/bin/gawk + +END { + split(pinst, pinst_fields, ".") + for (p in svrinfo_values[svrinfo_ws]) { + for (ip in svrinfo_values[svrinfo_ws][p]) { + if (svrinfo_values[svrinfo_ws][p][ip]["Host"]["Name"][1] == name) { + print svrinfo_values[svrinfo_ws][p][ip][pinst_fields[1]][pinst_fields[2]][1] + break + } + } + } +} + diff --git a/script/benchmark/svrinfo-json.awk b/script/benchmark/svrinfo-json.awk new file mode 100755 index 0000000..0134afe --- /dev/null +++ b/script/benchmark/svrinfo-json.awk @@ -0,0 +1,58 @@ +#!/usr/bin/gawk + +BEGIN { + svrinfo_values_start=0 + svrinfo_names_start=0 + svrinfo_host_start=0 + svrinfo_ws="default" + svrinfo_product="default" +} + +/^#svrinfo: / { + svrinfo_ws=gensub("^.*logs-(.*)[/]runs[/].*$","\\1",1,$2) + svrinfo_product=$3 +} + +/^#svrinfo-\s*"Name":\s*".*",*\s*$/ && !svrinfo_values_start && !svrinfo_names_start { + v=gensub(/^#svrinfo-\s*"Name":\s*"(.*)",*\s*$/, "\\1", 1, $0) + if (svrinfo_host_start) { + svrinfo_ip=v + } else { + svrinfo_group=v + } +} + +/^#svrinfo-\s*"AllHostValues":/ { + svrinfo_host_start=1 +} + +svrinfo_names_start && !svrinfo_values_start && /^#svrinfo-\s*".*",*\s*$/ { + svrinfo_names[++svrinfo_nnames]=gensub(/^#svrinfo-\s*"(.*)",*\s*$/, "\\1", 1, $0) +} + +svrinfo_names_start && !svrinfo_values_start && /^#svrinfo-\s*]\s*$/ { + svrinfo_names_start=0 +} + +/^#svrinfo-\s*"ValueNames":/ { + svrinfo_names_start=1 + svrinfo_nnames=0 +} + +svrinfo_values_start && /^#svrinfo-\s*".*",*\s*$/ { + n=svrinfo_names[(svrinfo_nvalues%svrinfo_nnames)+1] + i=int(svrinfo_nvalues/svrinfo_nnames)+1 + ++svrinfo_nvalues + svrinfo_values[svrinfo_ws][svrinfo_product][svrinfo_ip][svrinfo_group][n][i]=gensub(/^#svrinfo-\s*"(.*)",*\s*$/,"\\1",1,$0) +} + +svrinfo_values_start && /^#svrinfo-\s*}\s*$/ { + svrinfo_values_start=0 + svrinfo_host_start=0 +} + +/^#svrinfo-\s*"Values":/ { + svrinfo_values_start=1 + svrinfo_nvalues=0 +} + diff --git a/script/benchmark/svrinfo-xls.awk b/script/benchmark/svrinfo-xls.awk new file mode 100755 index 0000000..08f442b --- /dev/null +++ b/script/benchmark/svrinfo-xls.awk @@ -0,0 +1,387 @@ +#!/usr/bin/gawk + +function add_svrinfo_cell(vv) { + t=(vv==vv*1)?"Number":"String" + print "" escape(vv) "" +} + +function add_svrinfo_row(ws, g, k) { + print "" + add_svrinfo_cell(g"."k) + for (p in svrinfo_values[ws]) + for (s in svrinfo_values[ws][p]) + add_svrinfo_cell(svrinfo_values[ws][p][s][g][k][1]) + print "" +} + +function add_svrinfo_isa_summary(ws, g) { + print "" + add_svrinfo_cell(g) + for (p in svrinfo_values[ws]) { + for (s in svrinfo_values[ws][p]) { + vv="" + for (k in svrinfo_values[ws][p][s][g]) + if (svrinfo_values[ws][p][s][g][k][1] == "Yes") + vv=vv", "gensub(/-.*/,"",1,k) + add_svrinfo_cell(gensub(/^, /,"",1,vv)) + } + } + print "" +} + +function add_svrinfo_accelerator_summary(ws, g) { + print "" + add_svrinfo_cell(g) + for (p in svrinfo_values[ws]) { + for (s in svrinfo_values[ws][p]) { + vv="" + for (k in svrinfo_values[ws][p][s][g]) + if (svrinfo_values[ws][p][s][g][k][1]>0) + vv=vv", "k":"svrinfo_values[ws][p][s][g][k][1] + add_svrinfo_cell(gensub(/^, /,"",1,vv)) + } + } + print "" +} + +function add_svrinfo_nic_summary(ws, g, n, m) { + n1=0 + for (p in svrinfo_values[ws]) { + for (s in svrinfo_values[ws][p]) { + n2=length(svrinfo_values[ws][p][s][g][m]) + if (n2>n1) n1=n2 + } + } + for (n2=1;n2<=n1;n2++) { + print "" + add_svrinfo_cell((n2==1)?g:"") + for (p in svrinfo_values[ws]) { + for (s in svrinfo_values[ws][p]) { + vv="" + n3=0 + for (i in svrinfo_values[ws][p][s][g][m]) { + n3++ + if (n3==n2) { + vv=svrinfo_values[ws][p][s][g][n][i]": "svrinfo_values[ws][p][s][g][m][i] + break + } + } + add_svrinfo_cell(vv) + } + } + print "" + } +} + +function add_svrinfo_disk_summary(ws, g, n, m) { + n1=0 + for (p in svrinfo_values[ws]) { + for (s in svrinfo_values[ws][p]) { + n2=0 + for (i in svrinfo_values[ws][p][s][g][m]) + if (length(svrinfo_values[ws][p][s][g][m][i])>0) n2++ + if (n2>n1) n1=n2 + } + } + + for (n2=1;n2<=n1;n2++) { + print "" + add_svrinfo_cell((n2==1)?g:"") + for (p in svrinfo_values[ws]) { + for (s in svrinfo_values[ws][p]) { + n3=0 + vv="" + for (i in svrinfo_values[ws][p][s][g][m]) { + if (length(svrinfo_values[ws][p][s][g][m][i])>0) n3++ + if (n3==n2) { + vv=svrinfo_values[ws][p][s][g][n][i]": "svrinfo_values[ws][p][s][g][m][i] + break + } + } + add_svrinfo_cell(vv) + } + } + print "" + } +} + +function add_svrinfo_security_summary(ws, g) { + n1=0 + for (p in svrinfo_values[ws]) { + for (s in svrinfo_values[ws][p]) { + n2=length(svrinfo_values[ws][p][s][g]) + if (n2>n1) n1=n2 + } + } + for (n2=1;n2<=n1;n2++) { + print "" + g1=(n2==1)?g:"" + add_svrinfo_cell(g1) + for (p in svrinfo_values[ws]) { + for (s in svrinfo_values[ws][p]) { + vv="" + n3=0 + for (k in svrinfo_values[ws][p][s][g]) { + n3++ + if (n3==n2) { + vv=k": "gensub(/\s*[(].*[)].*/,"",1,svrinfo_values[ws][p][s][g][k][1]) + break; + } + } + add_svrinfo_cell(vv) + } + } + print "" + } +} + +function find_svrinfo_phost(ws, p) { + if (length(svrinfo_values[ws][p])==1) + for (s in svrinfo_values[ws][p]) + return s + for (s in svrinfo_values[ws][p]) + for (i in svrinfo_values[ws][p][s]["Host"]["Name"]) + if (svrinfo_values[ws][p][s]["Host"]["Name"][i] == phost) + return s + return "" +} + +function add_svrinfo_cell_ex(i, vv) { + style=(vv==vv*1)?"Number":"String" + print "" escape(vv) "" +} + +function add_svrinfo_row_ex(ws, psp, ith, g, k) { + print "" + add_svrinfo_cell(g"."k) + np=length(psp) + for (p1=1;p1<=np;p1++) { + s=find_svrinfo_phost(ws, psp[p1]) + add_svrinfo_cell_ex(ith[p1], svrinfo_values[ws][psp[p1]][s][g][k][1]) + } + print "" +} + +function add_svrinfo_nic_summary_ex(ws, psp, ith, g, n, m) { + np=length(psp) + n1=0 + for (p1=1;p1<=np;p1++) { + s=find_svrinfo_phost(ws, psp[p1]) + n2=length(svrinfo_values[ws][psp[p1]][s][g][m]) + if (n2>n1) n1=n2 + } + for (n2=1;n2<=n1;n2++) { + print "" + add_svrinfo_cell((n2==1)?g:"") + for (p1=1;p1<=np;p1++) { + s=find_svrinfo_phost(ws, psp[p1]) + vv="" + n3=0 + for (i in svrinfo_values[ws][psp[p1]][s][g][m]) { + n3++ + if (n3==n2) { + vv=svrinfo_values[ws][psp[p1]][s][g][n][i]": "svrinfo_values[ws][psp[p1]][s][g][m][i] + break + } + } + add_svrinfo_cell_ex(ith[p1], vv) + } + print "" + } +} + +function add_svrinfo_security_summary_ex(ws, psp, ith, g) { + np=length(psp) + n1=0 + for (p1=1;p1<=np;p1++) { + s=find_svrinfo_phost(ws, psp[p1]) + n2=length(svrinfo_values[ws][psp[p1]][s][g]) + if (n2>n1) n1=n2 + } + for (n2=1;n2<=n1;n2++) { + print "" + add_svrinfo_cell((n2==1)?g:"") + for (p1=1;p1<=np;p1++) { + s=find_svrinfo_phost(ws, psp[p1]) + vv="" + n3=0 + for (k in svrinfo_values[ws][psp[p1]][s][g]) { + n3++ + if (n3==n2) { + vv=k": "gensub(/\s*[(].*[)].*/,"",1,svrinfo_values[ws][psp[p1]][s][g][k][1]) + break + } + } + add_svrinfo_cell_ex(ith[p1], vv) + } + print "" + } +} + +function add_svrinfo_disk_summary_ex(ws, psp, ith, g, n, m) { + np=length(psp) + n1=0 + for (p1=1;p1<=np;p1++) { + s=find_svrinfo_phost(ws, psp[p1]) + n2=0 + for (i in svrinfo_values[ws][psp[p1]][s][g][m]) + if (length(svrinfo_values[ws][psp[p1]][s][g][m][i])>0) n2++ + if (n2>n1) n1=n2 + } + + for (n2=1;n2<=n1;n2++) { + print "" + g1=(n2==1)?g:"" + add_svrinfo_cell(g) + for (p1=1;p1<=np;p1++) { + s=find_svrinfo_phost(ws, psp[p1]) + n3=0 + vv="" + for (i in svrinfo_values[ws][psp[p1]][s][g][m]) { + if (length(svrinfo_values[ws][psp[p1]][s][g][m][i])>0) n3++ + if (n3==n2) { + vv=svrinfo_values[ws][psp[p1]][s][g][n][i]":"svrinfo_values[ws][psp[p1]][s][g][m][i] + break + } + } + add_svrinfo_cell(vv) + } + print "" + } +} + +function add_svrinfo(ws) { + print "" + print "" + + add_svrinfo_row(ws, "Host", "Name") + add_svrinfo_row(ws, "Host", "Time") + add_svrinfo_row(ws, "System", "Manufacturer") + add_svrinfo_row(ws, "System", "Product Name") + add_svrinfo_row(ws, "System", "Version") + add_svrinfo_row(ws, "System", "Serial #") + add_svrinfo_row(ws, "System", "UUID") + + add_svrinfo_row(ws, "Baseboard", "Manifacturer") + add_svrinfo_row(ws, "Baseboard", "Product Name") + add_svrinfo_row(ws, "Baseboard", "Version") + add_svrinfo_row(ws, "Baseboard", "Serial #") + + add_svrinfo_row(ws, "Chassis", "Manufacturer") + add_svrinfo_row(ws, "Chassis", "Type") + add_svrinfo_row(ws, "Chassis", "Version") + add_svrinfo_row(ws, "Chassis", "Serial #") + + add_svrinfo_row(ws, "BIOS", "Vendor") + add_svrinfo_row(ws, "BIOS", "Version") + add_svrinfo_row(ws, "BIOS", "Release Date") + + add_svrinfo_row(ws, "Operating System", "OS") + add_svrinfo_row(ws, "Operating System", "Kernel") + add_svrinfo_row(ws, "Operating System", "Microcode") + + add_svrinfo_row(ws, "Software Version", "GCC") + add_svrinfo_row(ws, "Software Version", "GLIBC") + add_svrinfo_row(ws, "Software Version", "Binutils") + add_svrinfo_row(ws, "Software Version", "Python") + add_svrinfo_row(ws, "Software Version", "Python3") + add_svrinfo_row(ws, "Software Version", "Java") + add_svrinfo_row(ws, "Software Version", "OpenSSL") + + add_svrinfo_row(ws, "CPU", "CPU Model") + add_svrinfo_row(ws, "CPU", "Architecture") + add_svrinfo_row(ws, "CPU", "Microarchitecture") + add_svrinfo_row(ws, "CPU", "Family") + add_svrinfo_row(ws, "CPU", "Model") + add_svrinfo_row(ws, "CPU", "Stepping") + add_svrinfo_row(ws, "CPU", "Base Frequency") + add_svrinfo_row(ws, "CPU", "Maximum Frequency") + add_svrinfo_row(ws, "CPU", "All-core Maximum Frequency") + add_svrinfo_row(ws, "CPU", "CPUs") + add_svrinfo_row(ws, "CPU", "On-line CPU List") + add_svrinfo_row(ws, "CPU", "Hyperthreading") + add_svrinfo_row(ws, "CPU", "Cores per Socket") + add_svrinfo_row(ws, "CPU", "Sockets") + add_svrinfo_row(ws, "CPU", "NUMA Nodes") + add_svrinfo_row(ws, "CPU", "NUMA CPU List") + add_svrinfo_row(ws, "CPU", "CHA Count") + add_svrinfo_row(ws, "CPU", "L1d Cache") + add_svrinfo_row(ws, "CPU", "L1i Cache") + add_svrinfo_row(ws, "CPU", "L2 Cache") + add_svrinfo_row(ws, "CPU", "L3 Cache") + add_svrinfo_row(ws, "CPU", "Memory Channels") + add_svrinfo_row(ws, "CPU", "Prefetchers") + add_svrinfo_row(ws, "CPU", "Intel Turbo Boost") + add_svrinfo_row(ws, "CPU", "PPINs") + + add_svrinfo_isa_summary(ws, "ISA") + add_svrinfo_accelerator_summary(ws, "Accelerator") + + add_svrinfo_row(ws, "Power", "TDP") + add_svrinfo_row(ws, "Power", "Power & Perf Policy") + add_svrinfo_row(ws, "Power", "Frequency Governer") + add_svrinfo_row(ws, "Power", "Frequency Driver") + add_svrinfo_row(ws, "Power", "MAX C-State") + + add_svrinfo_row(ws, "Memory", "Installed Memory") + add_svrinfo_row(ws, "Memory", "MemTotal") + add_svrinfo_row(ws, "Memory", "MemFree") + add_svrinfo_row(ws, "Memory", "MemAvailable") + add_svrinfo_row(ws, "Memory", "Buffers") + add_svrinfo_row(ws, "Memory", "Cached") + add_svrinfo_row(ws, "Memory", "HugePages_Total") + add_svrinfo_row(ws, "Memory", "Hugepagesize") + add_svrinfo_row(ws, "Memory", "Transparent Huge Pages") + add_svrinfo_row(ws, "Memory", "Automatic NUMA Balancing") + add_svrinfo_row(ws, "Memory", "Populated Memory Channels") + + add_svrinfo_row(ws, "GPU", "Manufacturer") + add_svrinfo_row(ws, "GPU", "Model") + + add_svrinfo_nic_summary(ws, "NIC", "Name", "Model") + add_svrinfo_nic_summary(ws, "Network IRQ Mapping", "Interface", "CPU:IRQs CPU:IRQs ...") + add_svrinfo_disk_summary(ws, "Disk", "NAME", "MODEL") + add_svrinfo_security_summary(ws, "Vulnerability") + + add_svrinfo_row(ws, "PMU", "cpu_cycles") + add_svrinfo_row(ws, "PMU", "instructions") + add_svrinfo_row(ws, "PMU", "ref_cycles") + add_svrinfo_row(ws, "PMU", "topdown_slots") + print "
" + print "
" +} + +function add_svrinfo_ex(ws, psp, ith) { + add_svrinfo_row_ex(ws, psp, ith, "Host", "Name") + add_svrinfo_row_ex(ws, psp, ith, "Host", "Time") + add_svrinfo_row_ex(ws, psp, ith, "System", "Manufacturer") + add_svrinfo_row_ex(ws, psp, ith, "System", "Product Name") + add_svrinfo_row_ex(ws, psp, ith, "BIOS", "Version") + add_svrinfo_row_ex(ws, psp, ith, "Operating System", "OS") + add_svrinfo_row_ex(ws, psp, ith, "Operating System", "Kernel") + add_svrinfo_row_ex(ws, psp, ith, "Operating System", "Microcode") + add_svrinfo_row_ex(ws, psp, ith, "CPU", "CPU Model") + add_svrinfo_row_ex(ws, psp, ith, "CPU", "Base Frequency") + add_svrinfo_row_ex(ws, psp, ith, "CPU", "Maximum Frequency") + add_svrinfo_row_ex(ws, psp, ith, "CPU", "All-core Maximum Frequency") + add_svrinfo_row_ex(ws, psp, ith, "CPU", "CPUs") + add_svrinfo_row_ex(ws, psp, ith, "CPU", "Cores per Socket") + add_svrinfo_row_ex(ws, psp, ith, "CPU", "Sockets") + add_svrinfo_row_ex(ws, psp, ith, "CPU", "NUMA Nodes") + add_svrinfo_row_ex(ws, psp, ith, "CPU", "Prefetchers") + add_svrinfo_row_ex(ws, psp, ith, "CPU", "Intel Turbo Boost") + add_svrinfo_row_ex(ws, psp, ith, "CPU", "PPINs") + add_svrinfo_row_ex(ws, psp, ith, "Power", "Power & Perf Policy") + add_svrinfo_row_ex(ws, psp, ith, "Power", "TDP") + add_svrinfo_row_ex(ws, psp, ith, "Power", "Frequency Driver") + add_svrinfo_row_ex(ws, psp, ith, "Power", "Frequency Governer") + add_svrinfo_row_ex(ws, psp, ith, "Power", "MAX C-State") + add_svrinfo_row_ex(ws, psp, ith, "Memory", "Installed Memory") + add_svrinfo_row_ex(ws, psp, ith, "Memory", "Hugepagesize") + add_svrinfo_row_ex(ws, psp, ith, "Memory", "Transparent Huge Pages") + add_svrinfo_row_ex(ws, psp, ith, "Memory", "Automatic NUMA Balancing") + add_svrinfo_nic_summary_ex(ws, psp, ith, "NIC", "Name", "Model") + add_svrinfo_nic_summary_ex(ws, psp, ith, "Network IRQ Mapping", "Interface", "CPU:IRQs CPU:IRQs ...") + add_svrinfo_disk_summary_ex(ws, psp, ith, "Disk", "NAME", "MODEL") + add_svrinfo_security_summary_ex(ws, psp, ith, "Vulnerability") +} diff --git a/script/benchmark/xlsutil.awk b/script/benchmark/xlsutil.awk new file mode 100755 index 0000000..693741c --- /dev/null +++ b/script/benchmark/xlsutil.awk @@ -0,0 +1,98 @@ +#!/usr/bin/gawk + +function median(v) { + n=asort(v, v_sorted, "@val_num_asc") + if (n%2 == 0) { + return v_sorted[n/2] + } else { + return v_sorted[(n+1)/2] + } +} + +function escape(text) { + text=gensub(//,"\\>","g",text) + return text +} + +function add_xls_header(align_left) { + print "" + print "" + print "" + + print "" + print "" + print "" + print "" + print "" + print "" + print "" + print "" +} + +function ws_name_ex(a) { + a1=gensub(filter,"","g",a) + if (length(a1)>26) a1=substr(a1,length(a1)-26) + return gensub(/^[^-_]+[-_](.*)$/,"\\1",1,a1) +} + +function ws_name(a) { + a1=ws_name_ex(a) + if (ws_uniq[a1] == "") { + ws_uniq[a1]=a + return a1 + } + print "Worksheet name conflict: "a1 > "/dev/stderr" + print "previous: "ws_uniq[a1] > "/dev/stderr" + print "new: "a > "/dev/stderr" + exit 3 +} + diff --git a/script/build.sh b/script/build.sh new file mode 100644 index 0000000..461a292 --- /dev/null +++ b/script/build.sh @@ -0,0 +1,91 @@ +#!/bin/bash -e + +PLATFORM=${PLATFORM:-SPR} +IMAGEARCH=${IMAGEARCH:-linux/amd64} +BACKEND=${BACKEND:-docker} +RELEASE=${RELEASE:-:latest} + +with_arch () { + if [[ "$IMAGEARCH" = "linux/amd64" ]]; then + echo $1 + else + echo $1-${IMAGEARCH/*\//} + fi +} + +docker_push () { + case "$1" in + *.dkr.ecr.*.amazonaws.com/*) + REGISTRY= "$SCRIPT/cumulus/shell.sh" aws -v "$SCRIPT/cumulus:/mnt:ro" -- /mnt/script/create-private-repository-aws.sh $1 || true + ;; + esac + docker -D push $1 +} + +# template substitution +if [[ "$DIR" = */workload/* ]]; then + find "$DIR" -name "*.m4" ! -name "*-config.yaml.m4" ! -path "*/template/*" -exec /bin/bash -c 'f="{}" && cd "'$DIR'" && m4 -Itemplate -I../../template -DPLATFORM='$PLATFORM' -DIMAGEARCH='$IMAGEARCH' -DWORKLOAD='$WORKLOAD' -DREGISTRY='$REGISTRY' -DBACKEND='$BACKEND' -DRELEASE='$RELEASE' "$f" > "${f/.m4/}"' \; +elif [[ "$DIR" = */stack/* ]]; then + find "$DIR" -name "*.m4" ! -name "*-config.yaml.m4" ! -path "*/template/*" -exec /bin/bash -c 'f="{}" && cd "'$DIR'" && m4 -Itemplate -I../../template -DPLATFORM='$PLATFORM' -DIMAGEARCH='$IMAGEARCH' -DSTACK='$STACK' -DREGISTRY='$REGISTRY' -DBACKEND='$BACKEND' -DRELEASE='$RELEASE' "$f" > "${f/.m4/}"' \; +fi + +if [ "${#DOCKER_CONTEXT[@]}" -eq 0 ]; then + DOCKER_CONTEXT=("${DOCKER_CONTEXT:-.}") +fi + +if [ "$1" = "--bom" ]; then + [[ "$DIR" = *"/workload/"* ]] && echo "# ${DIR/*\/workload/workload}" + [[ "$DIR" = *"/stack/"* ]] && echo "# ${DIR/*\/stack/stack}" + for dc in "${DOCKER_CONTEXT[@]}"; do + for pat in '.9.*' '.8.*' '.7.*' '.6.*' '.5.*' '.4.*' '.3.*' '.2.*' '.1.*' ''; do + find "$DIR/$dc" -maxdepth 1 -mindepth 1 -name "Dockerfile$pat" $FIND_OPTIONS ! -name "*.m4" -print 2> /dev/null | ( + while IFS= read df; do + image=$(with_arch $(head -n 2 "$df" | grep -E '^#+ ' | tail -n 1 | cut -d' ' -f2)) + header=$(head -n 2 "$df" | grep -E '^#+ ' | tail -n 1 | cut -d' ' -f1) + [ -n "$REGISTRY" ] && [ "$header" = "#" ] && image="$REGISTRY$image$RELEASE" + echo "$header image: $image" + while IFS= read line; do + if [[ "$line" = "ARG "*=* ]]; then + var="$(echo ${line/ARG /} | tr -d '"' | cut -f1 -d=)" + value="$(echo ${line/ARG /} | tr -d '"' | cut -f2- -d=)" + eval "$var=\"$value\"" + eval "value=\"$value\"" + + case "$line" in + *_VER=*|*_VERSION=*|*_REPO=*|*_REPOSITORY=*|*_IMG=*|*_IMAGE=*|*_PKG=*|*_PACKAGE=*) + echo "ARG $var=$value" + ;; + esac + fi + done < "$df" + done + ) + done + done +else + build_options="$(env | cut -f1 -d= | grep -iE '_proxy$' | sed 's/^/--build-arg /' | tr '\n' ' ')" + build_options="$build_options --build-arg RELEASE=$RELEASE" + + if [ "$IMAGEARCH" != "linux/amd64" ]; then + build_options="$build_options --platform $IMAGEARCH" + fi + + build_options="$build_options --build-arg BUILDKIT_INLINE_CACHE=1" + for dc in "${DOCKER_CONTEXT[@]}"; do + for pat in '.9.*' '.8.*' '.7.*' '.6.*' '.5.*' '.4.*' '.3.*' '.2.*' '.1.*' ''; do + for dockerfile in $(find "$DIR/$dc" -maxdepth 1 -mindepth 1 -name "Dockerfile$pat" $FIND_OPTIONS -print 2>/dev/null); do + + image=$(with_arch $(head -n 2 "$dockerfile" | grep -E '^#+ ' | tail -n 1 | cut -d' ' -f2)) + header=$(head -n 2 "$dockerfile" | grep -E '^#+ ' | tail -n 1 | cut -d' ' -f1) + IMAGE="$REGISTRY$image$RELEASE" + ( + cd "$DIR/$dc" + DOCKER_BUILDKIT=1 docker build $BUILD_OPTIONS $build_options --cache-from $REGISTRY$image$RELEASE -t $image -t $image$RELEASE $([ -n "$REGISTRY" ] && [ "$header" = "#" ] && echo -t $IMAGE) -f "$dockerfile" . + ) + if [ -n "$REGISTRY" ] && [ "$header" = "#" ]; then + docker_push $IMAGE + fi + done + done + done +fi diff --git a/script/check-license.sh b/script/check-license.sh new file mode 100755 index 0000000..37edcbc --- /dev/null +++ b/script/check-license.sh @@ -0,0 +1,40 @@ +#!/bin/bash -e + +if [ ${#@} = 0 ]; then + echo "Usage: : ..." + exit 3 +fi + +check_access () { + printf -- "$1\n" + reply='n' + read -p 'Type "accept" to proceed or anything else to skip: ' + [ "$REPLY" = "accept" ] || [ "$REPLY" = "ACCEPT" ] +} + +check_history () { + grep -qFx "$1" "$2" 2> /dev/null +} + +cache="$1" +shift +while [ ${#@} -gt 0 ]; do + name="${1/:*/}" + text="${1#$name:}" + shift + if check_history "$name" "$cache"; then + access=OK + elif check_access "$text"; then + access=OK + echo "$name" >> "$cache" + else + access=denied + fi + if [ $access != OK ]; then + echo + echo "Access to $name denied. Build aborted." + echo + exit 1 + fi +done +exit 0 diff --git a/script/component.cmake b/script/component.cmake new file mode 100644 index 0000000..238eecb --- /dev/null +++ b/script/component.cmake @@ -0,0 +1,90 @@ + +function(check_license name license_text) + if(NOT "/${CMAKE_CURRENT_SOURCE_DIR}//" MATCHES "/[^/]*${BENCHMARK}[^/]*/") + return() + endif() + + if(ENABLE_BUILD) + if (NOT ";${license_list}" MATCHES ";${name}:") + set(license_list "${license_list};${name}:${license_text}" PARENT_SCOPE) + endif() + endif() +endfunction() + +function(add_component_build type name) + set(component ${name} PARENT_SCOPE) + + if(NOT "/${CMAKE_CURRENT_SOURCE_DIR}//" MATCHES "/[^/]*${BENCHMARK}[^/]*/") + return() + elseif(BENCHMARK) + message("BENCHMARK enabled ${type} ${name}") + endif() + + set(license_reqs "") + set(sut_reqs "") + set(req_mode "LICENSE") + foreach(arg1 ${ARGN}) + if(arg1 STREQUAL "LICENSE") + set(req_mode "LICENSE") + elseif(arg1 STREQUAL "SUT") + set(req_mode "SUT") + else() + if(req_mode STREQUAL "LICENSE") + foreach(item ${license_list}) + if((item MATCHES "^${arg1}:") AND (NOT ACCEPT_LICENSE STREQUAL "ALL") AND (NOT " ${ACCEPT_LICENSE} " MATCHES " ${arg1} ")) + set(license_reqs "${license_reqs} '${item}'") + endif() + endforeach() + elseif(req_mode STREQUAL "SUT") + set(sut_reqs "${sut_reqs};${arg1}") + endif() + endif() + endforeach() + set(sut_reqs "${sut_reqs}" PARENT_SCOPE) + string(REPLACE "\n" "\\n" license_reqs "${license_reqs}") + + string(TOUPPER ${type} typeu) + add_custom_target(bom_${name} COMMAND bash -c "echo BOM of ${PLATFORM}/${name}:" COMMAND bash -c "PLATFORM=${PLATFORM} IMAGEARCH=${IMAGEARCH} ${typeu}=${name} BACKEND=${BACKEND} RELEASE=${RELEASE} REGISTRY=${REGISTRY} SCRIPT='${PROJECT_SOURCE_DIR}/script' '${CMAKE_CURRENT_SOURCE_DIR}/build.sh' --bom" VERBATIM) + add_dependencies(bom bom_${name}) + + if(ENABLE_BUILD) + add_custom_target(build_${name} ALL COMMAND bash -c "'${PROJECT_SOURCE_DIR}/script/check-license.sh' '${CMAKE_CURRENT_BINARY_DIR}/.check-license' ${license_reqs} && PLATFORM=${PLATFORM} IMAGEARCH=${IMAGEARCH} ${typeu}=${name} BACKEND=${BACKEND} RELEASE=${RELEASE} REGISTRY=${REGISTRY} SCRIPT='${PROJECT_SOURCE_DIR}/script' '${CMAKE_CURRENT_SOURCE_DIR}/build.sh'" VERBATIM) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "${CMAKE_CURRENT_BINARY_DIR}/.check-license") + + if(COMMAND add_backend_dependencies) + add_backend_dependencies(${type} ${name}) + endif() + + if(NOT IMAGEARCH STREQUAL "linux/amd64") + add_dependencies(build_${name} build_march) + endif() + endif() + execute_process(COMMAND bash -c "ln -s -r -f '${PROJECT_SOURCE_DIR}'/script/benchmark/*.sh ." WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}") +endfunction() + +function(add_component_testcase type component name) + if(NOT "/${CMAKE_CURRENT_SOURCE_DIR}//" MATCHES "/[^/]*${BENCHMARK}[^/]*/") + return() + endif() + + function(add_testcase_1 name backend) + string(REPLACE ";" " " argstr "${ARGN}") + + string(TOUPPER ${type} typeu) + add_test(NAME test_${name} COMMAND bash -c "rm -rf $TEST_PREFIX''logs-${name} && mkdir -p $TEST_PREFIX''logs-${name} && cd $TEST_PREFIX''logs-${name} && TESTCASE=test_${name} PLATFORM=${PLATFORM} IMAGEARCH=${IMAGEARCH} ${typeu}=${component} RELEASE=${RELEASE} REGISTRY=${REGISTRY} TIMEOUT=${TIMEOUT} ${backend} SCRIPT='${PROJECT_SOURCE_DIR}/script' REGISTRY_AUTH=${REGISTRY_AUTH} '${CMAKE_CURRENT_SOURCE_DIR}/validate.sh' ${argstr}" WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}") + + add_custom_target(kpi_${component}_${name} COMMAND "${CMAKE_SOURCE_DIR}/script/make-kpi.sh" "logs-${name}" WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" VERBATIM) + add_dependencies(kpi kpi_${component}_${name}) + + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "${CMAKE_CURRENT_BINARY_DIR}/logs-${name}") + endfunction() + + if(COMMAND add_backend_testcase) + add_backend_testcase(${type} ${component} ${name} ${ARGN}) + else() + if(NOT sut_reqs) + add_testcase_1(${name} "BACKEND=${BACKEND}" ${ARGN}) + endif() + endif() +endfunction() + diff --git a/script/cumulus.cmake b/script/cumulus.cmake new file mode 100644 index 0000000..81e5289 --- /dev/null +++ b/script/cumulus.cmake @@ -0,0 +1,53 @@ + +function(show_backend_settings) + message("-- Setting: CUMULUS_OPTIONS=${CUMULUS_OPTIONS}") + message("-- Setting: CUMULUS_SUT=${CUMULUS_SUT}") +endfunction() + +function(add_backend_dependencies type name) + add_dependencies(build_${name} build_cumulus) +endfunction() + +function(add_backend_testcase type component name) + if(";${sut_reqs};" STREQUAL ";;") + string(REPLACE " " ";" cumulus_sut "${CUMULUS_SUT}") + set(sut_list "") + foreach(sut1 ${cumulus_sut}) + if(NOT sut1 MATCHES "^-") + set(sut_list "${sut_list};${sut1}") + endif() + endforeach() + else() + set(sut_list "${sut_reqs}") + endif() + foreach(sut1 ${sut_list}) + if(sut1 AND (" ${CUMULUS_SUT} " MATCHES " ${sut1} ")) + string(REGEX REPLACE "^-" "" sut2 "${sut1}") + add_testcase_1("${sut2}_${name}" "BACKEND=${BACKEND} CUMULUS_OPTIONS='${CUMULUS_OPTIONS}' CUMULUS_CONFIG_IN='${PROJECT_SOURCE_DIR}/script/cumulus/cumulus-config.${sut1}.yaml'" ${ARGN}) + endif() + endforeach() +endfunction() + +################ TOP-LEVEL-CMAKE ########################### + +execute_process(COMMAND bash -c "echo \"\"$(find -name 'cumulus-config.*.yaml' | sed 's|.*/cumulus-config.\\(.*\\).yaml$|\\1|')" OUTPUT_VARIABLE sut_all OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/script/cumulus") +execute_process(COMMAND bash -c "echo \"\"$(find -name 'cumulus-config.*.yaml' ! -exec grep -q cloud: '{}' \\; -print | sed 's|.*/cumulus-config.\\(.*\\).yaml$|\\1|')" OUTPUT_VARIABLE CUMULUS_SUT_STATIC OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/script/cumulus") + +if (NOT CUMULUS_OPTIONS) + set(CUMULUS_OPTIONS "--docker-run --nosvrinfo") +endif() + +if ((NOT DEFINED CUMULUS_SUT) OR (CUMULUS_SUT STREQUAL "")) + set(CUMULUS_SUT "${sut_all}") +endif() + +string(REPLACE " " ";" configs "${CUMULUS_SUT}") +foreach(config ${configs}) + if(NOT " ${sut_all} " MATCHES "${config}") + message(FATAL_ERROR "Failed to locate cumulus config: ${config}") + endif() +endforeach() + +add_subdirectory(script/cumulus) +execute_process(COMMAND bash -c "ln -s -r -f '${PROJECT_SOURCE_DIR}'/script/cumulus/script/setup-sut.sh ." WORKING_DIRECTORY "${CMAKE_BINARY_DIR}") + diff --git a/script/cumulus/CMakeLists.txt b/script/cumulus/CMakeLists.txt new file mode 100644 index 0000000..c1344cc --- /dev/null +++ b/script/cumulus/CMakeLists.txt @@ -0,0 +1,12 @@ +if(ENABLE_BUILD) + add_custom_target(build_cumulus bash -c "PLATFORM=X64 IMAGEARCH=linux/amd64 BACKEND=${BACKEND} RELEASE=${RELEASE} REGISTRY=${REGISTRY} CUMULUS_SUT='${CUMULUS_SUT}' CUMULUS_OPTIONS='${CUMULUS_OPTIONS}' SCRIPT='${PROJECT_SOURCE_DIR}/script' '${CMAKE_CURRENT_SOURCE_DIR}/build.sh'") +else() + add_custom_target(build_cumulus) +endif() + +execute_process(COMMAND bash -c "mkdir -p '${CMAKE_CURRENT_SOURCE_DIR}/.docker' > /dev/null; touch '${CMAKE_CURRENT_SOURCE_DIR}/.gitconfig' 2>/dev/null; chmod 600 '${CMAKE_CURRENT_SOURCE_DIR}'/ssh_config" OUTPUT_QUIET ERROR_QUIET) + +foreach(cloud aws gcp azure tencent alicloud) + add_custom_target(${cloud} COMMAND bash -c "REGISTRY=${REGISTRY} RELEASE=${RELEASE} '${CMAKE_CURRENT_SOURCE_DIR}/shell.sh' ${cloud} -v '${CMAKE_CURRENT_SOURCE_DIR}:/home' -v '${CMAKE_CURRENT_SOURCE_DIR}:/root' -t -- bash || true" VERBATIM) +endforeach() + diff --git a/script/cumulus/Dockerfile.1.alicloud b/script/cumulus/Dockerfile.1.alicloud new file mode 100644 index 0000000..9d66884 --- /dev/null +++ b/script/cumulus/Dockerfile.1.alicloud @@ -0,0 +1,14 @@ +# cumulus-alicloud + +# Copyright (c) 2022 Intel Corporation +# SPDX-License-Identifier: Apache License 2.0 + +ARG RELEASE +FROM cumulus-cloud${RELEASE} + +ARG ALIYUN_CLI_VER=3.0.104 +ARG ALIYUN_CLI_PACKAGE=https://github.com/aliyun/aliyun-cli/releases/download/v${ALIYUN_CLI_VER}/aliyun-cli-linux-${ALIYUN_CLI_VER}-amd64.tgz +RUN curl -L ${ALIYUN_CLI_PACKAGE} | tar -xz -C /usr/local/bin + +# Add cleanup script +COPY script/cleanup-alicloud.sh /usr/local/bin/cleanup diff --git a/script/cumulus/Dockerfile.1.aws b/script/cumulus/Dockerfile.1.aws new file mode 100644 index 0000000..a32ff9b --- /dev/null +++ b/script/cumulus/Dockerfile.1.aws @@ -0,0 +1,14 @@ +# cumulus-aws + +# Copyright (c) 2022 Intel Corporation +# SPDX-License-Identifier: Apache License 2.0 + +ARG RELEASE +FROM cumulus-cloud${RELEASE} + +# Install AWS CLI +RUN python3 -m pip install --no-cache-dir -r /PerfKitBenchmarker/perfkitbenchmarker/providers/aws/requirements.txt + +# Add cleanup script +COPY script/cleanup-aws.sh /usr/local/bin/cleanup + diff --git a/script/cumulus/Dockerfile.1.azure b/script/cumulus/Dockerfile.1.azure new file mode 100644 index 0000000..f5c8c07 --- /dev/null +++ b/script/cumulus/Dockerfile.1.azure @@ -0,0 +1,19 @@ +# cumulus-azure + +# Copyright (c) 2022 Intel Corporation +# SPDX-License-Identifier: Apache License 2.0 + +ARG RELEASE +FROM cumulus-cloud${RELEASE} + +# Install AZure CLI +RUN apt-get update && apt-get install -y ca-certificates curl apt-transport-https gnupg && \ + apt-get clean && rm -rf /var/lib/apt/lists/* +RUN curl -sL https://packages.microsoft.com/keys/microsoft.asc | \ + gpg --dearmor > /etc/apt/trusted.gpg.d/microsoft.gpg && \ + echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ $(grep VERSION_CODENAME /etc/os-release | cut -f2 -d=) main" > /etc/apt/sources.list.d/azure-cli.list +RUN apt-get update && apt-get install -y azure-cli && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Add clean script +COPY script/cleanup-azure.sh /usr/local/bin/cleanup diff --git a/script/cumulus/Dockerfile.1.gcp b/script/cumulus/Dockerfile.1.gcp new file mode 100644 index 0000000..adc5c1e --- /dev/null +++ b/script/cumulus/Dockerfile.1.gcp @@ -0,0 +1,18 @@ +# cumulus-gcp + +# Copyright (c) 2022 Intel Corporation +# SPDX-License-Identifier: Apache License 2.0 + +ARG RELEASE +FROM cumulus-cloud${RELEASE} + +# Install GCP CLI +RUN apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg && \ + apt-get clean && rm -rf /var/lib/apt/lists/* +RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" > /etc/apt/sources.list.d/google-cloud-sdk.list +RUN apt-get update && apt-get install -y google-cloud-cli && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Add cleanup script +COPY script/cleanup-gcp.sh /usr/local/bin/cleanup diff --git a/script/cumulus/Dockerfile.1.tencent b/script/cumulus/Dockerfile.1.tencent new file mode 100644 index 0000000..ae547f8 --- /dev/null +++ b/script/cumulus/Dockerfile.1.tencent @@ -0,0 +1,13 @@ +# cumulus-tencent + +# Copyright (c) 2022 Intel Corporation +# SPDX-License-Identifier: Apache License 2.0 + +ARG RELEASE +FROM cumulus-cloud${RELEASE} + +# Install TCCLI +RUN python3 -m pip install --no-cache-dir -r /PerfKitBenchmarker/perfkitbenchmarker/providers/tencent/requirements.txt + +# Add cleanup script +COPY script/cleanup-tencent.sh /usr/local/bin/cleanup diff --git a/script/cumulus/Dockerfile.2.cloud b/script/cumulus/Dockerfile.2.cloud new file mode 100644 index 0000000..1373b50 --- /dev/null +++ b/script/cumulus/Dockerfile.2.cloud @@ -0,0 +1,11 @@ +## cumulus-cloud + +# Copyright (c) 2022 Intel Corporation +# SPDX-License-Identifier: Apache License 2.0 + +ARG RELEASE +FROM cumulus-static${RELEASE} + +# Install docker CLI +COPY --from=docker:20.10.17-dind /usr/local/bin/docker /usr/local/bin/ + diff --git a/script/cumulus/Dockerfile.3.static b/script/cumulus/Dockerfile.3.static new file mode 100644 index 0000000..6f69b29 --- /dev/null +++ b/script/cumulus/Dockerfile.3.static @@ -0,0 +1,56 @@ +# syntax=docker/dockerfile:1 +# cumulus-static + +# Copyright (c) 2022 Intel Corporation +# SPDX-License-Identifier: Apache License 2.0 + +ARG OS_VER=22.04 +ARG OS_IMAGE=ubuntu + +FROM ${OS_IMAGE}:${OS_VER} +## Security updates +RUN apt-get update && apt-get upgrade -y libsqlite3-0 rsync perl-modules libgnutls30 tar passwd libpcre2-8-0 libpcre3 patch openssh-client libtinfo6 libk5crypto3 libgmp10 libc-bin git-man coreutils && apt-get clean && rm -rf /var/lib/apt/lists/* + +## Prerequisites +RUN apt-get update && apt-get install -y curl git rsync bzip2 openssl skopeo sudo file && apt-get clean && rm -rf /var/lib/apt/lists/* + +# Install miniconda +ARG CONDA_DIR=/opt/conda +ARG CONDA_VER=py39_4.12.0 +ARG CONDA_REPO=https://repo.anaconda.com/miniconda +RUN curl -o ~/miniconda.sh ${CONDA_REPO}/Miniconda3-${CONDA_VER}-Linux-x86_64.sh && bash ~/miniconda.sh -b -p ${CONDA_DIR} && rm -f ~/miniconda.sh + +# Put conda in path so we can use python3 and pip3 installed in CONDA_DIR +ENV PATH=${CONDA_DIR}/bin:$PATH + +## Cumulus patches +COPY pkb/ /PerfKitBenchmarker/ +RUN python3 -m pip install --no-cache-dir -r PerfKitBenchmarker/requirements.txt kazoo paramiko lxml +WORKDIR /home + +## svrinfo NDA +## Download the package to script/cumulus/pkb/data/svrinfo +RUN [ -e /PerfKitBenchmarker/perfkitbenchmarker/data/svrinfo/svrinfo* ] && \ + sed -i "/DEFINE_string('svrinfo_tarball'/s|None|'$(ls -1 /PerfKitBenchmarker/perfkitbenchmarker/data/svrinfo/svrinfo*)'|" /PerfKitBenchmarker/perfkitbenchmarker/traces/svrinfo.py || true + +#### +ARG USER=pkb +ARG GROUP=pkb +RUN groupadd -f ${GROUP} && useradd -d /home -M ${USER} -g ${GROUP} +RUN echo "${USER} ALL=(ALL:ALL) NOPASSWD: ALL" >> /etc/sudoers +RUN groupadd -f docker && usermod -aG docker pkb +#### + +RUN apt-get update && apt-get install -y netcat gawk inetutils-ping && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Install gosu +ARG GOSU_VER=1.14 +ARG GOSU_REPO=https://github.com/tianon/gosu/releases/download/${GOSU_VER}/gosu-amd64 +RUN curl -o /usr/local/bin/gosu -SL ${GOSU_REPO} && \ + curl -o /usr/local/bin/gosu.asc ${GOSU_REPO}.asc && \ + chmod +x /usr/local/bin/gosu + +# entry point +COPY entrypoint.sh / +ENTRYPOINT [ "/entrypoint.sh" ] diff --git a/script/cumulus/build.sh b/script/cumulus/build.sh new file mode 100755 index 0000000..6ce02e8 --- /dev/null +++ b/script/cumulus/build.sh @@ -0,0 +1,18 @@ +#!/bin/bash -e + +DIR="$( cd "$( dirname "$0" )" &> /dev/null && pwd )" + +clouds="$( +for x in $CUMULUS_SUT; do + grep cloud: "$DIR"/cumulus-config.$x.yaml +done | awk '{a[$NF]=1}END{for(x in a)print x}' +)" + +FIND_OPTIONS="-name Dockerfile.*.static" +[[ "$clouds" = *AWS* ]] && FIND_OPTIONS="$FIND_OPTIONS -o -name Dockerfile.*.aws -o -name Dockerfile.*.cloud" +[[ "$clouds" = *GCP* ]] && FIND_OPTIONS="$FIND_OPTIONS -o -name Dockerfile.*.gcp -o -name Dockerfile.*.cloud" +[[ "$clouds" = *Azure* ]] && FIND_OPTIONS="$FIND_OPTIONS -o -name Dockerfile.*.azure -o -name Dockerfile.*.cloud" +[[ "$clouds" = *Tencent* ]] && FIND_OPTIONS="$FIND_OPTIONS -o -name Dockerfile.*.tencent -o -name Dockerfile.*.cloud" +[[ "$clouds" = *AliCloud* ]] && FIND_OPTIONS="$FIND_OPTIONS -o -name Dockerfile.*.alicloud -o -name Dockerfile.*.cloud" +FIND_OPTIONS="( $FIND_OPTIONS )" +. $DIR/../build.sh diff --git a/script/cumulus/cumulus-config.alicloud.yaml b/script/cumulus/cumulus-config.alicloud.yaml new file mode 100644 index 0000000..099d6bd --- /dev/null +++ b/script/cumulus/cumulus-config.alicloud.yaml @@ -0,0 +1,57 @@ +cloud_worker: &cloud_worker + AliCloud: + machine_type: ecs.g7.large +# https://ecs-buy.aliyun.com/instanceTypes + zone: cn-shanghai-m + +cloud_disk_mount_1: &cloud_disk_mount_1 + AliCloud: + mount_point: /mnt/disk1 + disk_type: ephemeral_ssd + disk_size: 500 + num_striped_disks: 1 + +cloud_controller: &cloud_controller + AliCloud: + machine_type: ecs.g7.large +# https://ecs-buy.aliyun.com/instanceTypes + zone: cn-shanghai-m + +docker_pt: + vm_groups: + worker: + vm_count: 1 + os_type: "ubuntu2204" + vm_spec: *cloud_worker + disk_spec: *cloud_disk_mount_1 + controller: + vm_count: 1 + os_type: "ubuntu2204" + vm_spec: *cloud_controller + flags: + dpt_docker_image: "" + dpt_docker_dataset: "" + dpt_docker_options: "" + dpt_kubernetes_yaml: "" + dpt_kubernetes_job: "" + dpt_logs_dir: "" + dpt_timeout: "28800,300" + dpt_name: "" + dpt_script_args: "" + dpt_cluster_yaml: "" + dpt_params: "" + dpt_tunables: "" + dpt_registry_map: "" + dpt_namespace: "" + ali_system_disk_size: 500 + ali_system_disk_type: cloud_essd + ssh_options: "-o TCPKeepAlive=yes" + sar_flags: "-B -b -d -p -H -I ALL -m ALL -n ALL -q -r ALL -u ALL -P ALL -v -W -w" + docker_dist_repo: "http://mirrors.aliyun.com/docker-ce/linux/ubuntu" + docker_registry_mirrors: "https://registry.cn-hangzhou.aliyuncs.com" + k8s_repo_key_url: "http://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg" + k8s_repo_url: "http://mirrors.aliyun.com/kubernetes/apt" + k8s_kubeadm_options: "--image-repository=registry.aliyuncs.com/google_containers" + k8s_image_mirrors: "docker.io/xmchen/node-feature-discovery:v0.10.1,k8s.gcr.io/nfd/node-feature-discovery:v0.10.1" + enable_rsync: true + cloud: AliCloud diff --git a/script/cumulus/cumulus-config.aws.yaml b/script/cumulus/cumulus-config.aws.yaml new file mode 100644 index 0000000..eb7ac95 --- /dev/null +++ b/script/cumulus/cumulus-config.aws.yaml @@ -0,0 +1,49 @@ +cloud_worker: &cloud_worker + AWS: + machine_type: m5.16xlarge + zone: us-east-2a + boot_disk_size: 500 + +cloud_disk_mount_1: &cloud_disk_mount_1 + AWS: + mount_point: /mnt/disk1 + disk_type: io2 + disk_size: 512 + iops: 25600 + num_striped_disks: 1 + +cloud_controller: &cloud_controller + AWS: + machine_type: m5.4xlarge + zone: us-east-2a + boot_disk_size: 500 + +docker_pt: + vm_groups: + worker: + vm_count: 1 + os_type: "ubuntu2204" + vm_spec: *cloud_worker + disk_spec: *cloud_disk_mount_1 + controller: + vm_count: 1 + os_type: "ubuntu2204" + vm_spec: *cloud_controller + flags: + dpt_docker_image: "" + dpt_docker_dataset: "" + dpt_docker_options: "" + dpt_kubernetes_yaml: "" + dpt_kubernetes_job: "" + dpt_logs_dir: "" + dpt_timeout: "28800,300" + dpt_name: "" + dpt_script_args: "" + dpt_cluster_yaml: "" + dpt_params: "" + dpt_tunables: "" + dpt_registry_map: "" + dpt_namespace: "" + ssh_options: "-o TCPKeepAlive=yes" + sar_flags: "-B -b -d -p -H -I ALL -m ALL -n ALL -q -r ALL -u ALL -P ALL -v -W -w" + cloud: AWS diff --git a/script/cumulus/cumulus-config.azure.yaml b/script/cumulus/cumulus-config.azure.yaml new file mode 100644 index 0000000..a3d8539 --- /dev/null +++ b/script/cumulus/cumulus-config.azure.yaml @@ -0,0 +1,48 @@ +cloud_worker: &cloud_worker + Azure: + machine_type: Standard_D8s_v3 + zone: eastus + boot_disk_size: 500 + +cloud_disk_mount_1: &cloud_disk_mount_1 + Azure: + mount_point: /mnt/disk1 + disk_type: Standard_LRS + disk_size: 500 + num_striped_disks: 1 + +cloud_controller: &cloud_controller + Azure: + machine_type: Standard_D8s_v3 + zone: eastus + boot_disk_size: 500 + +docker_pt: + vm_groups: + worker: + vm_count: 1 + os_type: "ubuntu2204" + vm_spec: *cloud_worker + disk_spec: *cloud_disk_mount_1 + controller: + vm_count: 1 + os_type: "ubuntu2204" + vm_spec: *cloud_controller + flags: + dpt_docker_image: "" + dpt_docker_dataset: "" + dpt_docker_options: "" + dpt_kubernetes_yaml: "" + dpt_kubernetes_job: "" + dpt_logs_dir: "" + dpt_timeout: "28800,300" + dpt_name: "" + dpt_script_args: "" + dpt_cluster_yaml: "" + dpt_params: "" + dpt_tunables: "" + dpt_registry_map: "" + dpt_namespace: "" + ssh_options: "-o TCPKeepAlive=yes" + sar_flags: "-B -b -d -p -H -I ALL -m ALL -n ALL -q -r ALL -u ALL -P ALL -v -W -w" + cloud: Azure diff --git a/script/cumulus/cumulus-config.gcp.yaml b/script/cumulus/cumulus-config.gcp.yaml new file mode 100644 index 0000000..840a8da --- /dev/null +++ b/script/cumulus/cumulus-config.gcp.yaml @@ -0,0 +1,51 @@ +cloud_worker: &cloud_worker + GCP: + machine_type: n2-standard-16 + zone: us-west1-b + +cloud_disk_mount_1: &cloud_disk_mount_1 + GCP: + mount_point: /mnt/disk1 + disk_type: pd-ssd + disk_size: 500 + num_striped_disks: 1 + +cloud_controller: &cloud_controller + GCP: + machine_type: n2-standard-16 + zone: us-west1-b + +docker_pt: + vm_groups: + worker: + vm_count: 1 + os_type: "ubuntu2204" + vm_spec: *cloud_worker + disk_spec: *cloud_disk_mount_1 + controller: + vm_count: 1 + os_type: "ubuntu2204" + vm_spec: *cloud_controller + flags: + dpt_docker_image: "" + dpt_docker_dataset: "" + dpt_docker_options: "" + dpt_kubernetes_yaml: "" + dpt_kubernetes_job: "" + dpt_logs_dir: "" + dpt_timeout: "28800,300" + dpt_name: "" + dpt_script_args: "" + dpt_cluster_yaml: "" + dpt_params: "" + dpt_tunables: "" + dpt_registry_map: "" + dpt_namespace: "" + gce_subnet_region: us-west1 + gce_boot_disk_size: "500" +# The GCP higher bandwidth feature works with N2, N2D, C2, or C2D series && >=32 VCPUs. +# gce_egress_bandwidth_tier: "TIER_1" +# gce_nic_type: "GVNIC" + ssh_options: "-o TCPKeepAlive=yes" + sar_flags: "-B -b -d -p -H -I ALL -m ALL -n ALL -q -r ALL -u ALL -P ALL -v -W -w" + cloud: GCP diff --git a/script/cumulus/cumulus-config.static.yaml b/script/cumulus/cumulus-config.static.yaml new file mode 100644 index 0000000..db2c5ac --- /dev/null +++ b/script/cumulus/cumulus-config.static.yaml @@ -0,0 +1,48 @@ +static_vms: + - &vm1 + ip_address: 10.219.170.167 + user_name: raspadmin + ssh_private_key: ~/.ssh/id_rsa + internal_ip: 10.219.170.167 + ssh_port: 22 + install_packages: false + os_type: "centos8" + +vm_spec: &default_single_core + GCP: + machine_type: n1-standard-1 + +docker_pt: + vm_groups: + worker: + vm_spec: *default_single_core + vm_count: 1 + os_type: "centos8" + static_vms: + - *vm1 + controller: + vm_spec: *default_single_core + vm_count: 1 + os_type: "centos8" + static_vms: + - *vm1 + flags: + dpt_docker_image: "" + dpt_docker_dataset: "" + dpt_docker_options: "" + dpt_kubernetes_yaml: "" + dpt_kubernetes_job: "" + dpt_logs_dir: "" + dpt_timeout: "28800,300" + dpt_name: "" + dpt_script_args: "" + dpt_cluster_yaml: "" + dpt_params: "" + dpt_tunables: "" + dpt_registry_map: "" + dpt_namespace: "" + dpt_trace_mode: "" +# enable the following flags to speed up svrinfo, only for functional validation +# svrinfo_flags: "--format all" + trace_vm_groups: "worker" + sar_flags: "-B -b -d -p -H -I ALL -m ALL -n ALL -q -r ALL -u ALL -P ALL -v -W -w" diff --git a/script/cumulus/cumulus-config.tencent.yaml b/script/cumulus/cumulus-config.tencent.yaml new file mode 100644 index 0000000..8de80a4 --- /dev/null +++ b/script/cumulus/cumulus-config.tencent.yaml @@ -0,0 +1,55 @@ +cloud_worker: &cloud_worker + Tencent: + machine_type: S5.LARGE8 + zone: ap-shanghai-2 + +cloud_disk_mount_1: &cloud_disk_mount_1 + Tencent: + mount_point: /mnt/disk1 + disk_type: LOCAL_SSD + disk_size: 500 + num_striped_disks: 1 + +cloud_controller: &cloud_controller + Tencent: + machine_type: S5.LARGE8 + zone: ap-shanghai-2 + +docker_pt: + vm_groups: + worker: + vm_count: 1 + os_type: "ubuntu2004" + vm_spec: *cloud_worker + disk_spec: *cloud_disk_mount_1 + controller: + vm_count: 1 + os_type: "ubuntu2004" + vm_spec: *cloud_controller + flags: + dpt_docker_image: "" + dpt_docker_dataset: "" + dpt_docker_options: "" + dpt_kubernetes_yaml: "" + dpt_kubernetes_job: "" + dpt_logs_dir: "" + dpt_timeout: "" + dpt_name: "" + dpt_script_args: "" + dpt_cluster_yaml: "" + dpt_params: "" + dpt_tunables: "" + dpt_registry_map: "" + dpt_namespace: "" + tencent_boot_disk_size: "500" + tencent_boot_disk_type: "CLOUD_SSD" + ssh_options: "-o TCPKeepAlive=yes" + sar_flags: "-B -b -d -p -H -I ALL -m ALL -n ALL -q -r ALL -u ALL -P ALL -v -W -w" + docker_dist_repo: "http://mirrors.aliyun.com/docker-ce/linux/ubuntu" + docker_registry_mirrors: "https://registry.cn-hangzhou.aliyuncs.com" + k8s_repo_key_url: "http://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg" + k8s_repo_url: "http://mirrors.aliyun.com/kubernetes/apt" + k8s_kubeadm_options: "--image-repository=registry.aliyuncs.com/google_containers" + k8s_image_mirrors: "docker.io/xmchen/node-feature-discovery:v0.10.1,k8s.gcr.io/nfd/node-feature-discovery:v0.10.1" + enable_rsync: true + cloud: Tencent diff --git a/script/cumulus/entrypoint.sh b/script/cumulus/entrypoint.sh new file mode 100755 index 0000000..cbe6943 --- /dev/null +++ b/script/cumulus/entrypoint.sh @@ -0,0 +1,23 @@ +#!/bin/bash -e + +if [ -n "$DOCKER_GID" ]; then + if grep -q -E '^docker:' /etc/group; then + if [ "$DOCKER_GID" != "$(getent group docker | cut -f3 -d:)" ]; then + groupmod -g $DOCKER_GID -o docker > /dev/null || true + fi + fi +fi + +if [ -n "$PKB_GID" ]; then + if [ "$PKB_GID" != "$(id -g pkb)" ]; then + groupmod -g $PKB_GID -o pkb > /dev/null || true + fi + if [ -n "$PKB_UID" ]; then + if [ "$PKB_UID" != "$(id -u pkb)" ]; then + usermod -u $PKB_UID -g $PKB_GID -o pkb > /dev/null || true + fi + fi +fi + +####INSERT#### +exec gosu pkb "$@" diff --git a/script/cumulus/pkb/perfkitbenchmarker/__init__.py b/script/cumulus/pkb/perfkitbenchmarker/__init__.py new file mode 100644 index 0000000..181b117 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Import absl.app.""" +from absl import app diff --git a/script/cumulus/pkb/perfkitbenchmarker/app_service.py b/script/cumulus/pkb/perfkitbenchmarker/app_service.py new file mode 100644 index 0000000..32984e1 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/app_service.py @@ -0,0 +1,175 @@ +"""Module containing class for BaseAppService and BaseAppServiceSpec.""" +import threading +import time + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import resource +from perfkitbenchmarker import sample +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.configs import spec + +FLAGS = flags.FLAGS +flags.DEFINE_string('appservice', None, + 'Type of app service. e.g. AppEngine') +flags.DEFINE_string('appservice_region', None, + 'Region of deployed app service.') +flags.DEFINE_string('appservice_backend', None, + 'Backend instance type of app service uses.') +flags.DEFINE_string('app_runtime', None, + 'Runtime environment of app service uses. ' + 'e.g. python, java') +flags.DEFINE_string('app_type', None, + 'Type of app packages builders should built.') +flags.DEFINE_integer('appservice_count', 1, + 'Copies of applications to launch.') + + +def GetAppServiceSpecClass(service): + return spec.GetSpecClass( + BaseAppServiceSpec, SERVICE=service) + + +class BaseAppServiceSpec(spec.BaseSpec): + """Storing various data about app service.""" + + SPEC_TYPE = 'BaseAppServiceSpec' + SPEC_ATTRS = ['SERVICE'] + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + super(BaseAppServiceSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['appservice_region'] .present: + config_values['appservice_region'] = flag_values.appservice_region + if flag_values['appservice_backend'].present: + config_values['appservice_backend'] = flag_values.appservice_backend + if flag_values['appservice'].present: + config_values['appservice'] = flag_values.appservice + + @classmethod + def _GetOptionDecoderConstructions(cls): + result = super(BaseAppServiceSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'appservice_region': (option_decoders.StringDecoder, { + 'default': None, 'none_ok': True}), + 'appservice_backend': (option_decoders.StringDecoder, { + 'default': None, 'none_ok': True}), + 'appservice': (option_decoders.StringDecoder, { + 'default': None, 'none_ok': True}) + }) + return result + + +def GetAppServiceClass(service): + return resource.GetResourceClass( + BaseAppService, SERVICE=service) + + +class BaseAppService(resource.BaseResource): + """Base class for representing an App instance.""" + + RESOURCE_TYPE = 'BaseAppService' + REQUIRED_ATTRS = ['SERVICE'] + POLL_INTERVAL = 1 + + _appservice_counter = 0 + _appservice_counter_lock = threading.Lock() + + def __init__(self, base_app_service_spec): + super(BaseAppService, self).__init__() + with self._appservice_counter_lock: + self.appservice_number = self._appservice_counter + self.name = 'pkb-%s-%s' % (FLAGS.run_uri, self.appservice_number) + BaseAppService._appservice_counter += 1 + self.region = base_app_service_spec.appservice_region + self.backend = base_app_service_spec.appservice_backend + self.builder = None + # update metadata + self.metadata.update({'backend': self.backend, + 'region': self.region, + 'concurrency': 'default'}) + self.samples = [] + + def _UpdateDependencies(self): + """Update dependencies for AppService.""" + self.builder.Mutate() + + def _Update(self): + raise NotImplementedError() + + def Update(self): + """Update a deployed app instance.""" + + @vm_util.Retry(poll_interval=self.POLL_INTERVAL, fuzz=0, + timeout=self.READY_TIMEOUT, + retryable_exceptions=( + errors.Resource.RetryableCreationError,)) + def WaitUntilReady(): + if not self._IsReady(): + raise errors.Resource.RetryableCreationError('Not yet ready') + + if self.user_managed: + return + self._UpdateDependencies() + self.update_start_time = time.time() + self._Update() + self.update_end_time = time.time() + WaitUntilReady() + self.update_ready_time = time.time() + self.samples.append( + sample.Sample('update latency', + self.update_end_time - self.update_start_time, + 'seconds', {})) + self.samples.append( + sample.Sample('update ready latency', + self.update_ready_time - self.update_start_time, + 'seconds', {})) + + def Invoke(self, args=None): + """Invoke a deployed app instance. + + Args: + args: dict. Arguments passed to app. + """ + raise NotImplementedError() + + def _CreateDependencies(self): + """Builds app package.""" + if self.builder: + self.builder.Create() + + def _DeleteDependencies(self): + """Delete app package.""" + if self.builder: + self.builder.Delete() + + def SetBuilder(self, builder=None, **kwargs): + """Set builder for AppService.""" + if builder: + self.builder = builder + + def GetLifeCycleMetrics(self): + """Export internal lifecycle metrics.""" + if self.builder: + self.metadata.update(self.builder.GetResourceMetadata()) + + for s in self.samples: + s.metadata.update(self.metadata) + return self.samples + + def _PostCreate(self): + """Method called after _CreateResource.""" + if self.builder: + self.metadata.update(self.builder.GetResourceMetadata()) + + def Create(self): + super(BaseAppService, self).Create() + self.samples.append( + sample.Sample('create latency', + self.create_end_time - self.create_start_time, + 'seconds', {})) + self.samples.append( + sample.Sample('create ready latency', + self.resource_ready_time - self.create_start_time, + 'seconds', {})) diff --git a/script/cumulus/pkb/perfkitbenchmarker/archive.py b/script/cumulus/pkb/perfkitbenchmarker/archive.py new file mode 100644 index 0000000..be62d05 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/archive.py @@ -0,0 +1,74 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Archive a run directory to GCS or S3.""" + +import datetime +import logging +import os +import posixpath +import subprocess +import tarfile + +from perfkitbenchmarker.providers.aws.util import AWS_PATH + + +def ArchiveRun(run_temp_directory, target_bucket, + prefix='', + gsutil_path='gsutil', + aws_path=AWS_PATH): + """Archive a run directory to GCS or S3. + + Args: + run_temp_directory: str. directory to archive. + target_bucket: str. Either a gs:// or s3:// path to an extant bucket. + prefix: str. prefix for the file. + gsutil_path: str. Path to the gsutil tool. + aws_path: str. Path to the aws command line tool. + + Raises: + ValueError: when directory or target_bucket does not exist. + subprocess.CalledProcessError: subprocess call failed. + """ + if not os.path.isdir(run_temp_directory): + raise ValueError('{0} is not a directory.'.format(run_temp_directory)) + + tar_file_name = '{}{}.tar.gz'.format( + prefix, datetime.datetime.now().strftime('%Y%m%d%H%M%S')) + + prefix_len = 5 + prefixes = { + 's3://': [aws_path, 's3', 'cp'], + 'gs://': [gsutil_path, 'cp'] + } + + assert all(len(key) == prefix_len for key in prefixes), prefixes + + try: + cmd = (prefixes[target_bucket[:prefix_len]] + + ['-', posixpath.join(target_bucket, tar_file_name)]) + except KeyError: + raise ValueError('Unsupported bucket name: {0}'.format(target_bucket)) + + logging.info('Streaming %s to %s\n%s', run_temp_directory, tar_file_name, + ' '.join(cmd)) + p = subprocess.Popen(cmd, stdin=subprocess.PIPE) + + with p.stdin: + with tarfile.open(mode='w:gz', fileobj=p.stdin) as tar: + tar.add(run_temp_directory, os.path.basename(run_temp_directory)) + + status = p.wait() + if status: + raise subprocess.CalledProcessError(status, cmd) diff --git a/script/cumulus/pkb/perfkitbenchmarker/background_tasks.py b/script/cumulus/pkb/perfkitbenchmarker/background_tasks.py new file mode 100644 index 0000000..4ac494e --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/background_tasks.py @@ -0,0 +1,687 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Background tasks that propagate PKB thread context. + +TODO(skschneider): Many of the threading module flaws have been corrected in +Python 3. When PKB switches to Python 3, this module can be simplified. + +PKB tries its best to clean up provisioned resources upon SIGINT. By default, +Python raises a KeyboardInterrupt upon a SIGINT, but none of the built-in +threading module classes are designed to handle a KeyboardInterrupt very well: + +- threading.Lock has an atomic acquire method that cannot be interrupted and + hangs forever if the same thread tries to acquire twice. Its release method + can be called by any thread but raises thread.error if an unacquired Lock is + released. + +- More complicated classes (threading.RLock, threading.Event, threading.Thread, + Queue.Queue) use internal Locks in such a way that a KeyboardInterrupt can + cause a thread that has acquired a Lock to jump out of its current action + without releasing the Lock. For example, in the below code, a + KeyboardInterrupt can be raised immediately after the acquire call but before + entering the try block: + lock.acquire() + try: + ... + except: + lock.release() + +Taken together, this means that there is a possibility to leave an internal Lock +acquired, and when later cleanup steps on the same or different thread attempt +to acquire the Lock, they will hang forever, unresponsive to even a second +KeyboardInterrupt. A KeyboardInterrupt during Thread.start() or Thread.join() +can even trigger an unbalanced acquire on a global lock used to keep track of +active threads, so that later attempts to start or join any Thread will hang +forever. + +While it would take a significant and impractical redesign of PKB's code to +completely eliminate any risk of deadlock following a KeyboardInterrupt, the +code in this module is designed to allow interrupting parallel tasks while +keeping the risk of deadlock low. +""" + + +import abc +from collections import deque +import ctypes +import functools +import logging +import os +import signal +import threading +import time +import traceback +from concurrent import futures + +from perfkitbenchmarker import context +from perfkitbenchmarker import errors +from absl import flags +from perfkitbenchmarker import log_util +import six +from six.moves import queue +from six.moves import range +from six.moves import zip + + +# For situations where an interruptable wait is necessary, a loop of waits with +# long timeouts is used instead. This is because some of Python's built-in wait +# methods are non-interruptable without a timeout. +_LONG_TIMEOUT = 1000. + +# Constants used for polling waits. See _WaitForCondition. +_WAIT_MIN_RECHECK_DELAY = 0.001 # 1 ms +_WAIT_MAX_RECHECK_DELAY = 0.050 # 50 ms + +# Values sent to child threads that have special meanings. +_THREAD_STOP_PROCESSING = 0 +_THREAD_WAIT_FOR_KEYBOARD_INTERRUPT = 1 + +# The default value for max_concurrent_threads. +MAX_CONCURRENT_THREADS = 200 + +# The default value is set in pkb.py. It is the greater of +# MAX_CONCURRENT_THREADS or the value passed to --num_vms. This is particularly +# important for the cluster_boot benchmark where we want to launch all of the +# VMs in parallel. +flags.DEFINE_integer( + 'max_concurrent_threads', None, 'Maximum number of concurrent threads to ' + 'use when running a benchmark.') +FLAGS = flags.FLAGS + + +def _GetCallString(target_arg_tuple): + """Returns the string representation of a function call.""" + target, args, kwargs = target_arg_tuple + while isinstance(target, functools.partial): + args = target.args + args + inner_kwargs = target.keywords.copy() + inner_kwargs.update(kwargs) + kwargs = inner_kwargs + target = target.func + arg_strings = [str(a) for a in args] + arg_strings.extend(['{0}={1}'.format(k, v) for k, v in six.iteritems(kwargs)]) + return '{0}({1})'.format(getattr(target, '__name__', target), + ', '.join(arg_strings)) + + +def _WaitForCondition(condition_callback, timeout=None): + """Waits until the specified callback returns a value that evaluates True. + + Similar to the threading.Condition.wait method that is the basis of most + threading class wait routines. Polls the condition, starting with frequent + checks but extending the delay between checks upon each failure. + + Args: + condition_callback: Callable that returns a value that evaluates True to end + the wait or evaluates False to continue the wait. + timeout: Optional float. Number of seconds to wait before giving up. If + provided, the condition is still checked at least once before giving up. + If not provided, the wait does not time out. + + Returns: + True if condition_callback returned a value that evaluated True. False if + condition_callback did not return a value that evaluated True before the + timeout. + """ + deadline = None if timeout is None else time.time() + timeout + delay = _WAIT_MIN_RECHECK_DELAY + while True: + if condition_callback(): + return True + remaining_time = (_WAIT_MAX_RECHECK_DELAY if deadline is None + else deadline - time.time()) + if remaining_time <= 0: + return False + time.sleep(delay) + delay = min(delay * 2, remaining_time, _WAIT_MAX_RECHECK_DELAY) + + +class _SingleReaderQueue(object): + """Queue to which multiple threads write but from which only one thread reads. + + A lightweight substitute for the Queue.Queue class that does not use + internal Locks. + + Gets are interruptable but depend on polling. + """ + + def __init__(self): + self._deque = deque() + + def Get(self, timeout=None): + if not _WaitForCondition(lambda: self._deque, timeout): + raise queue.Empty + return self._deque.popleft() + + def Put(self, item): + self._deque.append(item) + + +class _NonPollingSingleReaderQueue(object): + """Queue to which multiple threads write but from which only one thread reads. + + Uses a threading.Lock to implement a non-interruptable Get that does not poll + and is therefore easier on CPU usage. The reader waits for items by acquiring + the Lock, and writers release the Lock to signal that items have been written. + """ + + def __init__(self): + self._deque = deque() + self._lock = threading.Lock() + self._lock.acquire() + + def _WaitForItem(self): + self._lock.acquire() + + def _SignalAvailableItem(self): + try: + self._lock.release() + except threading.ThreadError: + pass + + def Get(self): + while True: + self._WaitForItem() + if self._deque: + item = self._deque.popleft() + if self._deque: + self._SignalAvailableItem() + return item + + def Put(self, item): + self._deque.append(item) + self._SignalAvailableItem() + + +class _BackgroundTaskThreadContext(object): + """Thread-specific information that can be inherited by a background task. + + Attributes: + benchmark_spec: BenchmarkSpec of the benchmark currently being executed. + log_context: ThreadLogContext of the parent thread. + """ + + def __init__(self): + self.benchmark_spec = context.GetThreadBenchmarkSpec() + self.log_context = log_util.GetThreadLogContext() + + def CopyToCurrentThread(self): + """Sets the thread context of the current thread.""" + log_util.SetThreadLogContext(log_util.ThreadLogContext(self.log_context)) + context.SetThreadBenchmarkSpec(self.benchmark_spec) + + +class _BackgroundTask(object): + """Base class for a task executed in a child thread or process. + + Attributes: + target: Function that is invoked in the child thread or process. + args: Series of unnamed arguments to be passed to the target. + kwargs: dict. Keyword arguments to be passed to the target. + context: _BackgroundTaskThreadContext. Thread-specific state to be inherited + from parent to child thread. + return_value: Return value if the call was executed successfully, or None + otherwise. + traceback: The traceback string if the call raised an exception, or None + otherwise. + """ + + def __init__(self, target, args, kwargs, thread_context): + self.target = target + self.args = args + self.kwargs = kwargs + self.context = thread_context + self.return_value = None + self.traceback = None + + def Run(self): + """Sets the current thread context and executes the target.""" + self.context.CopyToCurrentThread() + try: + self.return_value = self.target(*self.args, **self.kwargs) + except Exception: + self.traceback = traceback.format_exc() + + +class _BackgroundTaskManager(six.with_metaclass(abc.ABCMeta, object)): + """Base class for a context manager that manages state for background tasks. + + Attributes: + tasks: list of _BackgroundTask instances. Contains one _BackgroundTask per + started task, in the order that they were started. + """ + + def __init__(self, max_concurrency): + self._max_concurrency = max_concurrency + self.tasks = [] + + def __enter__(self): + return self + + def __exit__(self, *unused_args, **unused_kwargs): + pass + + @abc.abstractmethod + def StartTask(self, target, args, kwargs, thread_context): + """Creates and starts a _BackgroundTask. + + The created task is appended to self.tasks. + + Args: + target: Function that is invoked in the child thread or process. + args: Series of unnamed arguments to be passed to the target. + kwargs: dict. Keyword arguments to be passed to the target. + thread_context: _BackgroundTaskThreadContext. Thread-specific state to be + inherited from parent to child thread. + """ + raise NotImplementedError() + + @abc.abstractmethod + def AwaitAnyTask(self): + """Waits for any of the started tasks to complete. + + Returns: + int. Index of the task that completed in self.tasks. + """ + raise NotImplementedError() + + @abc.abstractmethod + def HandleKeyboardInterrupt(self): + """Called by the parent thread if a KeyboardInterrupt occurs. + + Ensures that any child thread also receives a KeyboardInterrupt, and then + waits for each child thread to stop executing. + """ + raise NotImplementedError() + + +def _ExecuteBackgroundThreadTasks(worker_id, task_queue, response_queue): + """Executes tasks received on a task queue. + + Executed in a child Thread by _BackgroundThreadTaskManager. + + Args: + worker_id: int. Identifier for the child thread relative to other child + threads. + task_queue: _NonPollingSingleReaderQueue. Queue from which input is read. + Each value in the queue can be one of three types of values. If it is a + (task_id, _BackgroundTask) pair, the task is executed on this thread. + If it is _THREAD_STOP_PROCESSING, the thread stops executing. If it is + _THREAD_WAIT_FOR_KEYBOARD_INTERRUPT, the thread waits for a + KeyboardInterrupt. + response_queue: _SingleReaderQueue. Queue to which output is written. It + receives worker_id when this thread's bootstrap code has completed and + receives a (worker_id, task_id) pair for each task completed on this + thread. + """ + try: + response_queue.Put(worker_id) + while True: + task_tuple = task_queue.Get() + if task_tuple == _THREAD_STOP_PROCESSING: + break + elif task_tuple == _THREAD_WAIT_FOR_KEYBOARD_INTERRUPT: + while True: + time.sleep(_WAIT_MAX_RECHECK_DELAY) + task_id, task = task_tuple + task.Run() + response_queue.Put((worker_id, task_id)) + except KeyboardInterrupt: + # TODO(skschneider): Detect when the log would be unhelpful (e.g. if the + # current thread was spinning in the _THREAD_WAIT_FOR_KEYBOARD_INTERRUPT + # sub-loop). Only log in helpful cases, like when the task is interrupted. + logging.debug('Child thread %s received a KeyboardInterrupt from its ' + 'parent.', worker_id, exc_info=True) + + +class _BackgroundThreadTaskManager(_BackgroundTaskManager): + """Manages state for background tasks started in child threads.""" + + def __init__(self, *args, **kwargs): + super(_BackgroundThreadTaskManager, self).__init__(*args, **kwargs) + self._response_queue = _SingleReaderQueue() + self._task_queues = [] + self._threads = [] + self._available_worker_ids = list(range(self._max_concurrency)) + uninitialized_worker_ids = set(self._available_worker_ids) + for worker_id in self._available_worker_ids: + task_queue = _NonPollingSingleReaderQueue() + self._task_queues.append(task_queue) + thread = threading.Thread( + target=_ExecuteBackgroundThreadTasks, + args=(worker_id, task_queue, self._response_queue)) + thread.daemon = True + self._threads.append(thread) + thread.start() + # Wait for each Thread to finish its bootstrap code. Starting all the + # threads upfront like this and reusing them for later calls minimizes the + # risk of a KeyboardInterrupt interfering with any of the Lock interactions. + for _ in self._threads: + worker_id = self._response_queue.Get() + uninitialized_worker_ids.remove(worker_id) + assert not uninitialized_worker_ids, uninitialized_worker_ids + + def __exit__(self, *unused_args, **unused_kwargs): + # Shut down worker threads. + for task_queue in self._task_queues: + task_queue.Put(_THREAD_STOP_PROCESSING) + for thread in self._threads: + _WaitForCondition(lambda: not thread.is_alive()) + + def StartTask(self, target, args, kwargs, thread_context): + assert self._available_worker_ids, ('StartTask called when no threads were ' + 'available') + task = _BackgroundTask(target, args, kwargs, thread_context) + task_id = len(self.tasks) + self.tasks.append(task) + worker_id = self._available_worker_ids.pop() + self._task_queues[worker_id].Put((task_id, task)) + + def AwaitAnyTask(self): + worker_id, task_id = self._response_queue.Get() + self._available_worker_ids.append(worker_id) + return task_id + + def HandleKeyboardInterrupt(self): + # Raise a KeyboardInterrupt in each child thread. + for thread in self._threads: + ctypes.pythonapi.PyThreadState_SetAsyncExc( + ctypes.c_long(thread.ident), ctypes.py_object(KeyboardInterrupt)) + # Wake threads up from possible non-interruptable wait states so they can + # actually see the KeyboardInterrupt. + for task_queue, thread in zip(self._task_queues, self._threads): + task_queue.Put(_THREAD_WAIT_FOR_KEYBOARD_INTERRUPT) + for thread in self._threads: + _WaitForCondition(lambda: not thread.is_alive()) + + +def _ExecuteProcessTask(task): + """Function invoked in another process by _BackgroundProcessTaskManager. + + Executes a specified task function and returns the result or exception + traceback. + + TODO(skschneider): Rework this helper function when moving to Python 3.5 or + when the backport of concurrent.futures.ProcessPoolExecutor is able to + preserve original traceback. + + Args: + task: _BackgroundTask to execute. + + Returns: + (result, traceback) tuple. The first element is the return value from the + task function, or None if the function raised an exception. The second + element is the exception traceback string, or None if the function + succeeded. + """ + def handle_sigint(signum, frame): + # Ignore any new SIGINTs since we are already tearing down. + signal.signal(signal.SIGINT, signal.SIG_IGN) + # Execute the default SIGINT handler which throws a KeyboardInterrupt + # in the main thread of the process. + signal.default_int_handler(signum, frame) + signal.signal(signal.SIGINT, handle_sigint) + task.Run() + return task.return_value, task.traceback + + +class _BackgroundProcessTaskManager(_BackgroundTaskManager): + """Manages states for background tasks started in child processes. + + TODO(skschneider): This class uses futures.ProcessPoolExecutor. We have been + using this executor since before issues regarding KeyboardInterrupt were + fully explored. The only consumer of this class is RunParallelProcesses, and + currently the uses for RunParallelProcesses are limited. In the future, this + class should also be redesigned for protection against KeyboardInterrupt. + """ + + def __init__(self, *args, **kwargs): + super(_BackgroundProcessTaskManager, self).__init__(*args, **kwargs) + self._active_futures = {} + self._executor = futures.ProcessPoolExecutor(self._max_concurrency) + + def __enter__(self): + self._executor.__enter__() + return self + + def __exit__(self, *args, **kwargs): + # Note: This invokes a non-interruptable wait. + return self._executor.__exit__(*args, **kwargs) + + def StartTask(self, target, args, kwargs, thread_context): + task = _BackgroundTask(target, args, kwargs, thread_context) + task_id = len(self.tasks) + self.tasks.append(task) + future = self._executor.submit(_ExecuteProcessTask, task) + self._active_futures[future] = task_id + + def AwaitAnyTask(self): + completed_tasks = None + while not completed_tasks: + completed_tasks, _ = futures.wait( + self._active_futures, timeout=_LONG_TIMEOUT, + return_when=futures.FIRST_COMPLETED) + future = completed_tasks.pop() + task_id = self._active_futures.pop(future) + task = self.tasks[task_id] + task.return_value, task.traceback = future.result() + return task_id + + def HandleKeyboardInterrupt(self): + # If this thread received an interrupt signal, then processes started with + # a ProcessPoolExecutor will also have received an interrupt without any + # extra work needed from this class. Only need to wait for child processes. + # Note: This invokes a non-interruptable wait. + self._executor.shutdown(wait=True) + + +def _RunParallelTasks(target_arg_tuples, max_concurrency, get_task_manager, + parallel_exception_class, post_task_delay=0): + """Executes function calls concurrently in separate threads or processes. + + Args: + target_arg_tuples: list of (target, args, kwargs) tuples. Each tuple + contains the function to call and the arguments to pass it. + max_concurrency: int or None. The maximum number of concurrent new + threads or processes. + get_task_manager: Callable that accepts an int max_concurrency arg and + returns a _TaskManager. + parallel_exception_class: Type of exception to raise upon an exception in + one of the called functions. + post_task_delay: Delay in seconds between parallel task invocations. + + Returns: + list of function return values in the order corresponding to the order of + target_arg_tuples. + + Raises: + parallel_exception_class: When an exception occurred in any of the called + functions. + """ + thread_context = _BackgroundTaskThreadContext() + max_concurrency = min(max_concurrency, len(target_arg_tuples)) + error_strings = [] + started_task_count = 0 + active_task_count = 0 + with get_task_manager(max_concurrency) as task_manager: + try: + while started_task_count < len(target_arg_tuples) or active_task_count: + if (started_task_count < len(target_arg_tuples) and + active_task_count < max_concurrency): + # Start a new task. + target, args, kwargs = target_arg_tuples[started_task_count] + task_manager.StartTask(target, args, kwargs, thread_context) + started_task_count += 1 + active_task_count += 1 + if post_task_delay: + time.sleep(post_task_delay) + continue + + # Wait for a task to complete. + task_id = task_manager.AwaitAnyTask() + active_task_count -= 1 + # If the task failed, it may still be a long time until all remaining + # tasks complete. Log the failure immediately before continuing to wait + # for other tasks. + stacktrace = task_manager.tasks[task_id].traceback + if stacktrace: + msg = ('Exception occurred while calling {0}:{1}{2}'.format( + _GetCallString(target_arg_tuples[task_id]), os.linesep, + stacktrace)) + logging.error(msg) + error_strings.append(msg) + + except KeyboardInterrupt: + logging.error( + 'Received KeyboardInterrupt while executing parallel tasks. Waiting ' + 'for %s tasks to clean up.', active_task_count) + task_manager.HandleKeyboardInterrupt() + raise + + if error_strings: + # TODO(skschneider): Combine errors.VmUtil.ThreadException and + # errors.VmUtil.CalledProcessException so this can be a single exception + # type. + raise parallel_exception_class( + 'The following exceptions occurred during parallel execution:' + '{0}{1}'.format(os.linesep, os.linesep.join(error_strings))) + results = [task.return_value for task in task_manager.tasks] + assert len(target_arg_tuples) == len(results), (target_arg_tuples, results) + return results + + +def RunParallelThreads(target_arg_tuples, max_concurrency, post_task_delay=0): + """Executes function calls concurrently in separate threads. + + Args: + target_arg_tuples: list of (target, args, kwargs) tuples. Each tuple + contains the function to call and the arguments to pass it. + max_concurrency: int or None. The maximum number of concurrent new + threads. + post_task_delay: Delay in seconds between parallel task invocations. + + Returns: + list of function return values in the order corresponding to the order of + target_arg_tuples. + + Raises: + errors.VmUtil.ThreadException: When an exception occurred in any of the + called functions. + """ + return _RunParallelTasks( + target_arg_tuples, max_concurrency, _BackgroundThreadTaskManager, + errors.VmUtil.ThreadException, post_task_delay) + + +def RunThreaded(target, + thread_params, + max_concurrent_threads=None, + post_task_delay=0): + """Runs the target method in parallel threads. + + The method starts up threads with one arg from thread_params as the first arg. + + Args: + target: The method to invoke in the thread. + thread_params: A thread is launched for each value in the list. The items + in the list can either be a singleton or a (args, kwargs) tuple/list. + Usually this is a list of VMs. + max_concurrent_threads: The maximum number of concurrent threads to allow. + post_task_delay: Delay in seconds between commands. + + Returns: + List of the same length as thread_params. Contains the return value from + each threaded function call in the corresponding order as thread_params. + + Raises: + ValueError: when thread_params is not valid. + errors.VmUtil.ThreadException: When an exception occurred in any of the + called functions. + + Example 1: # no args other than list. + args = [self.CreateVm() + for x in range(0, 10)] + RunThreaded(MyThreadedTargetMethod, args) + + Example 2: # using args only to pass to the thread: + args = [((self.CreateVm(), i, 'somestring'), {}) + for i in range(0, 10)] + RunThreaded(MyThreadedTargetMethod, args) + + Example 3: # using args & kwargs to pass to the thread: + args = [((self.CreateVm(),), {'num': i, 'name': 'somestring'}) + for i in range(0, 10)] + RunThreaded(MyThreadedTargetMethod, args) + """ + if max_concurrent_threads is None: + max_concurrent_threads = ( + FLAGS.max_concurrent_threads or MAX_CONCURRENT_THREADS) + + if not isinstance(thread_params, list): + raise ValueError('Param "thread_params" must be a list') + + if not thread_params: + # Nothing to do. + return [] + + if not isinstance(thread_params[0], tuple): + target_arg_tuples = [(target, (arg,), {}) for arg in thread_params] + elif (not isinstance(thread_params[0][0], tuple) or + not isinstance(thread_params[0][1], dict)): + raise ValueError('If Param is a tuple, the tuple must be (tuple, dict)') + else: + target_arg_tuples = [(target, args, kwargs) + for args, kwargs in thread_params] + + return RunParallelThreads(target_arg_tuples, + max_concurrency=max_concurrent_threads, + post_task_delay=post_task_delay) + + +def RunParallelProcesses(target_arg_tuples, max_concurrency, + post_process_delay=0): + """Executes function calls concurrently in separate processes. + + Args: + target_arg_tuples: list of (target, args, kwargs) tuples. Each tuple + contains the function to call and the arguments to pass it. + max_concurrency: int or None. The maximum number of concurrent new + processes. If None, it will default to the number of processors on the + machine. + post_process_delay: Delay in seconds between parallel process invocations. + + Returns: + list of function return values in the order corresponding to the order of + target_arg_tuples. + + Raises: + errors.VmUtil.CalledProcessException: When an exception occurred in any + of the called functions. + """ + def handle_sigint(signum, frame): + # Ignore any SIGINTS in the parent process, but let users know + # that the child processes are getting cleaned up. + logging.error('Got SIGINT while executing parallel tasks. ' + 'Waiting for tasks to clean up.') + old_handler = None + try: + old_handler = signal.signal(signal.SIGINT, handle_sigint) + ret_val = _RunParallelTasks( + target_arg_tuples, max_concurrency, _BackgroundProcessTaskManager, + errors.VmUtil.CalledProcessException, + post_task_delay=post_process_delay) + finally: + if old_handler: + signal.signal(signal.SIGINT, old_handler) + return ret_val diff --git a/script/cumulus/pkb/perfkitbenchmarker/background_workload.py b/script/cumulus/pkb/perfkitbenchmarker/background_workload.py new file mode 100644 index 0000000..95eb803 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/background_workload.py @@ -0,0 +1,133 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing classes for background workloads.""" + +from typing import List +from perfkitbenchmarker import os_types +from perfkitbenchmarker import vm_util +import six + +BACKGROUND_WORKLOADS: List['BaseBackgroundWorkload'] = [] + +BACKGROUND_IPERF_PORT = 20001 +BACKGROUND_IPERF_SECONDS = 2147483647 + + +class AutoRegisterBackgroundWorkloadMeta(type): + """Metaclass which allows BackgroundWorkloads to be auto-registered.""" + + def __init__(cls, name, bases, dct): + super(AutoRegisterBackgroundWorkloadMeta, cls).__init__(name, bases, dct) + BACKGROUND_WORKLOADS.append(cls) # pytype: disable=container-type-mismatch + + +class BaseBackgroundWorkload( + six.with_metaclass(AutoRegisterBackgroundWorkloadMeta, object)): + """Baseclass for background workloads.""" + + EXCLUDED_OS_TYPES = [] + + @staticmethod + def IsEnabled(vm): + """Returns true if this background workload is enabled on this VM.""" + del vm # Unused + return False + + @staticmethod + def Prepare(vm): + """Prepares the background workload on this VM.""" + pass + + @staticmethod + def Start(vm): + """Starts the background workload on this VM.""" + pass + + @staticmethod + def Stop(vm): + """Stops the background workload on this VM.""" + pass + + +class CpuWorkload(BaseBackgroundWorkload): + """Workload that runs sysbench in the background.""" + + EXCLUDED_OS_TYPES = os_types.WINDOWS_OS_TYPES + + @staticmethod + def IsEnabled(vm): + """Returns true if this background workload is enabled on this VM.""" + return bool(vm.background_cpu_threads) + + @staticmethod + def Prepare(vm): + """Prepares the background workload on this VM.""" + vm.Install('sysbench') + + @staticmethod + def Start(vm): + """Starts the background workload on this VM.""" + vm.RemoteCommand( + 'nohup sysbench --num-threads=%s --test=cpu --cpu-max-prime=10000000 ' + 'run 1> /dev/null 2> /dev/null &' % vm.background_cpu_threads) + + @staticmethod + def Stop(vm): + """Stops the background workload on this VM.""" + vm.RemoteCommand('pkill -9 sysbench') + + +class NetworkWorkload(BaseBackgroundWorkload): + """Workload that runs iperf in the background.""" + + EXCLUDED_OS_TYPES = os_types.WINDOWS_OS_TYPES + + @staticmethod + def IsEnabled(vm): + """Returns true if this background workload is enabled on this VM.""" + return bool(vm.background_network_mbits_per_sec) + + @staticmethod + def Prepare(vm): + """Prepares the background workload on this VM.""" + vm.Install('iperf') + + @staticmethod + def Start(vm): + """Starts the background workload on this VM.""" + vm.AllowPort(BACKGROUND_IPERF_PORT) + vm.RemoteCommand('nohup iperf --server --port %s &> /dev/null &' % + BACKGROUND_IPERF_PORT) + stdout, _ = vm.RemoteCommand('pgrep iperf -n') + vm.server_pid = stdout.strip() + + if vm.background_network_ip_type == vm_util.IpAddressSubset.EXTERNAL: + ip_address = vm.ip_address + else: + ip_address = vm.internal_ip + iperf_cmd = ('nohup iperf --client %s --port %s --time %s -u -b %sM ' + '&> /dev/null &' % (ip_address, BACKGROUND_IPERF_PORT, + BACKGROUND_IPERF_SECONDS, + vm.background_network_mbits_per_sec)) + + vm.RemoteCommand(iperf_cmd) + stdout, _ = vm.RemoteCommand('pgrep iperf -n') + vm.client_pid = stdout.strip() + + @staticmethod + def Stop(vm): + """Stops the background workload on this VM.""" + vm.RemoteCommand('kill -9 ' + vm.client_pid) + vm.RemoteCommand('kill -9 ' + vm.server_pid) diff --git a/script/cumulus/pkb/perfkitbenchmarker/beam_benchmark_helper.py b/script/cumulus/pkb/perfkitbenchmarker/beam_benchmark_helper.py new file mode 100644 index 0000000..6b7b3ce --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/beam_benchmark_helper.py @@ -0,0 +1,326 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper methods for Apache Beam benchmarks. + +This file contains methods which are common to all Beam benchmarks and +executions. +""" + +import fnmatch +import os + +from absl import flags +from perfkitbenchmarker import dpb_service +from perfkitbenchmarker import errors +from perfkitbenchmarker import vm_util + +BEAM_JAVA_SDK = 'java' +BEAM_PYTHON_SDK = 'python' + +flags.DEFINE_string('gradle_binary', None, + 'Set to use a different gradle binary than gradle wrapper ' + 'from the repository') +flags.DEFINE_string('beam_location', None, + 'Location of already checked out Beam codebase.') +flags.DEFINE_string('beam_it_module', None, + 'Gradle module containing integration test. Use full ' + 'module starting and separated by colon, like :sdk:python') +flags.DEFINE_boolean('beam_prebuilt', False, + 'Set this to indicate that the repo in beam_location ' + 'does not need to be rebuilt before being used') +flags.DEFINE_integer('beam_it_timeout', 600, 'Integration Test Timeout.') +flags.DEFINE_string('git_binary', 'git', 'Path to git binary.') +flags.DEFINE_string('beam_version', None, + 'Version of Beam to download. Use tag from Github ' + 'as value. If not specified, will use HEAD.') +flags.DEFINE_enum('beam_sdk', None, [BEAM_JAVA_SDK, BEAM_PYTHON_SDK], + 'Which BEAM SDK is used to build the benchmark pipeline.') +flags.DEFINE_string('beam_python_attr', 'IT', + 'Test decorator that is used in Beam Python to filter a ' + 'specific category.') +flags.DEFINE_string('beam_python_sdk_location', None, + 'Python SDK tar ball location. It is a required option to ' + 'run Python pipeline.') + +flags.DEFINE_string('beam_extra_properties', None, + 'Allows to specify list of key-value pairs that will be ' + 'forwarded to target mvn command as system properties') + +flags.DEFINE_string('beam_runner', 'dataflow', 'Defines runner which will be used in tests') +flags.DEFINE_string('beam_runner_option', None, + 'Overrides any pipeline options to specify the runner.') + +flags.DEFINE_string('beam_filesystem', None, + 'Defines filesystem which will be used in tests. ' + 'If not specified it will use runner\'s local filesystem.') + +FLAGS = flags.FLAGS + +SUPPORTED_RUNNERS = [dpb_service.DATAFLOW] + +BEAM_REPO_LOCATION = 'https://github.com/apache/beam.git' + +DEFAULT_PYTHON_TAR_PATTERN = 'apache-beam-*.tar.gz' + + +def AddRunnerArgument(command, runner_name): + if runner_name is None or runner_name == 'direct': + command.append('-DintegrationTestRunner=direct') + + if runner_name == 'dataflow': + command.append('-DintegrationTestRunner=dataflow') + + +def AddRunnerPipelineOption(beam_pipeline_options, runner_name, + runner_option_override): + """Add runner to pipeline options.""" + runner_pipeline_option = '' + + if runner_name == 'dataflow': + runner_pipeline_option = ('"--runner=TestDataflowRunner"') + + if runner_name == 'direct': + runner_pipeline_option = ('"--runner=DirectRunner"') + + if runner_option_override: + runner_pipeline_option = '--runner=' + runner_option_override + + if len(runner_pipeline_option) > 0: + beam_pipeline_options.append(runner_pipeline_option) + + +def AddFilesystemArgument(command, filesystem_name): + if filesystem_name == 'hdfs': + command.append('-Dfilesystem=hdfs') + + +def AddExtraProperties(command, extra_properties): + if not extra_properties: + return + + if 'integrationTestPipelineOptions=' in extra_properties: + raise ValueError('integrationTestPipelineOptions must not be in ' + 'beam_extra_properties') + + extra_properties = extra_properties.rstrip(']').lstrip('[').split(',') + extra_properties = [p.rstrip('" ').lstrip('" ') for p in extra_properties] + for p in extra_properties: + command.append('-D{}'.format(p)) + + +def AddPythonAttributes(command, attributes): + if attributes: + command.append('-Dattr={}'.format(attributes)) + + +def AddTaskArgument(command, task_name, module): + if not task_name or not module: + raise ValueError('task_name and module should not be empty.') + command.append('{}:{}'.format(module, task_name)) + + +def InitializeBeamRepo(benchmark_spec): + """Ensures environment is prepared for running Beam benchmarks. + + In the absence of FLAGS.beam_location, initializes the beam source code base + by checking out the repository from github. Specific branch selection is + supported. + + Args: + benchmark_spec: The PKB spec for the benchmark to run. + """ + if benchmark_spec.dpb_service.SERVICE_TYPE not in SUPPORTED_RUNNERS: + raise NotImplementedError('Unsupported Runner') + + vm_util.GenTempDir() + if FLAGS.beam_location is None: + git_clone_command = [FLAGS.git_binary, 'clone', BEAM_REPO_LOCATION] + if FLAGS.beam_version: + git_clone_command.append('--branch={}'.format(FLAGS.beam_version)) + git_clone_command.append('--single-branch') + + vm_util.IssueCommand(git_clone_command, cwd=vm_util.GetTempDir()) + + elif not os.path.exists(FLAGS.beam_location): + raise errors.Config.InvalidValue('Directory indicated by beam_location ' + 'does not exist: {}.'.format( + FLAGS.beam_location)) + + _PrebuildBeam() + + +def _PrebuildBeam(): + """Rebuild beam if it was not build earlier.""" + if not FLAGS.beam_prebuilt: + + gradle_prebuild_tasks = ['clean', 'assemble'] + gradle_prebuild_flags = ['--stacktrace', '--info'] + build_command = [_GetGradleCommand()] + build_command.extend(gradle_prebuild_flags) + + for task in gradle_prebuild_tasks: + AddTaskArgument(build_command, task, FLAGS.beam_it_module) + AddRunnerArgument(build_command, FLAGS.beam_runner) + AddFilesystemArgument(build_command, FLAGS.beam_filesystem) + AddExtraProperties(build_command, FLAGS.beam_extra_properties) + + vm_util.IssueCommand(build_command, timeout=1500, cwd=_GetBeamDir()) + + +def BuildBeamCommand(benchmark_spec, classname, job_arguments): + """Constructs a Beam execution command for the benchmark. + + Args: + benchmark_spec: The PKB spec for the benchmark to run. + classname: The classname of the class to run. + job_arguments: The additional job arguments provided for the run. + + Returns: + cmd: Array containing the built command. + beam_dir: The directory in which to run the command. + """ + if benchmark_spec.service_type not in SUPPORTED_RUNNERS: + raise NotImplementedError('Unsupported Runner') + + base_dir = _GetBeamDir() + + if FLAGS.beam_sdk == BEAM_JAVA_SDK: + cmd = _BuildGradleCommand(classname, job_arguments) + elif FLAGS.beam_sdk == BEAM_PYTHON_SDK: + cmd = _BuildPythonCommand(benchmark_spec, classname, job_arguments) + else: + raise NotImplementedError('Unsupported Beam SDK: %s.' % FLAGS.beam_sdk) + + return cmd, base_dir + + +def _BuildGradleCommand(classname, job_arguments): + """Constructs a Gradle command for the benchmark. + + Args: + classname: The classname of the class to run. + job_arguments: The additional job arguments provided for the run. + + Returns: + cmd: Array containing the built command. + """ + cmd = [] + + gradle_executable = _GetGradleCommand() + + if not vm_util.ExecutableOnPath(gradle_executable): + raise errors.Setup.MissingExecutableError( + 'Could not find required executable "%s"' % gradle_executable) + + cmd.append(gradle_executable) + AddTaskArgument(cmd, 'integrationTest', FLAGS.beam_it_module) + cmd.append('--tests={}'.format(classname)) + + beam_args = job_arguments if job_arguments else [] + + AddRunnerArgument(cmd, FLAGS.beam_runner) + AddRunnerPipelineOption(beam_args, FLAGS.beam_runner, + FLAGS.beam_runner_option) + AddFilesystemArgument(cmd, FLAGS.beam_filesystem) + AddExtraProperties(cmd, FLAGS.beam_extra_properties) + + cmd.append('-DintegrationTestPipelineOptions=' + '[{}]'.format(','.join(beam_args))) + + cmd.append('--stacktrace') + cmd.append('--info') + cmd.append('--scan') + + return cmd + + +def _BuildPythonCommand(benchmark_spec, classname, job_arguments): + """Constructs Gradle command for Python benchmark. + + Python integration tests can be invoked from Gradle task + `integrationTest`. How Python Gradle command constructed + is different from Java. We can use following system properties + in commandline: + + -Dtests: fully qualified class/module name of the test to run. + e.g. apache_beam.examples.wordcount_it_test:WordCountIT + -Dattr: a set of tests that are annotated by this attribute tag. + -DpipelineOptions: a set of pipeline options needed to run Beam job + + Args: + benchmark_spec: The PKB spec for the benchmark to run. + classname: The fully qualified class/module name of the test to run. + job_arguments: The additional job arguments provided for the run. + + Returns: + cmd: Array holds the execution command. + """ + + cmd = [] + + gradle_executable = _GetGradleCommand() + + if not vm_util.ExecutableOnPath(gradle_executable): + raise errors.Setup.MissingExecutableError( + 'Could not find required executable "%s"' % gradle_executable) + + cmd.append(gradle_executable) + AddTaskArgument(cmd, 'integrationTest', FLAGS.beam_it_module) + cmd.append('-Dtests={}'.format(classname)) + AddPythonAttributes(cmd, FLAGS.beam_python_attr) + + beam_args = job_arguments if job_arguments else [] + if benchmark_spec.service_type == dpb_service.DATAFLOW: + beam_args.append('"--runner={}"'.format(FLAGS.beam_runner)) + + sdk_location = FLAGS.beam_python_sdk_location + if not sdk_location: + tar_list = _FindFiles(_GetBeamPythonDir(), DEFAULT_PYTHON_TAR_PATTERN) + if not tar_list: + raise RuntimeError('No python sdk tar file is available.') + else: + sdk_location = tar_list[0] + beam_args.append('"--sdk_location={}"'.format(sdk_location)) + cmd.append('-DpipelineOptions={}'.format(' '.join(beam_args))) + + cmd.append('--info') + cmd.append('--scan') + + return cmd + + +def _GetGradleCommand(): + return FLAGS.gradle_binary or os.path.join(_GetBeamDir(), 'gradlew') + + +def _GetBeamDir(): + # TODO: This is temporary, find a better way. + return FLAGS.beam_location or os.path.join(vm_util.GetTempDir(), 'beam') + + +def _GetBeamPythonDir(): + return os.path.join(_GetBeamDir(), 'sdks/python') + + +def _FindFiles(base_path, pattern): + if not os.path.exists(base_path): + raise RuntimeError('No such directory: %s' % base_path) + + results = [] + for root, _, files in os.walk(base_path): + for f in files: + if fnmatch.fnmatch(f, pattern): + results.append(os.path.join(root, f)) + return results diff --git a/script/cumulus/pkb/perfkitbenchmarker/beam_pipeline_options.py b/script/cumulus/pkb/perfkitbenchmarker/beam_pipeline_options.py new file mode 100644 index 0000000..8ac754e --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/beam_pipeline_options.py @@ -0,0 +1,153 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import kubernetes_helper +import yaml + +FLAGS = flags.FLAGS + + +def GetStaticPipelineOptions(options_list): + """ + Takes the dictionary loaded from the yaml configuration file and returns it + in a form consistent with the others in GenerateAllPipelineOptions: a list of + (pipeline_option_name, pipeline_option_value) tuples. + + The options in the options_list are a dict: + Key is the name of the pipeline option to pass to beam + Value is the value of the pipeline option to pass to beam + """ + options = [] + for option in options_list: + if len(list(option.keys())) != 1: + raise Exception('Each item in static_pipeline_options should only have' + ' 1 key/value') + option_kv = list(option.items())[0] + options.append((option_kv[0], option_kv[1])) + return options + + +def EvaluateDynamicPipelineOptions(dynamic_options): + """ + Takes the user's dynamic args and retrieves the information to fill them in. + + dynamic_args is a python map of argument name -> {type, kubernetesSelector, *format} + returns a list of tuples containing (argName, argValue) + + if optional format it passed, argValue is equal to format with "{{type}}" being replaced with actual value. + """ + filledOptions = [] + for optionDescriptor in dynamic_options: + fillType = optionDescriptor['type'] + optionName = optionDescriptor['name'] + valueFormat = optionDescriptor.get('format') + + if not fillType: + raise errors.Config.InvalidValue( + 'For dynamic arguments, you must provide a "type"') + + if fillType == 'NodePortIp': + argValue = RetrieveNodePortIp(optionDescriptor) + elif fillType == 'LoadBalancerIp': + argValue = RetrieveLoadBalancerIp(optionDescriptor) + elif fillType == 'TestValue': + argValue = optionDescriptor['value'] + else: + raise errors.Config.InvalidValue( + 'Unknown dynamic argument type: %s' % (fillType)) + + if valueFormat: + argValue = valueFormat.replace("{{" + fillType + "}}", argValue) + filledOptions.append((optionName, argValue)) + + return filledOptions + + +def GenerateAllPipelineOptions(it_args, it_options, static_pipeline_options, + dynamic_pipeline_options): + """ + :param it_args: options list passed in via FLAGS.beam_it_args + :param it_options: options list passed in via FLAGS.beam_it_options + :param static_pipeline_options: options list loaded from the yaml config file + :param dynamic_pipeline_options: options list loaded from the yaml config file + :return: a list of values of the form "\"--option_name=value\"" + """ + # beam_it_options are in [--option=value,--option2=val2] form + user_option_list = [] + if it_options is not None and len(it_options) > 0: + user_option_list = it_options.rstrip(']').lstrip('[').split(',') + user_option_list = [option.rstrip('" ').lstrip('" ') + for option in user_option_list] + + # Add static options from the benchmark_spec + benchmark_spec_option_list = ( + EvaluateDynamicPipelineOptions(dynamic_pipeline_options)) + benchmark_spec_option_list.extend( + GetStaticPipelineOptions(static_pipeline_options)) + option_list = ['--{}={}'.format(t[0], t[1]) + for t in benchmark_spec_option_list] + + # beam_it_args is the old way of passing parameters + args_list = [] + if it_args is not None and len(it_args) > 0: + args_list = it_args.split(',') + + return ['"{}"'.format(arg) + for arg in args_list + user_option_list + option_list] + + +def ReadPipelineOptionConfigFile(): + """ + Reads the path to the config file from FLAGS, then loads the static and + dynamic pipeline options from it. + """ + dynamic_pipeline_options = [] + static_pipeline_options = [] + if FLAGS.beam_options_config_file: + with open(FLAGS.beam_options_config_file, 'r') as fileStream: + config = yaml.safe_load(fileStream) + if config['static_pipeline_options']: + static_pipeline_options = config['static_pipeline_options'] + if config['dynamic_pipeline_options']: + dynamic_pipeline_options = config['dynamic_pipeline_options'] + return static_pipeline_options, dynamic_pipeline_options + + +def RetrieveNodePortIp(argDescriptor): + jsonSelector = argDescriptor['podLabel'] + if not jsonSelector: + raise errors.Config.InvalidValue('For NodePortIp arguments, you must' + ' provide a "selector"') + ip = kubernetes_helper.GetWithWaitForContents( + 'pods', '', jsonSelector, '.items[0].status.podIP') + if len(ip) == 0: + raise "Could not retrieve NodePort IP address" + logging.info("Using NodePort IP Address: " + ip) + return ip + + +def RetrieveLoadBalancerIp(argDescriptor): + serviceName = argDescriptor['serviceName'] + if not serviceName: + raise errors.Config.InvalidValue('For LoadBalancerIp arguments, you must' + 'provide a "serviceName"') + ip = kubernetes_helper.GetWithWaitForContents( + 'svc', serviceName, '', '.status.loadBalancer.ingress[0].ip') + if len(ip) == 0: + raise "Could not retrieve LoadBalancer IP address" + logging.info("Using LoadBalancer IP Address: " + ip) + return ip diff --git a/script/cumulus/pkb/perfkitbenchmarker/benchmark_lookup.py b/script/cumulus/pkb/perfkitbenchmarker/benchmark_lookup.py new file mode 100644 index 0000000..10335f8 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/benchmark_lookup.py @@ -0,0 +1,48 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Function to lookup modules from benchmark names. + +BenchmarkModule: Returns a benchmark module given its name. + +This module works around a circular import issue where we cannot import +benchmark_sets.py directly into virtual_machine.py. After SetUpPKB is called, +benchmark_lookup.BenchmarkModule is equivalent to +benchmark_sets.BenchmarkModule. +""" + +from perfkitbenchmarker import errors + +_global_benchmark_module_function = None + + +def SetBenchmarkModuleFunction(function): + """Sets the function called by BenchmarkModule; See benchmark_sets.py.""" + global _global_benchmark_module_function + _global_benchmark_module_function = function + + +def BenchmarkModule(benchmark_name): + """Finds the module for a benchmark by name. + + Args: + benchmark_name: The name of the benchmark. + + Returns: + The benchmark's module, or None if the benchmark is invalid. + """ + if not _global_benchmark_module_function: + raise errors.Setup.InvalidSetupError( + 'Cannot call benchmark_lookup.py; Was SetUpPKB called?') + return _global_benchmark_module_function(benchmark_name) diff --git a/script/cumulus/pkb/perfkitbenchmarker/benchmark_sets.py b/script/cumulus/pkb/perfkitbenchmarker/benchmark_sets.py new file mode 100644 index 0000000..5d7f427 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/benchmark_sets.py @@ -0,0 +1,477 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Benchmark set specific functions and definitions.""" + + +import collections +import copy +import itertools + +from absl import flags +from perfkitbenchmarker import configs +from perfkitbenchmarker import linux_benchmarks +from perfkitbenchmarker import linux_packages +from perfkitbenchmarker import os_types +from perfkitbenchmarker import windows_benchmarks +from perfkitbenchmarker import windows_packages +import six +from six.moves import zip + +FLAGS = flags.FLAGS + +flags.DEFINE_string('flag_matrix', None, + 'The name of the flag matrix to run.') +flags.DEFINE_string('flag_zip', None, + 'The name of the flag zip to run.') +flags.DEFINE_integer('num_benchmark_copies', 1, + 'The number of copies of each benchmark config to run.') + +MESSAGE = 'message' +BENCHMARK_LIST = 'benchmark_list' +STANDARD_SET = 'standard_set' + +BENCHMARK_SETS = { + STANDARD_SET: { + MESSAGE: ('The standard_set is a community agreed upon set of ' + 'benchmarks to measure Cloud performance.'), + BENCHMARK_LIST: [ + 'aerospike', + 'block_storage_workload', + 'cassandra_stress', + 'cluster_boot', + 'copy_throughput', + 'coremark', + 'fio', + 'hadoop_terasort', + 'hpcc', + 'iperf', + 'mesh_network', + 'mongodb_ycsb', + 'netperf', + 'object_storage_service', + 'ping', + 'redis_memtier', + 'speccpu2006', + 'sysbench', + 'unixbench', + ] + }, + 'arm_set': { + MESSAGE: 'ARM benchmark set.', + BENCHMARK_LIST: [STANDARD_SET] + }, + 'alicloud_set': { + MESSAGE: 'AliCloud benchmark set.', + BENCHMARK_LIST: [STANDARD_SET] + }, + 'broadcom_set': { + MESSAGE: 'Broadcom benchmark set.', + BENCHMARK_LIST: [STANDARD_SET] + }, + 'canonical_set': { + MESSAGE: 'Canonical benchmark set.', + BENCHMARK_LIST: [STANDARD_SET] + }, + 'centurylinkcloud_set': { + MESSAGE: + 'This benchmark set is supported on CenturyLink Cloud.', + BENCHMARK_LIST: [ + 'cassandra_stress', + 'copy_throughput', + 'hpcc', + 'iperf', + 'mesh_network', + 'mongodb_ycsb', + 'ping', + 'redis_memtier', + 'sysbench', + 'unixbench', + ] + }, + 'cisco_set': { + MESSAGE: 'Cisco benchmark set.', + BENCHMARK_LIST: [STANDARD_SET] + }, + 'cloudharmony_set': { + MESSAGE: 'CloudHarmony benchmark set.', + BENCHMARK_LIST: [ + 'speccpu2006', + 'unixbench', + ] + }, + 'cloudspectator_set': { + MESSAGE: 'CloudSpectator benchmark set.', + BENCHMARK_LIST: [STANDARD_SET] + }, + 'google_set': { + MESSAGE: ('This benchmark set is maintained by Google Cloud Platform ' + 'Performance Team.'), + BENCHMARK_LIST: [ + 'aerospike_ycsb', + 'bidirectional_network', + 'block_storage_workload', + 'cassandra_stress', + 'cassandra_ycsb', + 'cluster_boot', + 'copy_throughput', + 'fio', + 'gpu_pcie_bandwidth', + 'hadoop_terasort', + 'horovod', + 'hpcc', + 'hpcg', + 'inception3', + 'iperf', + 'mesh_network', + 'mlperf', + 'mlperf_multiworkers', + 'mnist', + 'mongodb_ycsb', + 'multichase', + 'mxnet', + 'netperf', + 'object_storage_service', + 'oldisim', + 'pgbench', + 'ping', + 'redis_ycsb', + 'resnet', + 'stencil2d', + 'speccpu2006', + 'sysbench', + 'tensorflow', + 'tensorflow_serving', + 'tomcat_wrk', + 'unixbench', + ] + }, + 'intel_set': { + MESSAGE: + 'Intel benchmark set.', + BENCHMARK_LIST: [ + 'fio', + 'iperf', + 'unixbench', + 'hpcc', + 'cluster_boot', + 'redis_memtier', + 'cassandra_stress', + 'object_storage_service', + 'sysbench', + ] + }, + 'kubernetes_set': { + MESSAGE: + 'Kubernetes benchmark set.', + BENCHMARK_LIST: [ + 'block_storage_workload', + 'cassandra_ycsb', + 'cassandra_stress', + 'cluster_boot', + 'fio', + 'iperf', + 'mesh_network', + 'mongodb_ycsb', + 'netperf', + 'redis_memtier', + 'sysbench', + ] + }, + 'mellanox_set': { + MESSAGE: 'Mellanox benchmark set.', + BENCHMARK_LIST: [STANDARD_SET] + }, + 'microsoft_set': { + MESSAGE: 'Microsoft benchmark set.', + BENCHMARK_LIST: [STANDARD_SET] + }, + 'qualcomm_technologies_set': { + MESSAGE: 'Qualcomm Technologies, Inc. benchmark set.', + BENCHMARK_LIST: [STANDARD_SET] + }, + 'rackspace_set': { + MESSAGE: + 'Rackspace benchmark set.', + BENCHMARK_LIST: [ + 'aerospike', + 'block_storage_workload', + 'cassandra_stress', + 'cluster_boot', + 'copy_throughput', + 'fio', + 'hpcc', + 'iperf', + 'mesh_network', + 'mongodb_ycsb', + 'netperf', + 'oldisim', + 'ping', + 'redis_memtier', + 'silo', + 'sysbench', + 'unixbench', + ] + }, + 'red_hat_set': { + MESSAGE: 'Red Hat benchmark set.', + BENCHMARK_LIST: [STANDARD_SET] + }, + 'tradeworx_set': { + MESSAGE: 'Tradeworx Inc. benchmark set.', + BENCHMARK_LIST: [STANDARD_SET] + }, + 'thesys_technologies_set': { + MESSAGE: 'Thesys Technologies LLC. benchmark set.', + BENCHMARK_LIST: [STANDARD_SET] + }, + 'stanford_set': { + MESSAGE: 'Stanford University benchmark set.', + BENCHMARK_LIST: [STANDARD_SET, 'oldisim'] + }, + 'mit_set': { + MESSAGE: 'Massachusetts Institute of Technology benchmark set.', + BENCHMARK_LIST: [STANDARD_SET, 'silo'] + }, + 'cloudsuite_set': { + MESSAGE: + 'CloudSuite benchmark set.', + BENCHMARK_LIST: [ + 'cloudsuite_data_analytics', + 'cloudsuite_data_caching', + 'cloudsuite_graph_analytics', + 'cloudsuite_in_memory_analytics', + 'cloudsuite_media_streaming', + 'cloudsuite_web_search', + 'cloudsuite_web_serving', + ] + } +} + + +class FlagMatrixNotFoundException(Exception): + pass + + +class FlagZipNotFoundException(Exception): + pass + + +def _GetValidBenchmarks(): + """Returns a dict mapping valid benchmark names to their modules.""" + if FLAGS.os_type in os_types.CONTAINER_OS_TYPES: + return {'cluster_boot': linux_benchmarks.VALID_BENCHMARKS['cluster_boot']} + elif FLAGS.os_type in os_types.WINDOWS_OS_TYPES: + return windows_benchmarks.VALID_BENCHMARKS + return linux_benchmarks.VALID_BENCHMARKS + + +def _GetValidPackages(): + """Returns a dict mapping valid package names to their modules.""" + if FLAGS.os_type in os_types.CONTAINER_OS_TYPES: + return {} + elif FLAGS.os_type in os_types.WINDOWS_OS_TYPES: + return windows_packages.PACKAGES + return linux_packages.PACKAGES + + +def BenchmarkModule(benchmark_name): + """Finds the module for a benchmark by name. + + Args: + benchmark_name: The name of the benchmark. + + Returns: + The benchmark's module, or None if the benchmark is invalid. + """ + valid_benchmarks = _GetValidBenchmarks() + return valid_benchmarks.get(benchmark_name) + + +def PackageModule(package_name): + """Finds the module for a package by name. + + Args: + package_name: The name of the package. + + Returns: + The package's module, or None if the package_name is invalid. + """ + packages = _GetValidPackages() + return packages.get(package_name) + + +def _GetBenchmarksFromUserConfig(user_config): + """Returns a list of benchmark module, config tuples.""" + benchmarks = user_config.get('benchmarks', []) + valid_benchmarks = _GetValidBenchmarks() + benchmark_config_list = [] + + for entry in benchmarks: + name, user_config = entry.popitem() + try: + benchmark_module = valid_benchmarks[name] + except KeyError: + raise ValueError('Benchmark "%s" not valid on os_type "%s"' % + (name, FLAGS.os_type)) + benchmark_config_list.append((benchmark_module, user_config)) + + return benchmark_config_list + + +def _GetConfigForAxis(benchmark_config, flag_config): + config = copy.copy(benchmark_config) + config_local_flags = config.get('flags', {}) + config['flags'] = copy.deepcopy(configs.GetConfigFlags()) + config['flags'].update(config_local_flags) + for setting in flag_config: + config['flags'].update(setting) + return config + + +def _AssertZipAxesHaveSameLength(axes): + expected_length = len(axes[0]) + for axis in axes[1:]: + if len(axis) != expected_length: + raise ValueError('flag_zip axes must all be the same length') + + +def _AssertFlagMatrixAndZipDefsExist(benchmark_config, + flag_matrix_name, + flag_zip_name): + """Asserts that specified flag_matrix and flag_zip exist. + + Both flag_matrix_name and flag_zip_name can be None, meaning that the user + (or the benchmark_config) did not specify them. + + Args: + benchmark_config: benchmark_config + flag_matrix_name: name of the flag_matrix_def specified by the user via a + flag, specified in the benchmark_config, or None. + flag_zip_name: name of the flag_zip_def specified by the user via a flag, + specified in the benchmark_config, or None. + + Raises: + FlagMatrixNotFoundException: if flag_matrix_name is not None, and is not + found in the flag_matrix_defs section of the benchmark_config. + FlagZipNotFoundException: if flag_zip_name is not None, and is not + found in the flag_zip_defs section of the benchmark_config. + """ + if (flag_matrix_name and + flag_matrix_name not in + benchmark_config.get('flag_matrix_defs', {})): + raise FlagMatrixNotFoundException('No flag_matrix with name {0}' + .format(flag_matrix_name)) + if (flag_zip_name and + flag_zip_name not in + benchmark_config.get('flag_zip_defs', {})): + raise FlagZipNotFoundException('No flag_zip with name {0}' + .format(flag_zip_name)) + + +def GetBenchmarksFromFlags(): + """Returns a list of benchmarks to run based on the benchmarks flag. + + If no benchmarks (or sets) are specified, this will return the standard set. + If multiple sets or mixes of sets and benchmarks are specified, this will + return the union of all sets and individual benchmarks. + + Raises: + ValueError: when benchmark_name is not valid for os_type supplied + """ + user_config = configs.GetUserConfig() + benchmark_config_list = _GetBenchmarksFromUserConfig(user_config) + if benchmark_config_list and not FLAGS['benchmarks'].present: + return benchmark_config_list + + benchmark_queue = collections.deque(FLAGS.benchmarks) + benchmark_names = [] + benchmark_set = set() + + while benchmark_queue: + benchmark = benchmark_queue.popleft() + if benchmark in benchmark_set: + continue + benchmark_set.add(benchmark) + if benchmark in BENCHMARK_SETS: + benchmark_queue.extendleft(BENCHMARK_SETS[benchmark][BENCHMARK_LIST]) + else: + benchmark_names.append(benchmark) + + valid_benchmarks = _GetValidBenchmarks() + + # create a list of module, config tuples to return + benchmark_config_list = [] + for benchmark_name in benchmark_names: + benchmark_config = user_config.get(benchmark_name, {}) + benchmark_name = benchmark_config.get('name', benchmark_name) + benchmark_module = valid_benchmarks.get(benchmark_name) + + if benchmark_module is None: + raise ValueError('Benchmark "%s" not valid on os_type "%s"' % + (benchmark_name, FLAGS.os_type)) + + flag_matrix_name = ( + FLAGS.flag_matrix or benchmark_config.get('flag_matrix', None) + ) + flag_zip_name = ( + FLAGS.flag_zip or benchmark_config.get('flag_zip', None) + ) + _AssertFlagMatrixAndZipDefsExist(benchmark_config, + flag_matrix_name, + flag_zip_name) + + # We need to remove the 'flag_matrix', 'flag_matrix_defs', 'flag_zip', + # 'flag_zip_defs', and 'flag_matrix_filters' keys from the config + # dictionary since they aren't actually part of the config spec and will + # cause errors if they are left in. + benchmark_config.pop('flag_matrix', None) + benchmark_config.pop('flag_zip', None) + + flag_matrix = benchmark_config.pop( + 'flag_matrix_defs', {}).get(flag_matrix_name, {}) + flag_matrix_filter = benchmark_config.pop( + 'flag_matrix_filters', {}).get(flag_matrix_name, {}) + flag_zip = benchmark_config.pop( + 'flag_zip_defs', {}).get(flag_zip_name, {}) + + zipped_axes = [] + crossed_axes = [] + if flag_zip: + flag_axes = [] + for flag, values in six.iteritems(flag_zip): + flag_axes.append([{flag: v} for v in values]) + + _AssertZipAxesHaveSameLength(flag_axes) + + for flag_config in zip(*flag_axes): + config = _GetConfigForAxis(benchmark_config, flag_config) + zipped_axes.append((benchmark_module, config)) + + crossed_axes.append([benchmark_tuple[1]['flags'] for + benchmark_tuple in zipped_axes]) + + for flag, values in sorted(six.iteritems(flag_matrix)): + crossed_axes.append([{flag: v} for v in values]) + + for flag_config in itertools.product(*crossed_axes): + config = _GetConfigForAxis(benchmark_config, flag_config) + if (flag_matrix_filter and not eval( + flag_matrix_filter, {}, config['flags'])): + continue + + benchmark_config_list.extend([(benchmark_module, config)] * + FLAGS.num_benchmark_copies) + + return benchmark_config_list diff --git a/script/cumulus/pkb/perfkitbenchmarker/benchmark_spec.py b/script/cumulus/pkb/perfkitbenchmarker/benchmark_spec.py new file mode 100644 index 0000000..7f903cb --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/benchmark_spec.py @@ -0,0 +1,1065 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Container for all data required for a benchmark to run.""" + + +import contextlib +import copy +import datetime +import importlib +import logging +import os +import pickle +import threading +import uuid + +from absl import flags +from perfkitbenchmarker import benchmark_status +from perfkitbenchmarker import capacity_reservation +from perfkitbenchmarker import cloud_tpu +from perfkitbenchmarker import container_service +from perfkitbenchmarker import context +from perfkitbenchmarker import data_discovery_service +from perfkitbenchmarker import disk +from perfkitbenchmarker import dpb_service +from perfkitbenchmarker import edw_service +from perfkitbenchmarker import errors +from perfkitbenchmarker import flag_util +from perfkitbenchmarker import messaging_service +from perfkitbenchmarker import nfs_service +from perfkitbenchmarker import non_relational_db +from perfkitbenchmarker import os_types +from perfkitbenchmarker import placement_group +from perfkitbenchmarker import provider_info +from perfkitbenchmarker import providers +from perfkitbenchmarker import relational_db +from perfkitbenchmarker import smb_service +from perfkitbenchmarker import spark_service +from perfkitbenchmarker import stages +from perfkitbenchmarker import static_virtual_machine as static_vm +from perfkitbenchmarker import virtual_machine +from perfkitbenchmarker import vm_util +from perfkitbenchmarker import vpn_service +from perfkitbenchmarker.configs import freeze_restore_spec +from perfkitbenchmarker.providers.gcp import gcp_spanner +import six +from six.moves import range +import six.moves._thread +import six.moves.copyreg + + +def PickleLock(lock): + return UnPickleLock, (lock.locked(),) + + +def UnPickleLock(locked, *args): + lock = threading.Lock() + if locked: + if not lock.acquire(False): + raise pickle.UnpicklingError('Cannot acquire lock') + return lock + +six.moves.copyreg.pickle(six.moves._thread.LockType, PickleLock) + +SUPPORTED = 'strict' +NOT_EXCLUDED = 'permissive' +SKIP_CHECK = 'none' +# GCP labels only allow hyphens (-), underscores (_), lowercase characters, and +# numbers and International characters. +# metadata allow all characters and numbers. +METADATA_TIME_FORMAT = '%Y%m%dt%H%M%Sz' +FLAGS = flags.FLAGS + +flags.DEFINE_enum('cloud', providers.GCP, providers.VALID_CLOUDS, + 'Name of the cloud to use.') +flags.DEFINE_string('scratch_dir', None, + 'Base name for all scratch disk directories in the VM. ' + 'Upon creation, these directories will have numbers ' + 'appended to them (for example /scratch0, /scratch1, etc).') +flags.DEFINE_string('startup_script', None, + 'Script to run right after vm boot.') +flags.DEFINE_string('postrun_script', None, + 'Script to run right after run stage.') +flags.DEFINE_integer('create_and_boot_post_task_delay', None, + 'Delay in seconds to delay in between boot tasks.') +# pyformat: disable +flags.DEFINE_enum('benchmark_compatibility_checking', SUPPORTED, + [SUPPORTED, NOT_EXCLUDED, SKIP_CHECK], + 'Method used to check compatibility between the benchmark ' + ' and the cloud. ' + SUPPORTED + ' runs the benchmark only' + ' if the cloud provider has declared it supported. ' + + NOT_EXCLUDED + ' runs the benchmark unless it has been' + ' declared not supported by the cloud provider. ' + SKIP_CHECK + + ' does not do the compatibility' + ' check.') +# pyformat: enable + + +class BenchmarkSpec(object): + """Contains the various data required to make a benchmark run.""" + + total_benchmarks = 0 + + def __init__(self, benchmark_module, benchmark_config, benchmark_uid): + """Initialize a BenchmarkSpec object. + + Args: + benchmark_module: The benchmark module object. + benchmark_config: BenchmarkConfigSpec. The configuration for the + benchmark. + benchmark_uid: An identifier unique to this run of the benchmark even + if the same benchmark is run multiple times with different configs. + """ + self.config = benchmark_config + self.control_traces = False + self.name = benchmark_module.BENCHMARK_NAME + self.uid = benchmark_uid + self.status = benchmark_status.SKIPPED + self.failed_substatus = None + self.status_detail = None + BenchmarkSpec.total_benchmarks += 1 + self.sequence_number = BenchmarkSpec.total_benchmarks + self.vms = [] + self.regional_networks = {} + self.networks = {} + self.custom_subnets = {k: { + 'cloud': v.cloud, + 'cidr': v.cidr} for (k, v) in self.config.vm_groups.items()} + self.firewalls = {} + self.networks_lock = threading.Lock() + self.firewalls_lock = threading.Lock() + self.vm_groups = {} + self.container_specs = benchmark_config.container_specs or {} + self.container_registry = None + self.deleted = False + self.uuid = '%s-%s' % (FLAGS.run_uri, uuid.uuid4()) + self.always_call_cleanup = True + self.spark_service = None + self.dpb_service = None + self.container_cluster = None + self.relational_db = None + self.non_relational_db = None + self.spanner = None + self.tpus = [] + self.tpu_groups = {} + self.edw_service = None + self.nfs_service = None + self.smb_service = None + self.messaging_service = None + self.data_discovery_service = None + self.app_groups = {} + self._zone_index = 0 + self.additional_private_addresses_count = 0 + self.capacity_reservations = [] + self.placement_group_specs = benchmark_config.placement_group_specs or {} + self.placement_groups = {} + self.vms_to_boot = ( + self.config.vm_groups if self.config.relational_db is None else + relational_db.VmsToBoot(self.config.relational_db.vm_groups)) + self.vpc_peering = self.config.vpc_peering + + self.vpn_service = None + self.vpns = {} # dict of vpn's + self.vpn_gateways = {} # dict of vpn gw's + self.vpn_gateways_lock = threading.Lock() + self.vpns_lock = threading.Lock() + + self.restore_spec = None + self.freeze_path = None + + # Intel collector contributions + self.workload_name = None + self.software_config_metadata = {} + self.tunable_parameters_metadata = {} + self.sut_vm_group = None + self.s3_archive_url = None + self.s3_reports = [] + # End Intel collector contributions + + # Modules can't be pickled, but functions can, so we store the functions + # necessary to run the benchmark. + self.BenchmarkPrepare = benchmark_module.Prepare + self.BenchmarkRun = benchmark_module.Run + self.BenchmarkCleanup = benchmark_module.Cleanup + + # Each benchmark may implement a GetUsage method if it has + # additional information to display + if hasattr(benchmark_module, 'GetUsage'): + self.BenchmarkGetUsage = benchmark_module.GetUsage + + # Set the current thread's BenchmarkSpec object to this one. + context.SetThreadBenchmarkSpec(self) + + def __repr__(self): + return '%s(%r)' % (self.__class__, self.__dict__) + + def __str__(self): + return( + 'Benchmark name: {0}\nFlags: {1}' + .format(self.name, self.config.flags)) + + @contextlib.contextmanager + def RedirectGlobalFlags(self): + """Redirects flag reads and writes to the benchmark-specific flags object. + + Within the enclosed code block, reads and writes to the flags.FLAGS object + are redirected to a copy that has been merged with config-provided flag + overrides specific to this benchmark run. + """ + with self.config.RedirectFlags(FLAGS): + yield + + def _InitializeFromSpec( + self, attribute_name: str, + resource_spec: freeze_restore_spec.FreezeRestoreSpec) -> bool: + """Initializes the BenchmarkSpec attribute from the restore_spec. + + Args: + attribute_name: The attribute to restore. + resource_spec: The spec class corresponding to the resource to be + restored. + + Returns: + True if successful, False otherwise. + """ + if not hasattr(self, 'restore_spec'): + return False + if not self.restore_spec or not hasattr(self.restore_spec, attribute_name): + return False + if not resource_spec.enable_freeze_restore: + return False + logging.info('Getting %s instance from restore_spec', attribute_name) + frozen_resource = copy.copy(getattr(self.restore_spec, attribute_name)) + setattr(self, attribute_name, frozen_resource) + return True + + def ConstructContainerCluster(self): + """Create the container cluster.""" + if self.config.container_cluster is None: + return + cloud = self.config.container_cluster.cloud + cluster_type = self.config.container_cluster.type + providers.LoadProvider(cloud) + container_cluster_class = container_service.GetContainerClusterClass( + cloud, cluster_type) + self.container_cluster = container_cluster_class( + self.config.container_cluster) + + def ConstructContainerRegistry(self): + """Create the container registry.""" + if self.config.container_registry is None: + return + cloud = self.config.container_registry.cloud + providers.LoadProvider(cloud) + container_registry_class = container_service.GetContainerRegistryClass( + cloud) + self.container_registry = container_registry_class( + self.config.container_registry) + + def ConstructDpbService(self): + """Create the dpb_service object and create groups for its vms.""" + if self.config.dpb_service is None: + return + dpb_service_spec = self.config.dpb_service + dpb_service_cloud = dpb_service_spec.worker_group.cloud + dpb_service_spec.worker_group.vm_count = dpb_service_spec.worker_count + providers.LoadProvider(dpb_service_cloud) + + dpb_service_type = dpb_service_spec.service_type + dpb_service_class = dpb_service.GetDpbServiceClass(dpb_service_cloud, + dpb_service_type) + self.dpb_service = dpb_service_class(dpb_service_spec) + + # If the dpb service is un-managed, the provisioning needs to be handed + # over to the vm creation module. + if dpb_service_type in [ + dpb_service.UNMANAGED_DPB_SVC_YARN_CLUSTER, + dpb_service.UNMANAGED_SPARK_CLUSTER + ]: + # Ensure non cluster vms are not present in the spec. + if self.vms_to_boot: + raise Exception('Invalid Non cluster vm group {0} when benchmarking ' + 'unmanaged dpb service'.format(self.vms_to_boot)) + + base_vm_spec = dpb_service_spec.worker_group + base_vm_spec.vm_spec.zone = self.dpb_service.dpb_service_zone + + if dpb_service_spec.worker_count: + self.vms_to_boot['worker_group'] = dpb_service_spec.worker_group + # else we have a single node cluster. + + master_group_spec = copy.copy(base_vm_spec) + master_group_spec.vm_count = 1 + self.vms_to_boot['master_group'] = master_group_spec + + def ConstructRelationalDb(self): + """Create the relational db and create groups for its vms.""" + if self.config.relational_db is None: + return + cloud = self.config.relational_db.cloud + is_managed_db = self.config.relational_db.is_managed_db + engine = self.config.relational_db.engine + providers.LoadProvider(cloud) + relational_db_class = ( + relational_db.GetRelationalDbClass(cloud, is_managed_db, engine)) + self.relational_db = relational_db_class(self.config.relational_db) + + def ConstructNonRelationalDb(self) -> None: + """Initializes the non_relational db.""" + db_spec: non_relational_db.BaseNonRelationalDbSpec = self.config.non_relational_db + if not db_spec: + return + # Initialization from restore spec + if self._InitializeFromSpec('non_relational_db', db_spec): + return + # Initialization from benchmark config spec + logging.info('Constructing non_relational_db instance with spec: %s.', + db_spec) + service_type = db_spec.service_type + non_relational_db_class = non_relational_db.GetNonRelationalDbClass( + service_type) + self.non_relational_db = non_relational_db_class.FromSpec(db_spec) + + def ConstructSpanner(self) -> None: + """Initializes the spanner instance.""" + spanner_spec: gcp_spanner.SpannerSpec = self.config.spanner + if not spanner_spec: + return + # Initialization from restore spec + if self._InitializeFromSpec('spanner', spanner_spec): + return + # Initialization from benchmark config spec + logging.info('Constructing spanner instance with spec: %s.', spanner_spec) + spanner_class = gcp_spanner.GetSpannerClass(spanner_spec.service_type) + self.spanner = spanner_class.FromSpec(spanner_spec) + + def ConstructTpuGroup(self, group_spec): + """Constructs the BenchmarkSpec's cloud TPU objects.""" + if group_spec is None: + return + cloud = group_spec.cloud + providers.LoadProvider(cloud) + tpu_class = cloud_tpu.GetTpuClass(cloud) + return tpu_class(group_spec) + + def ConstructTpu(self): + """Constructs the BenchmarkSpec's cloud TPU objects.""" + tpu_group_specs = self.config.tpu_groups + + for group_name, group_spec in sorted(six.iteritems(tpu_group_specs)): + tpu = self.ConstructTpuGroup(group_spec) + + self.tpu_groups[group_name] = tpu + self.tpus.append(tpu) + + def ConstructEdwService(self): + """Create the edw_service object.""" + if self.config.edw_service is None: + return + # Load necessary modules from the provider to account for dependencies + # TODO(saksena): Replace with + # providers.LoadProvider(string.lower(FLAGS.cloud)) + providers.LoadProvider( + edw_service.TYPE_2_PROVIDER.get(self.config.edw_service.type)) + # Load the module for the edw service based on type + edw_service_type = self.config.edw_service.type + edw_service_module = importlib.import_module( + edw_service.TYPE_2_MODULE.get(edw_service_type)) + # The edw_service_type in certain cases may be qualified with a hosting + # cloud eg. snowflake_aws,snowflake_gcp, etc. + # However the edw_service_class_name in all cases will still be cloud + # agnostic eg. Snowflake. + edw_service_class_name = edw_service_type.split('_')[0] + edw_service_class = getattr( + edw_service_module, + edw_service_class_name[0].upper() + edw_service_class_name[1:]) + # Check if a new instance needs to be created or restored from snapshot + self.edw_service = edw_service_class(self.config.edw_service) + + def ConstructNfsService(self): + """Construct the NFS service object. + + Creates an NFS Service only if an NFS disk is found in the disk_specs. + """ + if self.nfs_service: + logging.info('NFS service already created: %s', self.nfs_service) + return + for group_spec in self.vms_to_boot.values(): + if not group_spec.disk_spec or not group_spec.vm_count: + continue + disk_spec = group_spec.disk_spec + if disk_spec.disk_type != disk.NFS: + continue + # Choose which nfs_service to create. + if disk_spec.nfs_ip_address: + self.nfs_service = nfs_service.StaticNfsService(disk_spec) + elif disk_spec.nfs_managed: + cloud = group_spec.cloud + providers.LoadProvider(cloud) + nfs_class = nfs_service.GetNfsServiceClass(cloud) + self.nfs_service = nfs_class(disk_spec, group_spec.vm_spec.zone) + else: + self.nfs_service = nfs_service.UnmanagedNfsService(disk_spec, + self.vms[0]) + logging.debug('NFS service %s', self.nfs_service) + break + + def ConstructSmbService(self): + """Construct the SMB service object. + + Creates an SMB Service only if an SMB disk is found in the disk_specs. + """ + if self.smb_service: + logging.info('SMB service already created: %s', self.smb_service) + return + for group_spec in self.vms_to_boot.values(): + if not group_spec.disk_spec or not group_spec.vm_count: + continue + disk_spec = group_spec.disk_spec + if disk_spec.disk_type != disk.SMB: + continue + + cloud = group_spec.cloud + providers.LoadProvider(cloud) + smb_class = smb_service.GetSmbServiceClass(cloud) + self.smb_service = smb_class(disk_spec, group_spec.vm_spec.zone) + logging.debug('SMB service %s', self.smb_service) + break + + def ConstructVirtualMachineGroup(self, group_name, group_spec): + """Construct the virtual machine(s) needed for a group.""" + vms = [] + + vm_count = group_spec.vm_count + disk_count = group_spec.disk_count + + # First create the Static VM objects. + if group_spec.static_vms: + specs = [ + spec for spec in group_spec.static_vms + if (FLAGS.static_vm_tags is None or spec.tag in FLAGS.static_vm_tags) + ][:vm_count] + for vm_spec in specs: + static_vm_class = static_vm.GetStaticVmClass(vm_spec.os_type) + vms.append(static_vm_class(vm_spec)) + + os_type = group_spec.os_type + cloud = group_spec.cloud + + # This throws an exception if the benchmark is not + # supported. + self._CheckBenchmarkSupport(cloud) + + # Then create the remaining VM objects using VM and disk specs. + + if group_spec.disk_spec: + disk_spec = group_spec.disk_spec + # disk_spec.disk_type may contain legacy values that were + # copied from FLAGS.scratch_disk_type into + # FLAGS.data_disk_type at the beginning of the run. We + # translate them here, rather than earlier, because here is + # where we know what cloud we're using and therefore we're + # able to pick the right translation table. + disk_spec.disk_type = disk.WarnAndTranslateDiskTypes( + disk_spec.disk_type, cloud) + else: + disk_spec = None + + if group_spec.placement_group_name: + group_spec.vm_spec.placement_group = self.placement_groups[ + group_spec.placement_group_name] + + for _ in range(vm_count - len(vms)): + # Assign a zone to each VM sequentially from the --zones flag. + if FLAGS.zones or FLAGS.extra_zones or FLAGS.zone: + zone_list = FLAGS.zones + FLAGS.extra_zones + FLAGS.zone + group_spec.vm_spec.zone = zone_list[self._zone_index] + self._zone_index = (self._zone_index + 1 + if self._zone_index < len(zone_list) - 1 else 0) + # you might need this info in certain providers, e.g Kubernetes + group_spec.vm_spec.group_name = group_name + if group_spec.cidr: # apply cidr range to all vms in vm_group + group_spec.vm_spec.cidr = group_spec.cidr + vm = self._CreateVirtualMachine(group_spec.vm_spec, os_type, cloud) + if disk_spec and not vm.is_static: + if disk_spec.disk_type == disk.LOCAL and disk_count is None: + disk_count = vm.max_local_disks + vm.disk_specs = [copy.copy(disk_spec) for _ in range(disk_count)] + # In the event that we need to create multiple disks from the same + # DiskSpec, we need to ensure that they have different mount points. + if (disk_count > 1 and disk_spec.mount_point): + for i, vm_disk_spec in enumerate(vm.disk_specs): + vm_disk_spec.mount_point += str(i) + vm.vm_group = group_name + vms.append(vm) + + return vms + + def ConstructCapacityReservations(self): + """Construct capacity reservations for each VM group.""" + if not FLAGS.use_capacity_reservations: + return + for vm_group in six.itervalues(self.vm_groups): + cloud = vm_group[0].CLOUD + providers.LoadProvider(cloud) + capacity_reservation_class = capacity_reservation.GetResourceClass( + cloud) + self.capacity_reservations.append( + capacity_reservation_class(vm_group)) + + def _CheckBenchmarkSupport(self, cloud): + """Throw an exception if the benchmark isn't supported.""" + + if FLAGS.benchmark_compatibility_checking == SKIP_CHECK: + return + + provider_info_class = provider_info.GetProviderInfoClass(cloud) + benchmark_ok = provider_info_class.IsBenchmarkSupported(self.name) + if FLAGS.benchmark_compatibility_checking == NOT_EXCLUDED: + if benchmark_ok is None: + benchmark_ok = True + + if not benchmark_ok: + raise ValueError('Provider {0} does not support {1}. Use ' + '--benchmark_compatibility_checking=none ' + 'to override this check.'.format( + provider_info_class.CLOUD, self.name)) + + def _ConstructJujuController(self, group_spec): + """Construct a VirtualMachine object for a Juju controller.""" + juju_spec = copy.copy(group_spec) + juju_spec.vm_count = 1 + jujuvms = self.ConstructVirtualMachineGroup('juju', juju_spec) + if len(jujuvms): + jujuvm = jujuvms.pop() + jujuvm.is_controller = True + return jujuvm + return None + + def ConstructVirtualMachines(self): + """Constructs the BenchmarkSpec's VirtualMachine objects.""" + + self.ConstructPlacementGroups() + + vm_group_specs = self.vms_to_boot + + clouds = {} + for group_name, group_spec in sorted(six.iteritems(vm_group_specs)): + vms = self.ConstructVirtualMachineGroup(group_name, group_spec) + + if group_spec.os_type == os_types.JUJU: + # The Juju VM needs to be created first, so that subsequent units can + # be properly added under its control. + if group_spec.cloud in clouds: + jujuvm = clouds[group_spec.cloud] + else: + jujuvm = self._ConstructJujuController(group_spec) + clouds[group_spec.cloud] = jujuvm + + for vm in vms: + vm.controller = clouds[group_spec.cloud] + + jujuvm.units.extend(vms) + if jujuvm and jujuvm not in self.vms: + self.vms.extend([jujuvm]) + self.vm_groups['%s_juju_controller' % group_spec.cloud] = [jujuvm] + + self.vm_groups[group_name] = vms + self.vms.extend(vms) + # If we have a spark service, it needs to access the master_group and + # the worker group. + if (self.config.spark_service and + self.config.spark_service.service_type == spark_service.PKB_MANAGED): + for group_name in 'master_group', 'worker_group': + self.spark_service.vms[group_name] = self.vm_groups[group_name] + + # In the case of an un-managed yarn cluster, for hadoop software + # installation, the dpb service instance needs access to constructed + # master group and worker group. + if (self.config.dpb_service and self.config.dpb_service.service_type in [ + dpb_service.UNMANAGED_DPB_SVC_YARN_CLUSTER, + dpb_service.UNMANAGED_SPARK_CLUSTER]): + self.dpb_service.vms['master_group'] = self.vm_groups['master_group'] + if self.config.dpb_service.worker_count: + self.dpb_service.vms['worker_group'] = self.vm_groups['worker_group'] + else: # single node cluster + self.dpb_service.vms['worker_group'] = [] + + def ConstructPlacementGroups(self): + for placement_group_name, placement_group_spec in six.iteritems( + self.placement_group_specs): + self.placement_groups[placement_group_name] = self._CreatePlacementGroup( + placement_group_spec, placement_group_spec.CLOUD) + + def ConstructSparkService(self): + """Create the spark_service object and create groups for its vms.""" + if self.config.spark_service is None: + return + + spark_spec = self.config.spark_service + # Worker group is required, master group is optional + cloud = spark_spec.worker_group.cloud + if spark_spec.master_group: + cloud = spark_spec.master_group.cloud + providers.LoadProvider(cloud) + service_type = spark_spec.service_type + spark_service_class = spark_service.GetSparkServiceClass( + cloud, service_type) + self.spark_service = spark_service_class(spark_spec) + # If this is Pkb managed, the benchmark spec needs to adopt vms. + if service_type == spark_service.PKB_MANAGED: + for name, group_spec in [('master_group', spark_spec.master_group), + ('worker_group', spark_spec.worker_group)]: + if name in self.vms_to_boot: + raise Exception('Cannot have a vm group {0} with a {1} spark ' + 'service'.format(name, spark_service.PKB_MANAGED)) + self.vms_to_boot[name] = group_spec + + def ConstructVPNService(self): + """Create the VPNService object.""" + if self.config.vpn_service is None: + return + self.vpn_service = vpn_service.VPNService(self.config.vpn_service) + + def ConstructMessagingService(self): + """Create the messaging_service object. + + Assumes VMs are already constructed. + """ + if self.config.messaging_service is None: + return + cloud = self.config.messaging_service.cloud + delivery = self.config.messaging_service.delivery + providers.LoadProvider(cloud) + messaging_service_class = messaging_service.GetMessagingServiceClass( + cloud, delivery + ) + self.messaging_service = messaging_service_class.FromSpec( + self.config.messaging_service) + self.messaging_service.setVms(self.vm_groups) + + def ConstructDataDiscoveryService(self): + """Create the data_discovery_service object.""" + if not self.config.data_discovery_service: + return + cloud = self.config.data_discovery_service.cloud + service_type = self.config.data_discovery_service.service_type + providers.LoadProvider(cloud) + data_discovery_service_class = ( + data_discovery_service.GetDataDiscoveryServiceClass(cloud, service_type) + ) + self.data_discovery_service = data_discovery_service_class.FromSpec( + self.config.data_discovery_service) + + def Prepare(self): + targets = [(vm.PrepareBackgroundWorkload, (), {}) for vm in self.vms] + vm_util.RunParallelThreads(targets, len(targets)) + + def Provision(self): + """Prepares the VMs and networks necessary for the benchmark to run.""" + should_restore = hasattr(self, 'restore_spec') and self.restore_spec + # Create capacity reservations if the cloud supports it. Note that the + # capacity reservation class may update the VMs themselves. This is true + # on AWS, because the VM needs to be aware of the capacity reservation id + # before its Create() method is called. Furthermore, if the user does not + # specify an AWS zone, but a region instead, the AwsCapacityReservation + # class will make a reservation in a zone that has sufficient capacity. + # In this case the VM's zone attribute, and the VMs network instance + # need to be updated as well. + if self.capacity_reservations: + vm_util.RunThreaded(lambda res: res.Create(), self.capacity_reservations) + + # Sort networks into a guaranteed order of creation based on dict key. + # There is a finite limit on the number of threads that are created to + # provision networks. Until support is added to provision resources in an + # order based on dependencies, this key ordering can be used to avoid + # deadlock by placing dependent networks later and their dependencies + # earlier. + networks = [ + self.networks[key] for key in sorted(six.iterkeys(self.networks)) + ] + + vm_util.RunThreaded(lambda net: net.Create(), networks) + + # VPC peering is currently only supported for connecting 2 VPC networks + if self.vpc_peering: + if len(networks) > 2: + raise errors.Error( + 'Networks of size %d are not currently supported.' % + (len(networks))) + # Ignore Peering for one network + elif len(networks) == 2: + networks[0].Peer(networks[1]) + + if self.container_registry: + self.container_registry.Create() + for container_spec in six.itervalues(self.container_specs): + if container_spec.static_image: + continue + container_spec.image = self.container_registry.GetOrBuild( + container_spec.image) + + if self.container_cluster: + self.container_cluster.Create() + + # do after network setup but before VM created + if self.nfs_service and self.nfs_service.CLOUD != nfs_service.UNMANAGED: + self.nfs_service.Create() + if self.smb_service: + self.smb_service.Create() + + for placement_group_object in self.placement_groups.values(): + placement_group_object.Create() + + if self.vms: + + # We separate out creating, booting, and preparing the VMs into two phases + # so that we don't slow down the creation of all the VMs by running + # commands on the VMs that booted. + vm_util.RunThreaded( + self.CreateAndBootVm, + self.vms, + post_task_delay=FLAGS.create_and_boot_post_task_delay) + if self.nfs_service and self.nfs_service.CLOUD == nfs_service.UNMANAGED: + self.nfs_service.Create() + vm_util.RunThreaded(self.PrepareVmAfterBoot, self.vms) + + sshable_vms = [ + vm for vm in self.vms if vm.OS_TYPE not in os_types.WINDOWS_OS_TYPES + ] + sshable_vm_groups = {} + for group_name, group_vms in six.iteritems(self.vm_groups): + sshable_vm_groups[group_name] = [ + vm for vm in group_vms + if vm.OS_TYPE not in os_types.WINDOWS_OS_TYPES + ] + vm_util.GenerateSSHConfig(sshable_vms, sshable_vm_groups) + if self.spark_service: + self.spark_service.Create() + if self.dpb_service: + self.dpb_service.Create() + if hasattr(self, 'relational_db') and self.relational_db: + self.relational_db.SetVms(self.vm_groups) + self.relational_db.Create() + if self.non_relational_db: + self.non_relational_db.Create(restore=should_restore) + if self.spanner: + self.spanner.Create(restore=should_restore) + if self.tpus: + vm_util.RunThreaded(lambda tpu: tpu.Create(), self.tpus) + if self.edw_service: + if (not self.edw_service.user_managed and + self.edw_service.SERVICE_TYPE == 'redshift'): + # The benchmark creates the Redshift cluster's subnet group in the + # already provisioned virtual private cloud (vpc). + for network in networks: + if network.__class__.__name__ == 'AwsNetwork': + self.edw_service.cluster_subnet_group.subnet_id = network.subnet.id + self.edw_service.Create() + if self.vpn_service: + self.vpn_service.Create() + if hasattr(self, 'messaging_service') and self.messaging_service: + self.messaging_service.Create() + if self.data_discovery_service: + self.data_discovery_service.Create() + + def Delete(self): + if self.deleted: + return + + should_freeze = hasattr(self, 'freeze_path') and self.freeze_path + if should_freeze: + self.Freeze() + + if self.container_registry: + self.container_registry.Delete() + if self.spark_service: + self.spark_service.Delete() + if self.dpb_service: + self.dpb_service.Delete() + if hasattr(self, 'relational_db') and self.relational_db: + self.relational_db.Delete() + if hasattr(self, 'non_relational_db') and self.non_relational_db: + self.non_relational_db.Delete(freeze=should_freeze) + if hasattr(self, 'spanner') and self.spanner: + self.spanner.Delete(freeze=should_freeze) + if self.tpus: + vm_util.RunThreaded(lambda tpu: tpu.Delete(), self.tpus) + if self.edw_service: + self.edw_service.Delete() + if self.nfs_service: + self.nfs_service.Delete() + if self.smb_service: + self.smb_service.Delete() + if hasattr(self, 'messaging_service') and self.messaging_service: + self.messaging_service.Delete() + if hasattr(self, 'data_discovery_service') and self.data_discovery_service: + self.data_discovery_service.Delete() + + # Note: It is ok to delete capacity reservations before deleting the VMs, + # and will actually save money (mere seconds of usage). + if self.capacity_reservations: + try: + vm_util.RunThreaded(lambda reservation: reservation.Delete(), + self.capacity_reservations) + except Exception: # pylint: disable=broad-except + logging.exception('Got an exception deleting CapacityReservations. ' + 'Attempting to continue tearing down.') + + if self.vms: + try: + vm_util.RunThreaded(self.DeleteVm, self.vms) + except Exception: + logging.exception('Got an exception deleting VMs. ' + 'Attempting to continue tearing down.') + if hasattr(self, 'placement_groups'): + for placement_group_object in self.placement_groups.values(): + placement_group_object.Delete() + + for firewall in six.itervalues(self.firewalls): + try: + firewall.DisallowAllPorts() + except Exception: + logging.exception('Got an exception disabling firewalls. ' + 'Attempting to continue tearing down.') + + if self.container_cluster: + self.container_cluster.DeleteServices() + self.container_cluster.DeleteContainers() + self.container_cluster.Delete() + + for net in six.itervalues(self.networks): + try: + net.Delete() + except Exception: + logging.exception('Got an exception deleting networks. ' + 'Attempting to continue tearing down.') + + if hasattr(self, 'vpn_service') and self.vpn_service: + self.vpn_service.Delete() + + self.deleted = True + + def GetSamples(self): + """Returns samples created from benchmark resources.""" + samples = [] + if self.container_cluster: + samples.extend(self.container_cluster.GetSamples()) + if self.container_registry: + samples.extend(self.container_registry.GetSamples()) + return samples + + def StartBackgroundWorkload(self): + targets = [(vm.StartBackgroundWorkload, (), {}) for vm in self.vms] + vm_util.RunParallelThreads(targets, len(targets)) + + def StopBackgroundWorkload(self): + targets = [(vm.StopBackgroundWorkload, (), {}) for vm in self.vms] + vm_util.RunParallelThreads(targets, len(targets)) + + def _IsSafeKeyOrValueCharacter(self, char): + return char.isalpha() or char.isnumeric() or char == '_' + + def _SafeLabelKeyOrValue(self, key): + result = ''.join(c if self._IsSafeKeyOrValueCharacter(c) else '_' + for c in key.lower()) + + # max length contraints on keys and values + # https://cloud.google.com/resource-manager/docs/creating-managing-labels + max_safe_length = 63 + return result[:max_safe_length] + + def _GetResourceDict(self, time_format, timeout_minutes=None): + """Gets a list of tags to be used to tag resources.""" + now_utc = datetime.datetime.utcnow() + + if not timeout_minutes: + timeout_minutes = FLAGS.timeout_minutes + + timeout_utc = ( + now_utc + + datetime.timedelta(minutes=timeout_minutes)) + + tags = { + 'timeout_utc': timeout_utc.strftime(time_format), + 'create_time_utc': now_utc.strftime(time_format), + 'benchmark': self.name, + 'perfkit_uuid': self.uuid, + 'owner': FLAGS.owner, + 'benchmark_uid': self.uid, + } + + # add metadata key value pairs + metadata_dict = (flag_util.ParseKeyValuePairs(FLAGS.metadata) + if hasattr(FLAGS, 'metadata') else dict()) + for key, value in metadata_dict.items(): + tags[self._SafeLabelKeyOrValue(key)] = self._SafeLabelKeyOrValue(value) + + return tags + + def GetResourceTags(self, timeout_minutes=None): + """Gets a list of tags to be used to tag resources.""" + return self._GetResourceDict(METADATA_TIME_FORMAT, timeout_minutes) + + def _CreatePlacementGroup(self, placement_group_spec, cloud): + """Create a placement group in zone. + + Args: + placement_group_spec: A placement_group.BasePlacementGroupSpec object. + cloud: The cloud for the placement group. + See the flag of the same name for more information. + Returns: + A placement_group.BasePlacementGroup object. + """ + + placement_group_class = placement_group.GetPlacementGroupClass(cloud) + if placement_group_class: + return placement_group_class(placement_group_spec) + else: + return None + + def GetVmGroupForVm(self, vm): + """Look up and return vm group name for the vm argument.""" + for group_name in self.vm_groups.keys(): + if vm in self.vm_groups[group_name]: + return group_name + return None + + def _CreateVirtualMachine(self, vm_spec, os_type, cloud): + """Create a vm in zone. + + Args: + vm_spec: A virtual_machine.BaseVmSpec object. + os_type: The type of operating system for the VM. See the flag of the + same name for more information. + cloud: The cloud for the VM. See the flag of the same name for more + information. + Returns: + A virtual_machine.BaseVirtualMachine object. + """ + vm = static_vm.StaticVirtualMachine.GetStaticVirtualMachine() + if vm: + return vm + + vm_class = virtual_machine.GetVmClass(cloud, os_type) + if vm_class is None: + raise errors.Error( + 'VMs of type %s" are not currently supported on cloud "%s".' % + (os_type, cloud)) + + return vm_class(vm_spec) + + def CreateAndBootVm(self, vm): + """Creates a single VM and waits for boot to complete. + + Args: + vm: The BaseVirtualMachine object representing the VM. + """ + vm.Create() + logging.info('VM: %s', vm.ip_address) + logging.info('Waiting for boot completion.') + vm.AllowRemoteAccessPorts() + vm.WaitForBootCompletion() + + def PrepareVmAfterBoot(self, vm): + """Prepares a VM after it has booted. + + This function will prepare a scratch disk if required. + + Args: + vm: The BaseVirtualMachine object representing the VM. + + Raises: + Exception: If --vm_metadata is malformed. + """ + vm.AddMetadata() + vm.OnStartup() + # Prepare vm scratch disks: + if any((spec.disk_type == disk.LOCAL for spec in vm.disk_specs)): + vm.SetupLocalDisks() + for disk_spec in vm.disk_specs: + if disk_spec.disk_type == disk.RAM: + vm.CreateRamDisk(disk_spec) + else: + vm.CreateScratchDisk(disk_spec) + # This must come after Scratch Disk creation to support the + # Containerized VM case + vm.PrepareVMEnvironment() + vm.LowerSecurityScannerPriority() + + def DeleteVm(self, vm): + """Deletes a single vm and scratch disk if required. + + Args: + vm: The BaseVirtualMachine object representing the VM. + """ + if vm.is_static and vm.install_packages: + vm.PackageCleanup() + vm.ProxyCleanup() + vm.Delete() + vm.DeleteScratchDisks() + + @staticmethod + def _GetPickleFilename(uid): + """Returns the filename for the pickled BenchmarkSpec.""" + return os.path.join(vm_util.GetTempDir(), uid) + + def Pickle(self, filename=None): + """Pickles the spec so that it can be unpickled on a subsequent run.""" + with open(filename or self._GetPickleFilename(self.uid), + 'wb') as pickle_file: + pickle.dump(self, pickle_file, 2) + + def Freeze(self): + """Pickles the spec to a destination, defaulting to tempdir if not found.""" + if not self.freeze_path: + return + logging.info('Freezing benchmark_spec to %s', self.freeze_path) + try: + self.Pickle(self.freeze_path) + except FileNotFoundError: + default_path = f'{vm_util.GetTempDir()}/restore_spec.pickle' + logging.exception('Could not find file path %s, defaulting freeze to %s.', + self.freeze_path, default_path) + self.Pickle(default_path) + + @classmethod + def GetBenchmarkSpec(cls, benchmark_module, config, uid): + """Unpickles or creates a BenchmarkSpec and returns it. + + Args: + benchmark_module: The benchmark module object. + config: BenchmarkConfigSpec. The configuration for the benchmark. + uid: An identifier unique to this run of the benchmark even if the same + benchmark is run multiple times with different configs. + + Returns: + A BenchmarkSpec object. + """ + if stages.PROVISION in FLAGS.run_stage: + return cls(benchmark_module, config, uid) + + try: + with open(cls._GetPickleFilename(uid), 'rb') as pickle_file: + bm_spec = pickle.load(pickle_file) + except Exception as e: # pylint: disable=broad-except + logging.error('Unable to unpickle spec file for benchmark %s.', + benchmark_module.BENCHMARK_NAME) + raise e + # Always let the spec be deleted after being unpickled so that + # it's possible to run cleanup even if cleanup has already run. + bm_spec.deleted = False + bm_spec.status = benchmark_status.SKIPPED + context.SetThreadBenchmarkSpec(bm_spec) + return bm_spec diff --git a/script/cumulus/pkb/perfkitbenchmarker/benchmark_status.py b/script/cumulus/pkb/perfkitbenchmarker/benchmark_status.py new file mode 100644 index 0000000..f402d9c --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/benchmark_status.py @@ -0,0 +1,142 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Constants and helpers for reporting the success status of each benchmark.""" + +import os +from six.moves import zip + +SUCCEEDED = 'SUCCEEDED' +FAILED = 'FAILED' +SKIPPED = 'SKIPPED' + +ALL = SUCCEEDED, FAILED, SKIPPED + +_COL_SEPARATOR = ' ' + + +class FailedSubstatus(object): + """Failure modes for benchmarks.""" + # Failure due to insufficient quota, user preventable + QUOTA = 'QUOTA_EXCEEDED' + + # Failure due to insufficient capacity in the cloud provider, user + # non-preventable. + INSUFFICIENT_CAPACITY = 'INSUFFICIENT_CAPACITY' + + # Failure during the execution of the benchmark. These are non-retryable, + # known failure modes of the benchmark. It is recommended that the benchmark + # be completely re-run. + KNOWN_INTERMITTENT = 'KNOWN_INTERMITTENT' + + # Failure due to an interruptible vm being interrupted before the benchmark + # completes. User non-preventable. + INTERRUPTED = 'INTERRUPTED' + + # Failure due to an unsupported configuration running. Ex. Machine type not + # supported in a zone. For retries, this will pick a new region to maximize + # the chance of success. + UNSUPPORTED = 'UNSUPPORTED' + + # General failure that don't fit in the above categories. + UNCATEGORIZED = 'UNCATEGORIZED' + + # Failure when restoring resource. + RESTORE_FAILED = 'RESTORE_FAILED' + + # Failure when freezing resource. + FREEZE_FAILED = 'FREEZE_FAILED' + + # List of valid substatuses for use with --retries. + # UNCATEGORIZED failures are not retryable. To make a specific UNCATEGORIZED + # failure retryable, please raise an errors.Benchmarks.KnownIntermittentError. + # RESTORE_FAILED/FREEZE_FAILED failures are not retryable since generally + # logic for freeze/restore is already retried in the BaseResource + # Create()/Delete(). + RETRYABLE_SUBSTATUSES = [ + QUOTA, INSUFFICIENT_CAPACITY, KNOWN_INTERMITTENT, INTERRUPTED, UNSUPPORTED + ] + + +def _CreateSummaryTable(benchmark_specs): + """Converts statuses of benchmark runs into a formatted string table. + + Args: + benchmark_specs: List of BenchmarkSpecs. + + Returns: + string. Multi-line string summarizing benchmark success statuses. Example: + -------------------------------------------------------- + Name UID Status Failed Substatus + -------------------------------------------------------- + iperf iperf0 SUCCEEDED + iperf iperf1 FAILED + iperf iperf2 FAILED QUOTA_EXCEEDED + cluster_boot cluster_boot0 SKIPPED + -------------------------------------------------------- + """ + run_status_tuples = [(spec.name, spec.uid, spec.status, + spec.failed_substatus if spec.failed_substatus else '') + for spec in benchmark_specs] + assert run_status_tuples, ('run_status_tuples must contain at least one ' + 'element.') + col_headers = 'Name', 'UID', 'Status', 'Failed Substatus' + col_lengths = [] + for col_header, col_entries in zip(col_headers, + list(zip(*run_status_tuples))): + max_col_content_length = max(len(entry) for entry in col_entries) + col_lengths.append(max(len(col_header), max_col_content_length)) + line_length = (len(col_headers) - 1) * len(_COL_SEPARATOR) + sum(col_lengths) + dash_line = '-' * line_length + line_format = _COL_SEPARATOR.join( + '{{{0}:<{1}s}}'.format(col_index, col_length) + for col_index, col_length in enumerate(col_lengths)) + msg = [dash_line, line_format.format(*col_headers), dash_line] + msg.extend(line_format.format(*row_entries) + for row_entries in run_status_tuples) + msg.append(dash_line) + return os.linesep.join(msg) + + +def CreateSummary(benchmark_specs): + """Logs a summary of benchmark run statuses. + + Args: + benchmark_specs: List of BenchmarkSpecs. + + Returns: + string. Multi-line string summarizing benchmark success statuses. Example: + Benchmark run statuses: + -------------------------------------------------------- + Name UID Status Failed Substatus + -------------------------------------------------------- + iperf iperf0 SUCCEEDED + iperf iperf1 FAILED + iperf iperf2 FAILED QUOTA_EXCEEDED + cluster_boot cluster_boot0 SKIPPED + -------------------------------------------------------- + Success rate: 25.00% (1/4) + """ + run_status_tuples = [(spec.name, spec.uid, spec.status) + for spec in benchmark_specs] + assert run_status_tuples, ('run_status_tuples must contain at least one ' + 'element.') + benchmark_count = len(run_status_tuples) + successful_benchmark_count = sum(1 for _, _, status in run_status_tuples + if status == SUCCEEDED) + return os.linesep.join(( + 'Benchmark run statuses:', + _CreateSummaryTable(benchmark_specs), + 'Success rate: {0:.2f}% ({1}/{2})'.format( + 100. * successful_benchmark_count / benchmark_count, + successful_benchmark_count, benchmark_count))) diff --git a/script/cumulus/pkb/perfkitbenchmarker/capacity_reservation.py b/script/cumulus/pkb/perfkitbenchmarker/capacity_reservation.py new file mode 100644 index 0000000..dec04b5 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/capacity_reservation.py @@ -0,0 +1,46 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing abstract class for a capacity reservation for VMs.""" + +from absl import flags +from perfkitbenchmarker import resource + +FLAGS = flags.FLAGS + +flags.DEFINE_bool('use_capacity_reservations', False, + 'Whether to use capacity reservations for virtual ' + 'machines. Only supported on AWS.') + + +def GetResourceClass(cloud): + """Get the CapacityReservation class corresponding to 'cloud'. + + Args: + cloud: name of cloud to get the class for. + + Returns: + Cloud-specific implementation of BaseCapacityReservation. + """ + return resource.GetResourceClass(BaseCapacityReservation, CLOUD=cloud) + + +class BaseCapacityReservation(resource.BaseResource): + """An object representing a CapacityReservation.""" + + RESOURCE_TYPE = 'BaseCapacityReservation' + + def __init__(self, vm_group): + super(BaseCapacityReservation, self).__init__() + self.vm_group = vm_group diff --git a/script/cumulus/pkb/perfkitbenchmarker/cloud_harmony_util.py b/script/cumulus/pkb/perfkitbenchmarker/cloud_harmony_util.py new file mode 100644 index 0000000..d01419c --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/cloud_harmony_util.py @@ -0,0 +1,164 @@ +"""Module for Helper methods when working with Cloud Harmony Suite. + +https://github.com/cloudharmony +""" + +import io +from typing import Any, Dict, List, Optional + +from absl import flags +import pandas as pd +from perfkitbenchmarker import providers +from perfkitbenchmarker import sample +from perfkitbenchmarker import virtual_machine +from perfkitbenchmarker.providers.gcp import util as gcp_util + +FLAGS = flags.FLAGS + +flags.DEFINE_boolean('ch_store_results', False, + 'Whether to store cloudharmony benchmark reports. ' + 'Defaults to False, can be turned on for production runs. ' + 'This flag is used to produce benchmark reports.') +STORE = flags.DEFINE_string('ch_results_store', None, + 'Storage to store cloudharmony benchmark reports. ' + 'Used if ch_store_results is set to True.') +BUCKET = flags.DEFINE_string('ch_results_bucket', None, + 'Bucket to store cloudharmony benchmark reports. ' + 'Used if ch_store_results is set to True.') +KEY = flags.DEFINE_string( + 'ch_results_key', None, + 'Access key to store cloudharmony benchmark reports. ' + 'Used in conjunction with ch_results_bucket') +SECRET = flags.DEFINE_string( + 'ch_results_secret', None, + 'Access secret to store cloudharmony benchmark reports. ' + 'Used in conjunction with ch_results_bucket') + +ITERATIONS = flags.DEFINE_integer( + 'ch_iterations', 1, 'The number of times to run the test. Multiple test ' + 'iterations will be grouped and saved in the same results resport.') + + +def GetSaveCommand() -> str: + """Returns the cloudharmony command to save benchmark reports.""" + return (f' --db_and_csv --store {STORE.value} --store_key {KEY.value} ' + f' --store_secret {SECRET.value} --store_container {BUCKET.value} ') + + +def GetRegionFromZone(zone: str) -> str: + # only gcp is supported as cloudharmony metadata is exclusive to gcp runs. + if FLAGS.cloud == 'GCP': + return gcp_util.GetRegionFromZone(zone) + else: + return zone + + +def ParseCsvResultsIntoMetadata(vm: virtual_machine.BaseVirtualMachine, + path: str) -> List[Dict[str, Any]]: + """Loads the CSV created by cloud harmony at path in the VM into metadata. + + The CSV located by path inside of virtual machine VM will be loaded. For each + row of results, a set of key/value pairs is created. The keys will all be + prepended with `cloudharmony` or similar. + + Args: + vm: the Virtual Machine that has run a cloud harmony benchmark + path: The path inside of VM which has the CSV file which should be loaded + Returns: + A list of metadata outputs that should be appended to the samples that are + produced by a cloud harmony benchmark. + """ + csv_string, _ = vm.RemoteCommand('cat {path}'.format(path=path)) + + return ParseCsvResultsFromString(csv_string) + + +def ParseCsvResultsFromString(csv_string: str, + prefix: str = '') -> List[Dict[str, Any]]: + """Loads the CSV created by cloud harmony in csv_string. + + The CSV will be loaded into a pandas data frame. + For every row of results - we will create a set of key/value pairs + representing that row of results. The keys will all be prepended with + prefix. + + Args: + csv_string: a string of the CSV which was produced by cloud_harmony + prefix: a string prefix to attach to the metadata. Defaults to empty + string. It can be set to a unique string if cloudharmony data is + attached to every sample instead of being its own sample. + Returns: + A list of metadata dictionaries, where each dict represents one row of + results (an iteration) in the csv string. + """ + data_frame = pd.read_csv(io.StringIO(csv_string)).fillna('') + number_of_rows = len(data_frame.index) + + results = [] + # one row = one test result + for row in range(number_of_rows): + result = {} + for column in data_frame.columns: + key = column + value = data_frame[column][row] + result_key = f'{prefix}_{key}' if prefix else key + result[result_key] = value + results.append(result) + + return results + + +def GetCommonMetadata(custom_metadata: Optional[Dict[str, Any]] = None) -> str: + """Returns pkb metadata associated with this run as cloudharmony metadata. + + Cloudharmony benchmarks take in benchmark setup configurations as inputs and + include them in the output as metadata for the run. This function creates a + string of input metadata from pkb flags to be included as run parameter for + cloudharmony benchmarks. + + Args: + custom_metadata: a dictionary of metadata key value pairs that should + override any flag chosen in the function, or should also be included. + Returns: + A string of metadata that should be appended to the cloudharmony + benchmark run. + """ + if FLAGS.cloud != providers.GCP: + # Should not be including cloudharmony metadata for non-gcp runs. + return '' + + metadata = { + 'meta_compute_service': 'Google Compute Engine', + 'meta_compute_service_id': 'google:compute', + 'meta_instance_id': FLAGS.machine_type, + 'meta_provider': 'Google Cloud Platform', + 'meta_provider_id': 'google', + 'meta_region': gcp_util.GetRegionFromZone(FLAGS.zone[0]), + 'meta_zone': FLAGS.zone[0], + 'meta_test_id': FLAGS.run_uri, + } + if custom_metadata: + metadata.update(custom_metadata) + + metadata_pair = [f'--{key} {value}' for key, value in metadata.items()] + return ' '.join(metadata_pair) + + +def GetMetadataSamples( + cloud_harmony_metadata: List[Dict[Any, Any]]) -> List[sample.Sample]: + """Returns the cloudharmony metadata as a list of samples. + + This function is commonly used across all cloudharmony benchmarks. + + Args: + cloud_harmony_metadata: list of metadata outputs to save in samples. + + Returns: + A list of sample.Sample objects of cloudharmony metadata, where one sample + represents one row of csv results (one row = one test iteration). + + """ + samples = [] + for result in cloud_harmony_metadata: + samples.append(sample.Sample('cloudharmony_output', '', '', result)) + return samples diff --git a/script/cumulus/pkb/perfkitbenchmarker/cloud_tpu.py b/script/cumulus/pkb/perfkitbenchmarker/cloud_tpu.py new file mode 100644 index 0000000..2ab85f5 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/cloud_tpu.py @@ -0,0 +1,124 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for TPU.""" + +import abc + +from absl import flags +from perfkitbenchmarker import resource + + +flags.DEFINE_string('tpu_cidr_range', None, """CIDR Range for the TPU. The IP + range that the TPU will select an IP address from. Must be + in CIDR notation and a /29 range, for example + 192.168.0.0/29. Errors will occur if the CIDR range has + already been used for a currently existing TPU, the CIDR + range conflicts with any networks in the user's provided + network, or the provided network is peered with another + network that is using that CIDR range.""") +flags.DEFINE_string('tpu_accelerator_type', 'tpu-v2', + 'TPU accelerator type for the TPU.') +flags.DEFINE_string('tpu_description', None, + 'Specifies a text description of the TPU.') +flags.DEFINE_string('tpu_network', None, + 'Specifies the network that this TPU will be a part of.') +flags.DEFINE_string('tpu_tf_version', None, + 'TensorFlow version for the TPU.') +flags.DEFINE_string('tpu_zone', None, + 'The zone of the tpu to create. Zone in which TPU lives.') +flags.DEFINE_string('tpu_name', None, + 'The name of the TPU to create.') +flags.DEFINE_boolean('tpu_preemptible', False, + 'Use preemptible TPU or not.') +flags.DEFINE_integer('tpu_cores_per_donut', 8, + 'The number of cores per TPU donut. This is 8 because each' + ' TPU has 4 chips each with 2 cores.') + +FLAGS = flags.FLAGS + + +def GetTpuClass(cloud): + """Gets the TPU class corresponding to 'cloud'. + + Args: + cloud: String. name of cloud to get the class for. + + Returns: + Implementation class corresponding to the argument cloud + + Raises: + Exception: An invalid TPU was provided + """ + return resource.GetResourceClass(BaseTpu, CLOUD=cloud) + + +class BaseTpu(resource.BaseResource): + """Object representing a TPU.""" + + RESOURCE_TYPE = 'BaseTpu' + + def __init__(self, tpu_spec): + """Initialize the TPU object. + + Args: + tpu_spec: spec of the TPU. + """ + super(BaseTpu, self).__init__() + self.spec = tpu_spec + + def _Create(self): + """Creates the TPU.""" + raise NotImplementedError() + + def _Delete(self): + """Deletes the TPU.""" + raise NotImplementedError() + + @abc.abstractmethod + def GetName(self): + raise NotImplementedError() + + @abc.abstractmethod + def GetMasterGrpcAddress(self): + """Gets the master grpc address of the TPU.""" + raise NotImplementedError() + + @abc.abstractmethod + def GetNumShards(self): + """Gets the number of TPU shards.""" + raise NotImplementedError() + + @abc.abstractmethod + def GetZone(self): + """Gets the TPU zone.""" + raise NotImplementedError() + + @abc.abstractmethod + def GetAcceleratorType(self): + """Gets the TPU accelerator type.""" + raise NotImplementedError() + + def GetResourceMetadata(self): + """Returns a dictionary of cluster metadata.""" + metadata = { + 'cidr_range': self.spec.tpu_cidr_range, + 'accelerator_type': self.spec.tpu_accelerator_type, + 'description': self.spec.tpu_description, + 'network': self.spec.tpu_network, + 'tf_version': self.spec.tpu_tf_version, + 'zone': self.spec.tpu_zone, + 'name': self.spec.tpu_name, + 'preemptible': self.spec.tpu_preemptible + } + return metadata diff --git a/script/cumulus/pkb/perfkitbenchmarker/configs/__init__.py b/script/cumulus/pkb/perfkitbenchmarker/configs/__init__.py new file mode 100644 index 0000000..9ad39c1 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/configs/__init__.py @@ -0,0 +1,325 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Configuration files for benchmarks. + +Each benchmark has a default configuration defined inside its module. +The configuration is written in YAML (www.yaml.org) and specifies what +resources are needed to run the benchmark. Users can write their own +config files, which will be merged with the default configuration. These +config files specify overrides to the default configuration. Users can also +specify which benchmarks to run in the same config file. + +Valid top level keys: + benchmarks: A YAML array of dictionaries mapping benchmark names to their + configs. This also determines which benchmarks to run. + flags: A YAML dictionary with overrides for default flag values. Benchmark + config specific flags override those specified here. + *any_benchmark_name*: If the 'benchmarks' key is not specified, then + specifying a benchmark name mapped to a config will override + that benchmark's default configuration in the event that that + benchmark is run. + +Valid config keys: + vm_groups: A YAML dictionary mapping the names of VM groups to the groups + themselves. These names can be any string. + description: A description of the benchmark. + flags: A YAML dictionary with overrides for default flag values. + +Valid VM group keys: + vm_spec: A YAML dictionary mapping names of clouds (e.g. AWS) to the + actual VM spec. + disk_spec: A YAML dictionary mapping names of clouds to the actual + disk spec. + vm_count: The number of VMs to create in this group. If this key isn't + specified, it defaults to 1. + disk_count: The number of disks to attach to VMs of this group. If this key + isn't specified, it defaults to 1. + cloud: The name of the cloud to create the group in. This is used for + multi-cloud configurations. + os_type: The OS type of the VMs to create (see the flag of the same name for + more information). This is used if you want to run a benchmark using VMs + with different OS types (e.g. Debian and RHEL). + static_vms: A YAML array of Static VM specs. These VMs will be used before + any Cloud VMs are created. The total number of VMs will still add up to + the number specified by the 'vm_count' key. + +For valid VM spec keys, see virtual_machine.BaseVmSpec and derived classes. +For valid disk spec keys, see disk.BaseDiskSpec and derived classes. + +See configs.spec.BaseSpec for more information about adding additional keys to +VM specs, disk specs, or any component of the benchmark configuration +dictionary. +""" + +import copy +import functools +import json +import logging +import re + +from absl import flags +import contextlib2 +from perfkitbenchmarker import data +from perfkitbenchmarker import errors +import six +import yaml + + +FLAGS = flags.FLAGS +CONFIG_CONSTANTS = 'default_config_constants.yaml' +FLAGS_KEY = 'flags' +IMPORT_REGEX = re.compile('^#import (.*)') + +flags.DEFINE_string('benchmark_config_file', None, + 'The file path to the user config file which will ' + 'override benchmark defaults. This should either be ' + 'a path relative to the current working directory, ' + 'an absolute path, or just the name of a file in the ' + 'configs/ directory.') +flags.DEFINE_multi_string( + 'config_override', None, + 'This flag can be used to override any config value. It is applied after ' + 'the user config (specified via --benchmark_config_file_path), so it has ' + 'a higher priority than that config. The value of the flag should be ' + 'fully.qualified.key=value (e.g. --config_override=cluster_boot.vm_groups.' + 'default.vm_count=4).') + + +class _ConcatenatedFiles(object): + """Class that presents several files as a single object. + + The class exposes a single method (read) which is all that yaml + needs to interact with a stream. + + Attributes: + files: A list of opened file objects. + current_file_index: The index of the current file that is being read from. + """ + + def __init__(self, files): + self.files = files + self.current_file_index = 0 + + def read(self, length): + data = self.files[self.current_file_index].read(length) + while (not data) and (self.current_file_index + 1 < len(self.files)): + self.current_file_index += 1 + data = self.files[self.current_file_index].read(length) + return data + + +def _GetImportFiles(config_file, imported_set=None): + """Get a list of file names that get imported from config_file. + + Args: + config_file: The name of a config file to find imports for. + imported_set: A set of files that _GetImportFiles has already + been called on that should be ignored. + + Returns: + A list of file names that are imported by config_file + (including config_file itself). + """ + imported_set = imported_set or set() + config_path = data.ResourcePath(config_file) + # Give up on circular imports. + if config_path in imported_set: + return [] + imported_set.add(config_path) + + with open(config_path) as f: + line = f.readline() + match = IMPORT_REGEX.match(line) + import_files = [] + while match: + import_file = match.group(1) + for file_name in _GetImportFiles(import_file, imported_set): + if file_name not in import_files: + import_files.append(file_name) + line = f.readline() + match = IMPORT_REGEX.match(line) + import_files.append(config_path) + return import_files + + +def _LoadUserConfig(path): + """Loads a user config from the supplied path.""" + config_files = _GetImportFiles(path) + with contextlib2.ExitStack() as stack: + files = [stack.enter_context(open(f)) for f in config_files] + return yaml.safe_load(_ConcatenatedFiles(files)) + + +@functools.lru_cache() +def _LoadConfigConstants(): + """Reads the config constants file.""" + with open(data.ResourcePath(CONFIG_CONSTANTS, False)) as fp: + return fp.read() + + +def _GetConfigFromOverrides(overrides): + """Converts a list of overrides into a config.""" + config = {} + + for override in overrides: + if override.count('=') != 1: + raise ValueError('--config_override flag value has incorrect number of ' + '"=" characters. The value must take the form ' + 'fully.qualified.key=value.') + full_key, value = override.split('=') + keys = full_key.split('.') + new_config = {keys.pop(): yaml.safe_load(value)} + while keys: + new_config = {keys.pop(): new_config} + config = MergeConfigs(config, new_config) + + return config + + +@functools.lru_cache() +def GetConfigFlags(): + """Returns the global flags from the user config.""" + return GetUserConfig().get(FLAGS_KEY, {}) + + +def GetUserConfig(): + """Returns the user config with any overrides applied. + + This loads config from --benchmark_config_file and merges it with + any overrides specified via --config_override and returns the result. + + Returns: + dict. The result of merging the loaded config from the + --benchmark_config_file flag with the config generated from the + --config override flag. + """ + try: + if FLAGS.benchmark_config_file: + config = _LoadUserConfig(FLAGS.benchmark_config_file) + else: + config = {} + + if FLAGS.config_override: + override_config = _GetConfigFromOverrides(FLAGS.config_override) + config = MergeConfigs(config, override_config) + + except yaml.parser.ParserError as e: + raise errors.Config.ParseError( + 'Encountered a problem loading config. Please ensure that the config ' + 'is valid YAML. Error received:\n%s' % e) + except yaml.composer.ComposerError as e: + raise errors.Config.ParseError( + 'Encountered a problem loading config. Please ensure that all ' + 'references are defined. Error received:\n%s' % e) + + return config + + +def MergeConfigs(default_config, override_config, warn_new_key=False): + """Merges the override config into the default config. + + This function will recursively merge two nested dicts. + The override_config represents overrides to the default_config dict, so any + leaf key/value pairs which are present in both dicts will take their value + from the override_config. + + Args: + default_config: The dict which will have its values overridden. + override_config: The dict wich contains the overrides. + warn_new_key: Determines whether we warn the user if the override config + has a key that the default config did not have. + + Returns: + A dict containing the values from the default_config merged with those from + the override_config. + """ + def _Merge(d1, d2): + """Merge two nested dicts.""" + merged_dict = copy.deepcopy(d1) + for k, v in six.iteritems(d2): + if k not in d1: + merged_dict[k] = copy.deepcopy(v) + if warn_new_key: + logging.warning('The key "%s" was not in the default config, ' + 'but was in user overrides. This may indicate ' + 'a typo.', k) + elif isinstance(d1[k], dict) and isinstance(v, dict): + merged_dict[k] = _Merge(d1[k], v) + else: + merged_dict[k] = v + return merged_dict + + if override_config: + return _Merge(default_config, override_config) + else: + return default_config + + +def LoadMinimalConfig(benchmark_config, benchmark_name): + """Loads a benchmark config without using any flags in the process. + + This function will prepend configs/default_config_constants.yaml to the + benchmark config prior to loading it. This allows the config to use + references to anchors defined in the constants file. + + Args: + benchmark_config: str. The default config in YAML format. + benchmark_name: str. The name of the benchmark. + + Returns: + dict. The loaded config. + """ + yaml_config = [] + yaml_config.append(_LoadConfigConstants()) + yaml_config.append(benchmark_config) + + try: + config = yaml.safe_load('\n'.join(yaml_config)) + except yaml.parser.ParserError as e: + raise errors.Config.ParseError( + 'Encountered a problem loading the default benchmark config. Please ' + 'ensure that the config is valid YAML. Error received:\n%s' % e) + except yaml.composer.ComposerError as e: + raise errors.Config.ParseError( + 'Encountered a problem loading the default benchmark config. Please ' + 'ensure that all references are defined. Error received:\n%s' % e) + + config = config[benchmark_name] + # yaml safe_parse parses anchor by reference and return the same + # object when the same anchor is used multiple times. + # Seralize and deserialize to make sure all objects in the dictionary are + # unique. + config = json.loads(json.dumps(config)) + return config + + +def LoadConfig(benchmark_config, user_config, benchmark_name): + """Loads a benchmark configuration. + + This function loads a benchmark's default configuration (in YAML format), + then merges it with any overrides the user provided, and returns the result. + This loaded config is then passed to the benchmark_spec.BenchmarkSpec + constructor in order to create a BenchmarkSpec. + + Args: + benchmark_config: str. The default configuration in YAML format. + user_config: dict. The loaded user config for the benchmark. + benchmark_name: str. The name of the benchmark. + + Returns: + dict. The loaded config. + """ + config = LoadMinimalConfig(benchmark_config, benchmark_name) + config = MergeConfigs(config, user_config, warn_new_key=True) + return config diff --git a/script/cumulus/pkb/perfkitbenchmarker/configs/benchmark_config_spec.py b/script/cumulus/pkb/perfkitbenchmarker/configs/benchmark_config_spec.py new file mode 100644 index 0000000..209f749 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/configs/benchmark_config_spec.py @@ -0,0 +1,2120 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Classes that verify and transform benchmark configuration input. + +See perfkitbenchmarker/configs/__init__.py for more information about +configuration files. +""" + +import contextlib +import logging +import os + +from perfkitbenchmarker import app_service +from perfkitbenchmarker import container_service +from perfkitbenchmarker import data_discovery_service +from perfkitbenchmarker import disk +from perfkitbenchmarker import dpb_service +from perfkitbenchmarker import edw_service +from perfkitbenchmarker import errors +from perfkitbenchmarker import flag_util +from perfkitbenchmarker import managed_memory_store +from perfkitbenchmarker import non_relational_db +from perfkitbenchmarker import os_types +from perfkitbenchmarker import placement_group +from perfkitbenchmarker import providers +from perfkitbenchmarker import relational_db +from perfkitbenchmarker import spark_service +from perfkitbenchmarker import sql_engine_utils +from perfkitbenchmarker import static_virtual_machine +from perfkitbenchmarker import virtual_machine +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.configs import spec +from perfkitbenchmarker.dpb_service import BaseDpbService +from perfkitbenchmarker.providers.gcp import gcp_spanner +import six + +_DEFAULT_DISK_COUNT = 1 +_DEFAULT_VM_COUNT = 1 + + +class _DpbApplicationListDecoder(option_decoders.ListDecoder): + """Decodes the list of applications to be enabled on the dpb service.""" + + def __init__(self, **kwargs): + super(_DpbApplicationListDecoder, self).__init__( + default=None, + item_decoder=option_decoders.EnumDecoder( + [dpb_service.FLINK, dpb_service.HIVE]), + **kwargs) + + +class _DpbServiceDecoder(option_decoders.TypeVerifier): + """Validates the dpb service dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_DpbServiceDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verifies dpb service dictionary of a benchmark config object. + + Args: + value: dict Dpb Service config dictionary + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _DpbServiceSpec Build from the config passed in in value. + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + dpb_service_config = super(_DpbServiceDecoder, + self).Decode(value, component_full_name, + flag_values) + + if (dpb_service_config['service_type'] == dpb_service.EMR and + component_full_name == 'dpb_wordcount_benchmark'): + if flag_values.dpb_wordcount_fs != BaseDpbService.S3_FS: + raise errors.Config.InvalidValue('EMR service requires S3.') + result = _DpbServiceSpec( + self._GetOptionFullName(component_full_name), flag_values, + **dpb_service_config) + return result + + +class _DpbServiceSpec(spec.BaseSpec): + """Configurable options of an Distributed Processing Backend Service. + + We may add more options here, such as disk specs, as necessary. + When there are flags for these attributes, the convention is that + the flag is prefixed with dpb. + Attributes: + service_type: string. pkb_managed or dataflow,dataproc,emr, etc. + static_dpb_service_instance: if user has pre created a container, the id + worker_group: Vm group spec for workers. + worker_count: the number of workers part of the dpb service + applications: An enumerated list of applications that need to be enabled + on the dpb service + version: string. The version of software to install inside the service. + """ + + def __init__(self, component_full_name, flag_values=None, **kwargs): + super(_DpbServiceSpec, self).__init__( + component_full_name, flag_values=flag_values, **kwargs) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) + pair. The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(_DpbServiceSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'static_dpb_service_instance': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'service_type': ( + option_decoders.EnumDecoder, + { + 'default': + dpb_service.DATAPROC, + 'valid_values': [ + dpb_service.DATAPROC, + dpb_service.DATAPROC_GKE, + dpb_service.DATAPROC_SERVERLESS, + dpb_service.DATAFLOW, + dpb_service.EMR, + dpb_service.UNMANAGED_DPB_SVC_YARN_CLUSTER, + dpb_service.UNMANAGED_SPARK_CLUSTER, + dpb_service.KUBERNETES_SPARK_CLUSTER, + ] + }), + 'worker_group': (_VmGroupSpecDecoder, {}), + 'worker_count': (option_decoders.IntDecoder, { + 'default': dpb_service.DEFAULT_WORKER_COUNT, + 'min': 0 + }), + 'applications': (_DpbApplicationListDecoder, {}), + 'version': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'gke_cluster_name': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'gke_cluster_nodepools': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'gke_cluster_location': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'dataproc_serverless_core_count': (option_decoders.IntDecoder, { + 'default': None, + 'none_ok': True, + }), + 'dataproc_serverless_initial_executors': (option_decoders.IntDecoder, { + 'default': None, + 'none_ok': True + }), + 'dataproc_serverless_min_executors': (option_decoders.IntDecoder, { + 'default': None, + 'none_ok': True + }), + 'dataproc_serverless_max_executors': (option_decoders.IntDecoder, { + 'default': None, + 'none_ok': True + }), + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May be + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(_DpbServiceSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['static_dpb_service_instance'].present: + config_values['static_dpb_service_instance'] = ( + flag_values.static_dpb_service_instance) + # TODO(saksena): Update the documentation for zones assignment + if flag_values['zones'].present: + group = 'worker_group' + if group in config_values: + for cloud in config_values[group]['vm_spec']: + config_values[group]['vm_spec'][cloud]['zone'] = ( + flag_values.zones[0]) + + +class _TpuGroupSpec(spec.BaseSpec): + """Configurable options of a TPU.""" + + def __init__(self, + component_full_name, + group_name, + flag_values=None, + **kwargs): + super(_TpuGroupSpec, self).__init__( + '{0}.{1}'.format(component_full_name, group_name), + flag_values=flag_values, + **kwargs) + if not self.tpu_name: + self.tpu_name = 'pkb-tpu-{group_name}-{run_uri}'.format( + group_name=group_name, run_uri=flag_values.run_uri) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments + to construct in order to decode the named option. + """ + result = super(_TpuGroupSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'cloud': (option_decoders.EnumDecoder, { + 'valid_values': providers.VALID_CLOUDS + }), + 'tpu_cidr_range': (option_decoders.StringDecoder, { + 'default': None + }), + 'tpu_accelerator_type': (option_decoders.StringDecoder, { + 'default': None + }), + 'tpu_description': (option_decoders.StringDecoder, { + 'default': None + }), + 'tpu_network': (option_decoders.StringDecoder, { + 'default': None + }), + 'tpu_tf_version': (option_decoders.StringDecoder, { + 'default': None + }), + 'tpu_zone': (option_decoders.StringDecoder, { + 'default': None + }), + 'tpu_name': (option_decoders.StringDecoder, { + 'default': None + }), + 'tpu_preemptible': (option_decoders.BooleanDecoder, { + 'default': False + }) + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May be + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(_TpuGroupSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['cloud'].present: + config_values['cloud'] = flag_values.cloud + if flag_values['tpu_cidr_range'].present: + config_values['tpu_cidr_range'] = flag_values.tpu_cidr_range + if flag_values['tpu_accelerator_type'].present: + config_values['tpu_accelerator_type'] = flag_values.tpu_accelerator_type + if flag_values['tpu_description'].present: + config_values['tpu_description'] = flag_values.tpu_description + if flag_values['tpu_network'].present: + config_values['tpu_network'] = flag_values.tpu_network + if flag_values['tpu_tf_version'].present: + config_values['tpu_tf_version'] = flag_values.tpu_tf_version + if flag_values['tpu_zone'].present: + config_values['tpu_zone'] = flag_values.tpu_zone + if flag_values['tpu_name'].present: + config_values['tpu_name'] = flag_values.tpu_name + if flag_values['tpu_preemptible'].present: + config_values['tpu_preemptible'] = flag_values.tpu_preemptible + + +class _EdwServiceDecoder(option_decoders.TypeVerifier): + """Validates the edw service dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_EdwServiceDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verifies edw service dictionary of a benchmark config object. + + Args: + value: dict edw service config dictionary + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _EdwServiceSpec Built from the config passed in in value. + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + edw_service_config = super(_EdwServiceDecoder, + self).Decode(value, component_full_name, + flag_values) + result = _EdwServiceSpec( + self._GetOptionFullName(component_full_name), flag_values, + **edw_service_config) + return result + + +class _EdwServiceSpec(spec.BaseSpec): + """Configurable options of an EDW service. + + When there are flags for these attributes, the convention is that + the flag is prefixed with edw_service. + + Attributes: + cluster_name : string. If set, the name of the cluster + type: string. The type of EDW service (redshift) + node_type: string, type of node comprising the cluster + node_count: integer, number of nodes in the cluster + """ + + def __init__(self, component_full_name, flag_values=None, **kwargs): + super(_EdwServiceSpec, self).__init__( + component_full_name, flag_values=flag_values, **kwargs) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments to + construct in order to decode the named option. + """ + result = super(_EdwServiceSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'type': (option_decoders.StringDecoder, { + 'default': 'redshift', + 'none_ok': False + }), + 'cluster_identifier': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'endpoint': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'concurrency': (option_decoders.IntDecoder, { + 'default': 5, + 'none_ok': True + }), + 'db': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'user': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'password': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'node_type': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'node_count': (option_decoders.IntDecoder, { + 'default': edw_service.DEFAULT_NUMBER_OF_NODES, + 'min': edw_service.DEFAULT_NUMBER_OF_NODES + }), + 'snapshot': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'cluster_subnet_group': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'cluster_parameter_group': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'resource_group': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'server_name': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'iam_role': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }) + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May be + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(_EdwServiceSpec, cls)._ApplyFlags(config_values, flag_values) + # TODO(saksena): Add cluster_subnet_group and cluster_parameter_group flags + # Restoring from a snapshot, so defer to the user supplied cluster details + if flag_values['edw_service_cluster_snapshot'].present: + config_values['snapshot'] = flag_values.edw_service_cluster_snapshot + if flag_values['edw_service_cluster_identifier'].present: + config_values['cluster_identifier'] = ( + flag_values.edw_service_cluster_identifier) + if flag_values['edw_service_endpoint'].present: + config_values['endpoint'] = flag_values.edw_service_endpoint + if flag_values['edw_service_cluster_concurrency'].present: + config_values['concurrency'] = flag_values.edw_service_cluster_concurrency + if flag_values['edw_service_cluster_db'].present: + config_values['db'] = flag_values.edw_service_cluster_db + if flag_values['edw_service_cluster_user'].present: + config_values['user'] = flag_values.edw_service_cluster_user + if flag_values['edw_service_cluster_password'].present: + config_values['password'] = flag_values.edw_service_cluster_password + + +class _StaticVmDecoder(option_decoders.TypeVerifier): + """Decodes an item of the static_vms list of a VM group config object.""" + + def __init__(self, **kwargs): + super(_StaticVmDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Decodes an item of the static_vms list of a VM group config object. + + Args: + value: dict mapping static VM config option name string to corresponding + option value. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + StaticVmSpec decoded from the input dict. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + input_dict = super(_StaticVmDecoder, + self).Decode(value, component_full_name, flag_values) + return static_virtual_machine.StaticVmSpec( + self._GetOptionFullName(component_full_name), + flag_values=flag_values, + **input_dict) + + +class _StaticVmListDecoder(option_decoders.ListDecoder): + """Decodes the static_vms list of a VM group config object.""" + + def __init__(self, **kwargs): + super(_StaticVmListDecoder, self).__init__( + default=list, item_decoder=_StaticVmDecoder(), **kwargs) + + +class _RelationalDbSpec(spec.BaseSpec): + """Configurable options of a database service.""" + + def __init__(self, component_full_name, flag_values=None, **kwargs): + super(_RelationalDbSpec, self).__init__( + component_full_name, flag_values=flag_values, **kwargs) + # TODO(ferneyhough): This is a lot of boilerplate, and is repeated + # below in VmGroupSpec. See if some can be consolidated. Maybe we can + # specify a VmGroupSpec instead of both vm_spec and disk_spec. + ignore_package_requirements = ( + getattr(flag_values, 'ignore_package_requirements', True) + if flag_values else True) + providers.LoadProvider(self.cloud, ignore_package_requirements) + + if self.db_disk_spec: + disk_config = getattr(self.db_disk_spec, self.cloud, None) + if disk_config is None: + raise errors.Config.MissingOption( + '{0}.cloud is "{1}", but {0}.db_disk_spec does not contain a ' + 'configuration for "{1}".'.format(component_full_name, self.cloud)) + disk_spec_class = disk.GetDiskSpecClass(self.cloud) + self.db_disk_spec = disk_spec_class( + '{0}.db_disk_spec.{1}'.format(component_full_name, self.cloud), + flag_values=flag_values, + **disk_config) + + db_vm_config = getattr(self.db_spec, self.cloud, None) + if db_vm_config is None: + raise errors.Config.MissingOption( + '{0}.cloud is "{1}", but {0}.db_spec does not contain a ' + 'configuration for "{1}".'.format(component_full_name, self.cloud)) + db_vm_spec_class = virtual_machine.GetVmSpecClass(self.cloud) + self.db_spec = db_vm_spec_class( + '{0}.db_spec.{1}'.format(component_full_name, self.cloud), + flag_values=flag_values, + **db_vm_config) + + # Set defaults that were not able to be set in + # GetOptionDecoderConstructions() + if not self.engine_version: + db_class = relational_db.GetRelationalDbClass(self.cloud, + self.is_managed_db, + self.engine) + self.engine_version = db_class.GetDefaultEngineVersion(self.engine) + if not self.database_name: + self.database_name = 'pkb-db-%s' % flag_values.run_uri + if not self.database_username: + self.database_username = 'pkb%s' % flag_values.run_uri + if not self.database_password: + self.database_password = relational_db.GenerateRandomDbPassword() + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments + to construct in order to decode the named option. + """ + result = super(_RelationalDbSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'cloud': (option_decoders.EnumDecoder, { + 'valid_values': providers.VALID_CLOUDS + }), + 'engine': (option_decoders.EnumDecoder, { + 'valid_values': sql_engine_utils.ALL_ENGINES, + }), + 'zones': (option_decoders.ListDecoder, { + 'item_decoder': option_decoders.StringDecoder(), + 'default': None + }), + 'engine_version': (option_decoders.StringDecoder, { + 'default': None + }), + 'database_name': (option_decoders.StringDecoder, { + 'default': None + }), + 'database_password': (option_decoders.StringDecoder, { + 'default': None + }), + 'database_username': (option_decoders.StringDecoder, { + 'default': None + }), + 'high_availability': (option_decoders.BooleanDecoder, { + 'default': False + }), + 'backup_enabled': (option_decoders.BooleanDecoder, { + 'default': True + }), + 'backup_start_time': (option_decoders.StringDecoder, { + 'default': '07:00' + }), + 'is_managed_db': (option_decoders.BooleanDecoder, {'default': True}), + 'db_spec': (option_decoders.PerCloudConfigDecoder, {}), + 'db_disk_spec': (option_decoders.PerCloudConfigDecoder, {}), + 'vm_groups': (_VmGroupsDecoder, { + 'default': {} + }), + 'db_flags': (option_decoders.ListDecoder, { + 'item_decoder': option_decoders.StringDecoder(), + 'default': None + }), + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May be + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + # TODO(ferneyhough): Add flags for db_disk_spec. + # Currently the only way to modify the disk spec of the + # db is to change the benchmark spec in the benchmark source code + # itself. + super(_RelationalDbSpec, cls)._ApplyFlags(config_values, flag_values) + + # TODO(jerlawson): Rename flags 'managed_db_' -> 'db_'. + has_db_machine_type = flag_values['managed_db_machine_type'].present + has_db_cpus = flag_values['managed_db_cpus'].present + has_db_memory = flag_values['managed_db_memory'].present + has_custom_machine_type = has_db_cpus and has_db_memory + has_client_machine_type = flag_values['client_vm_machine_type'].present + has_client_vm_cpus = flag_values['client_vm_cpus'].present + has_client_vm_memory = flag_values['client_vm_memory'].present + has_client_custom_machine_type = has_client_vm_cpus and has_client_vm_memory + + if has_custom_machine_type and has_db_machine_type: + raise errors.Config.UnrecognizedOption( + 'db_cpus/db_memory can not be specified with ' + 'db_machine_type. Either specify a custom machine ' + 'with cpus and memory or specify a predefined machine type.') + + if (not has_custom_machine_type and (has_db_cpus or has_db_memory)): + raise errors.Config.MissingOption( + 'To specify a custom database machine instance, both managed_db_cpus ' + 'and managed_db_memory must be specified.') + + if has_client_custom_machine_type and has_client_machine_type: + raise errors.Config.UnrecognizedOption( + 'client_vm_cpus/client_vm_memory can not be specified with ' + 'client_vm_machine_type. Either specify a custom machine ' + 'with cpus and memory or specify a predefined machine type.') + + if (not has_client_custom_machine_type and + (has_client_vm_cpus or has_client_vm_memory)): + raise errors.Config.MissingOption( + 'To specify a custom client VM, both client_vm_cpus ' + 'and client_vm_memory must be specified.') + + if flag_values['use_managed_db'].present: + config_values['is_managed_db'] = flag_values.use_managed_db + + if flag_values['cloud'].present or 'cloud' not in config_values: + config_values['cloud'] = flag_values.cloud + if flag_values['managed_db_engine'].present: + config_values['engine'] = flag_values.managed_db_engine + if flag_values['managed_db_engine_version'].present: + config_values['engine_version'] = flag_values.managed_db_engine_version + if flag_values['managed_db_database_name'].present: + config_values['database_name'] = flag_values.managed_db_database_name + if flag_values['managed_db_database_username'].present: + config_values['database_username'] = ( + flag_values.managed_db_database_username) + if flag_values['managed_db_database_password'].present: + config_values['database_password'] = ( + flag_values.managed_db_database_password) + if flag_values['managed_db_high_availability'].present: + config_values['high_availability'] = ( + flag_values.managed_db_high_availability) + if flag_values['managed_db_backup_enabled'].present: + config_values['backup_enabled'] = (flag_values.managed_db_backup_enabled) + if flag_values['managed_db_backup_start_time'].present: + config_values['backup_start_time'] = ( + flag_values.managed_db_backup_start_time) + if flag_values['db_flags'].present: + config_values['db_flags'] = flag_values.db_flags + cloud = config_values['cloud'] + has_unmanaged_dbs = ('vm_groups' in config_values and + 'servers' in config_values['vm_groups']) + + if flag_values['managed_db_zone'].present: + config_values['db_spec'][cloud]['zone'] = flag_values.managed_db_zone[0] + config_values['zones'] = flag_values.managed_db_zone + if has_unmanaged_dbs: + config_values['vm_groups']['servers']['vm_spec'][cloud]['zone'] = ( + flag_values.managed_db_zone[0]) + if flag_values['client_vm_zone'].present: + config_values['vm_groups']['clients']['vm_spec'][cloud]['zone'] = ( + flag_values.client_vm_zone) + if has_db_machine_type: + config_values['db_spec'][cloud]['machine_type'] = ( + flag_values.managed_db_machine_type) + if has_unmanaged_dbs: + config_values['vm_groups']['servers']['vm_spec'][cloud][ + 'machine_type'] = ( + flag_values.managed_db_machine_type) + if has_custom_machine_type: + config_values['db_spec'][cloud]['machine_type'] = { + 'cpus': flag_values.managed_db_cpus, + 'memory': flag_values.managed_db_memory + } + # tox and pylint have contradictory closing brace rules, so avoid having + # opening and closing brackets on different lines. + config_values_vm_groups = config_values['vm_groups'] + if has_unmanaged_dbs: + config_values_vm_groups['servers']['vm_spec'][cloud]['machine_type'] = { + 'cpus': flag_values.managed_db_cpus, + 'memory': flag_values.managed_db_memory + } + if flag_values['managed_db_azure_compute_units'].present: + config_values['db_spec'][cloud]['machine_type']['compute_units'] = ( + flag_values.managed_db_azure_compute_units) + if flag_values['managed_db_tier'].present: + config_values['db_spec'][cloud]['machine_type']['tier'] = ( + flag_values.managed_db_tier) + if has_client_machine_type: + config_values['vm_groups']['clients']['vm_spec'][cloud][ + 'machine_type'] = ( + flag_values.client_vm_machine_type) + if has_client_custom_machine_type: + config_values_vm_groups = config_values['vm_groups'] + config_values_vm_groups['clients']['vm_spec'][cloud]['machine_type'] = { + 'cpus': flag_values.client_vm_cpus, + 'memory': flag_values.client_vm_memory + } + if flag_values[ + 'db_num_striped_disks'].present and has_unmanaged_dbs: + config_values['vm_groups']['servers']['disk_spec'][cloud][ + 'num_striped_disks'] = flag_values.db_num_striped_disks + if flag_values['managed_db_disk_size'].present: + config_values['db_disk_spec'][cloud]['disk_size'] = ( + flag_values.managed_db_disk_size) + if has_unmanaged_dbs: + config_values['vm_groups']['servers']['disk_spec'][cloud][ + 'disk_size'] = flag_values.managed_db_disk_size + if flag_values['managed_db_disk_type'].present: + config_values['db_disk_spec'][cloud]['disk_type'] = ( + flag_values.managed_db_disk_type) + if has_unmanaged_dbs: + config_values['vm_groups']['servers']['disk_spec'][cloud][ + 'disk_type'] = flag_values.managed_db_disk_type + if flag_values['managed_db_disk_iops'].present: + # This value will be used in aws_relation_db.py druing db creation + config_values['db_disk_spec'][cloud]['iops'] = ( + flag_values.managed_db_disk_iops) + if has_unmanaged_dbs: + config_values['vm_groups']['servers']['disk_spec'][cloud][ + 'iops'] = flag_values.managed_db_disk_iops + + if flag_values['client_vm_os_type'].present: + config_values['vm_groups']['clients'][ + 'os_type'] = flag_values.client_vm_os_type + if flag_values['server_vm_os_type'].present: + config_values['vm_groups']['servers'][ + 'os_type'] = flag_values.server_vm_os_type + + if flag_values['client_gcp_min_cpu_platform'].present: + config_values['vm_groups']['clients']['vm_spec'][cloud][ + 'min_cpu_platform'] = flag_values.client_gcp_min_cpu_platform + if flag_values['server_gcp_min_cpu_platform'].present: + config_values['vm_groups']['servers']['vm_spec'][cloud][ + 'min_cpu_platform'] = flag_values.server_gcp_min_cpu_platform + + if flag_values['client_vm_disk_size'].present: + config_values['vm_groups']['clients']['disk_spec'][cloud]['disk_size'] = ( + flag_values.client_vm_disk_size) + if flag_values['client_vm_disk_type'].present: + config_values['vm_groups']['clients']['disk_spec'][cloud]['disk_type'] = ( + flag_values.client_vm_disk_type) + if flag_values['client_vm_disk_iops'].present: + config_values['vm_groups']['clients']['disk_spec'][cloud]['disk_iops'] = ( + flag_values.client_vm_disk_iops) + logging.warning('Relational db config values: %s', config_values) + + +class _SparkServiceSpec(spec.BaseSpec): + """Configurable options of an Apache Spark Service. + + We may add more options here, such as disk specs, as necessary. + When there are flags for these attributes, the convention is that + the flag is prefixed with spark. For example, the static_cluster_id + is overridden by the flag spark_static_cluster_id + + Attributes: + service_type: string. pkb_managed or managed_service + static_cluster_id: if user has created a cluster, the id of the cluster. + worker_group: Vm group spec for workers. + master_group: Vm group spec for master + """ + + def __init__(self, component_full_name, flag_values=None, **kwargs): + super(_SparkServiceSpec, self).__init__( + component_full_name, flag_values=flag_values, **kwargs) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments + to construct in order to decode the named option. + """ + result = super(_SparkServiceSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'static_cluster_id': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'service_type': (option_decoders.EnumDecoder, { + 'default': + spark_service.PROVIDER_MANAGED, + 'valid_values': [ + spark_service.PROVIDER_MANAGED, spark_service.PKB_MANAGED + ] + }), + 'worker_group': (_VmGroupSpecDecoder, {}), + 'master_group': (_VmGroupSpecDecoder, { + 'default': None, + 'none_ok': True + }) + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May be + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(_SparkServiceSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['spark_static_cluster_id'].present: + config_values['static_cluster_id'] = (flag_values.spark_static_cluster_id) + if flag_values['zones'].present: + for group in ('master_group', 'worker_group'): + if group in config_values: + for cloud in config_values[group]['vm_spec']: + config_values[group]['vm_spec'][cloud]['zone'] = ( + flag_values.zones[0]) + + +class _VmGroupSpec(spec.BaseSpec): + """Configurable options of a VM group. + + Attributes: + cloud: string. Cloud provider of the VMs in this group. + disk_count: int. Number of data disks to attach to each VM in this group. + disk_spec: BaseDiskSpec. Configuration for all data disks to be attached to + VMs in this group. + os_type: string. OS type of the VMs in this group. + static_vms: None or list of StaticVmSpecs. Configuration for all static VMs + in this group. + vm_count: int. Number of VMs in this group, including static VMs and + provisioned VMs. + vm_spec: BaseVmSpec. Configuration for provisioned VMs in this group. + placement_group_name: string. Name of placement group that VM group belongs + to. + cidr: subnet each vm in this group belongs to + """ + + def __init__(self, component_full_name, flag_values=None, **kwargs): + super(_VmGroupSpec, self).__init__( + component_full_name, flag_values=flag_values, **kwargs) + ignore_package_requirements = ( + getattr(flag_values, 'ignore_package_requirements', True) + if flag_values else True) + providers.LoadProvider(self.cloud, ignore_package_requirements) + if self.disk_spec: + disk_config = getattr(self.disk_spec, self.cloud, None) + if disk_config is None: + raise errors.Config.MissingOption( + '{0}.cloud is "{1}", but {0}.disk_spec does not contain a ' + 'configuration for "{1}".'.format(component_full_name, self.cloud)) + disk_spec_class = disk.GetDiskSpecClass(self.cloud) + self.disk_spec = disk_spec_class( + '{0}.disk_spec.{1}'.format(component_full_name, self.cloud), + flag_values=flag_values, + **disk_config) + vm_config = getattr(self.vm_spec, self.cloud, None) + if vm_config is None: + raise errors.Config.MissingOption( + '{0}.cloud is "{1}", but {0}.vm_spec does not contain a ' + 'configuration for "{1}".'.format(component_full_name, self.cloud)) + vm_spec_class = virtual_machine.GetVmSpecClass(self.cloud) + self.vm_spec = vm_spec_class( + '{0}.vm_spec.{1}'.format(component_full_name, self.cloud), + flag_values=flag_values, + **vm_config) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments + to construct in order to decode the named option. + """ + result = super(_VmGroupSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'cloud': (option_decoders.EnumDecoder, { + 'valid_values': providers.VALID_CLOUDS + }), + 'disk_count': (option_decoders.IntDecoder, { + 'default': _DEFAULT_DISK_COUNT, + 'min': 0, + 'none_ok': True + }), + 'disk_spec': (option_decoders.PerCloudConfigDecoder, { + 'default': None, + 'none_ok': True + }), + 'os_type': (option_decoders.EnumDecoder, { + 'valid_values': os_types.ALL + }), + 'static_vms': (_StaticVmListDecoder, {}), + 'vm_count': (option_decoders.IntDecoder, { + 'default': _DEFAULT_VM_COUNT, + 'min': 0 + }), + 'cidr': (option_decoders.StringDecoder, { + 'default': None + }), + 'vm_spec': (option_decoders.PerCloudConfigDecoder, {}), + 'placement_group_name': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May be + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(_VmGroupSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['cloud'].present or 'cloud' not in config_values: + config_values['cloud'] = flag_values.cloud + if flag_values['os_type'].present or 'os_type' not in config_values: + config_values['os_type'] = flag_values.os_type + if 'vm_count' in config_values and config_values['vm_count'] is None: + config_values['vm_count'] = flag_values.num_vms + + +class _VmGroupsDecoder(option_decoders.TypeVerifier): + """Validates the vm_groups dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_VmGroupsDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verifies vm_groups dictionary of a benchmark config object. + + Args: + value: dict mapping VM group name string to the corresponding VM group + config dict. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + dict mapping VM group name string to _VmGroupSpec. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + vm_group_configs = super(_VmGroupsDecoder, + self).Decode(value, component_full_name, + flag_values) + result = {} + for vm_group_name, vm_group_config in six.iteritems(vm_group_configs): + result[vm_group_name] = _VmGroupSpec( + '{0}.{1}'.format( + self._GetOptionFullName(component_full_name), vm_group_name), + flag_values=flag_values, + **vm_group_config) + return result + + +class _VmGroupSpecDecoder(option_decoders.TypeVerifier): + """Validates a single VmGroupSpec dictionary.""" + + def __init__(self, **kwargs): + super(_VmGroupSpecDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verifies vm_groups dictionary of a benchmark config object. + + Args: + value: dict corresonding to a VM group config. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + dict a _VmGroupSpec. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + vm_group_config = super(_VmGroupSpecDecoder, + self).Decode(value, component_full_name, + flag_values) + return _VmGroupSpec( + self._GetOptionFullName(component_full_name), + flag_values=flag_values, + **vm_group_config) + + +class _PlacementGroupSpecsDecoder(option_decoders.TypeVerifier): + """Validates the placement_group_specs dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_PlacementGroupSpecsDecoder, self).__init__( + valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verifies placement_group_specs dictionary of a benchmark config object. + + Args: + value: dict mapping Placement Group Spec name string to the corresponding + placement group spec config dict. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + dict mapping Placement Group Spec name string + to placement_group.BasePlacementGroupSpec. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + placement_group_spec_configs = ( + super(_PlacementGroupSpecsDecoder, + self).Decode(value, component_full_name, flag_values)) + result = {} + for placement_group_name, placement_group_spec_config in six.iteritems( + placement_group_spec_configs): + placement_group_spec_class = placement_group.GetPlacementGroupSpecClass( + self.cloud) + result[placement_group_name] = placement_group_spec_class( + '{0}.{1}'.format( + self._GetOptionFullName(component_full_name), + placement_group_name), + flag_values=flag_values, + **placement_group_spec_config) + return result + + +class _ContainerRegistryDecoder(option_decoders.TypeVerifier): + """Validates the container_registry dictionary of a benchmark config.""" + + def __init__(self, **kwargs): + super(_ContainerRegistryDecoder, self).__init__( + valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verifies container_registry dictionary of a benchmark config object. + + Args: + value: dict mapping VM group name string to the corresponding container + spec config dict. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + dict mapping container spec name string to ContainerSpec. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + vm_group_config = super(_ContainerRegistryDecoder, + self).Decode(value, component_full_name, + flag_values) + return container_service.ContainerRegistrySpec( + self._GetOptionFullName(component_full_name), + flag_values=flag_values, + **vm_group_config) + + +class _ContainerSpecsDecoder(option_decoders.TypeVerifier): + """Validates the container_specs dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_ContainerSpecsDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verifies container_specs dictionary of a benchmark config object. + + Args: + value: dict mapping VM group name string to the corresponding container + spec config dict. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + dict mapping container spec name string to ContainerSpec. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + container_spec_configs = super(_ContainerSpecsDecoder, + self).Decode(value, component_full_name, + flag_values) + result = {} + for spec_name, spec_config in six.iteritems(container_spec_configs): + result[spec_name] = container_service.ContainerSpec( + '{0}.{1}'.format( + self._GetOptionFullName(component_full_name), spec_name), + flag_values=flag_values, + **spec_config) + return result + + +class _NodepoolSpec(spec.BaseSpec): + """Configurable options of a Nodepool.""" + + def __init__(self, + component_full_name, + group_name, + flag_values=None, + **kwargs): + super(_NodepoolSpec, self).__init__( + '{0}.{1}'.format(component_full_name, group_name), + flag_values=flag_values, + **kwargs) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments + to construct in order to decode the named option. + """ + result = super(_NodepoolSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'vm_count': (option_decoders.IntDecoder, { + 'default': _DEFAULT_VM_COUNT, + 'min': 0 + }), + 'vm_spec': (option_decoders.PerCloudConfigDecoder, {}) + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May be + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(_NodepoolSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['container_cluster_num_vms'].present: + config_values['vm_count'] = flag_values.container_cluster_num_vms + + # Need to apply the first zone in the zones flag, if specified, + # to the spec. _NodepoolSpec does not currently support + # running in multiple zones in a single PKB invocation. + if flag_values['zones'].present: + for cloud in config_values['vm_spec']: + config_values['vm_spec'][cloud]['zone'] = (flag_values.zones[0]) + + +class _NodepoolsDecoder(option_decoders.TypeVerifier): + """Validate the nodepool dictionary of a nodepools config object.""" + + def __init__(self, **kwargs): + super(_NodepoolsDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verify Nodepool dict of a benchmark config object. + + Args: + value: dict. Config dictionary + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _NodepoolsDecoder built from the config passed in value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + nodepools_configs = super(_NodepoolsDecoder, self).Decode( + value, component_full_name, flag_values) + result = {} + for nodepool_name, nodepool_config in six.iteritems(nodepools_configs): + result[nodepool_name] = _NodepoolSpec( + self._GetOptionFullName(component_full_name), nodepool_name, + flag_values, **nodepool_config) + return result + + +class _ContainerClusterSpec(spec.BaseSpec): + """Spec containing info needed to create a container cluster.""" + + def __init__(self, component_full_name, flag_values=None, **kwargs): + super(_ContainerClusterSpec, self).__init__( + component_full_name, flag_values=flag_values, **kwargs) + ignore_package_requirements = ( + getattr(flag_values, 'ignore_package_requirements', True) + if flag_values else True) + providers.LoadProvider(self.cloud, ignore_package_requirements) + vm_config = getattr(self.vm_spec, self.cloud, None) + if vm_config is None: + raise errors.Config.MissingOption( + '{0}.cloud is "{1}", but {0}.vm_spec does not contain a ' + 'configuration for "{1}".'.format(component_full_name, self.cloud)) + vm_spec_class = virtual_machine.GetVmSpecClass(self.cloud) + self.vm_spec = vm_spec_class( + '{0}.vm_spec.{1}'.format(component_full_name, self.cloud), + flag_values=flag_values, + **vm_config) + nodepools = {} + for nodepool_name, nodepool_spec in sorted(six.iteritems(self.nodepools)): + if nodepool_name == container_service.DEFAULT_NODEPOOL: + raise errors.Config.InvalidValue( + 'Nodepool name {0} is reserved for use during cluster creation. ' + 'Please rename nodepool'.format(nodepool_name)) + nodepool_config = getattr(nodepool_spec.vm_spec, self.cloud, None) + if nodepool_config is None: + raise errors.Config.MissingOption( + '{0}.cloud is "{1}", but {0}.vm_spec does not contain a ' + 'configuration for "{1}".'.format(component_full_name, self.cloud)) + vm_spec_class = virtual_machine.GetVmSpecClass(self.cloud) + nodepool_spec.vm_spec = vm_spec_class( + '{0}.vm_spec.{1}'.format(component_full_name, self.cloud), + flag_values=flag_values, + **nodepool_config) + nodepools[nodepool_name] = nodepool_spec + + self.nodepools = nodepools + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments + to construct in order to decode the named option. + """ + result = super(_ContainerClusterSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'static_cluster': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'cloud': (option_decoders.EnumDecoder, { + 'valid_values': providers.VALID_CLOUDS + }), + 'type': (option_decoders.StringDecoder, { + 'default': container_service.KUBERNETES, + }), + 'vm_count': (option_decoders.IntDecoder, { + 'default': _DEFAULT_VM_COUNT, + 'min': 0 + }), + 'min_vm_count': (option_decoders.IntDecoder, { + 'default': None, + 'none_ok': True, + 'min': 0 + }), + 'max_vm_count': (option_decoders.IntDecoder, { + 'default': None, + 'none_ok': True, + 'min': 0 + }), + # vm_spec is used to define the machine type for the default nodepool + 'vm_spec': (option_decoders.PerCloudConfigDecoder, {}), + # nodepools specifies a list of additional nodepools to create alongside + # the default nodepool (nodepool created on cluster creation). + 'nodepools': (_NodepoolsDecoder, { + 'default': {}, + 'none_ok': True + }), + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + super(_ContainerClusterSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['cloud'].present or 'cloud' not in config_values: + config_values['cloud'] = flag_values.cloud + if flag_values['container_cluster_cloud'].present: + config_values['cloud'] = flag_values.container_cluster_cloud + if flag_values['container_cluster_type'].present: + config_values['type'] = flag_values.container_cluster_type + if flag_values['container_cluster_num_vms'].present: + config_values['vm_count'] = flag_values.container_cluster_num_vms + + # Need to apply the first zone in the zones flag, if specified, + # to the spec. ContainerClusters do not currently support + # running in multiple zones in a single PKB invocation. + if flag_values['zones'].present: + for cloud in config_values['vm_spec']: + config_values['vm_spec'][cloud]['zone'] = (flag_values.zones[0]) + + +class _ContainerClusterSpecDecoder(option_decoders.TypeVerifier): + """Validates a ContainerClusterSpec dictionairy.""" + + def __init__(self, **kwargs): + super(_ContainerClusterSpecDecoder, self).__init__( + valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verifies container_cluster dictionairy of a benchmark config object.""" + cluster_config = super(_ContainerClusterSpecDecoder, + self).Decode(value, component_full_name, flag_values) + + return _ContainerClusterSpec( + self._GetOptionFullName(component_full_name), + flag_values=flag_values, + **cluster_config) + + +class _SparkServiceDecoder(option_decoders.TypeVerifier): + """Validates the spark_service dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_SparkServiceDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verifies spark_service dictionary of a benchmark config object. + + Args: + value: dict Spark Service config dictionary + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _SparkServiceSpec Build from the config passed in value. + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + spark_service_config = super(_SparkServiceDecoder, + self).Decode(value, component_full_name, + flag_values) + result = _SparkServiceSpec( + self._GetOptionFullName(component_full_name), flag_values, + **spark_service_config) + return result + + +class _RelationalDbDecoder(option_decoders.TypeVerifier): + """Validate the relational_db dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_RelationalDbDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verify relational_db dict of a benchmark config object. + + Args: + value: dict. Config dictionary + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _RelationalDbService built from the config passed in in value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + relational_db_config = super(_RelationalDbDecoder, + self).Decode(value, component_full_name, + flag_values) + result = _RelationalDbSpec( + self._GetOptionFullName(component_full_name), flag_values, + **relational_db_config) + return result + + +class _NonRelationalDbDecoder(option_decoders.TypeVerifier): + """Validate the non_relational_db dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_NonRelationalDbDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verify non_relational_db dict of a benchmark config object. + + Args: + value: dict. Config dictionary + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _NonRelationalDbService built from the config passed in value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + non_relational_db_config = super().Decode(value, component_full_name, + flag_values) + if 'service_type' in non_relational_db_config: + db_spec_class = non_relational_db.GetNonRelationalDbSpecClass( + non_relational_db_config['service_type']) + else: + raise errors.Config.InvalidValue( + 'Required attribute `service_type` missing from non_relational_db ' + 'config.') + return db_spec_class( + self._GetOptionFullName(component_full_name), flag_values, + **non_relational_db_config) + + +class _SpannerDecoder(option_decoders.TypeVerifier): + """Validate the spanner dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_SpannerDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verify spanner dict of a benchmark config object. + + Args: + value: dict. Config dictionary + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _SpannerSpec built from the config passed in value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + spanner_config = super().Decode(value, component_full_name, flag_values) + # Allow for subclass-specific specs. + if 'service_type' in spanner_config: + spanner_spec_class = gcp_spanner.GetSpannerSpecClass( + spanner_config['service_type']) + else: + raise errors.Config.InvalidValue( + 'Required attribute `service_type` missing from spanner config.') + return spanner_spec_class( + self._GetOptionFullName(component_full_name), flag_values, + **spanner_config) + + +class _TpuGroupsDecoder(option_decoders.TypeVerifier): + """Validate the tpu dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_TpuGroupsDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verify tpu dict of a benchmark config object. + + Args: + value: dict. Config dictionary + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _Tpu built from the config passed in in value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + tpu_group_configs = super(_TpuGroupsDecoder, + self).Decode(value, component_full_name, + flag_values) + result = {} + for tpu_group_name, tpu_group_config in six.iteritems(tpu_group_configs): + result[tpu_group_name] = _TpuGroupSpec( + self._GetOptionFullName(component_full_name), tpu_group_name, + flag_values, **tpu_group_config) + return result + + +class _CloudRedisSpec(spec.BaseSpec): + """Specs needed to configure a cloud redis instance.""" + + def __init__(self, component_full_name, flag_values=None, **kwargs): + super(_CloudRedisSpec, self).__init__( + component_full_name, flag_values=flag_values, **kwargs) + if not self.redis_name: + self.redis_name = 'pkb-cloudredis-{0}'.format(flag_values.run_uri) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments to + construct in order to decode the named option. + """ + result = super(_CloudRedisSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'cloud': (option_decoders.EnumDecoder, { + 'valid_values': providers.VALID_CLOUDS + }), + 'redis_name': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': False + }), + 'redis_version': (option_decoders.EnumDecoder, { + 'default': managed_memory_store.REDIS_3_2, + 'valid_values': managed_memory_store.REDIS_VERSIONS + }), + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Args: + config_values: dict mapping config option names to provided values. May be + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(_CloudRedisSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['cloud'].present or 'cloud' not in config_values: + config_values['cloud'] = flag_values.cloud + + +class _CloudRedisDecoder(option_decoders.TypeVerifier): + """Validate the cloud_redis dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_CloudRedisDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verify cloud_redis dict of a benchmark config object. + + Args: + value: dict. Config dictionary + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _CloudRedis built from the config passed in in value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + cloud_redis_config = super(_CloudRedisDecoder, + self).Decode(value, component_full_name, + flag_values) + result = _CloudRedisSpec( + self._GetOptionFullName(component_full_name), flag_values, + **cloud_redis_config) + return result + + +class _VPNServiceSpec(spec.BaseSpec): + """Spec needed to configure a vpn tunnel between two vm_groups. + + Since vpn_gateway may be across cloud providers we only create tunnel when + vpn_gateway's are up and known + """ + + def __init__(self, component_full_name, flag_values=None, **kwargs): + super(_VPNServiceSpec, self).__init__( + component_full_name, flag_values=flag_values, **kwargs) + if not self.name: + self.name = 'pkb-vpn-svc-{0}'.format(flag_values.run_uri) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments to + construct in order to decode the named option. + """ + result = super(_VPNServiceSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'shared_key': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'name': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'tunnel_count': (option_decoders.IntDecoder, { + 'default': 1, + 'none_ok': True + }), + 'gateway_count': (option_decoders.IntDecoder, { + 'default': 1, + 'none_ok': True + }), + 'routing_type': (option_decoders.StringDecoder, { + 'default': 'static', + 'none_ok': True + }), + 'ike_version': (option_decoders.IntDecoder, { + 'default': 1, + 'none_ok': True + }), + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Args: + config_values: dict mapping config option names to provided values. May be + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(_VPNServiceSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['vpn_service_tunnel_count'].present: + config_values['tunnel_count'] = flag_values.vpn_service_tunnel_count + if flag_values['vpn_service_gateway_count'].present: + config_values['gateway_count'] = flag_values.vpn_service_gateway_count + if flag_values['vpn_service_name'].present: + config_values['name'] = flag_values.vpn_service_name + if flag_values['vpn_service_shared_key'].present: + config_values['shared_key'] = flag_values.vpn_service_shared_key + if flag_values['vpn_service_routing_type'].present: + config_values['routing_type'] = flag_values.vpn_service_routing_type + if flag_values['vpn_service_ike_version'].present: + config_values['ike_version'] = flag_values.vpn_service_ike_version + + +class _VPNServiceDecoder(option_decoders.TypeVerifier): + """Validate the vpn_service dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_VPNServiceDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verify vpn_service dict of a benchmark config object. + + Args: + value: dict. Config dictionary + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _VPNService built from the config passed in in value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + vpn_service_config = super(_VPNServiceDecoder, + self).Decode(value, component_full_name, + flag_values) + result = _VPNServiceSpec( + self._GetOptionFullName(component_full_name), flag_values, + **vpn_service_config) + return result + + +class _AppGroupSpec(spec.BaseSpec): + """Configurable options of a AppService group.""" + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments + to construct in order to decode the named option. + """ + result = super(_AppGroupSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'app_runtime': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'app_type': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'appservice_count': (option_decoders.IntDecoder, { + 'default': 1 + }), + 'appservice_spec': (_AppServiceDecoder, {}) + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + super(_AppGroupSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['appservice_count'].present: + config_values['appservice_count'] = flag_values.appservice_count + if flag_values['app_runtime'].present: + config_values['app_runtime'] = flag_values.app_runtime + if flag_values['app_type'].present: + config_values['app_type'] = flag_values.app_type + + +class _AppGroupsDecoder(option_decoders.TypeVerifier): + """Verify app_groups dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_AppGroupsDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verifys app_groups dictionary of a benchmark config object. + + Args: + value: dict. Config dictionary. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + dict mapping app group name string to _AppGroupSpec. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + app_group_configs = super(_AppGroupsDecoder, + self).Decode(value, component_full_name, + flag_values) + result = {} + for app_group_name, app_group_config in six.iteritems(app_group_configs): + result[app_group_name] = _AppGroupSpec( + '{0}.{1}'.format( + self._GetOptionFullName(component_full_name), app_group_name), + flag_values=flag_values, + **app_group_config) + return result + + +class _AppServiceDecoder(option_decoders.TypeVerifier): + """Verify app_service dict of a benchmark config object.""" + + def __init__(self, **kwargs): + super(_AppServiceDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verify app_service dict of a benchmark config object. + + Args: + value: dict. Config dictionary. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + AppService object built from config. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + config = super(_AppServiceDecoder, self).Decode(value, component_full_name, + flag_values) + spec_cls = app_service.GetAppServiceSpecClass( + flag_values.appservice or config.get('appservice')) + return spec_cls( + self._GetOptionFullName(component_full_name), + flag_values=flag_values, + **config) + + +class _MessagingServiceSpec(spec.BaseSpec): + """Specs needed to configure messaging service. + """ + + def __init__(self, component_full_name, flag_values=None, **kwargs): + super().__init__(component_full_name, flag_values=flag_values, **kwargs) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments to + construct in order to decode the named option. + """ + result = super()._GetOptionDecoderConstructions() + result.update({ + 'cloud': (option_decoders.EnumDecoder, { + 'valid_values': providers.VALID_CLOUDS}), + # TODO(odiego): Add support for push delivery mechanism + 'delivery': (option_decoders.EnumDecoder, { + 'valid_values': ('pull',)}), + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Args: + config_values: dict mapping config option names to provided values. May + be modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super()._ApplyFlags(config_values, flag_values) + if flag_values['cloud'].present or 'cloud' not in config_values: + config_values['cloud'] = flag_values.cloud + # TODO(odiego): Handle delivery when adding more delivery mechanisms + + +class _MessagingServiceDecoder(option_decoders.TypeVerifier): + """Validate the messaging_service dictionary of a benchmark config object.""" + + def __init__(self, **kwargs): + super().__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verify messaging_service dict of a benchmark config object. + + Args: + value: dict. Config dictionary + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _MessagingServiceSpec object built from the config passed in in value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + messaging_service_config = super().Decode(value, component_full_name, + flag_values) + result = _MessagingServiceSpec( + self._GetOptionFullName(component_full_name), flag_values, + **messaging_service_config) + return result + + +class _DataDiscoveryServiceSpec(spec.BaseSpec): + """Specs needed to configure data discovery service. + """ + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments to + construct in order to decode the named option. + """ + result = super()._GetOptionDecoderConstructions() + result.update({ + 'cloud': (option_decoders.EnumDecoder, { + 'valid_values': providers.VALID_CLOUDS + }), + 'service_type': ( + option_decoders.EnumDecoder, + { + 'default': + data_discovery_service.GLUE, + 'valid_values': [ + data_discovery_service.GLUE, + ] + }), + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Args: + config_values: dict mapping config option names to provided values. May + be modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super()._ApplyFlags(config_values, flag_values) + if flag_values['cloud'].present or 'cloud' not in config_values: + config_values['cloud'] = flag_values.cloud + + +class _DataDiscoveryServiceDecoder(option_decoders.TypeVerifier): + """Validate the data_discovery_service dict of a benchmark config object.""" + + def __init__(self, **kwargs): + super().__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Verify data_discovery_service dict of a benchmark config object. + + Args: + value: dict. Config dictionary + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _DataDiscoveryServiceSpec object built from the config passed in value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + if value is None: + value = {} + data_discovery_service_config = super().Decode(value, component_full_name, + flag_values) + result = _DataDiscoveryServiceSpec( + self._GetOptionFullName(component_full_name), flag_values, + **data_discovery_service_config) + return result + + +class BenchmarkConfigSpec(spec.BaseSpec): + """Configurable options of a benchmark run. + + Attributes: + description: None or string. Description of the benchmark to run. + name: Optional. The name of the benchmark + flags: dict. Values to use for each flag while executing the benchmark. + vm_groups: dict mapping VM group name string to _VmGroupSpec. Configurable + options for each VM group used by the benchmark. + """ + + def __init__(self, component_full_name, expected_os_types=None, **kwargs): + """Initializes a BenchmarkConfigSpec. + + Args: + component_full_name: string. Fully qualified name of the benchmark config + dict within the config file. + expected_os_types: Optional series of strings from os_types.ALL. + **kwargs: Keyword arguments for the BaseSpec constructor. + + Raises: + errors.Config.InvalidValue: If expected_os_types is provided and any of + the VM groups are configured with an OS type that is not included. + """ + super(BenchmarkConfigSpec, self).__init__(component_full_name, **kwargs) + if expected_os_types is not None: + mismatched_os_types = [] + for group_name, group_spec in sorted(six.iteritems(self.vm_groups)): + if group_spec.os_type not in expected_os_types: + mismatched_os_types.append('{0}.vm_groups[{1}].os_type: {2}'.format( + component_full_name, repr(group_name), repr(group_spec.os_type))) + if mismatched_os_types: + raise errors.Config.InvalidValue( + 'VM groups in {0} may only have the following OS types: {1}. The ' + 'following VM group options are invalid:{2}{3}'.format( + component_full_name, + ', '.join(repr(os_type) for os_type in expected_os_types), + os.linesep, os.linesep.join(mismatched_os_types))) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Can be overridden by derived classes to add options or impose additional + requirements on existing options. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments + to construct in order to decode the named option. + """ + result = super(BenchmarkConfigSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'description': (option_decoders.StringDecoder, { + 'default': None + }), + 'name': (option_decoders.StringDecoder, { + 'default': None + }), + 'flags': (option_decoders.TypeVerifier, { + 'default': None, + 'none_ok': True, + 'valid_types': (dict,) + }), + 'vm_groups': (_VmGroupsDecoder, { + 'default': {} + }), + 'placement_group_specs': (_PlacementGroupSpecsDecoder, { + 'default': {} + }), + 'spark_service': (_SparkServiceDecoder, { + 'default': None + }), + 'container_cluster': (_ContainerClusterSpecDecoder, { + 'default': None + }), + 'container_registry': (_ContainerRegistryDecoder, { + 'default': None + }), + 'container_specs': (_ContainerSpecsDecoder, { + 'default': None + }), + 'dpb_service': (_DpbServiceDecoder, { + 'default': None + }), + 'relational_db': (_RelationalDbDecoder, { + 'default': None + }), + 'tpu_groups': (_TpuGroupsDecoder, { + 'default': {} + }), + 'edw_service': (_EdwServiceDecoder, { + 'default': None + }), + 'cloud_redis': (_CloudRedisDecoder, { + 'default': None + }), + 'vpn_service': (_VPNServiceDecoder, { + 'default': None + }), + 'app_groups': (_AppGroupsDecoder, { + 'default': {} + }), + 'vpc_peering': (option_decoders.BooleanDecoder, { + 'default': False, + 'none_ok': True, + }), + 'non_relational_db': (_NonRelationalDbDecoder, { + 'default': None, + 'none_ok': True, + }), + 'spanner': (_SpannerDecoder, { + 'default': None, + 'none_ok': True, + }), + 'messaging_service': (_MessagingServiceDecoder, { + 'default': None, + }), + 'data_discovery_service': (_DataDiscoveryServiceDecoder, { + 'default': None, + 'none_ok': True, + }) + }) + return result + + def _DecodeAndInit(self, component_full_name, config, decoders, flag_values): + """Initializes spec attributes from provided config option values. + + Args: + component_full_name: string. Fully qualified name of the configurable + component containing the config options. + config: dict mapping option name string to option value. + decoders: OrderedDict mapping option name string to ConfigOptionDecoder. + flag_values: flags.FlagValues. Runtime flags that may override provided + config option values. These flags have already been applied to the + current config, but they may be passed to the decoders for propagation + to deeper spec constructors. + """ + # Decode benchmark-specific flags first and use them while decoding the + # rest of the BenchmarkConfigSpec's options. + decoders = decoders.copy() + self.flags = config.get('flags') + with self.RedirectFlags(flag_values): + super(BenchmarkConfigSpec, + self)._DecodeAndInit(component_full_name, config, decoders, + flag_values) + + @contextlib.contextmanager + def RedirectFlags(self, flag_values): + """Redirects flag reads and writes to the benchmark-specific flags object. + + Args: + flag_values: flags.FlagValues object. Within the enclosed code block, + reads and writes to this object are redirected to self.flags. + + Yields: + context manager that redirects flag reads and writes. + """ + with flag_util.OverrideFlags(flag_values, self.flags): + yield diff --git a/script/cumulus/pkb/perfkitbenchmarker/configs/default_config_constants.yaml b/script/cumulus/pkb/perfkitbenchmarker/configs/default_config_constants.yaml new file mode 100644 index 0000000..75a4b29 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/configs/default_config_constants.yaml @@ -0,0 +1,192 @@ +# All anchors defined in this file should be compatible +# with *all* clouds. That means any vm_specs or disk_specs +# defined here should have keys for every cloud. +default_single_core: &default_single_core + GCP: + machine_type: n1-standard-1 + zone: us-central1-a + image: null + Azure: + machine_type: Standard_A1 + zone: eastus2 + image: null + AWS: + machine_type: t2.small + zone: us-east-1 + image: null + AliCloud: + machine_type: ecs.g5.large + zone: cn-beijing-g + image: null + DigitalOcean: + machine_type: 2gb + zone: sfo1 + image: null + OpenStack: + machine_type: m1.small + zone: nova + image: null + CloudStack: + machine_type: 1vCPU.1GB + zone: QC-1 + image: null + Rackspace: + machine_type: general1-1 + zone: IAD + image: null + Kubernetes: + image: null + Mesos: + image: null + ProfitBricks: + machine_type: Small + zone: ZONE_1 + image: null + Docker: + image: null + machine_type: + cpus: 1 + memory: 2.0GiB + IBMCloud: + machine_type: cx2-2x4 + zone: us-south-1 + image: null + +# TODO: update the two core machines for more providers +default_dual_core: &default_dual_core + GCP: + machine_type: n1-standard-2 + zone: us-central1-a + image: null + Azure: + machine_type: Standard_D2_v3 + zone: eastus2 + image: null + AWS: + machine_type: m5.large + zone: us-east-1 + image: null + Docker: + image: null + machine_type: + cpus: 2 + memory: 4.0GiB + AliCloud: + machine_type: ecs.g5.xlarge + zone: cn-beijing-g + image: null + IBMCloud: + machine_type: cx2-4x8 + zone: us-south-1 + image: null + +# TODO(user): update the disk types below as more providers are +# updated for the disk types refactor. +default_500_gb: &default_500_gb + GCP: + disk_type: pd-standard + disk_size: 500 + mount_point: /scratch + Azure: + disk_type: Standard_LRS + disk_size: 500 + mount_point: /scratch + AWS: + disk_type: standard + disk_size: 500 + mount_point: /scratch + AliCloud: + disk_type: standard + disk_size: 500 + mount_point: /scratch + DigitalOcean: + disk_type: standard + disk_size: 500 + mount_point: /scratch + OpenStack: + disk_type: standard + disk_size: 500 + mount_point: /scratch + CloudStack: + disk_size: 500 + mount_point: /scratch + Rackspace: + disk_type: standard + disk_size: 500 + mount_point: /scratch + Kubernetes: + disk_type: emptyDir + disk_size: 500 + mount_point: /scratch + Mesos: + disk_type: local + disk_size: 500 + mount_point: /scratch + ProfitBricks: + disk_type: standard + disk_size: 500 + mount_point: /scratch + Docker: + disk_type: local + disk_size: 500 + mount_point: /scratch + IBMCloud: + disk_type: standard + disk_size: 500 + mount_point: /scratch + + +# TODO(user): update the disk types below as more providers are +# updated for the disk types refactor. +default_50_gb: &default_50_gb + GCP: + disk_type: pd-standard + disk_size: 50 + mount_point: /scratch + Azure: + disk_type: Standard_LRS + disk_size: 50 + mount_point: /scratch + AWS: + disk_type: standard + disk_size: 50 + mount_point: /scratch + AliCloud: + disk_type: standard + disk_size: 50 + mount_point: /scratch + DigitalOcean: + disk_type: standard + disk_size: 50 + mount_point: /scratch + OpenStack: + disk_type: standard + disk_size: 50 + mount_point: /scratch + CloudStack: + disk_size: 50 + mount_point: /scratch + Rackspace: + disk_type: standard + disk_size: 50 + mount_point: /scratch + Kubernetes: + disk_type: emptyDir + disk_size: 50 + mount_point: /scratch + Mesos: + disk_type: local + disk_size: 50 + mount_point: /scratch + ProfitBricks: + disk_type: standard + disk_size: 50 + mount_point: /scratch + Docker: + disk_type: local + disk_size: 50 + mount_point: /scratch + IBMCloud: + disk_type: standard + disk_size: 50 + mount_point: /scratch diff --git a/script/cumulus/pkb/perfkitbenchmarker/configs/example_docker_to_cloud.yaml b/script/cumulus/pkb/perfkitbenchmarker/configs/example_docker_to_cloud.yaml new file mode 100644 index 0000000..1de08f4 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/configs/example_docker_to_cloud.yaml @@ -0,0 +1,19 @@ +# This config file is an example of performing +# a network benchmark between a Docker container +# running locally and an n1-standard-1 instance +# on GCP +# +#./pkb.py --benchmarks=netperf --benchmark_config_file=example_docker_to_cloud.yaml + +netperf: + flags: + ip_addresses: EXTERNAL + vm_groups: + vm_1: + cloud: Docker + vm_2: + cloud: GCP + vm_spec: + GCP: + machine_type: n1-standard-1 + zone: us-central1-a diff --git a/script/cumulus/pkb/perfkitbenchmarker/configs/example_user_config.yaml b/script/cumulus/pkb/perfkitbenchmarker/configs/example_user_config.yaml new file mode 100644 index 0000000..dd44618 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/configs/example_user_config.yaml @@ -0,0 +1,55 @@ +# Declare any anchors you may want to use later. +four_core: &four_core + GCP: + machine_type: n1-standard-4 + AWS: + machine_type: c4.xlarge + +# You may also want to declare static VMs. +# For a complete list of valid keys, see +# static_virtual_machine.StaticVmSpec. +static_vms: + - &vm1 + user_name: perfkit + ssh_private_key: /absolute/path/to/key + ip_address: 1.1.1.1 + - &vm2 + user_name: perfkit + ssh_private_key: /absolute/path/to/key + ip_address: 2.2.2.2 + # Declare the OS type of the VM if necessary. + os_type: rhel + # If you want to run any benchmarks that use disks you + # should declare them. + disk_specs: + # For most benchmarks, just declaring the mount point + # is sufficient. + - mount_point: /scratch + +# Multi cloud iperf config. +iperf: &iperf_multicloud + vm_groups: + vm_1: + cloud: GCP + vm_spec: *four_core + vm_2: + cloud: AWS + vm_spec: *four_core + + +# If you've already declared your static VMs, +# here's how to use them. +# Don't run fio using this config because the +# static VM example values are completely bogus. +fio: + vm_groups: + default: + static_vms: + - *vm2 + +# If you choose to, you can specify which benchmarks should be +# run in your config file. This will even let you run the same +# benchmark multiple times with different configs. +benchmarks: + - iperf: null # This means use the default config + - iperf: *iperf_multicloud diff --git a/script/cumulus/pkb/perfkitbenchmarker/configs/freeze_restore_spec.py b/script/cumulus/pkb/perfkitbenchmarker/configs/freeze_restore_spec.py new file mode 100644 index 0000000..34c6d39 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/configs/freeze_restore_spec.py @@ -0,0 +1,73 @@ +# Copyright 2021 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Benchmark-configurable options for freezing and restoring resources.""" + +from typing import Dict, Optional + +from absl import flags +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.configs import spec + +FLAGS = flags.FLAGS + + +class FreezeRestoreSpec(spec.BaseSpec): + """Configurable freeze/restore options for resources. + + Attributes: + enable_freeze_restore: Designates the current resource to use + freeze/restore functionality if --freeze/--restore is specified on the + command line. This is a no-op if the resource does not have + _Freeze/_Restore implemented. + delete_on_freeze_error: If true, the resource deletes itself if there are + issues during a freeze. + create_on_restore_error: If true, the resource creates itself if there are + issues during a restore. + """ + + enable_freeze_restore: bool + delete_on_freeze_error: bool + create_on_restore_error: bool + + def __init__(self, + component_full_name: str, + flag_values: Optional[Dict[str, flags.FlagValues]] = None, + **kwargs): + super().__init__(component_full_name, flag_values=flag_values, **kwargs) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments + to construct in order to decode the named option. + """ + result = super()._GetOptionDecoderConstructions() + result.update({ + 'enable_freeze_restore': (option_decoders.BooleanDecoder, { + 'default': False, + 'none_ok': True + }), + 'delete_on_freeze_error': (option_decoders.BooleanDecoder, { + 'default': False, + 'none_ok': True + }), + 'create_on_restore_error': (option_decoders.BooleanDecoder, { + 'default': False, + 'none_ok': True + }), + }) + return result diff --git a/script/cumulus/pkb/perfkitbenchmarker/configs/import_three.yml b/script/cumulus/pkb/perfkitbenchmarker/configs/import_three.yml new file mode 100644 index 0000000..6af7de2 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/configs/import_three.yml @@ -0,0 +1,2 @@ +#import test_import.yml +three: &three 3 diff --git a/script/cumulus/pkb/perfkitbenchmarker/configs/import_three2.yml b/script/cumulus/pkb/perfkitbenchmarker/configs/import_three2.yml new file mode 100644 index 0000000..e4bfe86 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/configs/import_three2.yml @@ -0,0 +1 @@ +#import import_three.yml diff --git a/script/cumulus/pkb/perfkitbenchmarker/configs/option_decoders.py b/script/cumulus/pkb/perfkitbenchmarker/configs/option_decoders.py new file mode 100644 index 0000000..420f22c --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/configs/option_decoders.py @@ -0,0 +1,376 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Classes for verifying and decoding config option values.""" + + +import abc + +from perfkitbenchmarker import errors +from perfkitbenchmarker import providers +from perfkitbenchmarker.configs import spec +import six + + +class ConfigOptionDecoder(six.with_metaclass(abc.ABCMeta, object)): + """Verifies and decodes a config option value. + + Attributes: + option: None or string. Name of the config option. + required: boolean. True if the config option is required. False if not. + """ + + def __init__(self, option=None, **kwargs): + """Initializes a ConfigOptionDecoder. + + Args: + option: None or string. Name of the config option. + **kwargs: May optionally contain a 'default' key mapping to a value or + callable object. If a value is provided, the config option is + optional, and the provided value is the default if the user does not + set a value for the config option. If a callable object is provided, + the config option is optional, and the provided object is called to + determine the value if the user does not set a value for the config + option. If not provided, the config option is required. + """ + self.option = option + self.required = 'default' not in kwargs + if not self.required: + self._default = kwargs.pop('default') + assert not kwargs, ('__init__() received unexpected keyword arguments: ' + '{0}'.format(kwargs)) + + def _GetOptionFullName(self, component_full_name): + """Returns the fully qualified name of a config option. + + Args: + component_full_name: string. Fully qualified name of a configurable object + to which the option belongs. + """ + return (component_full_name if self.option is None + else '{0}.{1}'.format(component_full_name, self.option)) + + @property + def default(self): + """Gets the config option's default value. + + Returns: + Default value of an optional config option. + """ + assert not self.required, ( + 'Attempted to get the default value of required config option ' + '"{0}".'.format(self.option)) + if hasattr(self._default, '__call__'): + return self._default() + return self._default + + @abc.abstractmethod + def Decode(self, value, component_full_name, flag_values): + """Verifies and decodes a config option value. + + Args: + value: The value specified in the config. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + The decoded value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + raise NotImplementedError() + + +class EnumDecoder(ConfigOptionDecoder): + """Verifies that the config options value is in the allowed set. + + Passes through the value unmodified + """ + + def __init__(self, valid_values, **kwargs): + """Initializes the EnumVerifier. + + Args: + valid_values: list of the allowed values + **kwargs: Keyword arguments to pass to the base class. + """ + super(EnumDecoder, self).__init__(**kwargs) + self.valid_values = valid_values + + def Decode(self, value, component_full_name, flag_values): + """Verifies that the provided value is in the allowed set. + + Args: + value: The value specified in the config. + component_full_name: string. Fully qualified name of the + configurable component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be + propagated to the BaseSpec constructors. + + Returns: + The valid value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + if value in self.valid_values: + return value + else: + raise errors.Config.InvalidValue( + 'Invalid {0} value: "{1}". Value must be one of the following: ' + '{2}.'.format(self._GetOptionFullName(component_full_name), value, + ', '.join(str(t) for t in self.valid_values))) + + +class TypeVerifier(ConfigOptionDecoder): + """Verifies that a config option value's type belongs to an allowed set. + + Passes value through unmodified. + """ + + def __init__(self, valid_types, none_ok=False, **kwargs): + """Initializes a TypeVerifier. + + Args: + valid_types: tuple of allowed types. + none_ok: boolean. If True, None is also an allowed option value. + **kwargs: Keyword arguments to pass to the base class. + """ + super(TypeVerifier, self).__init__(**kwargs) + if none_ok: + self._valid_types = (type(None),) + valid_types + else: + self._valid_types = valid_types + + def Decode(self, value, component_full_name, flag_values): + """Verifies that the provided value is of an allowed type. + + Args: + value: The value specified in the config. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + The valid value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + if not isinstance(value, self._valid_types): + raise errors.Config.InvalidValue( + 'Invalid {0} value: "{1}" (of type "{2}"). Value must be one of the ' + 'following types: {3}.'.format( + self._GetOptionFullName(component_full_name), value, + value.__class__.__name__, + ', '.join(t.__name__ for t in self._valid_types))) + return value + + +class BooleanDecoder(TypeVerifier): + """Verifies and decodes a config option value when a boolean is expected.""" + + def __init__(self, **kwargs): + super(BooleanDecoder, self).__init__((bool,), **kwargs) + + +class IntDecoder(TypeVerifier): + """Verifies and decodes a config option value when an integer is expected. + + Attributes: + max: None or int. If provided, it specifies the maximum accepted value. + min: None or int. If provided, it specifies the minimum accepted value. + """ + + def __init__(self, max=None, min=None, **kwargs): + super(IntDecoder, self).__init__((int,), **kwargs) + self.max = max + self.min = min + + def Decode(self, value, component_full_name, flag_values): + """Verifies that the provided value is an int. + + Args: + value: The value specified in the config. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + int. The valid value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + value = super(IntDecoder, self).Decode(value, component_full_name, + flag_values) + if value is not None: + if self.max is not None and value > self.max: + raise errors.Config.InvalidValue( + 'Invalid {0} value: "{1}". Value must be at most {2}.'.format( + self._GetOptionFullName(component_full_name), value, self.max)) + if self.min is not None and value < self.min: + raise errors.Config.InvalidValue( + 'Invalid {0} value: "{1}". Value must be at least {2}.'.format( + self._GetOptionFullName(component_full_name), value, self.min)) + return value + + +class FloatDecoder(TypeVerifier): + """Verifies and decodes a config option value when a float is expected. + + Attributes: + max: None or float. If provided, it specifies the maximum accepted value. + min: None or float. If provided, it specifies the minimum accepted value. + """ + + def __init__(self, max=None, min=None, **kwargs): + super(FloatDecoder, self).__init__((float, int), **kwargs) + self.max = max + self.min = min + + def Decode(self, value, component_full_name, flag_values): + """Verifies that the provided value is a float. + + Args: + value: The value specified in the config. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + float. The valid value. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + value = super(FloatDecoder, self).Decode(value, component_full_name, + flag_values) + if value is not None: + if self.max is not None and value > self.max: + raise errors.Config.InvalidValue( + 'Invalid {0} value: "{1}". Value must be at most {2}.'.format( + self._GetOptionFullName(component_full_name), value, self.max)) + if self.min is not None and value < self.min: + raise errors.Config.InvalidValue( + 'Invalid {0} value: "{1}". Value must be at least {2}.'.format( + self._GetOptionFullName(component_full_name), value, self.min)) + return value + + +class StringDecoder(TypeVerifier): + """Verifies and decodes a config option value when a string is expected.""" + + def __init__(self, **kwargs): + super(StringDecoder, self).__init__(six.string_types, **kwargs) + + +class ListDecoder(TypeVerifier): + """Verifies and decodes a config option value when a list is expected.""" + + def __init__(self, item_decoder, **kwargs): + """Initializes a ListDecoder. + + Args: + item_decoder: ConfigOptionDecoder. Used to decode the items of an input + list. + **kwargs: Keyword arguments to pass to the base class. + """ + super(ListDecoder, self).__init__((list,), **kwargs) + self._item_decoder = item_decoder + + def Decode(self, value, component_full_name, flag_values): + """Verifies that the provided value is a list with appropriate items. + + Args: + value: The value specified in the config. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + None if the input value was None. Otherwise, a list containing the decoded + value of each item in the input list. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + input_list = super(ListDecoder, self).Decode(value, component_full_name, + flag_values) + if input_list is None: + return None + list_full_name = self._GetOptionFullName(component_full_name) + result = [] + for index, input_item in enumerate(input_list): + item_full_name = '{0}[{1}]'.format(list_full_name, index) + result.append(self._item_decoder.Decode(input_item, item_full_name, + flag_values)) + return result + + +class _PerCloudConfigSpec(spec.BaseSpec): + """Contains one config dict attribute per cloud provider. + + The name of each attribute is the name of the cloud provider. + """ + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments + to construct in order to decode the named option. + """ + result = super(_PerCloudConfigSpec, cls)._GetOptionDecoderConstructions() + for cloud in providers.VALID_CLOUDS: + result[cloud] = TypeVerifier, { + 'default': None, + 'valid_types': (dict,) + } + return result + + +class PerCloudConfigDecoder(TypeVerifier): + """Decodes the disk_spec or vm_spec option of a VM group config object.""" + + def __init__(self, **kwargs): + super(PerCloudConfigDecoder, self).__init__(valid_types=(dict,), **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Decodes the disk_spec or vm_spec option of a VM group config object. + + Args: + value: None or dict mapping cloud provider name string to a dict. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + _PerCloudConfigSpec decoded from the input dict. + """ + input_dict = super(PerCloudConfigDecoder, self).Decode( + value, component_full_name, flag_values) + return None if input_dict is None else _PerCloudConfigSpec( + self._GetOptionFullName(component_full_name), + flag_values=flag_values, + **input_dict) diff --git a/script/cumulus/pkb/perfkitbenchmarker/configs/spec.py b/script/cumulus/pkb/perfkitbenchmarker/configs/spec.py new file mode 100644 index 0000000..9914954 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/configs/spec.py @@ -0,0 +1,180 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Base class for objects decoded from a YAML config.""" + + +import collections +import threading + +from perfkitbenchmarker import errors +import six + +_SPEC_REGISTRY = {} + + +def GetSpecClass(base_class, **kwargs): + """Returns the subclass with the corresponding attributes. + + Args: + base_class: The base class of the resource to return + (e.g. BaseVmSpec). + **kwargs: Every attribute/value of the subclass's ATTRS that were + used to register the subclass. + Raises: + Exception: If no class could be found with matching attributes. + """ + key = [base_class.__name__] + key += sorted(kwargs.items()) + return _SPEC_REGISTRY.get(tuple(key), base_class) + + +class BaseSpecMetaClass(type): + """Metaclass that allows each BaseSpec derived class to have its own decoders. + """ + + def __init__(cls, name, bases, dct): + super(BaseSpecMetaClass, cls).__init__(name, bases, dct) + cls._init_decoders_lock = threading.Lock() + cls._decoders = collections.OrderedDict() + cls._required_options = set() + if (all(hasattr(cls, attr) for attr in cls.SPEC_ATTRS) and + cls.SPEC_TYPE): + key = [cls.SPEC_TYPE] + key += sorted([(attr, getattr(cls, attr)) for attr in cls.SPEC_ATTRS]) + if tuple(key) in _SPEC_REGISTRY: + raise Exception('Subclasses of %s must define unique values for the ' + 'attrs: %s.' % (cls.SPEC_TYPE, cls.SPEC_ATTRS)) + _SPEC_REGISTRY[tuple(key)] = cls + + +class BaseSpec(six.with_metaclass(BaseSpecMetaClass, object)): + """Object decoded from a YAML config.""" + # The name of the spec class that will be extended with auto-registered + # subclasses. + SPEC_TYPE = None + # A list of the attributes that are used to register the subclasses. + SPEC_ATTRS = ['CLOUD'] + + # Each derived class has its own copy of the following three variables. They + # are initialized by BaseSpecMetaClass.__init__ and later populated by + # _InitDecoders when the first instance of the derived class is created. + _init_decoders_lock = None # threading.Lock that protects the next two vars. + _decoders = None # dict mapping config option name to ConfigOptionDecoder. + _required_options = None # set of strings. Required config options. + + def __init__(self, component_full_name, flag_values=None, **kwargs): + """Initializes a BaseSpec. + + Translates keyword arguments via the class's decoders and assigns the + corresponding instance attribute. Derived classes can register decoders + for additional attributes by overriding _GetOptionDecoderConstructions + and can add support for additional flags by overriding _ApplyFlags. + + Args: + component_full_name: string. Fully qualified name of the configurable + component containing the config options. + flag_values: None or flags.FlagValues. Runtime flags that may override + the provided config option values in kwargs. + **kwargs: dict mapping config option names to provided values. + + Raises: + errors.Config.MissingOption: If a config option is required, but a value + was not provided in kwargs. + errors.Config.UnrecognizedOption: If an unrecognized config option is + provided with a value in kwargs. + """ + if not self._decoders: + self._InitDecoders() + if flag_values: + self._ApplyFlags(kwargs, flag_values) + missing_options = self._required_options.difference(kwargs) + if missing_options: + raise errors.Config.MissingOption( + 'Required options were missing from {0}: {1}.'.format( + component_full_name, ', '.join(sorted(missing_options)))) + unrecognized_options = frozenset(kwargs).difference(self._decoders) + if unrecognized_options: + raise errors.Config.UnrecognizedOption( + 'Unrecognized options were found in {0}: {1}.'.format( + component_full_name, ', '.join(sorted(unrecognized_options)))) + self._DecodeAndInit(component_full_name, kwargs, self._decoders, + flag_values) + + @classmethod + def _InitDecoders(cls): + """Creates a ConfigOptionDecoder for each config option. + + Populates cls._decoders and cls._required_options. + """ + with cls._init_decoders_lock: + if not cls._decoders: + constructions = cls._GetOptionDecoderConstructions() + for option, decoder_construction in sorted( + six.iteritems(constructions)): + decoder_class, init_args = decoder_construction + decoder = decoder_class(option=option, **init_args) + cls._decoders[option] = decoder + if decoder.required: + cls._required_options.add(option) + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May + be modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + pass + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Can be overridden by derived classes to add options or impose additional + requirements on existing options. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + return {} + + def _DecodeAndInit(self, component_full_name, config, decoders, flag_values): + """Initializes spec attributes from provided config option values. + + Args: + component_full_name: string. Fully qualified name of the configurable + component containing the config options. + config: dict mapping option name string to option value. + decoders: OrderedDict mapping option name string to ConfigOptionDecoder. + flag_values: flags.FlagValues. Runtime flags that may override provided + config option values. These flags have already been applied to the + current config, but they may be passed to the decoders for propagation + to deeper spec constructors. + """ + assert isinstance(decoders, collections.OrderedDict), ( + 'decoders must be an OrderedDict. The order in which options are ' + 'decoded must be guaranteed.') + for option, decoder in six.iteritems(decoders): + if option in config: + value = decoder.Decode(config[option], component_full_name, flag_values) + else: + value = decoder.default + setattr(self, option, value) diff --git a/script/cumulus/pkb/perfkitbenchmarker/configs/test_import.yml b/script/cumulus/pkb/perfkitbenchmarker/configs/test_import.yml new file mode 100644 index 0000000..f0e1360 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/configs/test_import.yml @@ -0,0 +1,11 @@ +#import import_three2.yml +#import import_three.yml +#import import_three.yml + +# This is an example of how to import in config files. +# You can even use anchors from other yaml files. +# Importing a file multiple times either directly or indirectly won't break +# things. +# Imports must all be at the top of the file with no spaces between them. +flags: + num_vms: *three diff --git a/script/cumulus/pkb/perfkitbenchmarker/container_service.py b/script/cumulus/pkb/perfkitbenchmarker/container_service.py new file mode 100644 index 0000000..5de7a11 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/container_service.py @@ -0,0 +1,968 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes related to managed container services. + +For now this just consists of a base cluster class that other container +services will be derived from and a Kubernetes specific variant. This enables +users to run PKB VM based benchmarks on container providers (e.g. Kubernetes) +without pre-provisioning container clusters. In the future, this may be +expanded to support first-class container benchmarks. +""" + +import collections +import functools +import ipaddress +import itertools +import os +import time +from typing import Any, Dict, List, Optional + +from absl import flags +import jinja2 +from perfkitbenchmarker import context +from perfkitbenchmarker import custom_virtual_machine_spec +from perfkitbenchmarker import data +from perfkitbenchmarker import errors +from perfkitbenchmarker import events +from perfkitbenchmarker import kubernetes_helper +from perfkitbenchmarker import os_types +from perfkitbenchmarker import resource +from perfkitbenchmarker import sample +from perfkitbenchmarker import units +from perfkitbenchmarker import virtual_machine +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.configs import spec +import requests +import six +import yaml + +KUBERNETES = 'Kubernetes' +DEFAULT_NODEPOOL = 'default' + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + 'kubeconfig', None, 'Path to kubeconfig to be used by kubectl. ' + 'If unspecified, it will be set to a file in this run\'s ' + 'temporary directory.') + +flags.DEFINE_string('kubectl', 'kubectl', 'Path to kubectl tool') + +flags.DEFINE_boolean( + 'local_container_build', False, + 'Force container images to be built locally rather than ' + 'just as a fallback if there is no remote image builder ' + 'associated with the registry.') + +flags.DEFINE_boolean( + 'static_container_image', True, + 'Whether container images are static (i.e. are not ' + 'managed by PKB). If this is set, PKB will accept the ' + 'image as fully qualified (including repository) and will ' + 'not attempt to build it.') + +flags.DEFINE_boolean( + 'force_container_build', False, + 'Whether to force PKB to build container images even ' + 'if they already exist in the registry.') + +flags.DEFINE_string( + 'container_cluster_cloud', None, + 'Sets the cloud to use for the container cluster. ' + 'This will override both the value set in the config and ' + 'the value set using the generic "cloud" flag.') + +flags.DEFINE_integer( + 'container_cluster_num_vms', None, + 'Number of nodes in the cluster. Defaults to ' + 'container_cluster.vm_count') + +flags.DEFINE_string('container_cluster_type', KUBERNETES, + 'The type of container cluster.') + +flags.DEFINE_string( + 'container_cluster_version', None, + 'Optional version flag to pass to the cluster create ' + 'command. If not specified, the cloud-specific container ' + 'implementation will chose an appropriate default.') + +_CONTAINER_CLUSTER_ARCHITECTURE = flags.DEFINE_list( + 'container_cluster_architecture', ['linux/amd64'], + 'The architecture(s) that the container cluster uses. ' + 'Defaults to linux/amd64') + +_K8S_INGRESS = """ +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + name: {service_name}-ingress +spec: + backend: + serviceName: {service_name} + servicePort: 8080 +""" + + +class ContainerException(errors.Error): + """Exception during the creation or execution of a container.""" + + +class FatalContainerException(errors.Resource.CreationError, + ContainerException): + """Fatal Exception during the creation or execution of a container.""" + pass + + +class RetriableContainerException(errors.Resource.RetryableCreationError, + ContainerException): + """Retriable Exception during the creation or execution of a container.""" + pass + + +def RunKubectlCommand(command: List[str], **kwargs): + """Run a kubectl command.""" + cmd = [FLAGS.kubectl, '--kubeconfig', FLAGS.kubeconfig] + command + return vm_util.IssueCommand(cmd, **kwargs) + + +class ContainerSpec(spec.BaseSpec): + """Class containing options for creating containers.""" + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Apply flag settings to the container spec.""" + super(ContainerSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['image'].present: + config_values['image'] = flag_values.image + if flag_values['static_container_image'].present: + config_values['static_image'] = flag_values.static_container_image + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Can be overridden by derived classes to add options or impose additional + requirements on existing options. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(ContainerSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'image': (option_decoders.StringDecoder, { + 'default': None + }), + 'static_image': (option_decoders.BooleanDecoder, { + 'default': False + }), + 'cpus': (option_decoders.FloatDecoder, { + 'default': None + }), + 'memory': (custom_virtual_machine_spec.MemoryDecoder, { + 'default': None + }), + 'command': (_CommandDecoder, {}), + 'container_port': (option_decoders.IntDecoder, { + 'default': 8080 + }), + }) + return result + + +class _CommandDecoder(option_decoders.ListDecoder): + """Decodes the command/arg list for containers.""" + + def __init__(self, **kwargs): + super(_CommandDecoder, self).__init__( + default=None, + none_ok=True, + item_decoder=option_decoders.StringDecoder(), + **kwargs) + + +class BaseContainer(resource.BaseResource): + """Class representing a single container.""" + + def __init__(self, container_spec=None, **_): + # Hack to make container_spec a kwarg + assert container_spec + super(BaseContainer, self).__init__() + self.cpus = container_spec.cpus + self.memory = container_spec.memory + self.command = container_spec.command + self.image = container_spec.image + self.ip_address = None + + def WaitForExit(self, timeout: int = 1200) -> Dict[str, Any]: + """Gets the successfully finished container. + + Args: + timeout: The timeout to wait in seconds + + Raises: + FatalContainerException: If the container fails + RetriableContainerException: If the container times out wihout succeeding. + """ + raise NotImplementedError() + + def GetLogs(self): + """Returns the logs from the container.""" + raise NotImplementedError() + + +class BaseContainerService(resource.BaseResource): + """Class representing a service backed by containers.""" + + def __init__(self, container_spec): + super(BaseContainerService, self).__init__() + self.cpus = container_spec.cpus + self.memory = container_spec.memory + self.command = container_spec.command + self.image = container_spec.image + self.container_port = container_spec.container_port + self.ip_address = None + self.port = None + self.host_header = None + + +class _ContainerImage(object): + """Simple class for tracking container image names and source locations.""" + + def __init__(self, name): + self.name = name + self.directory = os.path.dirname( + data.ResourcePath(os.path.join('docker', self.name, 'Dockerfile'))) + + +class ContainerRegistrySpec(spec.BaseSpec): + """Spec containing options for creating a Container Registry.""" + + def __init__(self, component_full_name, flag_values=None, **kwargs): + super(ContainerRegistrySpec, self).__init__( + component_full_name, flag_values=flag_values, **kwargs) + registry_spec = getattr(self.spec, self.cloud, {}) + self.project = registry_spec.get('project') + self.zone = registry_spec.get('zone') + self.name = registry_spec.get('name') + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Apply flag values to the spec.""" + super(ContainerRegistrySpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['cloud'].present or 'cloud' not in config_values: + config_values['cloud'] = flag_values.cloud + if flag_values['container_cluster_cloud'].present: + config_values['cloud'] = flag_values.container_cluster_cloud + updated_spec = {} + if flag_values['project'].present: + updated_spec['project'] = flag_values.project + if flag_values['zones'].present: + updated_spec['zone'] = flag_values.zones[0] + cloud = config_values['cloud'] + cloud_spec = config_values.get('spec', {}).get(cloud, {}) + cloud_spec.update(updated_spec) + config_values['spec'] = {cloud: cloud_spec} + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Can be overridden by derived classes to add options or impose additional + requirements on existing options. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(ContainerRegistrySpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'cloud': (option_decoders.StringDecoder, {}), + 'spec': (option_decoders.PerCloudConfigDecoder, { + 'default': {} + }) + }) + return result + + +def GetContainerRegistryClass(cloud): + return resource.GetResourceClass(BaseContainerRegistry, CLOUD=cloud) + + +class BaseContainerRegistry(resource.BaseResource): + """Base class for container image registries.""" + + RESOURCE_TYPE = 'BaseContainerRegistry' + + def __init__(self, registry_spec): + super(BaseContainerRegistry, self).__init__() + benchmark_spec = context.GetThreadBenchmarkSpec() + container_cluster = getattr(benchmark_spec, 'container_cluster', None) + zone = getattr(container_cluster, 'zone', None) + project = getattr(container_cluster, 'project', None) + self.zone = registry_spec.zone or zone + self.project = registry_spec.project or project + self.name = registry_spec.name or 'pkb%s' % FLAGS.run_uri + self.local_build_times = {} + self.remote_build_times = {} + self.metadata.update({'cloud': self.CLOUD}) + + def _Create(self): + """Creates the image registry.""" + pass + + def _Delete(self): + """Deletes the image registry.""" + pass + + def GetSamples(self): + """Returns image build related samples.""" + samples = [] + metadata = self.GetResourceMetadata() + for image_name, build_time in self.local_build_times.items(): + metadata.update({ + 'build_type': 'local', + 'image': image_name, + }) + samples.append( + sample.Sample('Image Build Time', build_time, 'seconds', metadata)) + for image_name, build_time in self.remote_build_times.items(): + metadata.update({ + 'build_type': 'remote', + 'image': image_name, + }) + samples.append( + sample.Sample('Image Build Time', build_time, 'seconds', metadata)) + return samples + + def GetFullRegistryTag(self, image): + """Returns the full name of the image for the registry. + + Args: + image: The PKB name of the image (string). + """ + raise NotImplementedError() + + def PrePush(self, image): + """Prepares registry to push a given image.""" + pass + + def RemoteBuild(self, image): + """Build the image remotely. + + Args: + image: Instance of _ContainerImage representing the image to build. + """ + raise NotImplementedError() + + def Login(self): + """Log in to the registry (in order to push to it).""" + raise NotImplementedError() + + def LocalBuildAndPush(self, image): + """Build the image locally and push to registry. + + Assumes we are already authenticated with the registry from self.Login. + Building and pushing done in one command to support multiarch images + https://github.com/docker/buildx/issues/59 + + Args: + image: Instance of _ContainerImage representing the image to build. + """ + full_tag = self.GetFullRegistryTag(image.name) + # Multiarch images require buildx create + # https://github.com/docker/build-push-action/issues/302 + vm_util.IssueCommand(['docker', 'buildx', 'create', '--use']) + cmd = ['docker', 'buildx', 'build'] + if _CONTAINER_CLUSTER_ARCHITECTURE.value: + cmd += ['--platform', ','.join(_CONTAINER_CLUSTER_ARCHITECTURE.value)] + cmd += ['--no-cache', '--push', '-t', full_tag, image.directory] + vm_util.IssueCommand(cmd) + vm_util.IssueCommand(['docker', 'buildx', 'stop']) + + def GetOrBuild(self, image): + """Finds the image in the registry or builds it. + + TODO(pclay): Add support for build ARGs. + + Args: + image: The PKB name for the image (string). + + Returns: + The full image name (including the registry). + """ + full_image = self.GetFullRegistryTag(image) + # Log in to the registry to see if image exists + self.Login() + if not FLAGS.force_container_build: + # manifest inspect inpspects the registry's copy + inspect_cmd = ['docker', 'manifest', 'inspect', full_image] + _, _, retcode = vm_util.IssueCommand( + inspect_cmd, suppress_warning=True, raise_on_failure=False) + if retcode == 0: + return full_image + self._Build(image) + return full_image + + def _Build(self, image): + """Builds the image and pushes it to the registry if necessary. + + Args: + image: The PKB name for the image (string). + """ + image = _ContainerImage(image) + build_start = time.time() + if not FLAGS.local_container_build: + try: + # Build the image remotely using an image building service. + self.RemoteBuild(image) + self.remote_build_times[image.name] = time.time() - build_start + return + except NotImplementedError: + pass + + self.PrePush(image) + # Build the image locally using docker. + build_start = time.time() + self.LocalBuildAndPush(image) + self.local_build_times[image.name] = time.time() - build_start + + +@events.benchmark_start.connect +def _SetKubeConfig(unused_sender, benchmark_spec): + """Sets the value for the kubeconfig flag if it's unspecified.""" + if not FLAGS.kubeconfig: + FLAGS.kubeconfig = vm_util.PrependTempDir( + 'kubeconfig' + str(benchmark_spec.sequence_number)) + # Store the value for subsequent run stages. + benchmark_spec.config.flags['kubeconfig'] = FLAGS.kubeconfig + + +def NodePoolName(name: str) -> str: + """Clean node pool names to be usable by all providers.""" + # GKE (or k8s?) requires nodepools use alphanumerics and hyphens + # AKS requires full alphanumeric + # PKB likes to use underscores strip them out. + return name.replace('_', '') + + +def GetContainerClusterClass(cloud, cluster_type): + return resource.GetResourceClass( + BaseContainerCluster, CLOUD=cloud, CLUSTER_TYPE=cluster_type) + + +class BaseContainerCluster(resource.BaseResource): + """A cluster that can be used to schedule containers.""" + + RESOURCE_TYPE = 'BaseContainerCluster' + REQUIRED_ATTRS = ['CLOUD', 'CLUSTER_TYPE'] + + def __init__(self, cluster_spec): + super().__init__(user_managed=bool(cluster_spec.static_cluster)) + self.name = cluster_spec.static_cluster or 'pkb-' + FLAGS.run_uri + self.vm_config = virtual_machine.GetVmClass(self.CLOUD, os_types.DEFAULT)( + cluster_spec.vm_spec) + self.zone = self.vm_config.zone + # Use Virtual Machine class to resolve VM Spec. This lets subclasses parse + # Provider specific information like disks out of the spec. + for name, nodepool in cluster_spec.nodepools.copy().items(): + nodepool_zone = nodepool.vm_spec.zone + # VM Classes can require zones. But nodepools have optional zones. + if not nodepool_zone: + nodepool.vm_spec.zone = self.zone + nodepool.vm_config = virtual_machine.GetVmClass( + self.CLOUD, os_types.DEFAULT)(nodepool.vm_spec) + nodepool.vm_config.zone = nodepool_zone + nodepool.num_nodes = nodepool.vm_count + # Fix name + del cluster_spec.nodepools[name] + cluster_spec.nodepools[NodePoolName(name)] = nodepool + self.nodepools = cluster_spec.nodepools + self.num_nodes = cluster_spec.vm_count + self.min_nodes = cluster_spec.min_vm_count or self.num_nodes + self.max_nodes = cluster_spec.max_vm_count or self.num_nodes + self.containers = collections.defaultdict(list) + self.services = {} + + def DeleteContainers(self): + """Delete containers belonging to the cluster.""" + for container in itertools.chain(*list(self.containers.values())): + container.Delete() + + def DeleteServices(self): + """Delete services belonging to the cluster.""" + for service in self.services.values(): + service.Delete() + + def GetResourceMetadata(self): + """Returns a dictionary of cluster metadata.""" + nodepools = {} + for name, nodepool in six.iteritems(self.nodepools): + nodepool_metadata = { + 'size': nodepool.num_nodes, + 'machine_type': nodepool.vm_config.machine_type, + 'name': name + } + nodepools[name] = nodepool_metadata + + metadata = { + 'cloud': self.CLOUD, + 'cluster_type': self.CLUSTER_TYPE, + 'zone': self.zone, + 'size': self.num_nodes, + 'machine_type': self.vm_config.machine_type, + 'nodepools': nodepools + } + + if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes: + metadata.update({ + 'max_size': self.max_nodes, + 'min_size': self.min_nodes, + }) + + return metadata + + def DeployContainer(self, name, container_spec): + """Deploys Containers according to the ContainerSpec.""" + raise NotImplementedError() + + def DeployContainerService(self, name, container_spec, num_containers): + """Deploys a ContainerSerivice according to the ContainerSpec.""" + raise NotImplementedError() + + def GetSamples(self): + """Return samples with information about deployment times.""" + samples = [] + if self.resource_ready_time and self.create_start_time: + samples.append( + sample.Sample('Cluster Creation Time', + self.resource_ready_time - self.create_start_time, + 'seconds')) + for container in itertools.chain(*list(self.containers.values())): + metadata = {'image': container.image.split('/')[-1]} + if container.resource_ready_time and container.create_start_time: + samples.append( + sample.Sample( + 'Container Deployment Time', + container.resource_ready_time - container.create_start_time, + 'seconds', metadata)) + if container.delete_end_time and container.delete_start_time: + samples.append( + sample.Sample( + 'Container Delete Time', + container.delete_end_time - container.delete_start_time, + 'seconds', metadata)) + for service in self.services.values(): + metadata = {'image': service.image.split('/')[-1]} + if service.resource_ready_time and service.create_start_time: + samples.append( + sample.Sample( + 'Service Deployment Time', + service.resource_ready_time - service.create_start_time, + 'seconds', metadata)) + if service.delete_end_time and service.delete_start_time: + samples.append( + sample.Sample('Service Delete Time', + service.delete_end_time - service.delete_start_time, + 'seconds', metadata)) + + return samples + + +class KubernetesPod: + """Representation of a Kubernetes pod. + + It can be created as a PKB managed resource using KubernetesContainer, + or created with ApplyManifest and directly constructed. + """ + + def __init__(self, name=None, **_): + assert name + self.name = name + + def _GetPod(self) -> Dict[str, Any]: + """Gets a representation of the POD and returns it.""" + stdout, _, _ = RunKubectlCommand(['get', 'pod', self.name, '-o', 'yaml']) + pod = yaml.safe_load(stdout) + self.ip_address = pod.get('status', {}).get('podIP') + return pod + + def WaitForExit(self, timeout: int = None) -> Dict[str, Any]: + """Gets the finished running container.""" + + @vm_util.Retry( + timeout=timeout, retryable_exceptions=(RetriableContainerException,)) + def _WaitForExit(): + # Inspect the pod's status to determine if it succeeded, has failed, or is + # doomed to fail. + # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/ + pod = self._GetPod() + status = pod['status'] + phase = status['phase'] + if phase == 'Succeeded': + return pod + elif phase == 'Failed': + raise FatalContainerException( + f"Pod {self.name} failed:\n{yaml.dump(pod['status'])}") + else: + for condition in status.get('conditions', []): + if (condition['type'] == 'PodScheduled' and + condition['status'] == 'False' and + condition['reason'] == 'Unschedulable'): + # TODO(pclay): Revisit this when we scale clusters. + raise FatalContainerException( + f"Pod {self.name} failed to schedule:\n{condition['message']}") + for container_status in status.get('containerStatuses', []): + waiting_status = container_status['state'].get('waiting', {}) + if waiting_status.get('reason') in [ + 'ErrImagePull', 'ImagePullBackOff' + ]: + raise FatalContainerException( + f'Failed to find container image for {status.name}:\n' + + yaml.dump(waiting_status.get('message'))) + raise RetriableContainerException( + f'Pod phase ({phase}) not in finished phases.') + + return _WaitForExit() + + def GetLogs(self): + """Returns the logs from the container.""" + stdout, _, _ = RunKubectlCommand(['logs', self.name]) + return stdout + + +class KubernetesContainer(BaseContainer, KubernetesPod): + """A KubernetesPod based flavor of Container.""" + + def _Create(self): + """Creates the container.""" + run_cmd = [ + 'run', + self.name, + '--image=%s' % self.image, + '--restart=Never', + ] + + limits = [] + if self.cpus: + limits.append(f'cpu={int(1000 * self.cpus)}m') + if self.memory: + limits.append(f'memory={self.memory}Mi') + if limits: + run_cmd.append('--limits=' + ','.join(limits)) + + if self.command: + run_cmd.extend(['--command', '--']) + run_cmd.extend(self.command) + RunKubectlCommand(run_cmd) + + def _Delete(self): + """Deletes the container.""" + pass + + def _IsReady(self): + """Returns true if the container has stopped pending.""" + return self._GetPod()['status']['phase'] != 'Pending' + + +class KubernetesContainerService(BaseContainerService): + """A Kubernetes flavor of Container Service.""" + + def __init__(self, container_spec, name): + super(KubernetesContainerService, self).__init__(container_spec) + self.name = name + self.port = 8080 + + def _Create(self): + run_cmd = [ + 'run', self.name, + '--image=%s' % self.image, '--port', + str(self.port) + ] + + limits = [] + if self.cpus: + limits.append(f'cpu={int(1000 * self.cpus)}m') + if self.memory: + limits.append(f'memory={self.memory}Mi') + if limits: + run_cmd.append('--limits=' + ','.join(limits)) + + if self.command: + run_cmd.extend(['--command', '--']) + run_cmd.extend(self.command) + RunKubectlCommand(run_cmd) + + expose_cmd = [ + 'expose', 'deployment', self.name, '--type', 'NodePort', + '--target-port', + str(self.port) + ] + RunKubectlCommand(expose_cmd) + with vm_util.NamedTemporaryFile() as tf: + tf.write(_K8S_INGRESS.format(service_name=self.name)) + tf.close() + kubernetes_helper.CreateFromFile(tf.name) + + def _GetIpAddress(self): + """Attempts to set the Service's ip address.""" + ingress_name = '%s-ingress' % self.name + get_cmd = [ + 'get', 'ing', ingress_name, '-o', + 'jsonpath={.status.loadBalancer.ingress[*].ip}' + ] + stdout, _, _ = RunKubectlCommand(get_cmd) + ip_address = stdout + if ip_address: + self.ip_address = ip_address + + def _IsReady(self): + """Returns True if the Service is ready.""" + if self.ip_address is None: + self._GetIpAddress() + if self.ip_address is not None: + url = 'http://%s' % (self.ip_address) + r = requests.get(url) + if r.status_code == 200: + return True + return False + + def _Delete(self): + """Deletes the service.""" + with vm_util.NamedTemporaryFile() as tf: + tf.write(_K8S_INGRESS.format(service_name=self.name)) + tf.close() + kubernetes_helper.DeleteFromFile(tf.name) + + delete_cmd = ['delete', 'deployment', self.name] + RunKubectlCommand(delete_cmd, raise_on_failure=False) + + +class KubernetesCluster(BaseContainerCluster): + """A Kubernetes flavor of Container Cluster.""" + + CLUSTER_TYPE = KUBERNETES + + def _DeleteAllFromDefaultNamespace(self): + """Deletes all resources from a namespace. + + Since StatefulSets do not reclaim PVCs upon deletion, they are explicitly + deleted here to prevent dynamically provisioned PDs from leaking once the + cluster has been deleted. + """ + run_cmd = [ + 'delete', 'all', '--all', '-n', 'default' + ] + RunKubectlCommand(run_cmd) + + run_cmd = [ + 'delete', 'pvc', '--all', '-n', 'default' + ] + RunKubectlCommand(run_cmd) + + def _Delete(self): + self._DeleteAllFromDefaultNamespace() + + def GetResourceMetadata(self): + """Returns a dict containing metadata about the cluster.""" + result = super().GetResourceMetadata() + result['container_cluster_version'] = self.k8s_version + return result + + def DeployContainer(self, base_name, container_spec): + """Deploys Containers according to the ContainerSpec.""" + name = base_name + str(len(self.containers[base_name])) + container = KubernetesContainer(container_spec=container_spec, name=name) + self.containers[base_name].append(container) + container.Create() + + def DeployContainerService(self, name, container_spec): + """Deploys a ContainerSerivice according to the ContainerSpec.""" + service = KubernetesContainerService(container_spec, name) + self.services[name] = service + service.Create() + + # TODO(pclay): Revisit instance methods that don't rely on instance data. + def ApplyManifest(self, manifest_file, **kwargs): + """Applies a declarative Kubernetes manifest; possibly with jinja. + + Args: + manifest_file: The name of the YAML file or YAML template. + **kwargs: Arguments to the jinja template. + """ + filename = data.ResourcePath(manifest_file) + if not filename.endswith('.j2'): + assert not kwargs + RunKubectlCommand(['apply', '-f', filename]) + return + + environment = jinja2.Environment(undefined=jinja2.StrictUndefined) + with open(filename) as template_file, vm_util.NamedTemporaryFile( + mode='w', suffix='.yaml') as rendered_template: + manifest = environment.from_string(template_file.read()).render(kwargs) + rendered_template.write(manifest) + rendered_template.close() + RunKubectlCommand(['apply', '-f', rendered_template.name]) + + def WaitForResource(self, resource_name, condition_name, namespace=None): + """Waits for a condition on a Kubernetes resource (eg: deployment, pod).""" + run_cmd = [ + 'wait', f'--for=condition={condition_name}', + f'--timeout={vm_util.DEFAULT_TIMEOUT}s', resource_name + ] + if namespace: + run_cmd.append(f'--namespace={namespace}') + RunKubectlCommand(run_cmd) + + def WaitForRollout(self, resource_name): + """Blocks until a Kubernetes rollout is completed.""" + run_cmd = [ + 'rollout', + 'status', + '--timeout=%ds' % vm_util.DEFAULT_TIMEOUT, + resource_name + ] + + RunKubectlCommand(run_cmd) + + @vm_util.Retry(retryable_exceptions=(errors.Resource.RetryableCreationError,)) + def GetLoadBalancerIP(self, service_name): + """Returns the IP address of a LoadBalancer service when ready.""" + get_cmd = [ + 'get', 'service', service_name, '-o', + 'jsonpath={.status.loadBalancer.ingress[0].ip}' + ] + + stdout, _, _ = RunKubectlCommand(get_cmd) + + try: + # Ensure the load balancer is ready by parsing the output IP + ip_address = ipaddress.ip_address(stdout) + except ValueError: + raise errors.Resource.RetryableCreationError( + "Load Balancer IP for service '%s' is not ready." % service_name) + + return format(ip_address) + + @vm_util.Retry(retryable_exceptions=(errors.Resource.RetryableCreationError,)) + def GetClusterIP(self, service_name) -> str: + """Returns the IP address of a ClusterIP service when ready.""" + get_cmd = [ + 'get', 'service', service_name, '-o', 'jsonpath={.spec.clusterIP}' + ] + + stdout, _, _ = RunKubectlCommand(get_cmd) + + if not stdout: + raise errors.Resource.RetryableCreationError( + "ClusterIP for service '%s' is not ready." % service_name) + + return stdout + + def CreateConfigMap(self, name, from_file_dir): + """Creates a Kubernetes ConfigMap. + + Args: + name: The name of the ConfigMap to create + from_file_dir: The directory name containing files that will be key/values + in the ConfigMap + """ + RunKubectlCommand( + ['create', 'configmap', name, '--from-file', from_file_dir]) + + def CreateServiceAccount(self, + name: str, + clusterrole: Optional[str] = None, + namespace='default'): + """Create a k8s service account and cluster-role-binding.""" + RunKubectlCommand( + ['create', 'serviceaccount', name, '--namespace', namespace]) + if clusterrole: + # TODO(pclay): Support customer cluster roles? + RunKubectlCommand([ + 'create', + 'clusterrolebinding', + f'{name}-role', + f'--clusterrole={clusterrole}', + f'--serviceaccount={namespace}:{name}', + '--namespace', + namespace, + ]) + + # TODO(pclay): Move to cached property in Python 3.9 + @property + @functools.lru_cache(maxsize=1) + def node_memory_allocatable(self) -> units.Quantity: + """Usable memory of each node in cluster in KiB.""" + stdout, _, _ = RunKubectlCommand( + # TODO(pclay): Take a minimum of all nodes? + [ + 'get', 'nodes', '-o', + 'jsonpath={.items[0].status.allocatable.memory}' + ]) + return units.ParseExpression(stdout) + + @property + @functools.lru_cache(maxsize=1) + def node_num_cpu(self) -> int: + """vCPU of each node in cluster.""" + stdout, _, _ = RunKubectlCommand( + ['get', 'nodes', '-o', 'jsonpath={.items[0].status.capacity.cpu}']) + return int(stdout) + + @property + @functools.lru_cache(maxsize=1) + def k8s_version(self) -> str: + """Actual Kubernetes version reported by server.""" + stdout, _, _ = RunKubectlCommand(['version', '-o', 'yaml']) + return yaml.safe_load(stdout)['serverVersion']['gitVersion'] + + def GetPodLabel(self, resource_name): + run_cmd = [ + 'get', resource_name, + '-o', 'jsonpath="{.spec.selector.matchLabels.app}"' + ] + + stdout, _, _ = RunKubectlCommand(run_cmd) + return yaml.safe_load(stdout) + + def GetPodIps(self, resource_name): + """Returns a list of internal IPs for a pod name. + + Args: + resource_name: The pod resource name + """ + pod_label = self.GetPodLabel(resource_name) + + get_cmd = [ + 'get', 'pods', '-l', 'app=%s' % pod_label, + '-o', 'jsonpath="{.items[*].status.podIP}"' + ] + + stdout, _, _ = RunKubectlCommand(get_cmd) + return yaml.safe_load(stdout).split() + + def RunKubectlExec(self, pod_name, cmd): + run_cmd = [ + 'exec', '-it', pod_name, '--' + ] + cmd + RunKubectlCommand(run_cmd) + + # TODO(pclay): integrate with kubernetes_disk. + def GetDefaultStorageClass(self) -> str: + """Get the default storage class for the provider.""" + raise NotImplementedError diff --git a/script/cumulus/pkb/perfkitbenchmarker/context.py b/script/cumulus/pkb/perfkitbenchmarker/context.py new file mode 100644 index 0000000..c9dcfb8 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/context.py @@ -0,0 +1,39 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module for working with the current thread context.""" + +import threading + + +class _ThreadData(threading.local): + def __init__(self): + self.benchmark_spec = None + + +_thread_local = _ThreadData() + + +def SetThreadBenchmarkSpec(benchmark_spec): + """Sets the current thread's BenchmarkSpec object.""" + _thread_local.benchmark_spec = benchmark_spec + + +def GetThreadBenchmarkSpec(): + """Gets the current thread's BenchmarkSpec object. + + If SetThreadBenchmarkSpec() has not been called in either the current thread + or in an ancestor, then this method will return None by default. + """ + return _thread_local.benchmark_spec diff --git a/script/cumulus/pkb/perfkitbenchmarker/custom_virtual_machine_spec.py b/script/cumulus/pkb/perfkitbenchmarker/custom_virtual_machine_spec.py new file mode 100644 index 0000000..c278f76 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/custom_virtual_machine_spec.py @@ -0,0 +1,212 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Classes relating to decoding a custom machine type. +""" + + +import re + +from perfkitbenchmarker import errors +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.configs import spec +from perfkitbenchmarker.providers.azure import flags as azure_flags +import six + + +class MemoryDecoder(option_decoders.StringDecoder): + """Verifies and decodes a config option value specifying a memory size.""" + + _CONFIG_MEMORY_PATTERN = re.compile(r'([0-9.]+)([GM]iB)') + + def Decode(self, value, component_full_name, flag_values): + """Decodes memory size in MiB from a string. + + The value specified in the config must be a string representation of the + memory size expressed in MiB or GiB. It must be an integer number of MiB + Examples: "1280MiB", "7.5GiB". + + Args: + value: The value specified in the config. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + int. Memory size in MiB. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + string = super(MemoryDecoder, self).Decode(value, component_full_name, + flag_values) + match = self._CONFIG_MEMORY_PATTERN.match(string) + if not match: + raise errors.Config.InvalidValue( + 'Invalid {0} value: "{1}". Examples of valid values: "1280MiB", ' + '"7.5GiB".'.format(self._GetOptionFullName(component_full_name), + string)) + try: + memory_value = float(match.group(1)) + except ValueError: + raise errors.Config.InvalidValue( + 'Invalid {0} value: "{1}". "{2}" is not a valid float.'.format( + self._GetOptionFullName(component_full_name), string, + match.group(1))) + memory_units = match.group(2) + if memory_units == 'GiB': + memory_value *= 1024 + memory_mib_int = int(memory_value) + if memory_value != memory_mib_int: + raise errors.Config.InvalidValue( + 'Invalid {0} value: "{1}". The specified size must be an integer ' + 'number of MiB.'.format(self._GetOptionFullName(component_full_name), + string)) + return memory_mib_int + + +class CustomMachineTypeSpec(spec.BaseSpec): + """Properties of a custom machine type. + + Attributes: + cpus: int. Number of vCPUs. + memory: string. Representation of the size of memory, expressed in MiB or + GiB. Must be an integer number of MiB (e.g. "1280MiB", "7.5GiB"). + """ + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(CustomMachineTypeSpec, cls)._GetOptionDecoderConstructions() + result.update({'cpus': (option_decoders.IntDecoder, {'min': 1}), + 'memory': (MemoryDecoder, {})}) + return result + + +class MachineTypeDecoder(option_decoders.TypeVerifier): + """Decodes the machine_type option of a VM config.""" + + def __init__(self, **kwargs): + super(MachineTypeDecoder, self).__init__((six.string_types + (dict,)), + **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Decodes the machine_type option of a VM config. + + Args: + value: Either a string name of a machine type or a dict containing + 'cpu' and 'memory' keys describing a custom VM. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + If value is a string, returns it unmodified. Otherwise, returns the + decoded CustomMachineTypeSpec. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + super(MachineTypeDecoder, self).Decode(value, component_full_name, + flag_values) + if isinstance(value, six.string_types): + return value + return CustomMachineTypeSpec(self._GetOptionFullName(component_full_name), + flag_values=flag_values, **value) + + +class AzureMachineTypeDecoder(option_decoders.TypeVerifier): + """Decodes the machine_type option of a VM config.""" + + def __init__(self, **kwargs): + super(AzureMachineTypeDecoder, self).__init__(six.string_types + (dict,), + **kwargs) + + def Decode(self, value, component_full_name, flag_values): + """Decodes the machine_type option of a VM config. + + Args: + value: Either a string name of a machine type or a dict containing + 'compute_units' and 'tier' keys describing a machine type. + component_full_name: string. Fully qualified name of the configurable + component containing the config option. + flag_values: flags.FlagValues. Runtime flag values to be propagated to + BaseSpec constructors. + + Returns: + If value is a string, returns it unmodified. Otherwise, returns the + decoded CustomMachineTypeSpec. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + super(AzureMachineTypeDecoder, self).Decode(value, component_full_name, + flag_values) + if isinstance(value, six.string_types): + return value + return AzurePerformanceTierDecoder( + self._GetOptionFullName(component_full_name), + flag_values=flag_values, **value) + + +class AzurePerformanceTierDecoder(spec.BaseSpec): + """Properties of a An Azure custom machine type. + + Attributes: + compute_units: int. Number of compute units. + tier: Basic, Standard or Premium + """ + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super( + AzurePerformanceTierDecoder, cls)._GetOptionDecoderConstructions() + # https://docs.microsoft.com/en-us/azure/virtual-machines/windows/acu + # https://docs.microsoft.com/en-us/azure/sql-database/sql-database-service-tiers + result.update({'compute_units': (option_decoders.IntDecoder, {'min': 50}), + 'tier': (option_decoders.EnumDecoder, { + 'valid_values': azure_flags.VALID_TIERS})}) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. + May be modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + if flag_values['azure_tier'].present: + config_values['tier'] = flag_values.azure_tier + + if flag_values['azure_compute_units'].present: + config_values['compute_units'] = flag_values.azure_compute_units diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/__init__.py b/script/cumulus/pkb/perfkitbenchmarker/data/__init__.py new file mode 100644 index 0000000..5aee938 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/__init__.py @@ -0,0 +1,238 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This module defines an interface for finding named resources. + +Due to license restrictions, not all software dependences can be shipped with +PerfKitBenchmarker. +Those that can be included in perfkitbenchmarker/data, or + perfkitbenchmarker/scripts and are loaded via a PackageResourceLoader. + +Users can specify additional paths to search for required data files using the +`--data_search_paths` flag. +""" + +import abc +import logging +import os +import shutil +from absl import flags +import perfkitbenchmarker +from perfkitbenchmarker import temp_dir +import pkg_resources +import six + +FLAGS = flags.FLAGS + +flags.DEFINE_multi_string('data_search_paths', ['.'], + 'Additional paths to search for data files. ' + 'These paths will be searched prior to using files ' + 'bundled with PerfKitBenchmarker.') + +_RESOURCES = 'resources' + + +class ResourceNotFound(ValueError): + """Error raised when a resource could not be found on the search path.""" + pass + + +class ResourceLoader(six.with_metaclass(abc.ABCMeta, object)): + """An interface for loading named resources.""" + + @abc.abstractmethod + def ResourceExists(self, name): + """Checks for existence of the resource 'name'. + + Args: + name: string. Name of the resource. Typically a file name. + + Returns: + A boolean indicating whether the resource 'name' can be loaded by this + object. + """ + pass + + @abc.abstractmethod + def ResourcePath(self, name): + """Gets the path to the resource 'name'. + + Args: + name: string. Name of the resource. Typically a file name. + + Returns: + A full path to 'name' on the filesystem. + + Raises: + ResourceNotFound: If 'name' was not found. + """ + pass + + +class FileResourceLoader(ResourceLoader): + """Loads resources from a directory in the filesystem. + + Attributes: + path: string. Root path to load resources from. + """ + + def __init__(self, path): + super().__init__() + self.path = path + + if not os.path.isdir(path): + logging.warn('File resource loader root %s is not a directory.', path) + + def __repr__(self): + return '<{0} path="{1}">'.format(type(self).__name__, self.path) + + def _Join(self, *args): + return os.path.join(self.path, *args) + + def ResourceExists(self, name): + return os.path.exists(self._Join(name)) + + def ResourcePath(self, name): + if not self.ResourceExists(name): + raise ResourceNotFound(name) + return self._Join(name) + + +class PackageResourceLoader(ResourceLoader): + """Loads resources from a Python package. + + Attributes: + package: string. Name of the package containing resources. + """ + + def __init__(self, package): + super().__init__() + self.package = package + + def __repr__(self): + return '<{0} package="{1}">'.format(type(self).__name__, self.package) + + def ResourceExists(self, name): + return pkg_resources.resource_exists(self.package, name) + + def ResourcePath(self, name): + if not self.ResourceExists(name): + raise ResourceNotFound(name) + try: + path = pkg_resources.resource_filename(self.package, name) + except NotImplementedError: + # This can happen if PerfKit Benchmarker is executed from a zip file. + # Extract the resource to the version-specific temporary directory. + path = os.path.join(temp_dir.GetVersionDirPath(), _RESOURCES, name) + if not os.path.exists(path): + dir_path = os.path.dirname(path) + try: + os.makedirs(dir_path) + except OSError: + if not os.path.isdir(dir_path): + raise + with open(path, 'wb') as extracted_file: + shutil.copyfileobj(pkg_resources.resource_stream(self.package, name), + extracted_file) + return path + + +DATA_PACKAGE_NAME = 'perfkitbenchmarker.data' +#YCSB_WORKLOAD_DIR_NAME = os.path.join( +# os.path.dirname(perfkitbenchmarker.__file__), 'data/ycsb') +#EDW_SCRIPT_DIR_NAME = os.path.join( +# os.path.dirname(perfkitbenchmarker.__file__), 'data/edw') +SCRIPT_PACKAGE_NAME = 'perfkitbenchmarker.scripts' +CONFIG_PACKAGE_NAME = 'perfkitbenchmarker.configs' +DEFAULT_RESOURCE_LOADERS = [PackageResourceLoader(DATA_PACKAGE_NAME), +# FileResourceLoader(YCSB_WORKLOAD_DIR_NAME), +# FileResourceLoader(EDW_SCRIPT_DIR_NAME), + PackageResourceLoader(SCRIPT_PACKAGE_NAME), + PackageResourceLoader(CONFIG_PACKAGE_NAME)] + + +def _GetResourceLoaders(): + """Gets a list of registered ResourceLoaders. + + Returns: + List of ResourceLoader instances. FileResourceLoaders for paths in + FLAGS.data_search_paths will be listed first, followed by + DEFAULT_RESOURCE_LOADERS. + """ + loaders = [] + + # Add all paths to list if they are specified on the command line (will warn + # if any are invalid). + # Otherwise add members of the default list iff they exist. + if FLAGS['data_search_paths'].present: + for path in FLAGS.data_search_paths: + loaders.append(FileResourceLoader(path)) + else: + for path in FLAGS.data_search_paths: + if os.path.isdir(path): + loaders.append(FileResourceLoader(path)) + loaders.extend(DEFAULT_RESOURCE_LOADERS) + return loaders + + +def ResourcePath(resource_name, search_user_paths=True): + """Gets the filename of a resource. + + Loaders are searched in order until the resource is found. + If no loader provides 'resource_name', an exception is thrown. + + If 'search_user_paths' is true, the directories specified by + "--data_search_paths" are consulted before the default paths. + + Args: + resource_name: string. Name of a resource. + search_user_paths: boolean. Whether paths from "--data_search_paths" should + be searched before the default paths. + Returns: + A path to the resource on the filesystem. + Raises: + ResourceNotFound: When resource was not found. + """ + if search_user_paths: + loaders = _GetResourceLoaders() + else: + loaders = DEFAULT_RESOURCE_LOADERS + for loader in loaders: + if loader.ResourceExists(resource_name): + return loader.ResourcePath(resource_name) + + raise ResourceNotFound( + '{0} (Searched: {1})'.format(resource_name, loaders)) + + +def ResourceExists(resource_name, search_user_paths=True): + """Returns True if a resource exists. + + Loaders are searched in order until the resource is found. + If no loader provides 'resource_name', returns False. + + If 'search_user_paths' is true, the directories specified by + "--data_search_paths" are consulted before the default paths. + + Args: + resource_name: string. Name of a resource. + search_user_paths: boolean. Whether paths from "--data_search_paths" should + be searched before the default paths. + Returns: + Whether the resource exists. + """ + try: + ResourcePath(resource_name, search_user_paths) + return True + except ResourceNotFound: + return False diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/build_collectd.sh.j2 b/script/cumulus/pkb/perfkitbenchmarker/data/build_collectd.sh.j2 new file mode 100755 index 0000000..250936d --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/build_collectd.sh.j2 @@ -0,0 +1,82 @@ +#!/bin/bash + +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Builds collectd from source on a VM. +# Supports Debian and RHEL-based systems. + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +readonly PACKAGE='{{ collectd_package }}' +readonly BUILD_DIR='{{ build_dir }}' +readonly PREFIX='{{ root_dir }}' +readonly PARENT_DIR='{{ parent_dir }}' +readonly CONFIG_DEPD_FILE='{{ config_depd_file }}' +readonly CONFIG_FILE='{{ config_file }}' +readonly PLUGIN_DIR='{{ plugin_dir }}' +readonly PATCHES_DIR='{{ patches_dir }}' +readonly PYTHON_CONFIG='{{ python_config }}' + +function build_collectd() { + mkdir $BUILD_DIR + pushd $BUILD_DIR + tar --strip-components 1 -xjf $PACKAGE + cp $PATCHES_DIR/*.patch . + for el in *.patch + do + patch -p1 < $el + done + autoreconf + export PYTHON_CONFIG + CFLAGS='-w -Werror' ./configure --prefix "$PREFIX" \ + --enable-python \ + --disable-perl \ + --without-perl-bindings \ + --disable-java \ + --with-librdkafka=/usr/local \ + --disable-rrdtool \ + --disable-werror + CFLAGS='-w -Werror' make -j `cat /proc/cpuinfo | grep processor | wc -l` + make install + popd + rm -rf $BUILD_DIR +} +function configure_collectd() { + # Add a collectd configuration script. + # See: + # https://github.com/collectd/collectd/blob/collectd-5.5.0/src/collectd.conf.in + # for a more verbose description. + cd $PARENT_DIR + cat $CONFIG_FILE > $PREFIX/etc/collectd.conf +} + +function configure_extra_plugins() { + cd $PLUGIN_DIR + chmod +x *.sh + for filename in *_depend.sh; do + sudo ./$filename + done +} + +# Build collectd if it doesn't already exist. +if [ ! -f $PREFIX/sbin/collectd ] + then + build_collectd + configure_extra_plugins + configure_collectd $CONFIG_FILE +fi diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/cAdvisor_metrics/perf-default.json b/script/cumulus/pkb/perfkitbenchmarker/data/cAdvisor_metrics/perf-default.json new file mode 100644 index 0000000..a158e3b --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/cAdvisor_metrics/perf-default.json @@ -0,0 +1,941 @@ +{ + "core": { + "events": [ + [ + "instructions_retired" + ], + [ + "l1d.replacement" + ], + [ + "L2_RQSTS.ALL_CODE_RD" + ], + [ + "MEM_LOAD_RETIRED.L1_HIT" + ], + [ + "MEM_LOAD_RETIRED.L2_HIT" + ], + [ + "L2_LINES_IN.ALL" + ], + [ + "MEM_LOAD_RETIRED.L2_MISS" + ], + [ + "L2_RQSTS.CODE_RD_MISS" + ], + [ + "ITLB_MISSES.WALK_COMPLETED" + ], + [ + "ref-cycles" + ], + [ + "instructions" + ], + [ + "DTLB_MISSES.WALK_COMPLETED" + ], + [ + "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M" + ], + [ + "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD" + ], + [ + "LSD.UOPS" + ], + [ + "CORE_POWER.LVL0_TURBO_LICENSE" + ], + [ + "CORE_POWER.LVL1_TURBO_LICENSE" + ], + [ + "CORE_POWER.LVL2_TURBO_LICENSE" + ], + [ + "CPU_CLK_UNHALTED.THREAD_ANY" + ], + [ + "IDQ_UOPS_NOT_DELIVERED.CORE" + ], + [ + "UOPS_RETIRED.RETIRE_SLOTS" + ], + [ + "INT_MISC.RECOVERY_CYCLES_ANY" + ], + [ + "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD" + ], + [ + "OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD" + ], + [ + "UOPS_ISSUED.ANY" + ], + [ + "CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE" + ], + [ + "CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY" + ], + [ + "OCR.ALL_READS.L3_MISS.REMOTE_HITM" + ], + [ + "cpu-cycles" + ], + [ + "power/energy-pkg" + ], + [ + "power/energy-ram" + ], + [ + "ITLB_MISSES.WALK_COMPLETED_2M_4M" + ], + [ + "ITLB_MISSES.WALK_ACTIVE" + ], + [ + "DTLB_LOAD_MISSES.WALK_COMPLETED" + ], + [ + "DTLB_LOAD_MISSES.WALK_COMPLETED_4K" + ], + [ + "DTLB_LOAD_MISSES.WALK_COMPLETED_1G" + ], + [ + "DTLB_STORE_MISSES.WALK_COMPLETED" + ], + [ + "DTLB_LOAD_MISSES.WALK_ACTIVE" + ], + [ + "DTLB_STORE_MISSES.WALK_ACTIVE" + ], + [ + "OCR.ALL_READS.L3_MISS.REMOTE_HIT_FORWARD" + ], + [ + "IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE" + ], + [ + "ICACHE_16B.IFDATA_STALL" + ], + [ + "ICACHE_64B.IFTAG_STALL" + ], + [ + "INT_MISC.CLEAR_RESTEER_CYCLES" + ], + [ + "INT_MISC.RECOVERY_CYCLES_ANY" + ], + [ + "BACLEARS.ANY" + ], + [ + "BR_MISP_RETIRED.ALL_BRANCHES" + ], + [ + "MACHINE_CLEARS.COUNT" + ], + [ + "IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE" + ], + [ + "CYCLE_ACTIVITY.STALLS_MEM_ANY" + ], + [ + "CYCLE_ACTIVITY.STALLS_L1D_MISS" + ], + [ + "CYCLE_ACTIVITY.STALLS_L2_MISS" + ], + [ + "CYCLE_ACTIVITY.STALLS_L3_MISS" + ], + [ + "DTLB_LOAD_MISSES.STLB_HIT" + ], + [ + "DTLB_STORE_MISSES.STLB_HIT" + ], + [ + "LD_BLOCKS.STORE_FORWARD" + ], + [ + "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_L3_MISS_DEMAND_DATA_RD" + ], + [ + "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD_GE_6" + ], + [ + "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD_GE_6" + ], + [ + "OFFCORE_REQUESTS_BUFFER.SQ_FULL" + ], + [ + "EXE_ACTIVITY.BOUND_ON_STORES" + ], + [ + "EXE_ACTIVITY.EXE_BOUND_0_PORTS" + ], + [ + "EXE_ACTIVITY.1_PORTS_UTIL" + ], + [ + "EXE_ACTIVITY.2_PORTS_UTIL" + ], + [ + "ARITH.DIVIDER_ACTIVE" + ], + [ + "UOPS_EXECUTED.CORE_CYCLES_NONE" + ], + [ + "UOPS_EXECUTED.CORE_CYCLES_GE_1" + ], + [ + "UOPS_EXECUTED.CORE_CYCLES_GE_2" + ], + [ + "UOPS_EXECUTED.CORE_CYCLES_GE_3" + ], + [ + "IDQ.MS_UOPS" + ] + ], + "custom_events": [ + { + "config": [ + "0x5300c0" + ], + "name": "instructions_retired", + "type": 4 + }, + { + "config": [ + "0x151" + ], + "name": "l1d.replacement", + "type": 4 + }, + { + "config": [ + "0xe424" + ], + "name": "L2_RQSTS.ALL_CODE_RD", + "type": 4 + }, + { + "config": [ + "0x1d1" + ], + "name": "MEM_LOAD_RETIRED.L1_HIT", + "type": 4 + }, + { + "config": [ + "0x2d1" + ], + "name": "MEM_LOAD_RETIRED.L2_HIT", + "type": 4 + }, + { + "config": [ + "0x1ff1" + ], + "name": "L2_LINES_IN.ALL", + "type": 4 + }, + { + "config": [ + "0x10d1" + ], + "name": "MEM_LOAD_RETIRED.L2_MISS", + "type": 4 + }, + { + "config": [ + "0x2424" + ], + "name": "ML2_RQSTS.CODE_RD_MISS", + "type": 4 + }, + { + "config": [ + "0xe85" + ], + "name": "ITLB_MISSES.WALK_COMPLETED", + "type": 4 + }, + { + "config": [ + "0x9" + ], + "name": "ref-cycles" + }, + { + "config": [ + "0x1" + ], + "name": "instructions" + }, + { + "config": [ + "0xe08" + ], + "name": "DTLB_MISSES.WALK_COMPLETED", + "type": 4 + }, + { + "config": [ + "0x408" + ], + "name": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M", + "type": 4 + }, + { + "config": [ + "0x1000160" + ], + "name": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", + "type": 4 + }, + { + "config": [ + "0x1a8" + ], + "name": "LSD.UOPS", + "type": 4 + }, + { + "config": [ + "0x728" + ], + "name": "CORE_POWER.LVL0_TURBO_LICENSE", + "type": 4 + }, + { + "config": [ + "0x1828" + ], + "name": "CORE_POWER.LVL1_TURBO_LICENSE", + "type": 4 + }, + { + "config": [ + "0x2028" + ], + "name": "CORE_POWER.LVL2_TURBO_LICENSE", + "type": 4 + }, + { + "config": [ + "0x20003c" + ], + "name": "CPU_CLK_UNHALTED.THREAD_ANY", + "type": 4 + }, + { + "config": [ + "0x19c" + ], + "name": "IDQ_UOPS_NOT_DELIVERED.CORE", + "type": 4 + }, + { + "config": [ + "0x2c2" + ], + "name": "UOPS_RETIRED.RETIRE_SLOTS", + "type": 4 + }, + { + "config": [ + "0x20010d" + ], + "name": "INT_MISC.RECOVERY_CYCLES_ANY", + "type": 4 + }, + { + "config": [ + "0x1060" + ], + "name": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD", + "type": 4 + }, + { + "config": [ + "0x10b0" + ], + "name": "OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", + "type": 4 + }, + { + "config": [ + "0x10e" + ], + "name": "OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", + "type": 4 + }, + { + "config": [ + "0x23c" + ], + "name": "CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE", + "type": 4 + }, + { + "config": [ + "0x20013c" + ], + "name": "CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY", + "type": 4 + }, + { + "config": [ + "0x1b7", + "0x103FC007F7" + ], + "name": "OCR.ALL_READS.L3_MISS.REMOTE_HITM", + "type": 4 + }, + { + "config": [ + "0x00" + ], + "name": "cpu-cycles" + }, + { + "config": [ + "0x2" + ], + "name": "power/energy-pkg" + }, + { + "config": [ + "0x3" + ], + "name": "power/energy-ram" + }, + { + "config": [ + "0x485" + ], + "name": "ITLB_MISSES.WALK_COMPLETED_2M_4M", + "type": 4 + }, + { + "config": [ + "0x1001085" + ], + "name": "ITLB_MISSES.WALK_ACTIVE", + "type": 4 + }, + { + "config": [ + "0xe08" + ], + "name": "DTLB_LOAD_MISSES.WALK_COMPLETED", + "type": 4 + }, + { + "config": [ + "0x208" + ], + "name": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K", + "type": 4 + }, + { + "config": [ + "0x808" + ], + "name": "DTLB_LOAD_MISSES.WALK_COMPLETED_1G", + "type": 4 + }, + { + "config": [ + "0xe49" + ], + "name": "DTLB_STORE_MISSES.WALK_COMPLETED", + "type": 4 + }, + { + "config": [ + "0x1001008" + ], + "name": "DTLB_LOAD_MISSES.WALK_ACTIVE", + "type": 4 + }, + { + "config": [ + "0x1001049" + ], + "name": "DTLB_STORE_MISSES.WALK_ACTIVE", + "type": 4 + }, + { + "config": [ + "0x1b7", + "0x083FC007F7" + ], + "name": "OCR.ALL_READS.L3_MISS.REMOTE_HIT_FORWARD", + "type": 4 + }, + { + "config": [ + "0x400019c" + ], + "name": "IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", + "type": 4 + }, + { + "config": [ + "0x480" + ], + "name": "ICACHE_16B.IFDATA_STALL", + "type": 4 + }, + { + "config": [ + "0x483" + ], + "name": "ICACHE_64B.IFTAG_STALL", + "type": 4 + }, + { + "config": [ + "0x800d" + ], + "name": "INT_MISC.CLEAR_RESTEER_CYCLES", + "type": 4 + }, + { + "config": [ + "0x20010d" + ], + "name": "INT_MISC.RECOVERY_CYCLES_ANY", + "type": 4 + }, + { + "config": [ + "0x1e6" + ], + "name": "BACLEARS.ANY", + "type": 4 + }, + { + "config": [ + "0xc5" + ], + "name": "BR_MISP_RETIRED.ALL_BRANCHES", + "type": 4 + }, + { + "config": [ + "0x10401c3" + ], + "name": "MACHINE_CLEARS.COUNT", + "type": 4 + }, + { + "config": [ + "0x400019c" + ], + "name": "IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", + "type": 4 + }, + { + "config": [ + "0x140014a3" + ], + "name": "CYCLE_ACTIVITY.STALLS_MEM_ANY", + "type": 4 + }, + { + "config": [ + "0xc000ca3" + ], + "name": "CYCLE_ACTIVITY.STALLS_L1D_MISS", + "type": 4 + }, + { + "config": [ + "0x50005a3" + ], + "name": "CYCLE_ACTIVITY.STALLS_L2_MISS", + "type": 4 + }, + { + "config": [ + "0x60006a3" + ], + "name": "CYCLE_ACTIVITY.STALLS_L3_MISS", + "type": 4 + }, + { + "config": [ + "0x2008" + ], + "name": "DTLB_LOAD_MISSES.STLB_HIT", + "type": 4 + }, + { + "config": [ + "0x2049" + ], + "name": "DTLB_STORE_MISSES.STLB_HIT", + "type": 4 + }, + { + "config": [ + "0x203" + ], + "name": "LD_BLOCKS.STORE_FORWARD", + "type": 4 + }, + { + "config": [ + "0x1001060" + ], + "name": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_L3_MISS_DEMAND_DATA_RD", + "type": 4 + }, + { + "config": [ + "0x6000160" + ], + "name": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD_GE_6", + "type": 4 + }, + { + "config": [ + "0x6001060" + ], + "name": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD_GE_6", + "type": 4 + }, + { + "config": [ + "0x1b2" + ], + "name": "OFFCORE_REQUESTS_BUFFER.SQ_FULL", + "type": 4 + }, + { + "config": [ + "0x40a6" + ], + "name": "EXE_ACTIVITY.BOUND_ON_STORES", + "type": 4 + }, + { + "config": [ + "0x1a6" + ], + "name": "EXE_ACTIVITY.EXE_BOUND_0_PORTS", + "type": 4 + }, + { + "config": [ + "0x2a6" + ], + "name": "EXE_ACTIVITY.1_PORTS_UTIL", + "type": 4 + }, + { + "config": [ + "0x4a6" + ], + "name": "EXE_ACTIVITY.2_PORTS_UTIL", + "type": 4 + }, + { + "config": [ + "0x1000114" + ], + "name": "ARITH.DIVIDER_ACTIVE", + "type": 4 + }, + { + "config": [ + "0x18002b1" + ], + "name": "UOPS_EXECUTED.CORE_CYCLES_NONE", + "type": 4 + }, + { + "config": [ + "0x10002b1" + ], + "name": "UOPS_EXECUTED.CORE_CYCLES_GE_1", + "type": 4 + }, + { + "config": [ + "0x20002b1" + ], + "name": "UOPS_EXECUTED.CORE_CYCLES_GE_2", + "type": 4 + }, + { + "config": [ + "0x30002b1" + ], + "name": "UOPS_EXECUTED.CORE_CYCLES_GE_3", + "type": 4 + }, + { + "config": [ + "0x3079" + ], + "name": "IDQ.MS_UOPS", + "type": 4 + } + ] + }, + "uncore": { + "events": [ + "uncore_cha/unc_cha_tor_inserts.ia_miss.0x40432", + "uncore_cha/unc_cha_tor_inserts.ia_occupancy.0x40432", + "uncore_cha/unc_cha_tor_inserts.ia_miss.0x40431", + "uncore_imc/cas_count_read", + "uncore_imc/cas_count_write", + "uncore_cha/unc_cha_tor_occupancy.ia_miss.0x40431", + "uncore_cha/unc_cha_clockticks", + "uncore_cha/unc_cha_tor_inserts.ia_hit.0x40433", + "uncore_cha/unc_cha_tor_occupancy.ia_hit.0x40433", + "uncore_cha/unc_cha_tor_inserts.ia_miss.0x40433", + "uncore_cha/unc_cha_tor_occupancy.ia_miss.0x40433", + "uncore_cha/unc_cha_tor_inserts.ia_miss.0x12D40433", + "uncore_cha/unc_cha_tor_occupancy.ia_miss.0x12D40433", + "uncore_cha/unc_cha_tor_inserts.ia_miss.0x12CC0233", + "uncore_cha/unc_cha_tor_occupancy.ia_miss.0x12CC0233", + "uncore_cha/unc_cha_tor_inserts.ia_miss.0x12C40033", + "uncore_cha/unc_cha_tor_occupancy.ia_miss.0x12C40033", + "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0", + "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1", + "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2", + "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3", + "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0", + "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1", + "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2", + "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3", + "uncore_upi/UNC_UPI_TxL_FLITS.ALL_DATA", + "uncore_upi/UNC_UPI_TxL_FLITS.NON_DATA", + "uncore_upi/UNC_UPI_CLOCKTICKS", + "uncore_upi/UNC_UPI_L1_POWER_CYCLES", + "uncore_imc/UNC_M_RPQ_INSERTS", + "uncore_imc/UNC_M_RPQ_OCCUPANCY" + ], + "custom_events": [ + { + "config": [ + "0x2135", + "0x4043200000000" + ], + "name": "uncore_cha/unc_cha_tor_inserts.ia_miss.0x40432" + }, + { + "config": [ + "0x2136", + "0x4043200000000" + ], + "name": "uncore_cha/unc_cha_tor_inserts.ia_occupancy.0x40432" + }, + { + "config": [ + "0x2135", + "0x4043100000000" + ], + "name": "uncore_cha/unc_cha_tor_inserts.ia_miss.0x40431" + }, + { + "config": [ + "0x304" + ], + "name": "uncore_imc/cas_count_read" + }, + { + "config": [ + "0xc04" + ], + "name": "uncore_imc/cas_count_write" + }, + { + "config": [ + "0x2136" + ], + "name": "uncore_cha/unc_cha_tor_occupancy.ia_miss.0x40431" + }, + { + "config": [ + "0x00" + ], + "name": "uncore_cha/unc_cha_clockticks" + }, + { + "config": [ + "0x1135", + "0x4043300000000" + ], + "name": "uncore_cha/unc_cha_tor_inserts.ia_hit.0x40433" + }, + { + "config": [ + "0x1136", + "0x4043300000000" + ], + "name": "uncore_cha/unc_cha_tor_occupancy.ia_hit.0x40433" + }, + { + "config": [ + "0x2135", + "0x4043300000000" + ], + "name": "uncore_cha/unc_cha_tor_inserts.ia_miss.0x40433" + }, + { + "config": [ + "0x2136", + "0x4043300000000" + ], + "name": "uncore_cha/unc_cha_tor_occupancy.ia_miss.0x40433" + }, + { + "config": [ + "0x2135", + "0x12D4043300000000" + ], + "name": "uncore_cha/unc_cha_tor_inserts.ia_miss.0x12D40433" + }, + { + "config": [ + "0x2136", + "0x12D4043300000000" + ], + "name": "uncore_cha/unc_cha_tor_occupancy.ia_miss.0x12D40433" + }, + { + "config": [ + "0x2135", + "0x12CC023300000000" + ], + "name": "uncore_cha/unc_cha_tor_inserts.ia_miss.0x12CC0233" + }, + { + "config": [ + "0x2136", + "0x12CC023300000000" + ], + "name": "uncore_cha/unc_cha_tor_occupancy.ia_miss.0x12CC0233" + }, + { + "config": [ + "0x2135", + "0x12C4003300000000" + ], + "name": "uncore_cha/unc_cha_tor_inserts.ia_miss.0x12C40033" + }, + { + "config": [ + "0x2136", + "0x12C4003300000000" + ], + "name": "uncore_cha/unc_cha_tor_occupancy.ia_miss.0x12C40033" + }, + { + "config": [ + "0x700000000483" + ], + "name": "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0" + }, + { + "config": [ + "0x702000000483" + ], + "name": "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1" + }, + { + "config": [ + "0x704000000483" + ], + "name": "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2" + }, + { + "config": [ + "0x708000000483" + ], + "name": "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3" + }, + { + "config": [ + "0x700000000183" + ], + "name": "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0" + }, + { + "config": [ + "0x702000000183" + ], + "name": "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1" + }, + { + "config": [ + "0x704000000183" + ], + "name": "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2" + }, + { + "config": [ + "0x708000000183" + ], + "name": "uncore_iio/UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3" + }, + { + "config": [ + "0xf02" + ], + "name": "uncore_upi/UNC_UPI_TxL_FLITS.ALL_DATA" + }, + { + "config": [ + "0x9702" + ], + "name": "uncore_upi/UNC_UPI_TxL_FLITS.NON_DATA" + }, + { + "config": [ + "0x1" + ], + "name": "uncore_upi/UNC_UPI_CLOCKTICKS" + }, + { + "config": [ + "0x21" + ], + "name": "uncore_upi/UNC_UPI_L1_POWER_CYCLES" + }, + { + "config": [ + "0x10" + ], + "name": "uncore_imc/UNC_M_RPQ_INSERTS" + }, + { + "config": [ + "0x80" + ], + "name": "uncore_imc/UNC_M_RPQ_OCCUPANCY" + } + ] + } +} diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/cAdvisor_metrics/prometheus-config.yaml.j2 b/script/cumulus/pkb/perfkitbenchmarker/data/cAdvisor_metrics/prometheus-config.yaml.j2 new file mode 100644 index 0000000..6bcb5ca --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/cAdvisor_metrics/prometheus-config.yaml.j2 @@ -0,0 +1,43 @@ +scrape_configs: + - job_name: 'cAdvisors' + scrape_interval: {{ scrape_interval }}s + static_configs: + - targets: {{ targets_list }} + metric_relabel_configs: + - source_labels: [__name__] + regex: "go.*" + action: drop + - source_labels: [__name__] + regex: "process.*" + action: drop + - source_labels: [__name__] + regex: "machine.*" + action: drop + - source_labels: [__name__] + regex: "container_cpu.*" + action: drop + - source_labels: [__name__] + regex: "container_memory.*" + action: drop + - source_labels: [__name__] + regex: "container_network.*" + action: drop + - source_labels: [__name__] + regex: "container_spec.*" + action: drop + - source_labels: [__name__] + regex: "container_start.*" + action: drop + - source_labels: [__name__] + regex: "container_tasks.*" + action: drop + - source_labels: [__name__] + regex: "container_last.*" + action: drop + - source_labels: [__name__] + regex: "container_scrape.*" + action: drop + - source_labels: [__name__] + regex: '(.*)' + replacement: 'cadvisor_${1}' + target_label: __name__ diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/collectd.conf b/script/cumulus/pkb/perfkitbenchmarker/data/collectd.conf new file mode 100755 index 0000000..d24a649 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/collectd.conf @@ -0,0 +1,118 @@ +Interval 10 + +LoadPlugin syslog +LoadPlugin cpu +LoadPlugin csv +LoadPlugin df +LoadPlugin disk +LoadPlugin entropy +LoadPlugin ethstat +LoadPlugin interface +LoadPlugin ipc +LoadPlugin irq +LoadPlugin load +LoadPlugin memory +LoadPlugin swap +LoadPlugin cpufreq +LoadPlugin "aggregation" + + + + ReportByCpu true + ValuesPercentage false + + + # ignore rootfs; else, the root file-system would appear twice, causing + # one of the updates to fail and spam the log + FSType rootfs + # ignore the usual virtual / temporary file-systems + FSType sysfs + FSType proc + FSType devtmpfs + FSType devpts + FSType tmpfs + FSType fusectl + FSType cgroup + IgnoreSelected true + + + DataDir "/opt/collectd/collectd-csv" + StoreRates true + + + Disk "/^[hs]d[a-z][0-9]+?$/" + Disk "/^xvd[a-z][a-z0-9]+?$/" + Disk "/^vd[a-z]?$/" + Disk "/^nvme[0-9c]+n[0-9]+$/" + IgnoreSelected false + UseBSDName false + UdevNameAttr "DEVNAME" + + + + # AWS: ens5, Azure: eth0, GCP: ens4, other: eno1 + Interface "/^eth[0-9]?$/" + Interface "/^ens[0-9]?$/" + Interface "/^eno[0-9]?$/" + Interface "/^enp[0-5]s[0-9]?$/" + Interface "/^bond[0-9]?$/" + Interface "/^br[0-9]?$/" + Map "rx_csum_offload_errors" "if_rx_errors" "checksum_offload" + Map "multicast" "if_multicast" + MappedOnly false + + + + Interface "/^eth/" + Interface "/^ens/" + Interface "/^enp/" + Interface "/^eno/" + Interface "/^bond/" + Interface "/^br/" + IgnoreSelected false + + + + Irq 7 + Irq 8 + Irq 9 + IgnoreSelected true + + + + ReportRelative true + + + + Globals true + + + ModulePath "/opt/collectd/collectd_plugins" + Import "sysstat_memstat" + Import "network_irq_affinity" + + # SamplingRate is the interval (in seconds) at which the sysstat sampling is called. Needs to be a positive integer value + # Metrics represents the types of data that this plugin can output. + # Metrics options: "pgpgin/s,pgpgout/s,fault/s,majflt/s,pgfree/s,pgscank/s,pgscand/s,pgsteal/s,vmeff,kbmemfree,kbmemused,memused,kbbuffers,kbcached,kbcommit,commit,kbactive,kbinact,kbdirty,kbswpfree,kbswpused,swpused,kbswpcad,swpcad" + # --- any subset of the input above will output the metrics in the list. Must be specified as a single line string, in quotations, with the elements separated by commas, + # "all" --- equivalent to giving the all the metrics mentioned above + # if not specified this will default to the "kbmemused" consumption. + # Docs: http://sebastien.godard.pagesperso-orange.fr/man_sar.html + SamplingRate 10 #seconds. + Metrics "all" + + + # No config options + + + + + + Plugin "cpu" + Type "cpu" + GroupBy "Host" + GroupBy "TypeInstance" + CalculateAverage true + + + diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/collectdDepend.txt b/script/cumulus/pkb/perfkitbenchmarker/data/collectdDepend.txt new file mode 100644 index 0000000..e69de29 diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/collectd_patches/ethstat_interface.patch b/script/cumulus/pkb/perfkitbenchmarker/data/collectd_patches/ethstat_interface.patch new file mode 100644 index 0000000..312485d --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/collectd_patches/ethstat_interface.patch @@ -0,0 +1,146 @@ +diff --git a/Makefile.am b/Makefile.am +index 00947da..5e0395f 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -1061,6 +1061,7 @@ if BUILD_PLUGIN_ETHSTAT + pkglib_LTLIBRARIES += ethstat.la + ethstat_la_SOURCES = src/ethstat.c + ethstat_la_LDFLAGS = $(PLUGIN_LDFLAGS) ++ethstat_la_LIBADD = libignorelist.la + endif + + if BUILD_PLUGIN_FHCOUNT +diff --git a/src/ethstat.c b/src/ethstat.c +index f8bc5b5..6fab71d 100644 +--- a/src/ethstat.c ++++ b/src/ethstat.c +@@ -27,6 +27,7 @@ + #include "plugin.h" + #include "utils/avltree/avltree.h" + #include "utils/common/common.h" ++#include "utils/ignorelist/ignorelist.h" + #include "utils_complain.h" + + #if HAVE_SYS_IOCTL_H +@@ -41,6 +42,9 @@ + #if HAVE_LINUX_ETHTOOL_H + #include + #endif ++#if HAVE_IFADDRS_H ++#include ++#endif + + struct value_map_s { + char type[DATA_MAX_NAME_LEN]; +@@ -48,31 +52,91 @@ struct value_map_s { + }; + typedef struct value_map_s value_map_t; + +-static char **interfaces; +-static size_t interfaces_num; ++static char **regexed_interfaces = NULL; ++static size_t regexed_interfaces_num = 0; ++ ++static char **valid_interfaces = NULL; ++static size_t valid_interfaces_num = 0; + + static c_avl_tree_t *value_map; + + static bool collect_mapped_only; + ++static int ethstat_check_if_existing_interface(char **existing_interfaces, char *entry) { ++ char *aux; ++ int idx; ++ ++ for(idx = 0; idx < valid_interfaces_num; ++idx) { ++ aux = strdup(existing_interfaces[idx]); ++ if (strcmp(aux, entry) == 0) ++ return 0; ++ } ++ ++ return 1; ++} ++ ++static int ethstat_get_matching_interface(const char *entry) { ++ size_t len; ++ struct ifaddrs *if_list = NULL; ++ char **v_tmp; ++ ignorelist_t *ethstat_ignorelist = NULL; ++ ++ if (ethstat_ignorelist == NULL) ++ ethstat_ignorelist = ignorelist_create(/* invert = */ 1); ++ if (ethstat_ignorelist == NULL) ++ return 1; ++ ++ len = strlen(entry); ++ if (len == 0) { ++ DEBUG("no new interface: empty entry"); ++ return 1; ++ } ++ ++ #if HAVE_GETIFADDRS ++ ignorelist_add(ethstat_ignorelist, entry); ++ ++ if (getifaddrs(&if_list) != 0) { ++ DEBUG("no new interface: no local interfaces were found"); ++ return -1; ++ } ++ for (struct ifaddrs *if_ptr = if_list; if_ptr != NULL; ++ if_ptr = if_ptr->ifa_next) { ++ if (ignorelist_match(ethstat_ignorelist, if_ptr->ifa_name) == 0 && ++ ethstat_check_if_existing_interface(valid_interfaces, if_ptr->ifa_name) != 0) { ++ v_tmp = realloc(valid_interfaces, sizeof(*valid_interfaces) * (valid_interfaces_num + 1)); ++ if (v_tmp == NULL) ++ return -1; ++ ++ valid_interfaces = v_tmp; ++ valid_interfaces[valid_interfaces_num] = strdup(if_ptr->ifa_name); ++ valid_interfaces_num++; ++ INFO("ethstat plugin: Registered interface %s", if_ptr->ifa_name); ++ } ++ } ++ #endif ++ ++ return 0; ++} ++ + static int ethstat_add_interface(const oconfig_item_t *ci) /* {{{ */ + { + char **tmp; + int status; + +- tmp = realloc(interfaces, sizeof(*interfaces) * (interfaces_num + 1)); ++ tmp = realloc(regexed_interfaces, sizeof(*regexed_interfaces) * (regexed_interfaces_num + 1)); + if (tmp == NULL) + return -1; +- interfaces = tmp; +- interfaces[interfaces_num] = NULL; ++ regexed_interfaces = tmp; ++ regexed_interfaces[regexed_interfaces_num] = NULL; + +- status = cf_util_get_string(ci, interfaces + interfaces_num); ++ status = cf_util_get_string(ci, regexed_interfaces + regexed_interfaces_num); + if (status != 0) + return status; + +- interfaces_num++; +- INFO("ethstat plugin: Registered interface %s", +- interfaces[interfaces_num - 1]); ++ status = ethstat_get_matching_interface(regexed_interfaces[regexed_interfaces_num]); ++ if (status != 0) ++ return status; ++ regexed_interfaces_num++; + + return 0; + } /* }}} int ethstat_add_interface */ +@@ -290,8 +354,8 @@ static int ethstat_read_interface(char *device) { + } /* }}} ethstat_read_interface */ + + static int ethstat_read(void) { +- for (size_t i = 0; i < interfaces_num; i++) +- ethstat_read_interface(interfaces[i]); ++ for (size_t i = 0; i < valid_interfaces_num; i++) ++ ethstat_read_interface(valid_interfaces[i]); + + return 0; + } diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/collectd_plugins/network_irq_affinity.py b/script/cumulus/pkb/perfkitbenchmarker/data/collectd_plugins/network_irq_affinity.py new file mode 100755 index 0000000..3967cec --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/collectd_plugins/network_irq_affinity.py @@ -0,0 +1,92 @@ +import collectd +import time +import os +import datetime + +PLUGIN_NAME = "network_irq_affinity" +UID = "0" +IRQS = [] + + +def set_uid(): + global UID + UID = datetime.datetime.today().strftime("%Y%m%d%H%M%S%f") + + +def logger(message): + collectd.info("%s: %s" % (PLUGIN_NAME, message)) + + +def _GetIrqAffinity(irq): + global UID + os.system('cat /proc/irq/{irq}/smp_affinity > /tmp/nirqa{UID}{irq}.out'.format(irq=irq, UID=UID)) + with open("/tmp/nirqa{UID}{irq}.out".format(irq=irq, UID=UID)) as f: + lines = f.readlines() + return lines[0].strip() + + +def _GetIpInterfaceName(): + global UID + os.system("ip link | awk -F: '$0 !~ \"lo|vir|wl|^[^0-9]\"{print $2a;getline}' > /tmp/nirqa%snm.out" % (UID)) + with open("/tmp/nirqa{UID}nm.out".format(UID=UID)) as f: + lines = f.readlines() + return lines[0].strip() + + +def _GetIpInterfaceIrqs(): + global UID + os.system("cat /proc/interrupts | grep {name} | awk '{{print substr($1,1,length($1)-1)}}' > /tmp/nirqa{UID}irqs.out" + .format(name=_GetIpInterfaceName(), UID=UID)) + with open("/tmp/nirqa{UID}irqs.out".format(UID=UID)) as f: + lines = f.readlines() + irqs = [] + for line in lines: + irqs.append(line.strip()) + return irqs + + +def _GetCpusFromIrqAffinity(affinity): + """returns a list of ints representing a single irq's associated CPUs""" + """e.g. 00000000,10000000,00000000""" + cpus = [] + position = 0 + for c in reversed(affinity): + if c == ',': + continue + n = int(c, 16) + for i in range(4): + if n & 2 ** i: + cpus.append((position * 4) + i) + position += 1 + return cpus + + +def config_func(config): + pass + + +def read_func(): + global IRQS + for irq in IRQS: + cpus = _GetCpusFromIrqAffinity(_GetIrqAffinity(irq)) + # logger(str(irq) + ":" + str(cpus)) + metric = collectd.Values() + metric.plugin = PLUGIN_NAME + metric.type = "count" + metric.type_instance = irq + metric.values = cpus + metric.dispatch() + collectd.flush() + + +def init_func(): + set_uid() + + global IRQS + IRQS = _GetIpInterfaceIrqs() + + +# Hook Callbacks, Order is important! +collectd.register_config(config_func) +collectd.register_init(init_func) +collectd.register_read(read_func) diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/collectd_plugins/plugin_utils.sh b/script/cumulus/pkb/perfkitbenchmarker/data/collectd_plugins/plugin_utils.sh new file mode 100755 index 0000000..02abd08 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/collectd_plugins/plugin_utils.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +LINUX_DISTRIBUTION=`python -c 'import platform; print (platform.dist()[0].lower())'` +KERNEL=`uname -r` + +function is_ubuntu() { + # 0 = true ; 1 = false + if [ $LINUX_DISTRIBUTION = "ubuntu" ]; then + return 0 + fi + return 1 +} + +function is_centos() { + if [ $LINUX_DISTRIBUTION = "centos" ]; then + return 0 + fi + if [ $LINUX_DISTRIBUTION = "redhat" ]; then + return 0 + fi + return 1 +} + +function get_kernel_version() { + return $KERNEL +} diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/collectd_plugins/sysstat_depend.sh b/script/cumulus/pkb/perfkitbenchmarker/data/collectd_plugins/sysstat_depend.sh new file mode 100755 index 0000000..9b2fdd5 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/collectd_plugins/sysstat_depend.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +. plugin_utils.sh + +if is_ubuntu; then + apt-get install -y sysstat +fi + +if is_centos; then + yum install -y sysstat +fi diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/collectd_plugins/sysstat_memstat.py b/script/cumulus/pkb/perfkitbenchmarker/data/collectd_plugins/sysstat_memstat.py new file mode 100644 index 0000000..13ffba6 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/collectd_plugins/sysstat_memstat.py @@ -0,0 +1,171 @@ +import collectd +import threading +import time +import os +import datetime + +mapping = { + "pgpgin/s": [3, "gauge"], + "pgpgout/s": [4, "gauge"], + "fault/s": [5, "gauge"], + "majflt/s": [6, "gauge"], + "pgfree/s": [7, "gauge"], + "pgscank/s": [8, "gauge"], + "pgscand/s": [9, "gauge"], + "pgsteal/s": [10, "gauge"], + "vmeff": [11, "percent"], + "kbmemfree": [12, "memory"], + "kbmemused": [13, "memory"], + "memused": [14, "percent"], + "kbbuffers": [15, "memory"], + "kbcached": [16, "memory"], + "kbcommit": [17, "counter"], + "commit": [18, "percent"], + "kbactive": [19, "memory"], + "kbinact": [20, "memory"], + "kbdirty": [21, "memory"], + "kbswpfree": [22, "memory"], + "kbswpused": [23, "memory"], + "swpused": [24, "percent"], + "kbswpcad": [25, "memory"], + "swpcad": [26, "percent"], +} + + +class Queue: + def __init__(self): + self.content = [] + self.lock = threading.Lock() + + def put(self, element): + self.lock.acquire() + self.content.insert(0, element) + self.lock.release() + + def get(self): + self.lock.acquire() + element = self.content.pop() + self.lock.release() + return element + + def empty(self): + self.lock.acquire() + result = len(self.content) == 0 + self.lock.release() + return result + + +PLUGIN_NAME = "sysstat_memstat" +UID = "0" +SAMPLING_RATE = 5 +METRICS = ["kbmemused"] +q = Queue() + + +def set_uid(): + global UID + UID = datetime.datetime.today().strftime("%Y%m%d%H%M%S%f") + + +def logger(message): + collectd.info("%s: %s" % (PLUGIN_NAME, message)) + + +def thread_runner_func(): + global q + global SAMPLING_RATE + global UID + while True: + os.system("rm -rf /tmp/mysar{UID} ; rm -rf /tmp/myout{UID}".format(UID=UID)) + os.system("sar -A -o /tmp/mysar{UID} 1 1 > /dev/null 2>&1".format(UID=UID)) + os.system("sadf /tmp/mysar{UID} -d -U -h -- -rSB | tr -s ';' ',' > /tmp/myout{UID}".format(UID=UID)) # memory stats + with open("/tmp/myout{UID}".format(UID=UID)) as f: + lines = f.readlines() + del lines[0] + # del lines[0] + data = { + "lines": lines + } + q.put(data) + time.sleep(SAMPLING_RATE - 1) + + +def config_func(config): + interval_set = False + metrics_set = False + + for node in config.children: + key = node.key.lower() + val = node.values[0] + logger(key) + logger(val) + if key == "samplingrate": + global SAMPLING_RATE + SAMPLING_RATE = int(val) + interval_set = True + elif key == "metrics": + global METRICS + if str(val) == "all": + METRICS = mapping.keys() + metrics_set = True + else: + intermediary_str = str(val).split(",") + new_metrics = [] + for metric in intermediary_str: + if metric in mapping.keys(): + new_metrics.append(metric) + METRICS = new_metrics + metrics_set = True + else: + logger('Unknown config key "%s"' % key) + + if interval_set: + logger("Using overridden interval: %s" % str(SAMPLING_RATE)) + else: + logger("Using default interval: %s " % str(SAMPLING_RATE)) + + if metrics_set: + logger("Using overridden metrics %s" % str(METRICS)) + else: + logger("Using default metrics: %s " % str(METRICS)) + + +def read_func(): + # logger("In read_func. Queue empty: %s" % str(q.empty()) ) + global METRICS + while not q.empty(): + data = q.get() + for line in data["lines"]: + for keyword in METRICS: + fields = line.split(",") + field_value = fields[mapping[keyword][0]] + + metric = collectd.Values() + metric.plugin = PLUGIN_NAME + + # DS_TYPE_COUNTER, DS_TYPE_GAUGE, DS_TYPE_DERIVE or DS_TYPE_ABSOLUTE. + # https://collectd.org/documentation/manpages/collectd-python.5.shtml + # gauge expects data to be float. Needs 1 single value + # counter expects data to be int + # more: cat ./collectd/share/collectd/types.db + # http://giovannitorres.me/using-collectd-python-and-graphite-to-graph-slurm-partitions.html + metric.type = mapping[keyword][1] + + metric.type_instance = keyword + metric.values = [field_value] + # metric.host = 'OverwritenHostname' + metric.dispatch() + collectd.flush() + + +def init_func(): + set_uid() + worker_thread = threading.Thread(target=thread_runner_func, args=()) + worker_thread.start() + logger("Monitoring thread started") + + +# Hook Callbacks, Order is important! +collectd.register_config(config_func) +collectd.register_init(init_func) +collectd.register_read(read_func) diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/edw/README b/script/cumulus/pkb/perfkitbenchmarker/data/edw/README new file mode 100644 index 0000000..9fb53f6 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/edw/README @@ -0,0 +1 @@ +Leave it empty but the directory is required diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/emon/README.md b/script/cumulus/pkb/perfkitbenchmarker/data/emon/README.md new file mode 100644 index 0000000..f5138ce --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/emon/README.md @@ -0,0 +1,43 @@ +### EMON + +This telemetry data collection tool can be used to collect EMON data on the SUT(s) while the workload is running on the same SUT(s). This is one of the traces based tool set in the Perfkitbenchmark (PKB) framework. By default, the telemetry tool will be fired at the same time the benchmark is launched, and terminated when the benchmark run is completed. The result will be pulled back to the PKB host in the PKB output folder. + +"--emon" is the only required command line flag for EMON collection. All other EMON related flags are optional. + +⚠️ **Please note that currently EMON runs only on bare metal instances.** + +``` + +Note :- By default the latest emon is available for you to run. If you want to use a custom version, +use --emon_tarball= + +For AMD runs, download and use the emon AMD version from emon website with --emon_tarball= + +``` + +#### Use Case 1: Collect EMON data and conduct EDP post-processing on-prem. + +``` +python3 ./pkb.py --emon --benchmarks=sysbench_cpu --benchmark_config_file=sysbench_cpu_config.yaml +``` + +##### Here is an example of sysbench_cpu_config.yaml: +```bash +static_vms: + - &worker + ip_address: 10.165.57.29 + user_name: pkb + ssh_private_key: ~/.ssh/id_rsa + internal_ip: 10.165.57.29 + tag: server +sysbench_cpu: + vm_groups: + vm_1: + static_vms: + - *worker + +flags: + sysbench_cpu_time: 60 # run for 60 seconds + sysbench_cpu_events: 0 # don't limit runtime by event count + sysbench_cpu_thread_counts: 1,0 # zero sets threadcount to number of VCPUs +``` diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/proxy_ip_list/proxy_ip_list.txt b/script/cumulus/pkb/perfkitbenchmarker/data/proxy_ip_list/proxy_ip_list.txt new file mode 100644 index 0000000..5774985 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/proxy_ip_list/proxy_ip_list.txt @@ -0,0 +1 @@ +# Update this file if you need limit cloud VMs access source IP addresses to certain CIDRs only such as 3.140.220.0/24 diff --git a/script/cumulus/pkb/perfkitbenchmarker/data/ssh_config.j2 b/script/cumulus/pkb/perfkitbenchmarker/data/ssh_config.j2 new file mode 100644 index 0000000..bf71210 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data/ssh_config.j2 @@ -0,0 +1,45 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +Host * + Protocol=2 + UserKnownHostsFile=/dev/null + StrictHostKeyChecking=no + IdentitiesOnly=yes + PreferredAuthentications=publickey + PasswordAuthentication=no + ConnectTimeout=5 + GSSAPIAuthentication=no + ServerAliveInterval=30 + ServerAliveCountMax=10 + BatchMode=yes + +{# Numeric / name-based access -#} +{% for vm in vms %} +Host {{ vm.name }} vm{{ loop.index0 }} + HostName={{ vm.ip_address }} + User={{ vm.user_name }} + Port={{ vm.ssh_port }} + IdentityFile={{ vm.ssh_private_key }} +{% endfor %} + +{# Group-based access -#} +{% for group, group_vms in vm_groups.items() %} +{% for vm in group_vms %} +Host {{group}}-{{ loop.index0 }} + HostName={{ vm.ip_address }} + User={{ vm.user_name }} + Port={{ vm.ssh_port }} + IdentityFile={{ vm.ssh_private_key }} +{% endfor -%} +{% endfor -%} diff --git a/script/cumulus/pkb/perfkitbenchmarker/data_discovery_service.py b/script/cumulus/pkb/perfkitbenchmarker/data_discovery_service.py new file mode 100644 index 0000000..925519e --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/data_discovery_service.py @@ -0,0 +1,65 @@ +"""Benchmarking support for Data Discovery Services for data lakes. + +In this module, the base class for Data Discovery Services is defined, which +outlines the common interface that must be implemented for each concrete Data +Discovery Service PKB supports (e.g. AWS Glue Crawler). +""" + +import abc +from typing import Any, Dict + +from absl import flags +from perfkitbenchmarker import resource + +_DATA_DISCOVERY_OBJECT_STORE_PATH = flags.DEFINE_string( + 'data_discovery_object_store_path', None, + 'Object store path which will be analyzed by the Data Discovery Service. ' + 'Must be a fully qualified object store URL (e.g. s3://bucket/dir for S3, ' + 'or gs://bucket/dir for GCS).' +) +_DATA_DISCOVERY_REGION = flags.DEFINE_string( + 'data_discovery_region', None, + 'Region on which the data discovery service will be deployed.' +) + +# Available service types +GLUE = 'glue' + + +def GetDataDiscoveryServiceClass(cloud, service_type): + """Gets the underlying Data Discovery Service class.""" + return resource.GetResourceClass( + BaseDataDiscoveryService, CLOUD=cloud, SERVICE_TYPE=service_type) + + +class BaseDataDiscoveryService(resource.BaseResource): + """Common interface of a data discovery service resource. + + Attributes: + region: The region the service was set up to use. + data_discovery_path: The object store path containing the files to be + discovered. + """ + + REQUIRED_ATTRS = ['CLOUD', 'SERVICE_TYPE'] + RESOURCE_TYPE = 'BaseDataDiscoveryService' + CLOUD = 'abstract' + SERVICE_TYPE = 'abstract' + + def __init__(self): + super().__init__() + self.region = _DATA_DISCOVERY_REGION.value + self.data_discovery_path = _DATA_DISCOVERY_OBJECT_STORE_PATH.value + + @classmethod + def FromSpec(cls, data_discovery_service_spec): + return cls() + + @abc.abstractmethod + def DiscoverData(self) -> float: + """Runs data discovery. Returns the time elapsed in secs.""" + raise NotImplementedError('Must implement in subclasses.') + + def GetMetadata(self) -> Dict[str, Any]: + """Return a dictionary of the metadata for this service.""" + return {'cloud': self.CLOUD, 'data_discovery_region': self.region} diff --git a/script/cumulus/pkb/perfkitbenchmarker/disk.py b/script/cumulus/pkb/perfkitbenchmarker/disk.py new file mode 100644 index 0000000..8ababf9 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/disk.py @@ -0,0 +1,637 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing abstract classes related to disks. + +Disks can be created, deleted, attached to VMs, and detached from VMs. +""" + +import abc +import logging + +from absl import flags +from perfkitbenchmarker import resource +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.configs import spec +import six + +flags.DEFINE_boolean('nfs_timeout_hard', True, + 'Whether to use hard or soft for NFS mount.') +flags.DEFINE_integer('nfs_rsize', 1048576, 'NFS read size.') +flags.DEFINE_integer('nfs_wsize', 1048576, 'NFS write size.') +flags.DEFINE_integer('nfs_timeout', 60, 'NFS timeout.') +flags.DEFINE_integer('nfs_retries', 2, 'NFS Retries.') +flags.DEFINE_integer( + 'nfs_nconnect', None, 'Number of connections that each NFS client should ' + 'establish to the server.') +flags.DEFINE_boolean( + 'nfs_noresvport', False, + 'Whether the NFS client should use a non-privileged ' + 'source port. Suggested to use with EFS') +flags.DEFINE_boolean( + 'nfs_managed', True, + 'Use a managed NFS service if using NFS disks. Otherwise ' + 'start an NFS server on the first VM.') +flags.DEFINE_string( + 'nfs_ip_address', None, + 'If specified, PKB will target this ip address when ' + 'mounting NFS "disks" rather than provisioning an NFS ' + 'Service for the corresponding cloud.') +flags.DEFINE_string( + 'nfs_directory', None, + 'Directory to mount if using a StaticNfsService. This ' + 'corresponds to the "VOLUME_NAME" of other NfsService ' + 'classes.') +flags.DEFINE_string('smb_version', '3.0', 'SMB version.') +flags.DEFINE_list('mount_options', [], + 'Additional arguments to supply when mounting.') +flags.DEFINE_list('fstab_options', [], + 'Additional arguments to supply to fstab.') + +FLAGS = flags.FLAGS + +# These are the (deprecated) old disk type names +STANDARD = 'standard' +REMOTE_SSD = 'remote_ssd' +PIOPS = 'piops' # Provisioned IOPS (SSD) in AWS and Alicloud +REMOTE_ESSD = 'remote_essd' # Enhanced Cloud SSD in Alicloud + +# 'local' refers to disks that come attached to VMs. It is the only +# "universal" disk type that is not associated with a provider. It +# exists because we can provision a local disk without creating a disk +# spec. The Aerospike benchmarks use this fact in their config +# methods, and they need to be able to tell when a disk is local. So +# until that changes, 'local' is a special disk type. +LOCAL = 'local' + +RAM = 'ram' + +# refers to disks that come from a cloud/unmanaged NFS or SMB service +NFS = 'nfs' +SMB = 'smb' + +# FUSE mounted object storage bucket +OBJECT_STORAGE = 'object_storage' + +# Map old disk type names to new disk type names +DISK_TYPE_MAPS = dict() + +# Standard metadata keys relating to disks +MEDIA = 'media' +REPLICATION = 'replication' +# And some possible values +HDD = 'hdd' +SSD = 'ssd' +NONE = 'none' +ZONE = 'zone' +REGION = 'region' + +DEFAULT_MOUNT_OPTIONS = 'discard' +DEFAULT_FSTAB_OPTIONS = 'defaults' + + +# TODO(user): remove this function when we remove the deprecated +# flags and disk type names. +def RegisterDiskTypeMap(provider_name, type_map): + """Register a map from legacy disk type names to modern ones. + + The translation machinery looks here to find the map corresponding + to the chosen provider and translates the user's flags and configs + to the new naming system. This function should be removed once the + (deprecated) legacy flags are removed. + + Args: + provider_name: a string. The name of the provider. Must match the names we + give to providers in benchmark_spec.py. + type_map: a dict. Maps generic disk type names (STANDARD, REMOTE_SSD, PIOPS) + to provider-specific names. + """ + + DISK_TYPE_MAPS[provider_name] = type_map + + +def GetDiskSpecClass(cloud): + """Get the DiskSpec class corresponding to 'cloud'.""" + return spec.GetSpecClass(BaseDiskSpec, CLOUD=cloud) + + +def WarnAndTranslateDiskTypes(name, cloud): + """Translate old disk types to new disk types, printing warnings if needed. + + Args: + name: a string specifying a disk type, either new or old. + cloud: the cloud we're running on. + + Returns: + The new-style disk type name (i.e. the provider's name for the type). + """ + + if cloud in DISK_TYPE_MAPS: + disk_type_map = DISK_TYPE_MAPS[cloud] + if name in disk_type_map and disk_type_map[name] != name: + new_name = disk_type_map[name] + logging.warning( + 'Disk type name %s is deprecated and will be removed. ' + 'Translating to %s for now.', name, new_name) + return new_name + else: + return name + else: + logging.info('No legacy->new disk type map for provider %s', cloud) + # The provider has not been updated to use new-style names. We + # need to keep benchmarks working, so we pass through the name. + return name + + +def WarnAndCopyFlag(old_name, new_name): + """Copy a value from an old flag to a new one, warning the user. + + Args: + old_name: old name of flag. + new_name: new name of flag. + """ + + if FLAGS[old_name].present: + logging.warning( + 'Flag --%s is deprecated and will be removed. Please ' + 'switch to --%s.', old_name, new_name) + if not FLAGS[new_name].present: + FLAGS[new_name].value = FLAGS[old_name].value + + # Mark the new flag as present so we'll print it out in our list + # of flag values. + FLAGS[new_name].present = True + else: + logging.warning('Ignoring legacy flag %s because new flag %s is present.', + old_name, new_name) + # We keep the old flag around so that providers that haven't been + # updated yet will continue to work. + + +DISK_FLAGS_TO_TRANSLATE = { + 'scratch_disk_type': 'data_disk_type', + 'scratch_disk_iops': 'aws_provisioned_iops', + 'scratch_disk_throughput': 'aws_provisioned_throughput', + 'scratch_disk_size': 'data_disk_size' +} + + +def WarnAndTranslateDiskFlags(): + """Translate old disk-related flags to new disk-related flags.""" + + for old, new in six.iteritems(DISK_FLAGS_TO_TRANSLATE): + WarnAndCopyFlag(old, new) + + +class BaseDiskSpec(spec.BaseSpec): + """Stores the information needed to create a disk. + + Attributes: + device_path: None or string. Path on the machine where the disk is located. + disk_number: None or int. Optional disk identifier unique within the current + machine. + disk_size: None or int. Size of the disk in GB. + disk_type: None or string. See cloud specific disk classes for more + information about acceptable values. + mount_point: None or string. Directory of mount point. + num_striped_disks: int. The number of disks to stripe together. If this is + 1, it means no striping will occur. This must be >= 1. + """ + + SPEC_TYPE = 'BaseDiskSpec' + CLOUD = None + + def __init__(self, *args, **kwargs): + self.device_path: str = None + self.mount_point: str = None + super(BaseDiskSpec, self).__init__(*args, **kwargs) + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Overrides config values with flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. Is + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + + Returns: + dict mapping config option names to values derived from the config + values or flag values. + """ + super(BaseDiskSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['data_disk_size'].present: + config_values['disk_size'] = flag_values.data_disk_size + if flag_values['data_disk_type'].present: + config_values['disk_type'] = flag_values.data_disk_type + if flag_values['num_striped_disks'].present: + config_values['num_striped_disks'] = flag_values.num_striped_disks + if flag_values['scratch_dir'].present: + config_values['mount_point'] = flag_values.scratch_dir + if flag_values['nfs_version'].present: + config_values['nfs_version'] = flag_values.nfs_version + if flag_values['nfs_timeout_hard'].present: + config_values['nfs_timeout_hard'] = flag_values.nfs_timeout_hard + if flag_values['nfs_rsize'].present: + config_values['nfs_rsize'] = flag_values.nfs_rsize + if flag_values['nfs_wsize'].present: + config_values['nfs_wsize'] = flag_values.nfs_wsize + if flag_values['nfs_timeout'].present: + config_values['nfs_timeout'] = flag_values.nfs_timeout + if flag_values['nfs_retries'].present: + config_values['nfs_retries'] = flag_values.nfs_retries + if flag_values['nfs_nconnect'].present: + config_values['nfs_nconnect'] = flag_values.nfs_nconnect + if flag_values['nfs_ip_address'].present: + config_values['nfs_ip_address'] = flag_values.nfs_ip_address + if flag_values['nfs_managed'].present: + config_values['nfs_managed'] = flag_values.nfs_managed + if flag_values['nfs_directory'].present: + config_values['nfs_directory'] = flag_values.nfs_directory + if flag_values['smb_version'].present: + config_values['smb_version'] = flag_values.smb_version + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Can be overridden by derived classes to add options or impose additional + requirements on existing options. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(BaseDiskSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'device_path': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'disk_number': (option_decoders.IntDecoder, { + 'default': None, + 'none_ok': True + }), + 'disk_size': (option_decoders.IntDecoder, { + 'default': None, + 'none_ok': True + }), + 'disk_type': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'mount_point': (option_decoders.StringDecoder, { + 'default': None, + 'none_ok': True + }), + 'num_striped_disks': (option_decoders.IntDecoder, { + 'default': 1, + 'min': 1 + }), + 'nfs_version': (option_decoders.StringDecoder, { + 'default': None + }), + 'nfs_ip_address': (option_decoders.StringDecoder, { + 'default': None + }), + 'nfs_managed': (option_decoders.BooleanDecoder, { + 'default': True + }), + 'nfs_directory': (option_decoders.StringDecoder, { + 'default': None + }), + 'nfs_rsize': (option_decoders.IntDecoder, { + 'default': 1048576 + }), + 'nfs_wsize': (option_decoders.IntDecoder, { + 'default': 1048576 + }), + 'nfs_timeout': (option_decoders.IntDecoder, { + 'default': 60 + }), + 'nfs_timeout_hard': (option_decoders.BooleanDecoder, { + 'default': True + }), + 'nfs_retries': (option_decoders.IntDecoder, { + 'default': 2 + }), + 'nfs_nconnect': (option_decoders.IntDecoder, { + 'default': None + }), + 'smb_version': (option_decoders.StringDecoder, { + 'default': '3.0' + }), + }) + return result + + +class BaseDisk(resource.BaseResource): + """Object representing a Base Disk.""" + + is_striped = False + + def __init__(self, disk_spec): + super(BaseDisk, self).__init__() + self.disk_size = disk_spec.disk_size + self.disk_type = disk_spec.disk_type + self.mount_point = disk_spec.mount_point + self.num_striped_disks = disk_spec.num_striped_disks + self.metadata.update({ + 'type': self.disk_type, + 'size': self.disk_size, + 'num_stripes': self.num_striped_disks, + }) + + # Set in derived classes by Attach() + self.vm = None + + # Linux related attributes. + self.device_path = disk_spec.device_path + + # Windows related attributes. + + # The disk number corresponds to the order in which disks were attached to + # the instance. The System Disk has a disk number of 0. Any local disks + # have disk numbers ranging from 1 to the number of local disks on the + # system. Any additional disks that were attached after boot will have + # disk numbers starting at the number of local disks + 1. These disk + # numbers are used in diskpart scripts in order to identify the disks + # that we want to operate on. + self.disk_number = disk_spec.disk_number + + @property + def mount_options(self): + """Returns options to mount a disk. + + The default value 'discard' is from the linux VM's MountDisk method. + + See `man 8 mount` for usage. For example, returning "ro" will cause the + mount command to be "mount ... -o ro ..." mounting the disk as read only. + """ + opts = DEFAULT_MOUNT_OPTIONS + if FLAGS.mount_options: + opts = ','.join(FLAGS.mount_options) + self.metadata.update({'mount_options': opts}) + return opts + + @property + def fstab_options(self): + """Returns options to use in the /etc/fstab entry for this drive. + + The default value 'defaults' is from the linux VM's MountDisk method. + + See `man fstab` for usage. + """ + opts = DEFAULT_FSTAB_OPTIONS + if FLAGS.fstab_options: + opts = ','.join(FLAGS.fstab_options) + self.metadata.update({'fstab_options': opts}) + return opts + + @abc.abstractmethod + def Attach(self, vm): + """Attaches the disk to a VM. + + Args: + vm: The BaseVirtualMachine instance to which the disk will be attached. + """ + pass + + def Detach(self): + """Detaches the disk from a VM.""" + # This is currently never called. + # TODO(pclay): Figure out if static VMs should call this. + raise NotImplementedError + + def GetDevicePath(self): + """Returns the path to the device inside a Linux VM.""" + if self.device_path is None: + raise ValueError('device_path is None.') + return self.device_path + + def GetDeviceId(self): + """Return the Windows DeviceId of this disk.""" + if self.disk_number is None: + raise ValueError('disk_number is None.') + return r'\\.\PHYSICALDRIVE%s' % self.disk_number + + +class StripedDisk(BaseDisk): + """Object representing several disks striped together.""" + + is_striped = True + + def __init__(self, disk_spec, disks): + """Initializes a StripedDisk object. + + Args: + disk_spec: A BaseDiskSpec containing the desired mount point. + disks: A list of BaseDisk objects that constitute the StripedDisk. + """ + super(StripedDisk, self).__init__(disk_spec) + self.disks = disks + self.metadata = disks[0].metadata.copy() + if self.disk_size: + self.metadata['size'] = self.disk_size * self.num_striped_disks + + def _Create(self): + for disk in self.disks: + disk.Create() + + def _Delete(self): + for disk in self.disks: + disk.Delete() + + def Attach(self, vm): + for disk in self.disks: + disk.Attach(vm) + + def Detach(self): + for disk in self.disks: + disk.Detach() + + +class MountableDisk(BaseDisk): + """Object representing a disk that produces a mounted directory. + + Examples are RamDisks or FUSE file systems. + """ + + def Attach(self, vm): + """Attaches the disk to a VM. + + Args: + vm: The BaseVirtualMachine instance to which the disk will be attached. + """ + pass + + def GetDevicePath(self): + """Returns the path to the device inside a Linux VM.""" + return None + + def GetDeviceId(self): + """Return the Windows DeviceId of this disk.""" + return None + + def _Create(self): + """Creates the disk.""" + pass + + def _Delete(self): + """Deletes the disk.""" + pass + + @abc.abstractmethod + def Mount(self, vm): + """Mount disk at specified mount point.""" + raise NotImplementedError() + + +class NetworkDisk(BaseDisk): + """Object representing a Network Disk.""" + + @abc.abstractmethod + def _GetNetworkDiskMountOptionsDict(self): + """Default mount options as a dict.""" + pass + + @property + def mount_options(self): + opts = [] + for key, value in sorted( + six.iteritems(self._GetNetworkDiskMountOptionsDict())): + opts.append('%s' % key if value is None else '%s=%s' % (key, value)) + return ','.join(opts) + + @property + def fstab_options(self): + return self.mount_options + + @abc.abstractmethod + def Attach(self): + """Attached NetworkDisk to a VM. Must set self.vm.""" + raise NotImplementedError() + + def _Create(self): + # handled by the Network Disk service + pass + + def _Delete(self): + # handled by the Network Disk service + pass + + +# TODO(chriswilkes): adds to the disk.GetDiskSpecClass registry +# that only has the cloud as the key. Look into using (cloud, disk_type) +# if causes problems +class NfsDisk(NetworkDisk): + """Provides options for mounting NFS drives. + + NFS disk should be ready to mount at the time of creation of this disk. + + Args: + disk_spec: The disk spec. + remote_mount_address: The host_address:/volume path to the NFS drive. + nfs_tier: The NFS tier / performance level of the server. + """ + + def __init__(self, + disk_spec, + remote_mount_address, + default_nfs_version=None, + nfs_tier=None): + super(NfsDisk, self).__init__(disk_spec) + self.nfs_version = disk_spec.nfs_version or default_nfs_version + self.nfs_timeout_hard = disk_spec.nfs_timeout_hard + self.nfs_rsize = disk_spec.nfs_rsize + self.nfs_wsize = disk_spec.nfs_wsize + self.nfs_timeout = disk_spec.nfs_timeout + self.nfs_retries = disk_spec.nfs_retries + self.nfs_nconnect = disk_spec.nfs_nconnect + self.device_path = remote_mount_address + for key, value in six.iteritems(self._GetNetworkDiskMountOptionsDict()): + self.metadata['nfs_{}'.format(key)] = value + if nfs_tier: + self.nfs_tier = nfs_tier + self.metadata['nfs_tier'] = nfs_tier + super(NfsDisk, self).GetResourceMetadata() + + def _GetNetworkDiskMountOptionsDict(self): + """Default NFS mount options as a dict.""" + options = { + 'hard' if self.nfs_timeout_hard else 'soft': None, + 'rsize': self.nfs_rsize, + 'wsize': self.nfs_wsize, + 'timeo': self.nfs_timeout * 10, # in decaseconds + 'retrans': self.nfs_retries, + } + # the client doesn't have to specify an NFS version to use (but should) + if self.nfs_version: + options['nfsvers'] = self.nfs_version + if self.nfs_nconnect: + options['nconnect'] = self.nfs_nconnect + if FLAGS.nfs_noresvport: + options['noresvport'] = None + return options + + def Attach(self, vm): + self.vm = vm + self.vm.Install('nfs_utils') + + +class SmbDisk(NetworkDisk): + """Provides options for mounting SMB drives. + + SMB disk should be ready to mount at the time of creation of this disk. + + Args: + disk_spec: The disk spec. + remote_mount_address: The host_address:/volume path to the SMB drive. + smb_tier: The SMB tier / performance level of the server. + """ + + def __init__(self, + disk_spec, + remote_mount_address, + storage_account_and_key, + default_smb_version=None, + smb_tier=None): + super(SmbDisk, self).__init__(disk_spec) + self.smb_version = disk_spec.smb_version + self.device_path = remote_mount_address + self.storage_account_and_key = storage_account_and_key + if smb_tier: + self.metadata['smb_tier'] = smb_tier + + def _GetNetworkDiskMountOptionsDict(self): + """Default SMB mount options as a dict.""" + options = { + 'vers': self.smb_version, + 'username': self.storage_account_and_key['user'], + 'password': self.storage_account_and_key['pw'], + 'dir_mode': '0777', + 'file_mode': '0777', + 'serverino': None, + # the following mount option is a suggestion from + # https://docs.microsoft.com/en-us/azure/storage/files/storage-troubleshooting-files-performance#throughput-on-linux-clients-is-significantly-lower-when-compared-to-windows-clients + 'nostrictsync': None, + } + return options + + def Attach(self, vm): + self.vm = vm + self.vm.InstallPackages('cifs-utils') diff --git a/script/cumulus/pkb/perfkitbenchmarker/disk_iops_to_capacity.py b/script/cumulus/pkb/perfkitbenchmarker/disk_iops_to_capacity.py new file mode 100644 index 0000000..1b414c8 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/disk_iops_to_capacity.py @@ -0,0 +1,270 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""IOPS Storage Utility class. + +This class is used to translate an {IOPS, Cloud Provider} requirement +to {core, number of disks, storage size} machine requirements necessary +to meet the IOPS level using the Cloud Provider declared. + + - On AWS, we will use "ebs-gp2" storage type. + - On GCP, we will use PD-SSD storage type. + +In future versions, we will support Azure as well. + +The data used to make these Conversions was acquired in May 2017 from the +following resources: + +GCP Storage Ratings: https://cloud.google.com/compute/docs/disks/. Storage can + go as high as 64TB per disk but IOPS maxes out at 30,000 iops/disk + or (1000GB). +AWS Storage Ratings: http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ + EBSVolumeTypes.html#EBSVolumeTypes_gp2. Storage can go as + high as 16TiB per disk but IOPS maxes out at 10000 IOPS/volume size. + Reference gives volume size in GiB so converted to GB below. +GCP CPU Ratings: https://cloud.google.com/compute/docs/disks/ + performance#ssd-pd-performance +AWS CPU Ratings: https://aws.amazon.com/ebs/details/ +GCP Disk Number Ratings: https://cloud.google.com/compute/docs/disks/performance +AWS Disk Number Ratings: https://aws.amazon.com/ebs/details/ + + +Note: These conversions will require updating as performance and resources +change. + +To utilize this class, initialize an instance of the DiskIOPSToCapacity class +with the IOPS level desired and the provider you wish to use. The machine +requirement attributes will be immediately populated. +""" +import math +import numpy + + +class InvalidProviderError(Exception): + pass + + +class InvalidIOPSError(Exception): + pass + + +class InvalidStorageTypeError(Exception): + pass + + +# GLOBAL STRINGS +AWS = 'AWS' +GCP = 'GCP' +MAX_IOPS = 'MAX_IOPS' +DEFAULT_STORAGE_TYPE = 'DEFAULT_STORAGE_TYPE' +VALID_STORAGE_TYPES = 'VALID_STORAGE_TYPES' +# Cloud providers info dictionary. Will be updated when support more cloud +# providers. VALID_STORAGE_TYPES are storage types that are currently +# supported by disk_iops_to_capacity converter. +# MAX IOPS Sources: +# GCP: Increasing vCPUs increases IOPS performance. Top IOPS performance is +# per instance is averaged to 30000. +# https://cloud.google.com/compute/docs/disks/performance +# AWS: Can add additional volumes to increase IOPS performance. Top IOPS +# performance per CPU is 75000. +# http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html +CLOUD_PROVIDERS_INFO = { + AWS: { + MAX_IOPS: 75000, + DEFAULT_STORAGE_TYPE: 'ebs-gp2', + VALID_STORAGE_TYPES: ['ebs-gp2'] + }, + GCP: { + MAX_IOPS: 30000, + DEFAULT_STORAGE_TYPE: 'pd-ssd', + VALID_STORAGE_TYPES: ['pd-ssd'], + } +} + + +class DiskIOPSToCapacity(object): + """Given iops and service provider requirements, return disk configurations. + + This class is used to translate an {IOPS, Cloud Provider} requirement + to {core, number of disks, storage size} machine requirements necessary + to meet the IOPS level using the Cloud MySQL Provider declared. + + Currently assumes SSD persistent disks. + TODO: + - Implement Azure calculations. Add other cloud providers as applicable. + - Support other storage types such as HDD and/or EBS-piops. + Add a further parameter of Disk Type (default SSD PD) and update + calculations to include HDD disk iops levels. + + Attributes: + _iops: Number of IOPS required. + _provider: 'AWS' or 'GCP'. + _size: Minimum size (GB) required for _iops level with _provider. + _number_disks: Disk number required to meet _iops level with _provider. + _cpu_count: vCPUs per instance required to meet _iops level with _provider. + """ + + def __init__(self, iops, provider=GCP, storage_type=None): + self._size = None + self._cpu_count = None + self._number_disks = None + self._iops = iops + self._provider = provider.upper() + self._storage_type = storage_type + self._ValidateProvider() + self._ValidateIOPS() + self._ValidateStorageType() + self._PopulateConfigs() + + def _ValidateStorageType(self): + """Validate storage type for given _provider, set to default if not given. + + Raises: + InvalidStorageTypeError: Incorrect storage type given. + + TODO: When support other types of storage types (i.e. when this class + supports ebs-piops for AWS or pd-hhd for gcp), will need to update + VALID_STORAGE_TYPES in CLOUD_PROVIDERS_INFO dictionary. + + """ + if self._storage_type: + self._storage_type = self._storage_type.lower() + if (self._storage_type is + not CLOUD_PROVIDERS_INFO[self._provider][DEFAULT_STORAGE_TYPE]): + raise InvalidStorageTypeError() + else: + self._storage_type = CLOUD_PROVIDERS_INFO[self._provider][ + DEFAULT_STORAGE_TYPE] + + def _ValidateProvider(self): + """Validate provider to be GCP or AWS, throw exception if invalid. + + Raises: + InvalidProviderError: Incorrect provider type given. + """ + if self._provider not in list(CLOUD_PROVIDERS_INFO.keys()): + raise InvalidProviderError('Provider given is not supported by ' + 'storage_utility.') + + def _ValidateIOPS(self): + """Validate IOPS to be within valid limits, throw exception if invalid. + + If IOPS parameter is less than 1 or greater than provider maximum IOPS + throw InvalidIOPSError. + + Raises: + InvalidIOPSError: Invalid IOPS parameter given. + """ + if (self._iops < 1 or + self._iops > CLOUD_PROVIDERS_INFO[self._provider][MAX_IOPS]): + raise InvalidIOPSError( + 'Invalid IOPS parameter, must be positive number less than ' + 'the maximum achievable for given cloud provider. ' + 'The maximum for {} is {}.'.format( + self._provider, CLOUD_PROVIDERS_INFO[self._provider][MAX_IOPS])) + + def _PopulateConfigs(self): + """Populate Storage Configurations.""" + self._SetSize() + self._SetCPUCount() + self._SetNumberDisks() + + def PrintConfigs(self): + """Print out necessary configs.""" + vm_config = ('For {} IOPS using {}, the following is required:\n\tStorage ' + 'Size (GB): {}\n\tCPU Count: {}\n\tNumber of ' + 'Disks: {}').format(self._iops, + self._provider.upper(), self._size, + self._cpu_count, self._number_disks) + print(vm_config) + + def _SetSize(self): + """Set minimum size (GB) necessary to achieve _iops level. + + Rating performance levels as of May 2017, sources found below. + GCP: ratings from https://cloud.google.com/compute/docs/disks/. Storage can + go as high as 64TB per disk but IOPS maxes out at 30,000 iops/disk + or (1000GB). + AWS: ratings from + http://docs.aws.amazon.com/AWSEC2/latest/ + UserGuide/EBSVolumeTypes.html#EBSVolumeTypes_gp2. Storage can go as + high as 16TiB per disk but IOPS maxes out at 10000 IOPS/volume size. + Reference gives volume size in GiB so converted to GB below. + """ + if self._provider == GCP: + self._size = min(int(math.ceil(self._iops / 30.0)), 30000 / 30) + elif self._provider == AWS: + value = self._iops + value = numpy.array(value) + self._size = int( + numpy.piecewise([value], [[value <= 100], [(value > 100) & ( + value <= 9999)], [value > 9999]], [ + lambda x: int(math.ceil(1.07374)), + lambda x: int(math.ceil(3 * value)), + lambda x: int(math.ceil(3579.855))])) + + def GetSize(self): + """Return storage size. + + Returns: + __size: Storage size (GB). + """ + return self._size + + def _SetCPUCount(self): + """Set cpu count. + + GCP: ratings from + https://cloud.google.com/compute/docs/disks/performance#ssd-pd-performance + AWS: to achieve good performance on EBS, one needs to use an + EBS-optimized VM instance, and the smallest VM instance that can be EBS + optimized is *.large VM types (e.g., c4.large), those comes with 2 cores. + ratings from + http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSOptimized.html + + """ + if self._provider == GCP: + value = self._iops + self._cpu_count = int( + numpy.piecewise([value], [[value <= 15000], [ + (value > 15000) & (value <= 25000) + ], [value > 25000]], [lambda x: 1, lambda x: 16, lambda x: 32])) + elif self._provider == AWS: + self._cpu_count = 2 + + def GetCPUCount(self): + """Return CPU count. + + Returns: + _cpu_count: CPU count. + """ + return self._cpu_count + + def _SetNumberDisks(self): + """Set number of disks. + + GCP: Adding disks does not increase IOPS for GCP. + AWS: ratings from https://aws.amazon.com/ebs/details/ + """ + if self._provider == GCP: + self._number_disks = 1 + elif self._provider == AWS: + self._number_disks = max(int(math.ceil(self._iops / 10000.0)), 1) + + def GetNumberDisks(self): + """Return Number of Disks. + + Returns: + _number_disks: Number of disks. + """ + return self._number_disks diff --git a/script/cumulus/pkb/perfkitbenchmarker/dpb_service.py b/script/cumulus/pkb/perfkitbenchmarker/dpb_service.py new file mode 100644 index 0000000..6da220f --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/dpb_service.py @@ -0,0 +1,872 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Benchmarking support for Data Processing Backend Services. + +In order to benchmark Data Processing Backend services such as Google +Cloud Platform's Dataproc and Dataflow or Amazon's EMR, we create a +BaseDpbService class. Classes to wrap specific backend services are in +the corresponding provider directory as a subclass of BaseDpbService. +""" + +import abc +import dataclasses +import datetime +import logging +from typing import Dict, List, Optional, Type + +from absl import flags +from perfkitbenchmarker import container_service +from perfkitbenchmarker import context +from perfkitbenchmarker import errors +from perfkitbenchmarker import resource +from perfkitbenchmarker import units +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.linux_packages import hadoop +from perfkitbenchmarker.linux_packages import spark +from perfkitbenchmarker.providers.aws import s3 +from perfkitbenchmarker.providers.aws import util as aws_util +from perfkitbenchmarker.providers.gcp import gcs +from perfkitbenchmarker.providers.gcp import util as gcp_util + +flags.DEFINE_string( + 'static_dpb_service_instance', None, + 'If set, the name of the pre created dpb implementation,' + 'assumed to be ready.') +flags.DEFINE_string('dpb_log_level', 'INFO', 'Manipulate service log level') +flags.DEFINE_string('dpb_job_jarfile', None, + 'Executable Jarfile containing workload implementation') +flags.DEFINE_string('dpb_job_classname', None, 'Classname of the job ' + 'implementation in the jar file') +flags.DEFINE_string( + 'dpb_service_bucket', None, 'A bucket to use with the DPB ' + 'service. If none is provided one will be created by the ' + 'benchmark and cleaned up afterwards unless you are using ' + 'a static instance.') +flags.DEFINE_string('dpb_service_zone', None, 'The zone for provisioning the ' + 'dpb_service instance.') +flags.DEFINE_list( + 'dpb_job_properties', [], 'A list of strings of the form ' + '"key=value" to be passed into DBP jobs.') +flags.DEFINE_list( + 'dpb_cluster_properties', [], 'A list of strings of the form ' + '"type:key=value" to be passed into DBP clusters. See ' + 'https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/cluster-properties.' +) + +FLAGS = flags.FLAGS + +# List of supported data processing backend services +DATAPROC = 'dataproc' +DATAPROC_GKE = 'dataproc_gke' +DATAPROC_SERVERLESS = 'dataproc_serverless' +DATAFLOW = 'dataflow' +EMR = 'emr' +UNMANAGED_DPB_SVC_YARN_CLUSTER = 'unmanaged_dpb_svc_yarn_cluster' +UNMANAGED_SPARK_CLUSTER = 'unmanaged_spark_cluster' +KUBERNETES_SPARK_CLUSTER = 'kubernetes_spark_cluster' +UNMANAGED_SERVICES = [ + UNMANAGED_DPB_SVC_YARN_CLUSTER, + UNMANAGED_SPARK_CLUSTER, +] + +# Default number of workers to be used in the dpb service implementation +DEFAULT_WORKER_COUNT = 2 + +# List of supported applications that can be enabled on the dpb service +FLINK = 'flink' +HIVE = 'hive' + +# Metrics and Status related metadata +# TODO(pclay): Remove these after migrating all callers to SubmitJob +SUCCESS = 'success' +RUNTIME = 'running_time' +WAITING = 'pending_time' + + +class JobNotCompletedError(Exception): + """Used to signal a job is still running.""" + pass + + +class JobSubmissionError(errors.Benchmarks.RunError): + """Thrown by all implementations if SubmitJob fails.""" + pass + + +@dataclasses.dataclass +class JobResult: + """Data class for the timing of a successful DPB job.""" + # Service reported execution time + run_time: float + # Service reported pending time (0 if service does not report). + pending_time: float = 0 + + @property + def wall_time(self) -> float: + """The total time the service reported it took to execute.""" + return self.run_time + self.pending_time + + +class BaseDpbService(resource.BaseResource): + """Object representing a Data Processing Backend Service.""" + + REQUIRED_ATTRS = ['CLOUD', 'SERVICE_TYPE'] + RESOURCE_TYPE = 'BaseDpbService' + CLOUD = 'abstract' + SERVICE_TYPE = 'abstract' + HDFS_FS = 'hdfs' + GCS_FS = 'gs' + S3_FS = 's3' + + # Job types that are supported on the dpb service backends + PYSPARK_JOB_TYPE = 'pyspark' + SPARKSQL_JOB_TYPE = 'spark-sql' + SPARK_JOB_TYPE = 'spark' + HADOOP_JOB_TYPE = 'hadoop' + DATAFLOW_JOB_TYPE = 'dataflow' + BEAM_JOB_TYPE = 'beam' + + def _JobJars(self) -> Dict[str, Dict[str, str]]: + """Known mappings of jars in the cluster used by GetExecutionJar.""" + return { + self.SPARK_JOB_TYPE: { + # Default for Dataproc and EMR + 'examples': 'file:///usr/lib/spark/examples/jars/spark-examples.jar' + } + } + + def __init__(self, dpb_service_spec): + """Initialize the Dpb service object. + + Args: + dpb_service_spec: spec of the dpb service. + """ + is_user_managed = dpb_service_spec.static_dpb_service_instance is not None + # Hand over the actual creation to the resource module which treats the + # user_managed resources in a special manner and skips creation attempt + super(BaseDpbService, self).__init__(user_managed=is_user_managed) + self.spec = dpb_service_spec + self.dpb_hdfs_type = None + if dpb_service_spec.static_dpb_service_instance: + self.cluster_id = dpb_service_spec.static_dpb_service_instance + else: + self.cluster_id = 'pkb-' + FLAGS.run_uri + if FLAGS.dpb_service_bucket: + self.bucket = FLAGS.dpb_service_bucket + self.manage_bucket = False + else: + self.bucket = 'pkb-' + FLAGS.run_uri + self.manage_bucket = True + self.dpb_service_zone = FLAGS.dpb_service_zone + self.dpb_version = dpb_service_spec.version + self.dpb_service_type = 'unknown' + self.storage_service = None + + @property + def base_dir(self): + return self.persistent_fs_prefix + self.bucket # pytype: disable=attribute-error # bind-properties + + @abc.abstractmethod + def SubmitJob(self, + jarfile: Optional[str] = None, + classname: Optional[str] = None, + pyspark_file: Optional[str] = None, + query_file: Optional[str] = None, + job_poll_interval: Optional[float] = None, + job_stdout_file: Optional[str] = None, + job_arguments: Optional[List[str]] = None, + job_files: Optional[List[str]] = None, + job_jars: Optional[List[str]] = None, + job_type: Optional[str] = None, + properties: Optional[Dict[str, str]] = None) -> JobResult: + """Submit a data processing job to the backend. + + Args: + jarfile: Jar file to execute. + classname: Name of the main class. + pyspark_file: Comma separated list of Python files to be provided to the + job. Must be one of the following file formats ".py, .zip, or .egg". + query_file: HCFS URI of file containing Spark SQL script to execute as the + job. + job_poll_interval: integer saying how often to poll for job completion. + Not used by providers for which submit job is a synchronous operation. + job_stdout_file: String giving the location of the file in which to put + the standard out of the job. + job_arguments: List of string arguments to pass to driver application. + These are not the arguments passed to the wrapper that submits the job. + job_files: Files passed to a Spark Application to be distributed to + executors. + job_jars: Jars to pass to the application + job_type: Spark or Hadoop job + properties: Dict of properties to pass with the job. + + Returns: + A JobResult with the timing of the successful job. + + Raises: + JobSubmissionError if job fails. + """ + pass + + def _WaitForJob(self, job_id, timeout, poll_interval): + + @vm_util.Retry( + timeout=timeout, + poll_interval=poll_interval, + fuzz=0, + retryable_exceptions=(JobNotCompletedError,)) + def Poll(): + result = self._GetCompletedJob(job_id) + if result is None: + raise JobNotCompletedError('Job {} not complete.'.format(job_id)) + return result + + return Poll() + + def _GetCompletedJob(self, job_id: str) -> Optional[JobResult]: + """Get the job result if it has finished. + + Args: + job_id: The step id to query. + + Returns: + A dictionary describing the job if the step the step is complete, + None otherwise. + + Raises: + JobSubmissionError if job fails. + """ + raise NotImplementedError('You need to implement _GetCompletedJob if you ' + 'use _WaitForJob') + + def GetSparkSubmitCommand( + self, + jarfile: Optional[str] = None, + classname: Optional[str] = None, + pyspark_file: Optional[str] = None, + query_file: Optional[str] = None, + job_arguments: Optional[List[str]] = None, + job_files: Optional[List[str]] = None, + job_jars: Optional[List[str]] = None, + job_type: Optional[str] = None, + properties: Optional[Dict[str, str]] = None, + spark_submit_cmd: str = spark.SPARK_SUBMIT) -> List[str]: + """Builds the command to run spark-submit on cluster.""" + # TODO(pclay): support BaseDpbService.SPARKSQL_JOB_TYPE + if job_type not in [ + BaseDpbService.PYSPARK_JOB_TYPE, + BaseDpbService.SPARK_JOB_TYPE, + ]: + raise NotImplementedError + cmd = [spark_submit_cmd] + # Order is important + if classname: + cmd += ['--class', classname] + all_properties = self.GetJobProperties() + all_properties.update(properties or {}) + for k, v in all_properties.items(): + cmd += ['--conf', '{}={}'.format(k, v)] + if job_files: + cmd = ['--files', ','.join(job_files)] + # Main jar/script goes last before args. + if job_type == BaseDpbService.SPARK_JOB_TYPE: + assert jarfile + cmd.append(jarfile) + elif job_type == BaseDpbService.PYSPARK_JOB_TYPE: + assert pyspark_file + cmd.append(pyspark_file) + if job_arguments: + cmd += job_arguments + return cmd + + def DistributedCopy(self, + source: str, + destination: str, + properties: Optional[Dict[str, str]] = None) -> JobResult: + """Method to copy data using a distributed job on the cluster. + + Args: + source: HCFS directory to copy data from. + destination: name of new HCFS directory to copy data into. + properties: properties to add to the job. Not supported on EMR. + + Returns: + A JobResult with the timing of the successful job. + + Raises: + JobSubmissionError if job fails. + """ + return self.SubmitJob( + classname='org.apache.hadoop.tools.DistCp', + job_arguments=[source, destination], + job_type=BaseDpbService.HADOOP_JOB_TYPE, + properties=properties) + + def GetMetadata(self): + """Return a dictionary of the metadata for this cluster.""" + pretty_version = self.dpb_version or 'default' + basic_data = { + 'dpb_service': + self.dpb_service_type, + 'dpb_version': + pretty_version, + 'dpb_service_version': + '{}_{}'.format(self.dpb_service_type, pretty_version), + 'dpb_cluster_id': + self.cluster_id, + 'dpb_cluster_shape': + self.spec.worker_group.vm_spec.machine_type, + 'dpb_cluster_size': + self.spec.worker_count, + 'dpb_hdfs_type': + self.dpb_hdfs_type, + 'dpb_disk_size': + self.spec.worker_group.disk_spec.disk_size, + 'dpb_service_zone': + self.dpb_service_zone, + 'dpb_job_properties': + ','.join('{}={}'.format(k, v) + for k, v in self.GetJobProperties().items()), + 'dpb_cluster_properties': + ','.join(FLAGS.dpb_cluster_properties), + } + return basic_data + + def _CreateDependencies(self): + """Creates a bucket to use with the cluster.""" + if self.manage_bucket: + self.storage_service.MakeBucket(self.bucket) + + def _Create(self): + """Creates the underlying resource.""" + raise NotImplementedError() + + def _DeleteDependencies(self): + """Deletes the bucket used with the cluster.""" + if self.manage_bucket: + self.storage_service.DeleteBucket(self.bucket) + + def _Delete(self): + """Deletes the underlying resource. + + Implementations of this method should be idempotent since it may + be called multiple times, even if the resource has already been + deleted. + """ + raise NotImplementedError() + + def _ProcessWallTime(self, start_time, end_time): + """Compute the wall time from the given start and end processing time. + + Args: + start_time: Datetime value when the processing was started. + end_time: Datetime value when the processing completed. + + Returns: + Wall time in seconds. + + Raises: + ValueError: Exception raised when invalid input is provided. + """ + if start_time > end_time: + raise ValueError('start_time cannot be later than the end_time') + return (end_time - start_time).total_seconds() + + def GetJobProperties(self) -> Dict[str, str]: + """Parse the dpb_job_properties_flag.""" + return dict(pair.split('=') for pair in FLAGS.dpb_job_properties) + + def GetExecutionJar(self, job_category: str, job_type: str) -> str: + """Retrieve execution jar corresponding to the job_category and job_type. + + Args: + job_category: String category of the job for eg. hadoop, spark, hive, etc. + job_type: String name of the type of workload to executed on the cluster, + for eg. word_count, terasort, etc. + + Returns: + The path to the execusion jar on the cluster + + Raises: + NotImplementedError: An unsupported combination of job_category + and job_type was provided for execution on the cluster. + """ + jar = self._JobJars().get(job_category, {}).get(job_type) + if jar: + return jar + raise NotImplementedError( + f'No jar found for category {job_category} and type {job_type}.') + + def GetClusterCreateTime(self) -> Optional[float]: + """Returns the cluster creation time. + + This default implementation computes it by substracting the + resource_ready_time and create_start_time attributes. + + Returns: + A float representing the creation time in seconds or None. + """ + if self.resource_ready_time is None or self.create_start_time is None: + return None + return self.resource_ready_time - self.create_start_time + + +class UnmanagedDpbService(BaseDpbService): + """Object representing an un-managed dpb service.""" + + def __init__(self, dpb_service_spec): + super(UnmanagedDpbService, self).__init__(dpb_service_spec) + # Dictionary to hold the cluster vms. + self.vms = {} + self.cloud = dpb_service_spec.worker_group.cloud + if not self.dpb_service_zone: + raise errors.Setup.InvalidSetupError( + 'dpb_service_zone must be provided, for provisioning.') + if self.cloud == 'GCP': + self.region = gcp_util.GetRegionFromZone(FLAGS.dpb_service_zone) + self.storage_service = gcs.GoogleCloudStorageService() + self.persistent_fs_prefix = 'gs://' + elif self.cloud == 'AWS': + self.region = aws_util.GetRegionFromZone(FLAGS.dpb_service_zone) + self.storage_service = s3.S3Service() + self.persistent_fs_prefix = 's3://' + else: + self.region = None + self.storage_service = None + self.persistent_fs_prefix = None + self.manage_bucket = False + logging.warning( + 'Cloud provider %s does not support object storage. ' + 'Some benchmarks will not work.', + self.cloud) + + if self.storage_service: + self.storage_service.PrepareService(location=self.region) + + # set in _Create of derived classes + self.leader = None + + def GetClusterCreateTime(self) -> Optional[float]: + """Returns the cluster creation time. + + UnmanagedDpbService Create phase doesn't consider actual VM creation, just + further provisioning. Thus, we need to add the VMs create time to the + default implementation. + + Returns: + A float representing the creation time in seconds or None. + """ + + my_create_time = super().GetClusterCreateTime() + if my_create_time is None: + return None + vms = [] + for vm_group in self.vms.values(): + for vm in vm_group: + vms.append(vm) + first_vm_create_start_time = min( + (vm.create_start_time + for vm in vms + if vm.create_start_time is not None), + default=None, + ) + last_vm_ready_start_time = max( + (vm.resource_ready_time + for vm in vms + if vm.resource_ready_time is not None), + default=None, + ) + if first_vm_create_start_time is None or last_vm_ready_start_time is None: + return None + return (my_create_time + last_vm_ready_start_time - + first_vm_create_start_time) + + +class UnmanagedDpbServiceYarnCluster(UnmanagedDpbService): + """Object representing an un-managed dpb service yarn cluster.""" + + CLOUD = 'Unmanaged' + SERVICE_TYPE = UNMANAGED_DPB_SVC_YARN_CLUSTER + + def __init__(self, dpb_service_spec): + super(UnmanagedDpbServiceYarnCluster, self).__init__(dpb_service_spec) + # Dictionary to hold the cluster vms. + self.dpb_service_type = UNMANAGED_DPB_SVC_YARN_CLUSTER + # Set DPB version as Hadoop version for metadata + self.cloud = dpb_service_spec.worker_group.cloud + + def _Create(self): + """Create an un-managed yarn cluster.""" + logging.info('Should have created vms by now.') + logging.info(str(self.vms)) + + def InstallHadoop(vm): + vm.Install('hadoop') + if self.cloud == 'GCP': + hadoop.InstallGcsConnector(vm) + + if 'worker_group' not in self.vms: + raise errors.Resource.CreationError( + 'UnmanagedDpbServiceYarnCluster requires VMs in a worker_group.') + vm_util.RunThreaded(InstallHadoop, + self.vms['worker_group'] + self.vms['master_group']) + self.leader = self.vms['master_group'][0] + hadoop.ConfigureAndStart( + self.leader, self.vms['worker_group'], configure_s3=self.cloud == 'AWS') + + def SubmitJob(self, + jarfile=None, + classname=None, + pyspark_file=None, + query_file=None, + job_poll_interval=None, + job_stdout_file=None, + job_arguments=None, + job_files=None, + job_jars=None, + job_type=None, + properties=None): + """Submit a data processing job to the backend.""" + if job_type != self.HADOOP_JOB_TYPE: + raise NotImplementedError + cmd_list = [hadoop.HADOOP_CMD] + # Order is important + if jarfile: + cmd_list += ['jar', jarfile] + # Specifying classname only works if jarfile is omitted or if it has no + # main class. + if classname: + cmd_list += [classname] + all_properties = self.GetJobProperties() + all_properties.update(properties or {}) + cmd_list += ['-D{}={}'.format(k, v) for k, v in all_properties.items()] + if job_arguments: + cmd_list += job_arguments + cmd_string = ' '.join(cmd_list) + + start_time = datetime.datetime.now() + try: + stdout, _ = self.leader.RobustRemoteCommand(cmd_string, should_log=True) + except errors.VirtualMachine.RemoteCommandError as e: + raise JobSubmissionError() from e + end_time = datetime.datetime.now() + + if job_stdout_file: + with open(job_stdout_file, 'w') as f: + f.write(stdout) + return JobResult(run_time=(end_time - start_time).total_seconds()) + + def _Delete(self): + pass + + def _GetCompletedJob(self, job_id: str) -> Optional[JobResult]: + """Submitting Job via SSH is blocking so this is not meaningful.""" + raise NotImplementedError('Submitting Job via SSH is a blocking command.') + + +class UnmanagedDpbSparkCluster(UnmanagedDpbService): + """Object representing an un-managed dpb service spark cluster.""" + + CLOUD = 'Unmanaged' + SERVICE_TYPE = UNMANAGED_SPARK_CLUSTER + + def _JobJars(self) -> Dict[str, Dict[str, str]]: + """Known mappings of jars in the cluster used by GetExecutionJar.""" + return {self.SPARK_JOB_TYPE: {'examples': spark.SparkExamplesJarPath()}} + + def __init__(self, dpb_service_spec): + super(UnmanagedDpbSparkCluster, self).__init__(dpb_service_spec) + # Dictionary to hold the cluster vms. + self.vms = {} + self.dpb_service_type = UNMANAGED_SPARK_CLUSTER + # Set DPB version as Spark version for metadata + self.dpb_version = 'spark_' + FLAGS.spark_version + self.cloud = dpb_service_spec.worker_group.cloud + + def _Create(self): + """Create an un-managed yarn cluster.""" + logging.info('Should have created vms by now.') + logging.info(str(self.vms)) + + def InstallSpark(vm): + vm.Install('spark') + if self.cloud == 'GCP': + hadoop.InstallGcsConnector(vm) + + if 'worker_group' not in self.vms: + raise errors.Resource.CreationError( + 'UnmanagedDpbSparkCluster requires VMs in a worker_group.') + + vm_util.RunThreaded(InstallSpark, + self.vms['worker_group'] + self.vms['master_group']) + self.leader = self.vms['master_group'][0] + spark.ConfigureAndStart( + self.leader, self.vms['worker_group'], configure_s3=self.cloud == 'AWS') + + def SubmitJob(self, + jarfile=None, + classname=None, + pyspark_file=None, + query_file=None, + job_poll_interval=None, + job_stdout_file=None, + job_arguments=None, + job_files=None, + job_jars=None, + job_type=None, + properties=None): + """Submit a data processing job to the backend.""" + cmd = self.GetSparkSubmitCommand( + jarfile=jarfile, + classname=classname, + pyspark_file=pyspark_file, + job_arguments=job_arguments, + job_files=job_files, + job_jars=job_jars, + job_type=job_type, + properties=properties) + start_time = datetime.datetime.now() + try: + stdout, _ = self.leader.RobustRemoteCommand( + ' '.join(cmd), should_log=True) + except errors.VirtualMachine.RemoteCommandError as e: + raise JobSubmissionError() from e + end_time = datetime.datetime.now() + + if job_stdout_file: + with open(job_stdout_file, 'w') as f: + f.write(stdout) + return JobResult(run_time=(end_time - start_time).total_seconds()) + + def _Delete(self): + pass + + def _GetCompletedJob(self, job_id: str) -> Optional[JobResult]: + """Submitting Job via SSH is blocking so this is not meaningful.""" + raise NotImplementedError('Submitting Job via SSH is a blocking command.') + + +class KubernetesSparkCluster(BaseDpbService): + """Object representing a Kubernetes dpb service spark cluster.""" + + CLOUD = container_service.KUBERNETES + SERVICE_TYPE = KUBERNETES_SPARK_CLUSTER + + # Constants to sychronize between YAML and Spark configuration + # TODO(pclay): Consider setting in YAML + SPARK_DRIVER_SERVICE = 'spark-driver' + SPARK_DRIVER_PORT = 4042 + SPARK_K8S_SERVICE_ACCOUNT = 'spark' + + def _JobJars(self) -> Dict[str, Dict[str, str]]: + """Known mappings of jars in the cluster used by GetExecutionJar.""" + return {self.SPARK_JOB_TYPE: {'examples': spark.SparkExamplesJarPath()}} + + # TODO(odiego): Implement GetClusterCreateTime adding K8s cluster create time + + def __init__(self, dpb_service_spec): + super().__init__(dpb_service_spec) + self.dpb_service_type = self.SERVICE_TYPE + # Set DPB version as Spark version for metadata + self.dpb_version = 'spark_' + FLAGS.spark_version + + benchmark_spec = context.GetThreadBenchmarkSpec() + self.k8s_cluster = benchmark_spec.container_cluster + assert self.k8s_cluster + assert self.k8s_cluster.CLUSTER_TYPE == container_service.KUBERNETES + self.cloud = self.k8s_cluster.CLOUD + self.container_registry = benchmark_spec.container_registry + assert self.container_registry + + self.spark_drivers = [] + + # TODO(pclay): Support overriding image? + # Corresponds with data/docker/spark directory + self.image = 'spark' + + if self.cloud == 'GCP': + self.region = gcp_util.GetRegionFromZone(self.k8s_cluster.zone) + self.storage_service = gcs.GoogleCloudStorageService() + self.persistent_fs_prefix = 'gs://' + elif self.cloud == 'AWS': + self.region = self.k8s_cluster.region + self.storage_service = s3.S3Service() + self.persistent_fs_prefix = 's3://' + else: + raise errors.Config.InvalidValue( + f'Unsupported Cloud provider {self.cloud}') + + self.storage_service.PrepareService(location=self.region) + + # TODO(pclay): support + assert not FLAGS.dpb_cluster_properties + + if self.k8s_cluster.num_nodes < 2: + raise errors.Config.InvalidValue( + f'Cluster type {KUBERNETES_SPARK_CLUSTER} requires at least 2 nodes.' + f'Found {self.k8s_cluster.num_nodes}.') + + def _Create(self): + """Create docker image for cluster.""" + logging.info('Should have created k8s cluster by now.') + # TODO(pclay): Should resources publicly declare they have been created? + assert self.k8s_cluster.resource_ready_time + assert self.container_registry.resource_ready_time + logging.info(self.k8s_cluster) + logging.info(self.container_registry) + + logging.info('Building Spark image.') + self.image = self.container_registry.GetOrBuild(self.image) + + # https://spark.apache.org/docs/latest/running-on-kubernetes.html#rbac + # TODO(pclay): Consider moving into manifest + self.k8s_cluster.CreateServiceAccount( + self.SPARK_K8S_SERVICE_ACCOUNT, clusterrole='edit') + + def _GetDriverName(self): + return f'spark-driver-{len(self.spark_drivers)}' + + # Kubernetes unlike other Spark Shedulers reserves 40% of memory for PySpark + # instead of the normal 10%. We don't need much memory for PySpark, because + # our PySpark is 100% SQL and thus on the JVM (in dpb_sparksql_benchmark). + # Force Spark to reserve the normal 10% overhead in all cases. + # https://spark.apache.org/docs/latest/running-on-kubernetes.html#spark-properties + MEMORY_OVERHEAD_FACTOR = 0.1 + + def GetJobProperties(self) -> Dict[str, str]: + node_cpu = self.k8s_cluster.node_num_cpu + # TODO(pclay): Validate that we don't have too little memory? + node_memory_mb = self.k8s_cluster.node_memory_allocatable.m_as( + units.mebibyte) + # Reserve 512 MB for system daemons + node_memory_mb -= 512 + # Remove overhead + node_memory_mb /= (1 + self.MEMORY_OVERHEAD_FACTOR) + node_memory_mb = int(node_memory_mb) + + # Common PKB Spark cluster properties + properties = spark.GetConfiguration( + driver_memory_mb=node_memory_mb, + worker_memory_mb=node_memory_mb, + # Schedule one thread per vCPU + worker_cores=node_cpu, + # Reserve one node for driver + num_workers=self.k8s_cluster.num_nodes - 1, + configure_s3=self.cloud == 'AWS') + # k8s specific properties + properties.update({ + 'spark.driver.host': + self.SPARK_DRIVER_SERVICE, + 'spark.driver.port': + str(self.SPARK_DRIVER_PORT), + 'spark.kubernetes.driver.pod.name': + self._GetDriverName(), + # Tell Spark to under-report cores by 1 to fit next to k8s services + 'spark.kubernetes.executor.request.cores': + str(node_cpu - 1), + 'spark.kubernetes.container.image': + self.image, + # No HDFS available + 'spark.hadoop.fs.defaultFS': + self.base_dir, + 'spark.kubernetes.memoryOverheadFactor': + str(self.MEMORY_OVERHEAD_FACTOR), + }) + # User specified properties + properties.update(super().GetJobProperties()) + return properties + + def SubmitJob(self, + jarfile=None, + classname=None, + pyspark_file=None, + query_file=None, + job_poll_interval=None, + job_stdout_file=None, + job_arguments=None, + job_files=None, + job_jars=None, + job_type=None, + properties=None): + """Submit a data processing job to the backend.""" + # Specs can't be copied or created by hand. So we override the command of + # the spec for each job. + command = self.GetSparkSubmitCommand( + jarfile=jarfile, + classname=classname, + pyspark_file=pyspark_file, + job_arguments=job_arguments, + job_files=job_files, + job_jars=job_jars, + job_type=job_type, + properties=properties) + driver_name = self._GetDriverName() + # Request memory for driver. This should guarantee that driver does not get + # scheduled on same VM as exectutor and OOM. + driver_memory_mb = int( + self.GetJobProperties()[spark.SPARK_DRIVER_MEMORY].strip('m')) + start_time = datetime.datetime.now() + self.k8s_cluster.ApplyManifest( + 'container/spark/spark-driver.yaml.j2', + name=driver_name, + command=command, + driver_memory_mb=driver_memory_mb, + driver_port=self.SPARK_DRIVER_PORT, + driver_service=self.SPARK_DRIVER_SERVICE, + image=self.image, + service_account=self.SPARK_K8S_SERVICE_ACCOUNT) + container = container_service.KubernetesPod(driver_name) + # increments driver_name for next job + self.spark_drivers.append(container) + try: + container.WaitForExit() + except container_service.ContainerException as e: + raise JobSubmissionError() from e + end_time = datetime.datetime.now() + + if job_stdout_file: + with open(job_stdout_file, 'w') as f: + f.write(container.GetLogs()) + + # TODO(pclay): use k8s output for timing? + return JobResult(run_time=(end_time - start_time).total_seconds()) + + def _Delete(self): + pass + + def _GetCompletedJob(self, job_id: str) -> Optional[JobResult]: + """container.WaitForExit is blocking so this is not meaningful.""" + raise NotImplementedError('container.WaitForExit is a blocking command.') + + +def GetDpbServiceClass(cloud: str, + dpb_service_type: str) -> Optional[Type[BaseDpbService]]: + """Gets the Data Processing Backend class corresponding to 'service_type'. + + Args: + cloud: String name of cloud of the service + dpb_service_type: String service type as specified in configuration + + Returns: + Implementation class corresponding to the argument dpb_service_type + + Raises: + Exception: An invalid data processing backend service type was provided + """ + if dpb_service_type in UNMANAGED_SERVICES: + cloud = 'Unmanaged' + elif dpb_service_type == KUBERNETES_SPARK_CLUSTER: + cloud = container_service.KUBERNETES + return resource.GetResourceClass( + BaseDpbService, CLOUD=cloud, SERVICE_TYPE=dpb_service_type) diff --git a/script/cumulus/pkb/perfkitbenchmarker/edw_benchmark_results_aggregator.py b/script/cumulus/pkb/perfkitbenchmarker/edw_benchmark_results_aggregator.py new file mode 100644 index 0000000..215d04e --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/edw_benchmark_results_aggregator.py @@ -0,0 +1,1099 @@ +"""Aggregates the performance results from a edw benchmark. + +An edw benchmark, runs multiple iterations of a suite of queries. +Independent raw query performance is aggregated during the benchmark, and used +for generating: +a. Raw query performance samples +b. Aggregated query performance samples +c. Raw wall time for each stream in each iteration +d. Raw wall time for each iteration +e. Aggregated (average) iteration wall time +f. Raw geo mean performance for each iteration +g. Aggregated geo mean performance using the aggregated query performances +""" +import copy +import enum +import functools +import json +import logging +from typing import Any, Dict, Iterable, List, Text + +from absl import flags +import numpy as np +from perfkitbenchmarker import sample + + +flags.DEFINE_bool('edw_generate_aggregated_metrics', True, + 'Whether the benchmark generates aggregated_metrics such as ' + 'geomean. Query performance metrics are still generated.') + +FLAGS = flags.FLAGS + + +class EdwPerformanceAggregationError(Exception): + """Error encountered during aggregation of performance results.""" + + +def geometric_mean(iterable: List[float]) -> float: + """Function to compute the geo mean for a list of numeric values. + + Args: + iterable: A List of Float performance values + + Returns: + A float value equal to the geometric mean of the input performance values. + + Raises: + EdwPerformanceAggregationError: If an invalid performance value was included + for aggregation. + """ + if (not iterable or any(perf <= 0.0 for perf in iterable)): + raise EdwPerformanceAggregationError('Invalid values cannot be aggregated.') + a = np.array(iterable) + return a.prod() ** (1.0 / len(a)) + + +class EdwQueryExecutionStatus(enum.Enum): + """Enum class for potential status of query execution. + + Potential values: + FAILED: Indicates that the query execution failed. + SUCCESSFUL: Indicates that the query execution succeeded. + """ + FAILED = 'query_execution_failed' + SUCCESSFUL = 'query_execution_successful' + + +class EdwQueryPerformance(object): + """Class that represents the performance of an executed edw query. + + Attributes: + name: A string name of the query that was executed + performance: A Float variable set to the query's completion time in secs. + -1.0 is used as a sentinel value implying the query failed. For a successful + query the value is expected to be positive. + execution_status: An EdwQueryExecutionStatus enum indicating success/failure + metadata: A dictionary of query execution attributes (job_id, etc.) + """ + + def __init__(self, query_name: Text, performance: float, + metadata: Dict[str, str]): + # TODO(user): add query start and query end as attributes. + self.name = query_name + self.performance = performance + self.execution_status = (EdwQueryExecutionStatus.FAILED + if performance == -1.0 + else EdwQueryExecutionStatus.SUCCESSFUL) + self.metadata = metadata + + @classmethod + def from_json(cls, serialized_performance: str): + """Process the serialized query performance from client jar. + + Expected Performance format: + {"query_wall_time_in_secs":1.998,"query_end":1601695222108,"query":"1", + "query_start":1601695220110, + "details":{"job_id":"b66b5a8e-633f-4ee4-8632-4e3d0856172f"}} + + Args: + serialized_performance: Stringified json performance. + + Returns: + An instance of EdwQueryPerformance + """ + results = json.loads(serialized_performance) + if 'details' in results: + metadata = results['details'] + else: + metadata = {} + if results['query_wall_time_in_secs'] == -1: + logging.warning('Query %s failed.', results['query']) + return cls(query_name=results['query'], + performance=results['query_wall_time_in_secs'], + metadata=metadata) + + def get_performance_sample(self, metadata: Dict[str, str]) -> sample.Sample: + """Method to generate a sample for the query performance. + + Args: + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A sample for the edw query performance. + """ + query_metadata = copy.copy(metadata) + query_metadata['query'] = self.name + query_metadata['execution_status'] = self.execution_status + query_metadata.update(self.metadata) + return sample.Sample('edw_raw_query_time', self.performance, 'seconds', + query_metadata) + + def get_performance_value(self) -> float: + """Method to get the query's completion time in secs. + + Returns: + A float value set to the query's completion time in secs. + """ + return self.performance + + def get_performance_metadata(self) -> Dict[str, str]: + """Method to get the query's execution attributes (job_id, etc.). + + Returns: + A dictionary set to query's execution attributes (job_id, etc.) + """ + return self.metadata + + def is_successful(self) -> bool: + """Validates if the query was successful.""" + return self.execution_status == EdwQueryExecutionStatus.SUCCESSFUL + + +class EdwBaseIterationPerformance(object): + """Class that represents the performance of an iteration of edw queries.""" + + +class EdwPowerIterationPerformance(EdwBaseIterationPerformance): + """Class that represents the performance of a power iteration of edw queries. + + Attributes: + id: A unique string id for the iteration. + performance: A dictionary of query name to its execution performance which + is a EdwQueryPerformance instance. + successful_count: An integer count of the successful queries in the + iteration. + total_count: An integer count of the total number of queries in the + iteration. + """ + + def __init__(self, iteration_id: Text, total_queries: int): + self.id = iteration_id + self.performance = {} + self.total_count = total_queries + self.successful_count = 0 + + def add_query_performance(self, query_name: Text, performance: float, + metadata: Dict[str, str]): + """Creates and populates a query performance from the input results. + + Updates the iteration's performance map with the query performance. + The method also increaments the success and failure query counts for the + iteration. + + Args: + query_name: A string name of the query that was executed + performance: A Float variable set to the query's completion time in secs. + -1.0 is used as a sentinel value implying the query failed. For a + successful query the value is expected to be positive. + metadata: Extra metadata to add to each performance. + + Raises: + EdwPerformanceAggregationError: If the query has already been added. + """ + query_metadata = copy.copy(metadata) + query_performance = EdwQueryPerformance( + query_name=query_name, performance=performance, metadata=query_metadata) + + if query_performance.name in self.performance: + raise EdwPerformanceAggregationError('Attempting to aggregate a ' + 'duplicate query: %s.' % + query_performance.name) + self.performance[query_performance.name] = query_performance + if query_performance.is_successful(): + self.successful_count += 1 + + def has_query_performance(self, query_name: Text) -> bool: + """Returns whether the query was run at least once in the iteration. + + Args: + query_name: A String name of the query to check. + + Returns: + A boolean value indicating if the query was executed in the iteration. + """ + return query_name in self.performance + + def is_query_successful(self, query_name: Text) -> bool: + """Returns whether the query was successful in the iteration. + + Args: + query_name: A String name of the query to check. + + Returns: + A boolean value indicating if the query was successful in the iteration. + """ + return self.performance.get(query_name).is_successful() + + def get_query_performance(self, query_name: Text) -> float: + """Gets a query's execution performance generated during iteration execution. + + Args: + query_name: A String name of the query to retrieve details for + + Returns: + A float value set to the query's completion time in secs. + """ + return self.performance[query_name].get_performance_value() + + def get_query_metadata(self, query_name: Text) -> Dict[str, Any]: + """Gets the metadata of a query as executed in the current iteration. + + Args: + query_name: Name of the query whose performance is requested. + + Returns: + A dictionary set to the query's metadata. + + Raises: + EdwPerformanceAggregationError: If the query failed. + """ + if not self.is_query_successful(query_name): + raise EdwPerformanceAggregationError('Cannot aggregate invalid / failed' + ' query' + query_name) + return self.performance.get(query_name).metadata + + def get_all_queries_in_iteration(self) -> List[Text]: + """Gets a list of names of all queries in the iteration. + + Returns: + A list of all queries in the iteration. + """ + return self.performance.keys() + + def get_all_query_performance_samples( + self, metadata: Dict[str, str]) -> List[sample.Sample]: + """Gets a list of samples for all queries in the iteration. + + Args: + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A list of samples of each query's performance + """ + return [ + query_performance.get_performance_sample(metadata) + for query_performance in self.performance.values() + ] + + def is_successful(self, expected_queries: List[Text]) -> bool: + """Check if all the expected queries ran and all succeeded.""" + all_queries_ran = set( + self.get_all_queries_in_iteration()) == set(expected_queries) + all_queries_were_successful = self.total_count == self.successful_count + return all_queries_ran and all_queries_were_successful + + def get_queries_geomean(self) -> float: + """Gets the geometric mean of all queries in the iteration. + + Returns: + The (float) geometric mean of all the queries ran in the iteration. + + Raises: + EdwPerformanceAggregationError: If the iteration contains unsuccessful + query executions. + """ + return geometric_mean([ + query_performance.performance + for query_performance in self.performance.values() + ]) + + def get_queries_geomean_performance_sample( + self, expected_queries: List[Text], metadata: Dict[str, + str]) -> sample.Sample: + """Gets a sample for geomean of all queries in the iteration. + + Args: + expected_queries: A list of query names expected to have been executed in + an iteration. + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A sample of iteration geomean performance. + + Raises: + EdwPerformanceAggregationError: If the iteration contains unsuccessful + query executions. + """ + if not self.is_successful(expected_queries): + raise EdwPerformanceAggregationError('Failed executions in iteration.') + raw_geo_mean = self.get_queries_geomean() + geo_mean_metadata = copy.copy(metadata) + return sample.Sample('edw_iteration_geomean_time', raw_geo_mean, 'seconds', + geo_mean_metadata) + + +class EdwSimultaneousIterationPerformance(EdwBaseIterationPerformance): + """Class that represents the performance of a simultaneous iteration. + + Attributes: + id: A unique string id for the iteration. + start_time: The start time of the iteration in milliseconds since epoch. + end_time: The end time of the iteration in milliseconds since epoch. + wall_time: The wall time in seconds as a double value. + performance: A dictionary of query name to its execution performance which + is an EdwQueryPerformance instance. + all_queries_succeeded: Whether all queries in the iteration were successful. + """ + + def __init__(self, iteration_id: Text, iteration_start_time: int, + iteration_end_time: int, iteration_wall_time: float, + iteration_performance: Dict[str, EdwQueryPerformance], + all_queries_succeeded: bool): + self.id = iteration_id + self.start_time = iteration_start_time + self.end_time = iteration_end_time + self.wall_time = iteration_wall_time + self.performance = iteration_performance + self.all_queries_succeeded = all_queries_succeeded + + @classmethod + def from_json(cls, iteration_id: str, serialized_performance: str): + """Process the serialized simultaneous iteration performance from client jar. + + Expected Performance format: + {"simultaneous_end":1601145943197,"simultaneous_start":1601145940113, + "all_queries_performance_array":[{"query_wall_time_in_secs":2.079, + "query_end":1601145942208,"job_id":"914682d9-4f64-4323-bad2-554267cbbd8d", + "query":"1","query_start":1601145940129},{"query_wall_time_in_secs":2.572, + "query_end":1601145943192,"job_id":"efbf93a1-614c-4645-a268-e3801ae994f1", + "query":"2","query_start":1601145940620}], + "simultaneous_wall_time_in_secs":3.084} + + Args: + iteration_id: String identifier of the simultaneous iteration. + serialized_performance: Stringified json performance. + + Returns: + An instance of EdwSimultaneousIterationPerformance + """ + results = json.loads(serialized_performance) + query_performance_map = {} + all_queries_succeeded = 'failure_reason' not in results + if all_queries_succeeded: + for query_perf_json in results['all_queries_performance_array']: + query_perf = EdwQueryPerformance.from_json( + serialized_performance=(json.dumps(query_perf_json))) + query_performance_map[query_perf.name] = query_perf + else: + logging.warning('Failure reported. Reason: %s', results['failure_reason']) + return cls( + iteration_id=iteration_id, + iteration_start_time=(results['simultaneous_start'] + if all_queries_succeeded else -1), + iteration_end_time=(results['simultaneous_end'] + if all_queries_succeeded else -1), + iteration_wall_time=results['simultaneous_wall_time_in_secs'], + iteration_performance=query_performance_map, + all_queries_succeeded=all_queries_succeeded) + + def get_wall_time(self) -> float: + """Gets the total wall time, in seconds, for the iteration. + + The wall time is the time from the start of the first query to the end time + of the last query to finish. + + Returns: + The wall time in seconds. + """ + return self.wall_time + + def get_wall_time_performance_sample(self, metadata: Dict[ + str, str]) -> sample.Sample: + """Gets a sample for wall time performance of the iteration. + + Args: + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A sample of iteration wall time performance + """ + wall_time = self.wall_time + wall_time_metadata = copy.copy(metadata) + wall_time_metadata['iteration_start_time'] = self.start_time + wall_time_metadata['iteration_end_time'] = self.end_time + return sample.Sample('edw_iteration_wall_time', wall_time, 'seconds', + wall_time_metadata) + + def get_all_query_performance_samples( + self, metadata: Dict[str, str]) -> List[sample.Sample]: + """Gets a list of samples for all queries in the iteration. + + Args: + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A list of samples of each query's performance + """ + return [ + query_performance.get_performance_sample(metadata) + for query_performance in self.performance.values() + ] + + def is_successful(self, expected_queries: List[Text]) -> bool: + """Check if all the expected queries ran and all succeeded.""" + all_queries_ran = self.performance.keys() == set(expected_queries) + return all_queries_ran and self.all_queries_succeeded + + def has_query_performance(self, query_name: Text) -> bool: + """Returns whether the query was run at least once in the iteration. + + Args: + query_name: A String name of the query to check. + + Returns: + A boolean value indicating if the query was executed in the iteration. + """ + return query_name in self.performance + + def is_query_successful(self, query_name: Text) -> bool: + """Returns whether the query was successful in the iteration. + + Args: + query_name: A String name of the query to check. + + Returns: + A boolean value indicating if the query was successful in the iteration. + """ + if self.has_query_performance(query_name): + return self.performance.get(query_name).is_successful() + return False + + def get_query_performance(self, query_name: Text) -> float: + """Gets a query's execution performance in the current iteration. + + Args: + query_name: A String name of the query to retrieve details for + + Returns: + A float value set to the query's completion time in secs. + """ + return self.performance[query_name].get_performance_value() + + def get_query_metadata(self, query_name: Text) -> Dict[str, Any]: + """Gets the metadata of a query in the current iteration. + + Args: + query_name: Name of the query whose aggregated performance is requested + + Returns: + A dictionary set to the query's aggregated metadata, accumulated from the + raw query run in the current iteration. + + Raises: + EdwPerformanceAggregationError: If the query failed in the iteration. + """ + if not self.is_query_successful(query_name): + raise EdwPerformanceAggregationError('Cannot aggregate invalid / failed' + ' query' + query_name) + return self.performance.get(query_name).metadata + + def get_queries_geomean(self) -> float: + """Gets the geometric mean of all queries in the iteration. + + Returns: + The (float) geometric mean of all the queries ran in the iteration. + + Raises: + EdwPerformanceAggregationError: If the iteration contains unsuccessful + query executions. + """ + return geometric_mean([ + query_performance.performance + for query_performance in self.performance.values() + ]) + + def get_queries_geomean_performance_sample( + self, expected_queries: List[Text], metadata: Dict[str, + str]) -> sample.Sample: + """Gets a sample for geomean of all queries in the iteration. + + Args: + expected_queries: A list of query names expected to have been executed in + an iteration. + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A sample of iteration geomean performance. + + Raises: + EdwPerformanceAggregationError: If the iteration contains unsuccessful + query executions. + """ + if not self.is_successful(expected_queries): + raise EdwPerformanceAggregationError('Failed executions in iteration.') + raw_geo_mean = self.get_queries_geomean() + geo_mean_metadata = copy.copy(metadata) + return sample.Sample('edw_iteration_geomean_time', raw_geo_mean, 'seconds', + geo_mean_metadata) + + +class EdwThroughputIterationPerformance(EdwBaseIterationPerformance): + """Class that represents the performance of an iteration of edw queries. + + Attributes: + id: A unique string id for the iteration. + start_time: The start time of the iteration execution. + end_time: The end time of the iteration execution. + wall_time: The wall time of the stream execution. + performance: A dict of stream_id to stream performances, each of which is a + dictionary mapping query names to their execution performances, which are + EdwQueryPerformance instances. + """ + + def __init__(self, iteration_id: Text, iteration_start_time: int, + iteration_end_time: int, iteration_wall_time: float, + iteration_performance: Dict[str, Dict[str, + EdwQueryPerformance]]): + self.id = iteration_id + self.start_time = iteration_start_time + self.end_time = iteration_end_time + self.wall_time = iteration_wall_time + self.performance = iteration_performance + + @classmethod + def from_json(cls, iteration_id: str, serialized_performance: str): + """Process the serialized throughput iteration performance from client jar. + + Expected Performance format: + {"throughput_start":1601666911596,"throughput_end":1601666916139, + "throughput_wall_time_in_secs":4.543, + "all_streams_performance_array":[ + {"stream_start":1601666911597,"stream_end":1601666916139, + "stream_wall_time_in_secs":4.542, + "stream_performance_array":[ + {"query_wall_time_in_secs":2.238,"query_end":1601666913849, + "query":"1","query_start":1601666911611, + "details":{"job_id":"438170b0-b0cb-4185-b733-94dd05b46b05"}}, + {"query_wall_time_in_secs":2.285,"query_end":1601666916139, + "query":"2","query_start":1601666913854, + "details":{"job_id":"371902c7-5964-46f6-9f90-1dd00137d0c8"}} + ]}, + {"stream_start":1601666911597,"stream_end":1601666916018, + "stream_wall_time_in_secs":4.421, + "stream_performance_array":[ + {"query_wall_time_in_secs":2.552,"query_end":1601666914163, + "query":"2","query_start":1601666911611, + "details":{"job_id":"5dcba418-d1a2-4a73-be70-acc20c1f03e6"}}, + {"query_wall_time_in_secs":1.855,"query_end":1601666916018, + "query":"1","query_start":1601666914163, + "details":{"job_id":"568c4526-ae26-4e9d-842c-03459c3a216d"}} + ]} + ]} + + Args: + iteration_id: String identifier of the throughput iteration. + serialized_performance: Stringified json performance. + + Returns: + An instance of EdwThroughputIterationPerformance + """ + results = json.loads(serialized_performance) + stream_performances = {} + all_queries_succeeded = 'failure_reason' not in results + if all_queries_succeeded: + for stream_id, stream_perf_json in enumerate( + results['all_streams_performance_array']): + stream_id = str(stream_id) + stream_performance_map = {} + for query_perf_json in stream_perf_json['stream_performance_array']: + query_perf = EdwQueryPerformance.from_json( + serialized_performance=(json.dumps(query_perf_json))) + stream_performance_map[query_perf.name] = query_perf + stream_performances.update({stream_id: stream_performance_map}) + else: + logging.warning('Failure reported. Reason: %s', results['failure_reason']) + return cls( + iteration_id=iteration_id, + iteration_start_time=(results['throughput_start'] + if all_queries_succeeded else -1), + iteration_end_time=(results['throughput_end'] + if all_queries_succeeded else -1), + iteration_wall_time=results['throughput_wall_time_in_secs'], + iteration_performance=stream_performances) + + def has_query_performance(self, query_name: Text) -> bool: + """Returns whether the query was run at least once in the iteration. + + Args: + query_name: A String name of the query to check. + + Returns: + A boolean value indicating if the query was executed in the iteration. + """ + for stream in self.performance.values(): + if query_name in stream: + return True + return False + + def is_query_successful(self, query_name: Text) -> bool: + """Returns whether the query was successful in the iteration. + + Args: + query_name: A String name of the query to check. + + Returns: + A boolean value indicating if the query was successful in the iteration. + """ + for stream in self.performance.values(): + if query_name in stream: + if not stream[query_name].is_successful(): + return False + return True + + def get_query_performance(self, query_name: Text) -> float: + """Gets a query's execution performance aggregated across all streams in the current iteration. + + Args: + query_name: A String name of the query to retrieve details for + + Returns: + A float value set to the query's average completion time in secs. + """ + all_performances = [] + for stream in self.performance.values(): + if query_name in stream: + all_performances.append(stream[query_name].get_performance_value()) + if not all_performances: + return -1.0 + return sum(all_performances) / len(all_performances) + + def get_query_metadata(self, query_name: Text) -> Dict[str, Any]: + """Gets the metadata of a query aggregated across all streams in the current iteration. + + Args: + query_name: Name of the query whose aggregated performance is requested + + Returns: + A dictionary set to the query's aggregated metadata, accumulated from the + raw query runs in all streams of the current iteration. + + Raises: + EdwPerformanceAggregationError: If the query failed in one or more streams + """ + result = {} + for stream_id, stream_performance in self.performance.items(): + if query_name in stream_performance: + q_performance = stream_performance[query_name] + result[stream_id + '_runtime'] = q_performance.get_performance_value() + result.update({ + stream_id + '_' + k: v + for (k, v) in q_performance.get_performance_metadata().items() + }) + return result + + def get_all_query_performance_samples( + self, metadata: Dict[str, str]) -> List[sample.Sample]: + """Gets a list of samples for all queries in all streams of the iteration. + + Args: + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A list of samples of each query's performance + """ + all_query_performances = [] + for stream_id, stream_performance in self.performance.items(): + stream_metadata = copy.copy(metadata) + stream_metadata['stream'] = stream_id + all_query_performances.extend([ + query_perf.get_performance_sample(stream_metadata) + for query_perf in stream_performance.values() + ]) + return all_query_performances + + def all_streams_ran_all_expected_queries( + self, expected_queries: List[Text]) -> bool: + """Checks that the same set of expected queries ran in all streams.""" + for stream in self.performance.values(): + if set(stream.keys()) != set(expected_queries): + return False + return True + + def no_duplicate_queries(self) -> bool: + """Checks that no streams contain any duplicate queries.""" + for stream in self.performance.values(): + if len(stream.keys()) != len(set(stream.keys())): + return False + return True + + def all_queries_succeeded(self) -> bool: + """Checks if every query in every stream was successful.""" + for stream_performance in self.performance.values(): + for query_perf in stream_performance.values(): + if query_perf.performance == -1: + return False + return True + + def is_successful(self, expected_queries: List[Text]) -> bool: + """Check if the throughput run was successful. + + A successful run meets the following conditions: + - There were more than 0 streams. + - Each stream ran the same set of expected queries (regardless of order) + - Each stream ran each query only once + - Every query in every stream succeeded + + Args: + expected_queries: A list of query names expected to have been executed in + an iteration. + + Returns: + True if all success conditions were met, false otherwise. + """ + non_zero_streams = len(self.performance) >= 1 + all_streams_ran_all_queries = self.all_streams_ran_all_expected_queries( + expected_queries) + no_duplicate_queries = self.no_duplicate_queries() + all_queries_succeeded = self.all_queries_succeeded() + return (non_zero_streams and all_streams_ran_all_queries and + no_duplicate_queries and all_queries_succeeded) + + def get_queries_geomean(self) -> float: + """Gets the geometric mean of all queries in all streams of the iteration. + + Returns: + The (float) geometric mean of all the individual queries ran in all + streams of the iteration. + + Raises: + EdwPerformanceAggregationError: If the suite contains unsuccessful query + executions. + """ + query_performances = [] + for stream in self.performance.values(): + for query in stream.values(): + query_performances.append(query.get_performance_value()) + return geometric_mean(query_performances) + + def get_queries_geomean_performance_sample( + self, expected_queries: List[Text], metadata: Dict[str, + str]) -> sample.Sample: + """Gets a sample for geomean of all queries in all streams of the iteration. + + Args: + expected_queries: A list of query names expected to have been executed in + an iteration. + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A sample of iteration geomean performance. + + Raises: + EdwPerformanceAggregationError: If the iteration contains unsuccessful + query executions. + """ + if not self.is_successful(expected_queries): + raise EdwPerformanceAggregationError('Failed executions in iteration.') + raw_geo_mean = self.get_queries_geomean() + geo_mean_metadata = copy.copy(metadata) + return sample.Sample('edw_iteration_geomean_time', raw_geo_mean, 'seconds', + geo_mean_metadata) + + def get_wall_time(self) -> float: + """Gets the total wall time, in seconds, for the iteration. + + The wall time is the time from the start of the first stream to the end time + of the last stream to finish. + + Returns: + The wall time in seconds. + """ + return self.wall_time + + def get_wall_time_performance_sample( + self, metadata: Dict[str, str]) -> sample.Sample: + """Gets a sample for total wall time performance of the iteration. + + Args: + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A sample of iteration wall time performance + """ + wall_time_metadata = copy.copy(metadata) + wall_time_metadata['iteration_start_time'] = self.start_time + wall_time_metadata['iteration_end_time'] = self.end_time + return sample.Sample('edw_iteration_wall_time', self.wall_time, 'seconds', + wall_time_metadata) + + +class EdwBenchmarkPerformance(object): + """Class that represents the performance of an edw benchmark. + + Attributes: + total_iterations: An integer variable set to total of number of iterations. + expected_queries: A list of query names that are executed in an iteration of + the benchmark + iteration_performances: A dictionary of iteration id (String value) to its + execution performance (an instance of EdwBaseIterationPerformance) + """ + + def __init__(self, total_iterations: int, expected_queries: Iterable[Text]): + self.total_iterations = total_iterations + self.expected_queries = list(expected_queries) + self.iteration_performances = {} + + def add_iteration_performance(self, performance: EdwBaseIterationPerformance): + """Add an iteration's performance to the benchmark results. + + Args: + performance: An instance of EdwBaseIterationPerformance encapsulating the + iteration performance details. + + Raises: + EdwPerformanceAggregationError: If the iteration has already been added. + """ + iteration_id = performance.id + if iteration_id in self.iteration_performances: + raise EdwPerformanceAggregationError('Attempting to aggregate a duplicate' + ' iteration: %s.' % iteration_id) + self.iteration_performances[iteration_id] = performance + + def is_successful(self) -> bool: + """Check a benchmark's success, only if all the iterations succeed.""" + return functools.reduce((lambda x, y: x and y), [ + iteration_performance.is_successful(self.expected_queries) + for iteration_performance in self.iteration_performances.values() + ]) + + def aggregated_query_status(self, query_name: Text) -> bool: + """Gets the status of query aggregated across all iterations. + + A query is considered successful only if + a. Query was executed in every iteration + b. Query was successful in every iteration + + Args: + query_name: Name of the query whose aggregated success is requested + + Returns: + A boolean value indicating if the query was successful in the benchmark. + """ + for performance in self.iteration_performances.values(): + if not performance.has_query_performance(query_name): + return False + if not performance.is_query_successful(query_name): + return False + return True + + def aggregated_query_execution_time(self, query_name: Text) -> float: + """Gets the execution time of query aggregated across all iterations. + + Args: + query_name: Name of the query whose aggregated performance is requested + + Returns: + A float value set to the query's aggregated execution time + + Raises: + EdwPerformanceAggregationError: If the query failed in one or more + iterations + """ + if not self.aggregated_query_status(query_name): + raise EdwPerformanceAggregationError('Cannot aggregate invalid / failed ' + 'query ' + query_name) + query_performances = [ + iteration_performance.get_query_performance(query_name) + for iteration_performance in self.iteration_performances.values() + ] + return sum(query_performances) / self.total_iterations + + def aggregated_query_metadata(self, query_name: Text) -> Dict[str, Any]: + """Gets the metadata of a query aggregated across all iterations. + + Args: + query_name: Name of the query whose aggregated performance is requested + + Returns: + A dictionary set to the query's aggregated metadata, accumulated from the + raw query runs. + + Raises: + EdwPerformanceAggregationError: If the query failed in one or more + iterations + """ + if not self.aggregated_query_status(query_name): + raise EdwPerformanceAggregationError('Cannot aggregate invalid / failed ' + 'query ' + query_name) + result = {} + for iteration_id, iteration_performance in ( + self.iteration_performances.items()): + result.update({ + iteration_id + '_' + k: v + for (k, v) in iteration_performance.get_query_metadata( + query_name).items() + }) + return result + + def get_aggregated_query_performance_sample( + self, query_name: Text, metadata: Dict[str, str]) -> sample.Sample: + """Gets the performance of query aggregated across all iterations. + + Args: + query_name: Name of the query whose aggregated performance is requested + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A sample of the query's aggregated execution time + """ + query_metadata = copy.copy(metadata) + query_metadata['query'] = query_name + query_metadata['aggregation_method'] = 'mean' + perf, exec_status, agg_md = -1.0, EdwQueryExecutionStatus.FAILED, {} + if self.aggregated_query_status(query_name): + perf = self.aggregated_query_execution_time(query_name=query_name) + exec_status = EdwQueryExecutionStatus.SUCCESSFUL + agg_md = self.aggregated_query_metadata(query_name=query_name) + query_metadata['execution_status'] = exec_status + query_metadata.update(agg_md) + return sample.Sample('edw_aggregated_query_time', perf, 'seconds', + query_metadata) + + def get_all_query_performance_samples(self, metadata: Dict[str, str]) -> List[ + sample.Sample]: + """Generates samples for all query performances. + + Benchmark relies on iteration runs to generate the raw query performance + samples + Benchmark appends the aggregated query performance sample + + Args: + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A list of samples (raw and aggregated) + """ + results = [] + # Raw query performance samples + for iteration, performance in self.iteration_performances.items(): + iteration_metadata = copy.copy(metadata) + iteration_metadata['iteration'] = iteration + results.extend(performance.get_all_query_performance_samples( + iteration_metadata)) + # Aggregated query performance samples + for query in self.expected_queries: + results.append(self.get_aggregated_query_performance_sample( + query_name=query, metadata=metadata)) + return results + + def get_aggregated_wall_time_performance_sample(self, + metadata: Dict[str, str] + ) -> sample.Sample: + """Gets the wall time performance aggregated across all iterations. + + Args: + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A sample of aggregated (averaged) wall time. + """ + wall_times = [ + iteration.get_wall_time() + for iteration in self.iteration_performances.values() + ] + aggregated_wall_time = sum(wall_times) / self.total_iterations + wall_time_metadata = copy.copy(metadata) + wall_time_metadata['aggregation_method'] = 'mean' + return sample.Sample('edw_aggregated_wall_time', aggregated_wall_time, + 'seconds', wall_time_metadata) + + def get_wall_time_performance_samples(self, metadata: Dict[str, str]): + """Generates samples for all wall time performances. + + Benchmark relies on iterations to generate the raw wall time performance + samples. + Benchmark appends the aggregated wall time performance sample + + Args: + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A list of samples (raw and aggregated) + """ + results = [] + + for iteration, performance in self.iteration_performances.items(): + iteration_metadata = copy.copy(metadata) + iteration_metadata['iteration'] = iteration + results.append(performance.get_wall_time_performance_sample( + iteration_metadata)) + results.append(self.get_aggregated_wall_time_performance_sample( + metadata=metadata)) + return results + + def get_aggregated_geomean_performance_sample(self, + metadata: + Dict[str, + str]) -> sample.Sample: + """Gets the geomean performance aggregated across all iterations. + + Args: + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A sample of aggregated geomean + + Raises: + EdwPerformanceAggregationError: If the benchmark conatins a failed query + execution. + """ + if not self.is_successful(): + raise EdwPerformanceAggregationError('Benchmark contains a failed query.') + aggregated_geo_mean = geometric_mean([ + self.aggregated_query_execution_time(query_name=query) + for query in self.expected_queries + ]) + + geomean_metadata = copy.copy(metadata) + geomean_metadata['intra_query_aggregation_method'] = 'mean' + geomean_metadata['inter_query_aggregation_method'] = 'geomean' + return sample.Sample('edw_aggregated_geomean', aggregated_geo_mean, + 'seconds', geomean_metadata) + + def get_queries_geomean_performance_samples(self, metadata: Dict[str, str] + ) -> List[sample.Sample]: + """Generates samples for all geomean performances. + + Benchmark relies on iteration runs to generate the raw geomean performance + samples + Benchmark appends the aggregated geomean performance sample + + Args: + metadata: A dictionary of execution attributes to be merged with the query + execution attributes, for eg. tpc suite, scale of dataset, etc. + + Returns: + A list of samples (raw and aggregated) + + Raises: + EdwPerformanceAggregationError: If the benchmark conatins a failed query + execution + """ + if not self.is_successful(): + raise EdwPerformanceAggregationError('Benchmark contains a failed query.') + results = [] + + for iteration, performance in self.iteration_performances.items(): + iteration_metadata = copy.copy(metadata) + iteration_metadata['iteration'] = iteration + results.append( + performance.get_queries_geomean_performance_sample( + self.expected_queries, iteration_metadata)) + + results.append(self.get_aggregated_geomean_performance_sample( + metadata=metadata)) + return results diff --git a/script/cumulus/pkb/perfkitbenchmarker/edw_service.py b/script/cumulus/pkb/perfkitbenchmarker/edw_service.py new file mode 100644 index 0000000..7fb1e42 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/edw_service.py @@ -0,0 +1,393 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Resource encapsulating provisioned Data Warehouse in the cloud Services. + +Classes to wrap specific backend services are in the corresponding provider +directory as a subclass of BaseEdwService. +""" +import os +from typing import Dict, List, Text, Tuple + +from absl import flags +from perfkitbenchmarker import resource + + +flags.DEFINE_integer('edw_service_cluster_concurrency', 5, + 'Number of queries to run concurrently on the cluster.') +flags.DEFINE_string('edw_service_cluster_snapshot', None, + 'If set, the snapshot to restore as cluster.') +flags.DEFINE_string('edw_service_cluster_identifier', None, + 'If set, the preprovisioned edw cluster.') +flags.DEFINE_string('edw_service_endpoint', None, + 'If set, the preprovisioned edw cluster endpoint.') +flags.DEFINE_string('edw_service_cluster_db', None, + 'If set, the db on cluster to use during the benchmark (' + 'only applicable when using snapshots).') +flags.DEFINE_string('edw_service_cluster_user', None, + 'If set, the user authorized on cluster (only applicable ' + 'when using snapshots).') +flags.DEFINE_string('edw_service_cluster_password', None, + 'If set, the password authorized on cluster (only ' + 'applicable when using snapshots).') +flags.DEFINE_string('snowflake_snowsql_config_override_file', None, + 'The SnowSQL configuration to use.' + 'https://docs.snowflake.net/manuals/user-guide/snowsql-config.html#snowsql-config-file') # pylint: disable=line-too-long +flags.DEFINE_string('snowflake_connection', None, + 'Named Snowflake connection defined in SnowSQL config file.' + 'https://docs.snowflake.net/manuals/user-guide/snowsql-start.html#using-named-connections') # pylint: disable=line-too-long +flags.DEFINE_integer('edw_suite_iterations', 1, 'Number of suite iterations to perform.') +# TODO(user): Revisit flags for accepting query lists. +flags.DEFINE_string('edw_simultaneous_queries', + None, 'CSV list of simultaneous queries to benchmark.') +flags.DEFINE_integer('edw_simultaneous_query_submission_interval', '0', + 'Simultaneous query submission interval in milliseconds.') +flags.DEFINE_string('edw_power_queries', None, + 'CSV list of power queries to benchmark.') +flags.DEFINE_multi_string( + 'concurrency_streams', [], 'List of all query streams to execute. Each ' + 'stream should be passed in separately and the queries should be comma ' + 'separated, e.g. --concurrency_streams=1,2,3 --concurrency_streams=3,2,1') +flags.DEFINE_string('snowflake_warehouse', None, + 'A virtual warehouse, often referred to simply as a - ' + 'warehouse, is a cluster of compute in Snowflake. ' + 'https://docs.snowflake.com/en/user-guide/warehouses.html') # pylint: disable=line-too-long +flags.DEFINE_string( + 'snowflake_database', None, + 'The hosted snowflake database to use during the benchmark.') +flags.DEFINE_string( + 'snowflake_schema', None, + 'The schema of the hosted snowflake database to use during the benchmark.') +flags.DEFINE_enum( + 'snowflake_client_interface', 'JDBC', ['JDBC'], + 'The Runtime Interface used when interacting with Snowflake.') + + +FLAGS = flags.FLAGS + + +TYPE_2_PROVIDER = dict([('athena', 'aws'), ('redshift', 'aws'), + ('spectrum', 'aws'), ('snowflake_aws', 'aws'), + ('snowflakeexternal_aws', 'aws'), ('bigquery', 'gcp'), + ('endor', 'gcp'), ('endorazure', 'gcp'), + ('bqfederated', 'gcp'), + ('azuresqldatawarehouse', 'azure')]) +TYPE_2_MODULE = dict([ + ('athena', 'perfkitbenchmarker.providers.aws.athena'), + ('redshift', 'perfkitbenchmarker.providers.aws.redshift'), + ('spectrum', 'perfkitbenchmarker.providers.aws.spectrum'), + ('snowflake_aws', 'perfkitbenchmarker.providers.aws.snowflake'), + ('snowflakeexternal_aws', 'perfkitbenchmarker.providers.aws.snowflake'), + ('bigquery', 'perfkitbenchmarker.providers.gcp.bigquery'), + ('endor', 'perfkitbenchmarker.providers.gcp.bigquery'), + ('endorazure', 'perfkitbenchmarker.providers.gcp.bigquery'), + ('bqfederated', 'perfkitbenchmarker.providers.gcp.bigquery'), + ('azuresqldatawarehouse', 'perfkitbenchmarker.providers.azure.' + 'azure_sql_data_warehouse') +]) +DEFAULT_NUMBER_OF_NODES = 1 +# The order of stages is important to the successful lifecycle completion. +EDW_SERVICE_LIFECYCLE_STAGES = ['create', 'load', 'query', 'delete'] +SAMPLE_QUERY_PATH = '/tmp/sample.sql' +SAMPLE_QUERY = 'select * from INFORMATION_SCHEMA.TABLES;' + + +class EdwExecutionError(Exception): + """Encapsulates errors encountered during execution of a query.""" + + +class EdwClientInterface(object): + """Defines the interface for EDW service clients. + + Attributes: + client_vm: An instance of virtual_machine.BaseVirtualMachine used to + interface with the edw service. + whitelist_ip: The IP to whitelist. + """ + + def __init__(self): + self.client_vm = None + + # set by derived classes + self.whitelist_ip = None + + def SetProvisionedAttributes(self, benchmark_spec): + """Sets any attributes that were unknown during initialization.""" + self.client_vm = benchmark_spec.vms[0] + self.client_vm.RemoteCommand('echo "\nMaxSessions 100" | ' + 'sudo tee -a /etc/ssh/sshd_config') + + def Prepare(self, package_name: Text) -> None: + """Prepares the client vm to execute query. + + The default implementation raises an Error, to ensure client specific + installation and authentication of runner utilities. + + Args: + package_name: String name of the package defining the preprovisioned data + (certificates, etc.) to extract and use during client vm preparation. + """ + raise NotImplementedError + + def ExecuteQuery(self, query_name: Text) -> Tuple[float, Dict[str, str]]: + """Executes a query and returns performance details. + + Args: + query_name: String name of the query to execute + + Returns: + A tuple of (execution_time, execution details) + execution_time: A Float variable set to the query's completion time in + secs. -1.0 is used as a sentinel value implying the query failed. For a + successful query the value is expected to be positive. + performance_details: A dictionary of query execution attributes eg. job_id + """ + raise NotImplementedError + + def ExecuteSimultaneous(self, submission_interval: int, + queries: List[str]) -> str: + """Executes queries simultaneously on client and return performance details. + + Simultaneous app expects queries as white space separated query file names. + Response format: + {"simultaneous_end":1601145943197,"simultaneous_start":1601145940113, + "stream_performance_array":[{"query_wall_time_in_secs":2.079, + "query_end":1601145942208,"job_id":"914682d9-4f64-4323-bad2-554267cbbd8d", + "query":"1","query_start":1601145940129},{"query_wall_time_in_secs":2.572, + "query_end":1601145943192,"job_id":"efbf93a1-614c-4645-a268-e3801ae994f1", + "query":"2","query_start":1601145940620}], + "simultaneous_wall_time_in_secs":3.084} + + Args: + submission_interval: Simultaneous query submission interval in milliseconds. + queries: List of string names of the queries to execute simultaneously. + + Returns: + performance_details: A serialized dictionary of execution details. + """ + raise NotImplementedError + + def ExecuteThroughput(self, concurrency_streams: List[List[str]]) -> str: + """Executes a throughput test and returns performance details. + + Response format: + {"throughput_start":1601666911596,"throughput_end":1601666916139, + "throughput_wall_time_in_secs":4.543, + "all_streams_performance_array":[ + {"stream_start":1601666911597,"stream_end":1601666916139, + "stream_wall_time_in_secs":4.542, + "stream_performance_array":[ + {"query_wall_time_in_secs":2.238,"query_end":1601666913849, + "query":"1","query_start":1601666911611, + "details":{"job_id":"438170b0-b0cb-4185-b733-94dd05b46b05"}}, + {"query_wall_time_in_secs":2.285,"query_end":1601666916139, + "query":"2","query_start":1601666913854, + "details":{"job_id":"371902c7-5964-46f6-9f90-1dd00137d0c8"}} + ]}, + {"stream_start":1601666911597,"stream_end":1601666916018, + "stream_wall_time_in_secs":4.421, + "stream_performance_array":[ + {"query_wall_time_in_secs":2.552,"query_end":1601666914163, + "query":"2","query_start":1601666911611, + "details":{"job_id":"5dcba418-d1a2-4a73-be70-acc20c1f03e6"}}, + {"query_wall_time_in_secs":1.855,"query_end":1601666916018, + "query":"1","query_start":1601666914163, + "details":{"job_id":"568c4526-ae26-4e9d-842c-03459c3a216d"}} + ]} + ]} + + Args: + concurrency_streams: List of streams to execute simultaneously, each of + which is a list of string names of queries. + + Returns: + A serialized dictionary of execution details. + """ + raise NotImplementedError + + def WarmUpQuery(self): + """Executes a service-agnostic query that can detect cold start issues.""" + with open(SAMPLE_QUERY_PATH, 'w+') as f: + f.write(SAMPLE_QUERY) + self.client_vm.PushFile(SAMPLE_QUERY_PATH) + query_name = os.path.basename(SAMPLE_QUERY_PATH) + self.ExecuteQuery(query_name) + + def GetMetadata(self) -> Dict[str, str]: + """Returns the client interface metadata.""" + raise NotImplementedError + + +class EdwService(resource.BaseResource): + """Object representing a EDW Service.""" + + def __init__(self, edw_service_spec): + """Initialize the edw service object. + + Args: + edw_service_spec: spec of the edw service. + """ + # Hand over the actual creation to the resource module, which assumes the + # resource is pkb managed by default + is_user_managed = self.IsUserManaged(edw_service_spec) + # edw_service attribute + self.cluster_identifier = self.GetClusterIdentifier(edw_service_spec) + super(EdwService, self).__init__(user_managed=is_user_managed) + + # Provision related attributes + if edw_service_spec.snapshot: + self.snapshot = edw_service_spec.snapshot + else: + self.snapshot = None + + # Cluster related attributes + self.concurrency = edw_service_spec.concurrency + self.node_type = edw_service_spec.node_type + + if edw_service_spec.node_count: + self.node_count = edw_service_spec.node_count + else: + self.node_count = DEFAULT_NUMBER_OF_NODES + + # Interaction related attributes + if edw_service_spec.endpoint: + self.endpoint = edw_service_spec.endpoint + else: + self.endpoint = '' + self.db = edw_service_spec.db + self.user = edw_service_spec.user + self.password = edw_service_spec.password + # resource config attribute + self.spec = edw_service_spec + # resource workflow management + self.supports_wait_on_delete = True + self.client_interface = None + + def GetClientInterface(self) -> EdwClientInterface: + """Gets the active Client Interface.""" + return self.client_interface + + def IsUserManaged(self, edw_service_spec): + """Indicates if the edw service instance is user managed. + + Args: + edw_service_spec: spec of the edw service. + + Returns: + A boolean, set to True if the edw service instance is user managed, False + otherwise. + """ + return edw_service_spec.cluster_identifier is not None + + def GetClusterIdentifier(self, edw_service_spec): + """Returns a string name of the Cluster Identifier. + + Args: + edw_service_spec: spec of the edw service. + + Returns: + A string, set to the name of the cluster identifier. + """ + if self.IsUserManaged(edw_service_spec): + return edw_service_spec.cluster_identifier + else: + return 'pkb-' + FLAGS.run_uri + + def GetMetadata(self): + """Return a dictionary of the metadata for this edw service.""" + basic_data = {'edw_service_type': self.spec.type, + 'edw_cluster_identifier': self.cluster_identifier, + 'edw_cluster_node_type': self.node_type, + 'edw_cluster_node_count': self.node_count} + return basic_data + + def GenerateLifecycleStageScriptName(self, lifecycle_stage): + """Computes the default name for script implementing an edw lifecycle stage. + + Args: + lifecycle_stage: Stage for which the corresponding sql script is desired. + + Returns: + script name for implementing the argument lifecycle_stage. + """ + return os.path.basename( + os.path.normpath('database_%s.sql' % lifecycle_stage)) + + def Cleanup(self): + """Cleans up any temporary resources created for the service.""" + pass + + def GetDatasetLastUpdatedTime(self, dataset=None): + """Get the formatted last modified timestamp of the dataset.""" + raise NotImplementedError + + def ExtractDataset(self, + dest_bucket, + dataset=None, + tables=None, + dest_format='CSV'): + """Extract all tables in a dataset to object storage. + + Args: + dest_bucket: Name of the bucket to extract the data to. Should already + exist. + dataset: Optional name of the dataset. If none, will be determined by the + service. + tables: Optional list of table names to extract. If none, all tables in + the dataset will be extracted. + dest_format: Format to extract data in. + """ + raise NotImplementedError + + def RemoveDataset(self, dataset=None): + """Removes a dataset. + + Args: + dataset: Optional name of the dataset. If none, will be determined by the + service. + """ + raise NotImplementedError + + def CreateDataset(self, dataset=None, description=None): + """Creates a new dataset. + + Args: + dataset: Optional name of the dataset. If none, will be determined by the + service. + description: Optional description of the dataset. + """ + raise NotImplementedError + + def LoadDataset(self, source_bucket, tables, dataset=None): + """Load all tables in a dataset to a database from object storage. + + Args: + source_bucket: Name of the bucket to load the data from. Should already + exist. Each table must have its own subfolder in the bucket named after + the table, containing one or more csv files that make up the table data. + tables: List of table names to load. + dataset: Optional name of the dataset. If none, will be determined by the + service. + """ + raise NotImplementedError + + def RequiresWarmUpSuite(self) -> bool: + """Verifies if the edw_service requires a warm up suite execution. + + Currently enabled for all service types, for parity. + + Returns: + A boolean value (True) if the warm suite is recommended. + """ + return True diff --git a/script/cumulus/pkb/perfkitbenchmarker/errors.py b/script/cumulus/pkb/perfkitbenchmarker/errors.py new file mode 100644 index 0000000..bf0fb52 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/errors.py @@ -0,0 +1,260 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A common location for all perfkitbenchmarker-defined exceptions.""" + +import pprint + + +class Error(Exception): + pass + + +class Setup(object): + """Errors raised in setting up PKB.""" + + class PythonPackageRequirementUnfulfilled(Error): + """Error raised when a Python package requirement is unfulfilled.""" + pass + + class MissingExecutableError(Error): + """Error raised when we cannot find an executable we need.""" + pass + + class NoRunURIError(Error): + """Error raised when we were not given a run_uri and cannot infer it.""" + pass + + class BadRunURIError(Error): + """Error raised when the given run_uri is invalid.""" + pass + + class BadPreprovisionedDataError(Error): + """Error raised when the pre-provisioned data is invalid.""" + pass + + class InvalidSetupError(Error): + """Error raised when SetUpPKB was not called correctly.""" + pass + + class InvalidFlagConfigurationError(Error): + """Error raised when the set of command line flags is invalid.""" + pass + + class InvalidConfigurationError(Error): + """Error raised when configuration is invalid.""" + pass + + +class VirtualMachine(object): + """Errors raised by virtual_machine.py.""" + + class RemoteCommandError(Error): + """Error raised when a Remote Command or Remote Copy fails.""" + pass + + class RemoteExceptionError(Error): + pass + + class AuthError(Error): + """Error raised when one VM cannot access another VM.""" + pass + + class VirtualMachineError(Error): + """An error raised when VM is having an issue.""" + + @classmethod + def FromDebugInfo(cls, info, error_message): + """Create VirtualMachineError class from debug information. + + Args: + info: A dictionary containing debug information (such as traceroute + info). + error_message: the error message from the originating code. + + Returns: + a cls exception class + + Raises: + TypeError: if info is not an instance of dictionary. + """ + if isinstance(info, dict): + info = VirtualMachine.VirtualMachineError.FormatDebugInfo( + info, error_message) + return cls(info) + raise TypeError('The argument of FromDebugInfo should be an instance ' + 'of dictionary.') + + @staticmethod + def FormatDebugInfo(info, error_message): + """A function to return a string in human readable format. + + Args: + info: A dictionary containing debug information (such as traceroute + info). + error_message: the error message from the originating code. + + Returns: + A human readable string of debug information. + """ + sep = '\n%s\n' % ('-' * 65) + + def AddHeader(error, header, message): + error += '{sep}{header}\n{message}\n'.format( + sep=sep, header=header, message=message) + return error + + def AddKeyIfExists(result, header, key): + if key in info: + result = AddHeader(result, header, info[key]) + del info[key] + return result + + result = AddHeader('', 'error_message:', + error_message) if error_message else '' + result = AddKeyIfExists(result, 'traceroute:', 'traceroute') + return AddHeader(result, 'Debug Info:', pprint.pformat(info)) + + class VmStateError(VirtualMachineError): + pass + + +class VmUtil(object): + """Errors raised by vm_util.py.""" + + class RestConnectionError(Error): + pass + + class IpParsingError(Error): + pass + + class UserSetupError(Error): + pass + + class ThreadException(Error): + pass + + class CalledProcessException(Error): + pass + + class IssueCommandError(Error): + pass + + class IssueCommandTimeoutError(Error): + pass + + +class Benchmarks(object): + """Errors raised by individual benchmark.""" + + class BucketCreationError(Error): + pass + + class PrepareException(Error): + pass + + class MissingObjectCredentialException(Error): + pass + + class RunError(Error): + pass + + class InsufficientCapacityCloudFailure(Error): + pass + + class QuotaFailure(Error): + """Errors that are related to insufficient quota on cloud provider.""" + + class RateLimitExceededError(Error): + pass + + class KnownIntermittentError(Error): + """Known intermittent failures of the benchmark. + + These are non-retryable, known failure modes of the benchmark. It is + recommended that the benchmark be completely re-run. + """ + + class UnsupportedConfigError(Error): + """Errors due to an unsupported configuration running.""" + pass + + +class Resource(object): + """Errors related to resource creation and deletion.""" + + class CreationError(Error): + """An error on creation which is not retryable.""" + pass + + class CleanupError(Error): + pass + + class RetryableCreationError(Error): + pass + + class RetryableDeletionError(Error): + pass + + class GetError(Error): + """An error on get which is not retryable.""" + pass + + class RetryableGetError(Error): + pass + + class UpdateError(Error): + """An error on update.""" + pass + + class SubclassNotFoundError(Error): + pass + + class RestoreError(Error): + """Errors while restoring a resource.""" + pass + + class FreezeError(Error): + """Errors while freezing a resource.""" + pass + + +class Config(object): + """Errors related to configs.""" + + class InvalidValue(Error): + """User provided an invalid value for a config option.""" + pass + + class MissingOption(Error): + """User did not provide a value for a required config option.""" + pass + + class ParseError(Error): + """Error raised when a config can't be loaded properly.""" + pass + + class UnrecognizedOption(Error): + """User provided a value for an unrecognized config option.""" + pass + + +class Juju(object): + """Errors related to the Juju OS_TYPE.""" + + class TimeoutException(Error): + pass + + class UnitErrorException(Error): + pass diff --git a/script/cumulus/pkb/perfkitbenchmarker/events.py b/script/cumulus/pkb/perfkitbenchmarker/events.py new file mode 100644 index 0000000..5d8055c --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/events.py @@ -0,0 +1,182 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Defines observable events in PerfKitBenchmarker. + +All events are passed keyword arguments, and possibly a sender. See event +definitions below. + +Event handlers are run synchronously in an unspecified order; any exceptions +raised will be propagated. +""" +import logging +import os + +from absl import flags +import blinker +from perfkitbenchmarker import data +from perfkitbenchmarker import sample +from perfkitbenchmarker import stages + + +FLAGS = flags.FLAGS +_events = blinker.Namespace() + + +initialization_complete = _events.signal('system-ready', doc=""" +Signal sent once after the system is initialized (command-line flags +parsed, temporary directory initialized, run_uri set). + +Sender: None +Payload: parsed_flags, the parsed FLAGS object.""") + +provider_imported = _events.signal('provider-imported', doc=""" +Signal sent after a cloud provider's modules have been imported. + +Sender: string. Cloud provider name chosen from providers.VALID_CLOUDS.""") + +benchmark_start = _events.signal('benchmark-start', doc=""" +Signal sent at the beginning of a benchmark before any resources are +provisioned. + +Sender: None +Payload: benchmark_spec.""") + +on_vm_startup = _events.signal('on-vm-startup', doc=""" +Signal sent on vm startup. + +Sender: None +Payload: vm (VirtualMachine object).""") + + +benchmark_end = _events.signal('benchmark-end', doc=""" +Signal sent at the end of a benchmark after any resources have been +torn down (if run_stage includes teardown). + +Sender: None +Payload: benchmark_spec.""") + +PREPARE_PHASE = 'prepare' +RUN_PHASE = 'run' +CLEANUP_PHASE = 'cleanup' + +before_phase = _events.signal('before-phase', doc=""" +Signal sent immediately before a phase runs. + +Sender: the phase. Currently only RUN_PHASE, PREPARE_PHASE. +Payload: benchmark_spec.""") + +start_trace = _events.signal('start_trace', doc=""" +Signal sent to indicate that traces should begin collecting data + +Sender: the phase. Currently only RUN_PHASE +Payload: benchmark_spec.""") + +stop_trace = _events.signal('stop_trace', doc=""" +Signal sent to indicate that traces should stop collecting data + +Sender: the phase. Currently only RUN_PHASE +Payload: benchmark_spec.""") + +after_phase = _events.signal('after-phase', doc=""" +Signal sent immediately after a phase runs, regardless of whether it was +successful. + +Sender: the phase. Currently only RUN_PHASE, PREPARE_PHASE. +Payload: benchmark_spec.""") + +samples_created = _events.signal('samples-created', doc=""" +Called with samples list and benchmark spec. + +Signal sent immediately after a sample is created. +The samples' metadata is mutable, and may be updated by the subscriber. + +Sender: the phase. Currently only stages.RUN. +Payload: benchmark_spec (BenchmarkSpec), samples (list of sample.Sample).""") + +record_event = _events.signal('record-event', doc=""" +Signal sent when an event is recorded. + +Signal sent after an event occurred. Record start, end timestamp and metadata +of the event for analysis. + +Sender: None +Payload: event (string), start_timestamp (float), end_timestamp (float), +metadata (dict).""") + + +def RegisterTracingEvents(): + record_event.connect(AddEvent, weak=False) + + +class TracingEvent(object): + """Represents an event object. + + Attributes: + sender: string. Name of the sending class/object. + event: string. Name of the event. + start_timestamp: float. Represents the start timestamp of the event. + end_timestamp: float. Represents the end timestamp of the event. + metadata: dict. Additional metadata of the event. + """ + + events = [] + + def __init__(self, sender, event, start_timestamp, end_timestamp, metadata): + self.sender = sender + self.event = event + self.start_timestamp = start_timestamp + self.end_timestamp = end_timestamp + self.metadata = metadata + + +def AddEvent(sender, event, start_timestamp, end_timestamp, metadata): + """Record a TracingEvent.""" + TracingEvent.events.append( + TracingEvent(sender, event, start_timestamp, end_timestamp, metadata)) + + +@on_vm_startup.connect +def _RunStartupScript(unused_sender, vm): + """Run startup script if necessary.""" + if FLAGS.startup_script: + vm.RemoteCopy(data.ResourcePath(FLAGS.startup_script)) + vm.startup_script_output = vm.RemoteCommand( + './%s' % os.path.basename(FLAGS.startup_script)) + + +@samples_created.connect +def _AddScriptSamples(unused_sender, benchmark_spec, samples): + def _ScriptResultToMetadata(out): + return {'stdout': out[0], 'stderr': out[1]} + for vm in benchmark_spec.vms: + if FLAGS.startup_script: + samples.append(sample.Sample( + 'startup', 0, '', _ScriptResultToMetadata(vm.startup_script_output))) + if FLAGS.postrun_script: + samples.append(sample.Sample( + 'postrun', 0, '', _ScriptResultToMetadata(vm.postrun_script_output))) + + +@after_phase.connect +def _RunPostRunScript(sender, benchmark_spec): + if sender != stages.RUN: + logging.info( + 'Receive after_phase signal from :%s, not ' + 'triggering _RunPostRunScript.', sender) + if FLAGS.postrun_script: + for vm in benchmark_spec.vms: + vm.RemoteCopy(FLAGS.postrun_script) + vm.postrun_script_output = vm.RemoteCommand( + './%s' % os.path.basename(FLAGS.postrun_script)) diff --git a/script/cumulus/pkb/perfkitbenchmarker/flag_util.py b/script/cumulus/pkb/perfkitbenchmarker/flag_util.py new file mode 100644 index 0000000..b6d0515 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/flag_util.py @@ -0,0 +1,556 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions for working with user-supplied flags.""" + + +import logging +import os +import re + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import units + +import six +from six.moves import range +import yaml + +FLAGS = flags.FLAGS + +INTEGER_GROUP_REGEXP = re.compile(r'(\d+)(-(\d+))?(-(\d+))?$') +INTEGER_GROUP_REGEXP_COLONS = re.compile(r'(-?\d+)(:(-?\d+))?(:(-?\d+))?$') + + +class IntegerList(object): + """An immutable list of nonnegative integers. + + The list contains either single integers (ex: 5) or ranges (ex: + 8-12). Additionally, the user can provide a step to the range like so: + 8-24-2. The list can include as many elements as will fit in + memory. Furthermore, the memory required to hold a range will not + grow with the size of the range. + + Make a list with + lst = IntegerList(groups) + + where groups is a list whose elements are either single integers, + 2-tuples holding the low and high bounds of a range + (inclusive), or 3-tuples holding the low and high bounds, followed + by the step size. (Ex: [5, (8,12)] represents the integer list + 5,8,9,10,11,12, and [(8-14-2)] represents the list 8,10,12,14.) + + For negative number ranges use a colon separator (ex: "-2:1" is the integer + list -2, -1, 0, 1). + """ + + def __init__(self, groups): + self.groups = groups + + length = 0 + for elt in groups: + if isinstance(elt, six.integer_types): + length += 1 + if isinstance(elt, tuple): + length += len(self._CreateXrangeFromTuple(elt)) + + self.length = length + + def __len__(self): + return self.length + + def __getitem__(self, idx): + if not isinstance(idx, int): + raise TypeError() + if idx < 0 or idx >= self.length: + raise IndexError() + + group_idx = 0 + while idx > 0: + group = self.groups[group_idx] + + if not isinstance(group, tuple): + group_idx += 1 + idx -= 1 + else: + group_len = len(self._CreateXrangeFromTuple(group)) + if idx >= group_len: + group_idx += 1 + idx -= group_len + else: + step = 1 if len(group) == 2 else group[2] + return group[0] + idx * step + + if isinstance(self.groups[group_idx], tuple): + return self.groups[group_idx][0] + else: + return self.groups[group_idx] + + def __eq__(self, other): + if other is None: + return False + return tuple(self) == tuple(other) + + def __ne__(self, other): + if other is None: + return True + return tuple(self) != tuple(other) + + def __iter__(self): + for group in self.groups: + if isinstance(group, six.integer_types): + yield group + else: + for val in self._CreateXrangeFromTuple(group): + yield val + + def __str__(self): + return IntegerListSerializer().serialize(self) + + def __repr__(self): + return 'IntegerList([%s])' % self + + def _CreateXrangeFromTuple(self, input_tuple): + start = input_tuple[0] + step = 1 if len(input_tuple) == 2 else input_tuple[2] + stop_inclusive = input_tuple[1] + (1 if step > 0 else -1) + return range(start, stop_inclusive, step) + + +def _IsNonIncreasing(result, val): + """Determines if result would be non-increasing if val is appended. + + Args: + result: list integers and/or range tuples. + val: integer or range tuple to append. + Returns: + bool indicating if the appended list is non-increasing. + """ + if result: + if isinstance(result[-1], tuple): + # extract high from previous tuple + prev = result[-1][1] + else: + # previous is int + prev = result[-1] + if val <= prev: + return True + return False + + +class IntegerListParser(flags.ArgumentParser): + """Parse a string containing a comma-separated list of nonnegative integers. + + The list may contain single integers and dash-separated ranges. For + example, "1,3,5-7" parses to [1,3,5,6,7] and "1-7-3" parses to + [1,4,7]. + + Can pass the flag on_nonincreasing to the constructor to tell it + what to do if the list is nonincreasing. Options are + - None: do nothing. + - IntegerListParser.WARN: log a warning. + - IntegerListParser.EXCEPTION: raise a ValueError. + + As a special case, instead of a string, can pass a list of integers + or an IntegerList. In these cases, the return value iterates over + the same integers as were in the argument. + + For negative number ranges use a colon separator, for example "-3:4:2" parses + to [-3, -1, 1, 3]. + """ + + syntactic_help = ('A comma-separated list of integers or integer ' + 'ranges. Ex: -1,3,5:7 is read as -1,3,5,6,7.') + + WARN = 'warn' + EXCEPTION = 'exception' + + def __init__(self, on_nonincreasing=None): + super(IntegerListParser, self).__init__() + + self.on_nonincreasing = on_nonincreasing + + def parse(self, inp): + """Parse an integer list. + + Args: + inp: a string, a list, or an IntegerList. + + Returns: + An iterable of integers. + + Raises: + ValueError: if inp doesn't follow a format it recognizes. + """ + + if isinstance(inp, IntegerList): + return inp + elif isinstance(inp, list): + return IntegerList(inp) + elif isinstance(inp, int): + return IntegerList([inp]) + + def HandleNonIncreasing(): + if self.on_nonincreasing == IntegerListParser.WARN: + logging.warning('Integer list %s is not increasing', inp) + elif self.on_nonincreasing == IntegerListParser.EXCEPTION: + raise ValueError('Integer list %s is not increasing' % inp) + + groups = inp.split(',') + result = [] + + for group in groups: + match = INTEGER_GROUP_REGEXP.match( + group) or INTEGER_GROUP_REGEXP_COLONS.match(group) + if match is None: + raise ValueError('Invalid integer list %s' % inp) + elif match.group(2) is None: + val = int(match.group(1)) + + if _IsNonIncreasing(result, val): + HandleNonIncreasing() + + result.append(val) + else: + low = int(match.group(1)) + high = int(match.group(3)) + step = int(match.group(5)) if match.group(5) is not None else 1 + step = -step if step > 0 and low > high else step + + if high <= low or (_IsNonIncreasing(result, low)): + HandleNonIncreasing() + + result.append((low, high, step)) + + return IntegerList(result) + + def flag_type(self): + return 'integer list' + + +class IntegerListSerializer(flags.ArgumentSerializer): + + def _SerializeRange(self, val): + separator = ':' if any(item < 0 for item in val) else '-' + return separator.join(str(item) for item in val) + + def serialize(self, il): + return ','.join([str(val) if isinstance(val, six.integer_types) + else self._SerializeRange(val) + for val in il.groups]) + + +def DEFINE_integerlist(name, default, help, on_nonincreasing=None, + flag_values=FLAGS, **kwargs): + """Register a flag whose value must be an integer list.""" + + parser = IntegerListParser(on_nonincreasing=on_nonincreasing) + serializer = IntegerListSerializer() + + flags.DEFINE(parser, name, default, help, flag_values, serializer, **kwargs) + + +class OverrideFlags(object): + """Context manager that applies any config_dict overrides to flag_values.""" + + def __init__(self, flag_values, config_dict): + """Initializes an OverrideFlags context manager. + + Args: + flag_values: FlagValues that is temporarily modified so that any options + in override_dict that are not 'present' in flag_values are applied to + flag_values. + Upon exit, flag_values will be restored to its original state. + config_dict: Merged config flags from the benchmark config and benchmark + configuration yaml file. + """ + self._flag_values = flag_values + self._config_dict = config_dict + self._flags_to_reapply = {} + + def __enter__(self): + """Overrides flag_values with options in override_dict.""" + if not self._config_dict: + return + + for key, value in six.iteritems(self._config_dict): + if key not in self._flag_values: + raise errors.Config.UnrecognizedOption( + 'Unrecognized option {0}.{1}. Each option within {0} must ' + 'correspond to a valid command-line flag.'.format('flags', key)) + if not self._flag_values[key].present: + self._flags_to_reapply[key] = self._flag_values[key].value + try: + self._flag_values[key].parse(value) # Set 'present' to True. + except flags.IllegalFlagValueError as e: + raise errors.Config.InvalidValue( + 'Invalid {0}.{1} value: "{2}" (of type "{3}").{4}{5}'.format( + 'flags', key, value, + value.__class__.__name__, os.linesep, e)) + + def __exit__(self, *unused_args, **unused_kwargs): + """Restores flag_values to its original state.""" + if not self._flags_to_reapply: + return + for key, value in six.iteritems(self._flags_to_reapply): + self._flag_values[key].value = value + self._flag_values[key].present = 0 + + +class UnitsParser(flags.ArgumentParser): + """Parse a flag containing a unit expression. + + Attributes: + convertible_to: list of units.Unit instances. A parsed expression must be + convertible to at least one of the Units in this list. For example, + if the parser requires that its inputs are convertible to bits, then + values expressed in KiB and GB are valid, but values expressed in meters + are not. + """ + + syntactic_help = ('A quantity with a unit. Ex: 12.3MB.') + + def __init__(self, convertible_to): + """Initialize the UnitsParser. + + Args: + convertible_to: Either an individual unit specification or a series of + unit specifications, where each unit specification is either a string + (e.g. 'byte') or a units.Unit. The parser input must be convertible to + at least one of the specified Units, or the parse() method will raise + a ValueError. + """ + if isinstance(convertible_to, (six.string_types, units.Unit)): + self.convertible_to = [units.Unit(convertible_to)] + else: + self.convertible_to = [units.Unit(u) for u in convertible_to] + + def parse(self, inp): + """Parse the input. + + Args: + inp: a string or a units.Quantity. If a string, it has the format + "", as in "12KB", or "2.5GB". + + Returns: + A units.Quantity. + + Raises: + ValueError: If the input cannot be parsed, or if it parses to a value with + improper units. + """ + if isinstance(inp, units.Quantity): + quantity = inp + else: + try: + quantity = units.ParseExpression(inp) + except Exception as e: + raise ValueError("Couldn't parse unit expression %r: %s" % + (inp, str(e))) + if not isinstance(quantity, units.Quantity): + raise ValueError('Expression %r evaluates to a unitless value.' % inp) + + for unit in self.convertible_to: + try: + quantity.to(unit) + break + except units.DimensionalityError: + pass + else: + raise ValueError( + 'Expression {0!r} is not convertible to an acceptable unit ' + '({1}).'.format(inp, ', '.join(str(u) for u in self.convertible_to))) + + return quantity + + +class UnitsSerializer(flags.ArgumentSerializer): + def serialize(self, units): + return str(units) + + +def DEFINE_units(name, default, help, convertible_to, + flag_values=flags.FLAGS, **kwargs): + """Register a flag whose value is a units expression. + + Args: + name: string. The name of the flag. + default: units.Quantity. The default value. + help: string. A help message for the user. + convertible_to: Either an individual unit specification or a series of unit + specifications, where each unit specification is either a string (e.g. + 'byte') or a units.Unit. The flag value must be convertible to at least + one of the specified Units to be considered valid. + flag_values: the absl.flags.FlagValues object to define the flag in. + """ + parser = UnitsParser(convertible_to=convertible_to) + serializer = UnitsSerializer() + flags.DEFINE(parser, name, default, help, flag_values, serializer, **kwargs) + + +def StringToBytes(string): + """Convert an object size, represented as a string, to bytes. + + Args: + string: the object size, as a string with a quantity and a unit. + + Returns: + an integer. The number of bytes in the size. + + Raises: + ValueError, if either the string does not represent an object size + or if the size does not contain an integer number of bytes. + """ + + try: + quantity = units.ParseExpression(string) + except Exception: + # Catching all exceptions is ugly, but we don't know what sort of + # exception pint might throw, and we want to turn any of them into + # ValueError. + raise ValueError("Couldn't parse size %s" % string) + + try: + bytes = quantity.m_as(units.byte) + except units.DimensionalityError: + raise ValueError("Quantity %s is not a size" % string) + + if bytes != int(bytes): + raise ValueError("Size %s has a non-integer number (%s) of bytes!" % + (string, bytes)) + + if bytes < 0: + raise ValueError("Size %s has a negative number of bytes!" % string) + + return int(bytes) + + +def StringToRawPercent(string): + """Convert a string to a raw percentage value. + + Args: + string: the percentage, with '%' on the end. + + Returns: + A floating-point number, holding the percentage value. + + Raises: + ValueError, if the string can't be read as a percentage. + """ + + if len(string) <= 1: + raise ValueError("String '%s' too short to be percentage." % string) + + if string[-1] != '%': + raise ValueError("Percentage '%s' must end with '%%'" % string) + + # This will raise a ValueError if it can't convert the string to a float. + val = float(string[:-1]) + + if val < 0.0 or val > 100.0: + raise ValueError('Quantity %s is not a valid percentage' % val) + + return val + + +# The YAML flag type is necessary because flags can be read either via +# the command line or from a config file. If they come from a config +# file, they will already be parsed as YAML, but if they come from the +# command line, they will be raw strings. The point of this flag is to +# guarantee a consistent representation to the rest of the program. +class YAMLParser(flags.ArgumentParser): + """Parse a flag containing YAML.""" + + syntactic_help = 'A YAML expression.' + + def parse(self, inp): + """Parse the input. + + Args: + inp: A string or the result of yaml.safe_load. If a string, should be + a valid YAML document. + """ + + if isinstance(inp, six.string_types): + # This will work unless the user writes a config with a quoted + # string that, if unquoted, would be parsed as a non-string + # Python type (example: '123'). In that case, the first + # yaml.safe_load() in the config system will strip away the quotation + # marks, and this second yaml.safe_load() will parse it as the + # non-string type. However, I think this is the best we can do + # without significant changes to the config system, and the + # problem is unlikely to occur in PKB. + try: + return yaml.safe_load(inp) + except yaml.YAMLError as e: + raise ValueError("Couldn't parse YAML string '%s': %s" % + (inp, str(e))) + else: + return inp + + +class YAMLSerializer(flags.ArgumentSerializer): + + def serialize(self, val): + return yaml.dump(val) + + +def DEFINE_yaml(name, default, help, flag_values=flags.FLAGS, **kwargs): + """Register a flag whose value is a YAML expression. + + Args: + name: string. The name of the flag. + default: object. The default value of the flag. + help: string. A help message for the user. + flag_values: the absl.flags.FlagValues object to define the flag in. + kwargs: extra arguments to pass to absl.flags.DEFINE(). + """ + + parser = YAMLParser() + serializer = YAMLSerializer() + + flags.DEFINE(parser, name, default, help, flag_values, serializer, **kwargs) + + +def ParseKeyValuePairs(strings): + """Parses colon separated key value pairs from a list of strings. + + Pairs should be separated by a comma and key and value by a colon, e.g., + ['k1:v1', 'k2:v2,k3:v3']. + + Args: + strings: A list of strings. + + Returns: + A dict populated with keys and values from the flag. + """ + pairs = {} + for pair in [kv for s in strings for kv in s.split(',')]: + try: + key, value = pair.split(':', 1) + pairs[key] = value + except ValueError: + logging.error('Bad key value pair format. Skipping "%s".', pair) + continue + + return pairs + + +def GetProvidedCommandLineFlags(): + """Return flag names and values that were specified on the command line. + + Returns: + A dictionary of provided flags in the form: {flag_name: flag_value}. + """ + return {k: FLAGS[k].value for k in FLAGS if FLAGS[k].present} diff --git a/script/cumulus/pkb/perfkitbenchmarker/hpc_util.py b/script/cumulus/pkb/perfkitbenchmarker/hpc_util.py new file mode 100644 index 0000000..0dd6f04 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/hpc_util.py @@ -0,0 +1,63 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""HPC utility functions.""" + +from absl import flags +from perfkitbenchmarker import vm_util + +flags.DEFINE_boolean('mpirun_allow_run_as_root', False, + 'Whether to allow mpirun to be run by the root user.') + + +def CreateMachineFile(vms, + num_slots=lambda vm: vm.NumCpusForBenchmark(), + remote_path='MACHINEFILE', + mpi_vendor='openmpi'): + """Create a file with the IP of each machine in the cluster on its own line. + + The file is then pushed to the provided path on the master vm. + + Pass in "num_slots=lambda vm: 0" to create a machine file without a defined + number of slots. + + OpenMPI's format: " slots=" + https://www.open-mpi.org/faq/?category=running#mpirun-hostfile + IntelMPI's format: ":" + https://software.intel.com/content/www/us/en/develop/articles/controlling-process-placement-with-the-intel-mpi-library.html + + Args: + vms: The list of vms which will be in the cluster. + num_slots: The function to use to calculate the number of slots + for each vm. Defaults to vm.NumCpusForBenchmark() + remote_path: remote path of the machine file. Defaults to MACHINEFILE + mpi_vendor: Implementation of MPI. Can be openmpi or intel. + """ + + def Line(vm, vm_name=None): + vm_name = vm_name or vm.internal_ip + slots = num_slots(vm) + if not slots: + return vm_name + if mpi_vendor == 'intel': + return f'{vm_name}:{slots}' + return f'{vm_name} slots={slots}' + + with vm_util.NamedTemporaryFile(mode='w') as machine_file: + master_vm = vms[0] + machine_file.write(Line(master_vm, 'localhost') + '\n') + for vm in vms[1:]: + machine_file.write(Line(vm) + '\n') + machine_file.close() + master_vm.PushFile(machine_file.name, remote_path) diff --git a/script/cumulus/pkb/perfkitbenchmarker/import_util.py b/script/cumulus/pkb/perfkitbenchmarker/import_util.py new file mode 100644 index 0000000..197462a --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/import_util.py @@ -0,0 +1,43 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for dynamically importing python files.""" + +import importlib +import pkgutil + + +def LoadModulesForPath(path, package_prefix=None): + """Load all modules on 'path', with prefix 'package_prefix'. + + Example usage: + LoadModulesForPath(__path__, __name__) + + Args: + path: Path containing python modules. + package_prefix: prefix (e.g., package name) to prefix all modules. + 'path' and 'package_prefix' will be joined with a '.'. + Yields: + Imported modules. + """ + prefix = package_prefix + '.' if package_prefix else '' + # If iter_modules is invoked within a zip file, the zipimporter adds the + # prefix to the names of archived modules, but not archived packages. Because + # the prefix is necessary to correctly import a package, this behavior is + # undesirable, so do not pass the prefix to iter_modules. Instead, apply it + # explicitly afterward. + for _, modname, _ in pkgutil.iter_modules(path): + # Skip recursively listed modules (e.g. 'subpackage.module'). + if '.' not in modname: + yield importlib.import_module(prefix + modname) diff --git a/script/cumulus/pkb/perfkitbenchmarker/kubernetes_helper.py b/script/cumulus/pkb/perfkitbenchmarker/kubernetes_helper.py new file mode 100644 index 0000000..d8711c0 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/kubernetes_helper.py @@ -0,0 +1,112 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +import time + +from absl import flags +import jinja2 +from perfkitbenchmarker import data +from perfkitbenchmarker import vm_util + +FLAGS = flags.FLAGS +flags.DEFINE_integer('k8s_get_retry_count', 18, + 'Maximum number of waits for getting LoadBalancer external IP') +flags.DEFINE_integer('k8s_get_wait_interval', 10, + 'Wait interval for getting LoadBalancer external IP') + + +def checkKubernetesFlags(): + if not FLAGS.kubectl: + raise Exception('Please provide path to kubectl tool using --kubectl ' + 'flag. Exiting.') + if not FLAGS.kubeconfig: + raise Exception('Please provide path to kubeconfig using --kubeconfig ' + 'flag. Exiting.') + + +def CreateFromFile(file_name): + checkKubernetesFlags() + create_cmd = [FLAGS.kubectl, '--kubeconfig=%s' % FLAGS.kubeconfig, 'create', + '-f', file_name] + vm_util.IssueRetryableCommand(create_cmd) + + +def DeleteFromFile(file_name): + checkKubernetesFlags() + delete_cmd = [FLAGS.kubectl, '--kubeconfig=%s' % FLAGS.kubeconfig, 'delete', + '-f', file_name, '--ignore-not-found'] + vm_util.IssueRetryableCommand(delete_cmd) + + +def DeleteAllFiles(file_list): + for file in file_list: + DeleteFromFile(file) + + +def CreateAllFiles(file_list): + for file in file_list: + CreateFromFile(file) + + +def Get(resource, resourceInstanceName, labelFilter, jsonSelector): + checkKubernetesFlags() + get_pod_cmd = [FLAGS.kubectl, '--kubeconfig=%s' % FLAGS.kubeconfig, + 'get', resource] + if len(resourceInstanceName) > 0: + get_pod_cmd.append(resourceInstanceName) + if len(labelFilter) > 0: + get_pod_cmd.append('-l ' + labelFilter) + get_pod_cmd.append('-ojsonpath={{{}}}'.format(jsonSelector)) + stdout, stderr, _ = vm_util.IssueCommand(get_pod_cmd, suppress_warning=True, + raise_on_failure=False) + if len(stderr) > 0: + raise Exception("Error received from kubectl get: " + stderr) + return stdout + + +def GetWithWaitForContents(resource, resourceInstanceName, filter, jsonFilter): + ret = Get(resource, resourceInstanceName, filter, jsonFilter) + numWaitsLeft = FLAGS.k8s_get_retry_count + while len(ret) == 0 and numWaitsLeft > 0: + time.sleep(FLAGS.k8s_get_wait_interval) + ret = Get(resource, resourceInstanceName, filter, jsonFilter) + numWaitsLeft -= 1 + return ret + + +def CreateResource(resource_body): + with vm_util.NamedTemporaryFile(mode='w') as tf: + tf.write(resource_body) + tf.close() + CreateFromFile(tf.name) + + +def DeleteResource(resource_body): + with vm_util.NamedTemporaryFile() as tf: + tf.write(resource_body) + tf.close() + DeleteFromFile(tf.name) + + +def CreateRenderedManifestFile(filename, config): + """Returns a file containing a rendered Jinja manifest (.j2) template.""" + manifest_filename = data.ResourcePath(filename) + environment = jinja2.Environment(undefined=jinja2.StrictUndefined) + with open(manifest_filename) as manifest_file: + manifest_template = environment.from_string(manifest_file.read()) + rendered_yaml = tempfile.NamedTemporaryFile(mode='w') + rendered_yaml.write(manifest_template.render(config)) + rendered_yaml.flush() + return rendered_yaml diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_benchmarks/__init__.py b/script/cumulus/pkb/perfkitbenchmarker/linux_benchmarks/__init__.py new file mode 100644 index 0000000..4b29b83 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_benchmarks/__init__.py @@ -0,0 +1,35 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains benchmark imports and a list of benchmarks. + +All modules within this package are considered benchmarks, and are loaded +dynamically. Add non-benchmark code to other packages. +""" + +import os +from perfkitbenchmarker import import_util + + +def _LoadBenchmarks(): + __path__.append(os.path.join(__path__[0], "intel_community")) + return list(import_util.LoadModulesForPath(__path__, __name__)) + +BENCHMARKS = _LoadBenchmarks() + +VALID_BENCHMARKS = {} +for module in BENCHMARKS: + if module.BENCHMARK_NAME in VALID_BENCHMARKS: + raise ValueError('There are multiple benchmarks with BENCHMARK_NAME "%s"' % + (module.BENCHMARK_NAME)) + VALID_BENCHMARKS[module.BENCHMARK_NAME] = module diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_benchmarks/cluster_boot_benchmark.py b/script/cumulus/pkb/perfkitbenchmarker/linux_benchmarks/cluster_boot_benchmark.py new file mode 100644 index 0000000..164e7cd --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_benchmarks/cluster_boot_benchmark.py @@ -0,0 +1,252 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Records the time required to boot a cluster of VMs.""" + +import logging +import time +from typing import List +from absl import flags +from perfkitbenchmarker import configs +from perfkitbenchmarker import sample +from perfkitbenchmarker import virtual_machine +from perfkitbenchmarker import vm_util + +BENCHMARK_NAME = 'cluster_boot' +BENCHMARK_CONFIG = """ +cluster_boot: + description: > + Create a cluster, record all times to boot. + Specify the cluster size with --num_vms. + vm_groups: + default: + vm_spec: + AWS: + machine_type: m5.large + zone: us-east-1 + Azure: + machine_type: Standard_D2s_v3 + zone: eastus + boot_disk_type: StandardSSD_LRS + GCP: + machine_type: n1-standard-2 + zone: us-central1-a + boot_disk_type: pd-ssd + IBMCloud: + machine_type: cx2-2x4 + zone: us-south-1 + Kubernetes: + image: null + OpenStack: + machine_type: t1.small + zone: nova + vm_count: null + flags: + # We don't want boot time samples to be affected from retrying, so don't + # retry cluster_boot when rate limited. + retry_on_rate_limited: False +""" + +flags.DEFINE_boolean( + 'cluster_boot_time_reboot', False, + 'Whether to reboot the VMs during the cluster boot benchmark to measure ' + 'reboot performance.') +flags.DEFINE_boolean( + 'cluster_boot_test_port_listening', False, + 'Test the time it takes to successfully connect to the port that is used to run the remote command.' +) +FLAGS = flags.FLAGS + + +def GetConfig(user_config): + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(unused_benchmark_spec): + pass + + +def GetTimeToBoot(vms): + """Creates Samples for the boot time of a list of VMs. + + The boot time is the time difference from before the VM is created to when + the VM is responsive to SSH commands. + + Args: + vms: List of BaseVirtualMachine subclasses. + + Returns: + List of Samples containing the boot times and an overall cluster boot time. + """ + if not vms: + return [] + + min_create_start_time = min(vm.create_start_time for vm in vms) + + max_create_delay_sec = 0 + max_boot_time_sec = 0 + max_port_listening_time_sec = 0 + max_rdp_port_listening_time_sec = 0 + samples = [] + os_types = set() + for i, vm in enumerate(vms): + assert vm.bootable_time + assert vm.create_start_time + assert vm.bootable_time >= vm.create_start_time + os_types.add(vm.OS_TYPE) + create_delay_sec = vm.create_start_time - min_create_start_time + max_create_delay_sec = max(max_create_delay_sec, create_delay_sec) + metadata = { + 'machine_instance': i, + 'num_vms': len(vms), + 'os_type': vm.OS_TYPE, + 'create_delay_sec': '%0.1f' % create_delay_sec + } + boot_time_sec = vm.bootable_time - min_create_start_time + max_boot_time_sec = max(max_boot_time_sec, boot_time_sec) + samples.append( + sample.Sample('Boot Time', boot_time_sec, 'seconds', metadata)) + if FLAGS.cluster_boot_test_port_listening: + assert vm.port_listening_time + assert vm.port_listening_time >= vm.create_start_time + port_listening_time_sec = vm.port_listening_time - min_create_start_time + max_port_listening_time_sec = max(max_port_listening_time_sec, + port_listening_time_sec) + samples.append( + sample.Sample('Port Listening Time', port_listening_time_sec, + 'seconds', metadata)) + # TODO(user): refactor so Windows specifics aren't in linux_benchmarks + if FLAGS.cluster_boot_test_rdp_port_listening: + assert vm.rdp_port_listening_time + assert vm.rdp_port_listening_time >= vm.create_start_time + rdp_port_listening_time_sec = ( + vm.rdp_port_listening_time - min_create_start_time) + max_rdp_port_listening_time_sec = max(max_rdp_port_listening_time_sec, + rdp_port_listening_time_sec) + samples.append( + sample.Sample('RDP Port Listening Time', rdp_port_listening_time_sec, + 'seconds', metadata)) + + # Add a total cluster boot sample as the maximum boot time. + metadata = { + 'num_vms': len(vms), + 'os_type': ','.join(sorted(os_types)), + 'max_create_delay_sec': '%0.1f' % max_create_delay_sec + } + samples.append( + sample.Sample('Cluster Boot Time', max_boot_time_sec, 'seconds', + metadata)) + if FLAGS.cluster_boot_test_port_listening: + samples.append( + sample.Sample('Cluster Port Listening Time', + max_port_listening_time_sec, 'seconds', metadata)) + if FLAGS.cluster_boot_test_rdp_port_listening: + samples.append( + sample.Sample('Cluster RDP Port Listening Time', + max_rdp_port_listening_time_sec, 'seconds', metadata)) + if max_create_delay_sec > 1: + logging.warning( + 'The maximum delay between starting VM creations is %0.1fs.', + max_create_delay_sec) + + return samples + + +def _MeasureReboot(vms): + """Measures the time to reboot the cluster of VMs. + + Args: + vms: List of BaseVirtualMachine subclasses. + + Returns: + List of Samples containing the reboot times and an overall cluster reboot + time. + """ + before_reboot_timestamp = time.time() + reboot_times = vm_util.RunThreaded(lambda vm: vm.Reboot(), vms) + cluster_reboot_time = time.time() - before_reboot_timestamp + return _GetVmOperationDataSamples(reboot_times, cluster_reboot_time, 'Reboot', + vms) + + +def MeasureDelete( + vms: List[virtual_machine.BaseVirtualMachine]) -> List[sample.Sample]: + """Measures the time to delete the cluster of VMs. + + Args: + vms: List of BaseVirtualMachine subclasses. + + Returns: + List of Samples containing the delete times and an overall cluster delete + time. + """ + before_delete_timestamp = time.time() + vm_util.RunThreaded(lambda vm: vm.Delete(), vms) + delete_times = [vm.delete_end_time - vm.delete_start_time for vm in vms] + max_delete_end_time = max([vm.delete_end_time for vm in vms]) + cluster_delete_time = max_delete_end_time - before_delete_timestamp + return _GetVmOperationDataSamples(delete_times, cluster_delete_time, 'Delete', + vms) + + +def _GetVmOperationDataSamples( + operation_times: List[int], cluster_time: int, operation: str, + vms: List[virtual_machine.BaseVirtualMachine]) -> List[sample.Sample]: + """Append samples from given data. + + Args: + operation_times: The list of times for each vms. + cluster_time: The cluster time for the benchmark. + operation: The benchmark operation being run, capitalized with no spaces. + vms: list of virtual machines. + + Returns: + List of samples constructed from data. + """ + samples = [] + metadata_list = [] + for i, vm in enumerate(vms): + metadata = { + 'machine_instance': i, + 'num_vms': len(vms), + 'os_type': vm.OS_TYPE + } + metadata_list.append(metadata) + for operation_time, metadata in zip(operation_times, metadata_list): + samples.append( + sample.Sample(f'{operation} Time', operation_time, 'seconds', metadata)) + os_types = set([vm.OS_TYPE for vm in vms]) + metadata = {'num_vms': len(vms), 'os_type': ','.join(sorted(os_types))} + samples.append( + sample.Sample(f'Cluster {operation} Time', cluster_time, 'seconds', + metadata)) + return samples + + +def Run(benchmark_spec): + """Measure the boot time for all VMs. + + Args: + benchmark_spec: The benchmark specification. + + Returns: + An empty list (all boot samples will be added later). + """ + samples = [] + if FLAGS.cluster_boot_time_reboot: + samples.extend(_MeasureReboot(benchmark_spec.vms)) + return samples + + +def Cleanup(unused_benchmark_spec): + pass diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_benchmarks/docker_passthrough.py b/script/cumulus/pkb/perfkitbenchmarker/linux_benchmarks/docker_passthrough.py new file mode 100644 index 0000000..5ca5336 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_benchmarks/docker_passthrough.py @@ -0,0 +1,851 @@ +from absl import flags +from perfkitbenchmarker import configs, events, stages, vm_util, sample +from perfkitbenchmarker.linux_packages import k8s +from perfkitbenchmarker.linux_packages import skopeo +try: + from perfkitbenchmarker.linux_packages import archived_images +except: + archived_images = None +from perfkitbenchmarker.linux_packages import habana +from perfkitbenchmarker.linux_packages import runwith +from perfkitbenchmarker.linux_packages import docker_ce +from perfkitbenchmarker.linux_packages import INSTALL_DIR +from perfkitbenchmarker.linux_packages import docker_auth +from posixpath import join +from uuid import uuid4 +from yaml import safe_load_all, dump_all +import logging +import time +import os + +BENCHMARK_NAME = "docker_pt" +BENCHMARK_CONFIG = """ +docker_pt: + description: Docker Passthrough Benchmark + vm_groups: {} + flags: + dpt_docker_image: "" + dpt_docker_dataset: [] + dpt_docker_options: "" + dpt_kubernetes_yaml: "" + dpt_kubernetes_job: "" + dpt_namespace: "" + dpt_logs_dir: "" + dpt_timeout: "300" + dpt_name: "" + dpt_script_args: "" + dpt_cluster_yaml: "" + dpt_registry_map: [] + dpt_trace_mode: [] + dpt_params: "" + dpt_tunables: "" + dpt_debug: [] + dpt_vm_groups: "worker" + dpt_reuse_sut: false +""" + +run_seq = 0 +SUT_VM_CTR = "controller" +KUBERNETES_CONFIG = "kubernetes_config.yaml" +HUGEPAGE_NR = "/sys/kernel/mm/hugepages/hugepages-{}/nr_hugepages" +LOGS_TARFILE = "{}.tar" +ITERATION_DIR = "itr-{}" +KPISH = "kpi.sh" +SF_NS_LABEL = "cn-benchmarking.intel.com/sf_namespace=true" +EXPORT_LOGS_TARFILE = "~/export-logs.tar" +KUBELET_CONFIG = "kubelet-config.yaml" + +FLAGS = flags.FLAGS +flags.DEFINE_string("dpt_name", "", "Benchmark name") +flags.DEFINE_list("dpt_script_args", [], "The KPI and setup script args") +flags.DEFINE_string("dpt_docker_image", "", "Docker image name") +flags.DEFINE_list("dpt_docker_dataset", [], "Docker dataset images") +flags.DEFINE_string("dpt_docker_options", "", "Docker run options") +flags.DEFINE_string("dpt_kubernetes_yaml", "", "Kubernetes run yaml file") +flags.DEFINE_string("dpt_kubernetes_job", "benchmark", "Benchmark job name") +flags.DEFINE_string("dpt_namespace", str(uuid4()), "namespace") +flags.DEFINE_string("dpt_logs_dir", "", "The logs directory") +flags.DEFINE_string("dpt_timeout", "300", "Execution timeout") +flags.DEFINE_string("dpt_cluster_yaml", "", "The cluster configuration file") +flags.DEFINE_string("dpt_params", "", "The workload configuration parameters") +flags.DEFINE_string("dpt_tunables", "", "The workload tunable configuration parameters") +flags.DEFINE_list("dpt_registry_map", [], "Replace the registries") +flags.DEFINE_list("dpt_vm_groups", ["worker"], "Define the mapping of cluster-config groups to vm_groups") +flags.DEFINE_list("dpt_debug", [], "Set debug breakpoints") +flags.DEFINE_list("dpt_trace_mode", [], "Specify the trace mode triple") +flags.DEFINE_boolean("dpt_reuse_sut", False, "Enable when IWOS running in cloud") + + +def _GetTempDir(): + return join(INSTALL_DIR, FLAGS.dpt_namespace) + + +def _MakeTempDir(vm): + tmp_dir = _GetTempDir() + vm.RemoteCommand("sudo mkdir -p {0} && bash -c 'sudo chown $(id -u):$(id -g) {0}'".format(tmp_dir)) + return tmp_dir + + +def _SetBreakPoint(breakpoint): + if breakpoint in FLAGS.dpt_debug: + try: + logging.info("Pause for debugging at %s", breakpoint) + while not os.path.exists(join(vm_util.GetTempDir(), "Resume" + breakpoint)): + time.sleep(5) + except: + pass + logging.info("Resume after debugging %s", breakpoint) + + +def _FormatKPI(line): + key, _, value = line.rpartition(":") + key = key.strip() + value = float(value.strip()) + if key.endswith(")"): + key, _, unit = key.rpartition("(") + unit = unit[0:-1].strip() + key = key.strip() + else: + unit = "-" + return key, value, unit + + +def _ParseKPI(metadata): + cmd = "cd {} && ./{} {}".format(join(FLAGS.dpt_logs_dir, ITERATION_DIR.format(run_seq)), KPISH, " ".join(FLAGS.dpt_script_args)) + stdout, _, retcode = vm_util.IssueCommand(["sh", "-c", cmd]) + + samples = [] + for line in stdout.split("\n"): + if line.startswith("##"): + k, _, v = line[2:].rpartition(":") + k = k.strip() + v = v.strip() + if k and v: + metadata[k]=v + else: + try: + k, v, u = _FormatKPI(line) + if k.startswith("*"): + samples.append(sample.Sample(k[1:], v, u, {"primary_sample": True})) + elif not k.startswith("#"): + samples.append(sample.Sample(k, v, u)) + except Exception: + pass + if len(samples) == 1: + samples[0].metadata["primary_sample"] = True + return samples + + +def GetConfig(user_config): + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def CheckPrerequisites(benchmark_config): + pass + + +def _ReplaceImage(image): + if FLAGS.dpt_registry_map: + if FLAGS.dpt_registry_map[0]: + return image.replace(FLAGS.dpt_registry_map[0], FLAGS.dpt_registry_map[1]) + return FLAGS.dpt_registry_map[1] + image + return image + + +def _WalkTo(node, name): + try: + if name in node: + return node + for item1 in node: + node1 = _WalkTo(node[item1], name) + if node1: + return node1 + except Exception: + pass + return None + + +def _GetNodes(controller0): + nodes = {} + stdout, _ = controller0.RemoteCommand( + "kubectl get nodes -o='custom-columns=name:.metadata.name,ip:.status.addresses[?(@.type==\"InternalIP\")].address' --no-headers") + for line in stdout.split("\n"): + fields = line.strip().split(" ") + if fields[-1]: + nodes[fields[-1]] = fields[0] + return nodes + + +def _WriteKubeletConfigFile(options): + kc = [{ + "apiVersion": "kubelet.config.k8s.io/v1beta1", + "kind": "KubeletConfiguration", + }] + kc[0].update(options) + + kcf = f"{FLAGS.dpt_logs_dir}/{KUBELET_CONFIG}" + with open(kcf, "w") as fd: + dump_all(kc, fd) + return kcf + + +def _ParseClusterConfigs(vm_groups, nodes={}): + vimages = [] + workers = [] + nidx = {} + options = [] + + with open(FLAGS.dpt_cluster_yaml, "rt") as fd: + for doc in safe_load_all(fd): + if "cluster" in doc: + for i, cluster1 in enumerate(doc["cluster"]): + name = FLAGS.dpt_vm_groups[i % len(FLAGS.dpt_vm_groups)] + if name not in nidx: + nidx[name] = 0 + worker1 = { + "vm": vm_groups[name][nidx[name] % len(vm_groups[name])], + "labels": cluster1["labels"], + } + worker1["labels"]["VM_GROUP_" + name.upper()] = "required" + internal_ip = worker1["vm"].internal_ip + try: + worker1["name"] = nodes[internal_ip] + except Exception: + worker1["name"] = internal_ip + nidx[name] = nidx[name] + 1 + workers.append(worker1) + if "vm" in doc: + for vim1 in doc["vm"]: + for vm10 in vm_groups[vim1["name"]]: + vimages.append({ + "name": vim1["name"], + "vm": vm10, + "env": vim1["env"], + }) + if "setup" in vim1: + vimages[-1]["setup"] = vim1["setup"] + options.extend(["-e", "{}={}".format(vim1["env"], vm10.internal_ip)]) + if "kubernetes" in doc: + dock8s = doc["kubernetes"] + if "cni" in dock8s: + FLAGS.k8s_cni = dock8s["cni"] + if "cni-options" in dock8s: + FLAGS.k8s_cni_options = dock8s["cni-options"] + if "kubevirt" in dock8s: + FLAGS.k8s_kubevirt = dock8s["kubevirt"] + if "kubelet-options" in dock8s: + option = "--config=" + _WriteKubeletConfigFile(dock8s["kubelet-options"]) + if option not in FLAGS.k8s_kubeadm_options: + FLAGS.k8s_kubeadm_options.append(option) + + return workers, vimages, options + + +def _AddNodeAffinity(spec, workers): + if "affinity" not in spec: + spec["affinity"] = {} + + if "nodeAffinity" not in spec["affinity"]: + spec["affinity"]["nodeAffinity"] = {} + + if "requiredDuringSchedulingIgnoredDuringExecution" not in spec["affinity"]["nodeAffinity"]: + spec["affinity"]["nodeAffinity"]["requiredDuringSchedulingIgnoredDuringExecution"] = {} + + if "nodeSelectorTerms" not in spec["affinity"]["nodeAffinity"]["requiredDuringSchedulingIgnoredDuringExecution"]: + spec["affinity"]["nodeAffinity"]["requiredDuringSchedulingIgnoredDuringExecution"]["nodeSelectorTerms"] = [] + + spec["affinity"]["nodeAffinity"]["requiredDuringSchedulingIgnoredDuringExecution"]["nodeSelectorTerms"].append({ + "matchExpressions": [{ + "key": "kubernetes.io/hostname", + "operator": "In", + "values": [x["name"] for x in workers], + }], + }) + + +def _ModifyImageRef(spec, images, ifcloud): + for c1 in spec: + if c1["image"] in images: + c1["image"] = FLAGS.dpt_registry_map[1] + images[c1["image"]][0] + if ifcloud: + c1["imagePullPolicy"] = "IfNotPresent" + + +def _ScanImages(docs): + images = {} + for doc in docs: + for c1 in ["containers", "initContainers"]: + spec = _WalkTo(doc, c1) + if spec and spec[c1]: + for c2 in spec[c1]: + if "image" in c2: + images[c2["image"]] = 1 + + registry_url = FLAGS.dpt_registry_map[0] if FLAGS.dpt_registry_map else "" + priv_images = {} + for image in images: + attr = skopeo.InspectImage(image, registry_url) + if attr: + priv_images[image] = attr + return priv_images + + +def _ModifyEnvs(spec, vimages, workers): + if vimages: + for c1 in spec: + if "env" not in c1: + c1["env"] = [] + elif not c1["env"]: + c1["env"] = [] + for vm1 in vimages: + c1["env"].append({ + "name": vm1["env"], + "value": vm1["vm"].internal_ip, + }) + for c1 in spec: + if "env" in c1: + if isinstance(c1["env"], list): + for e1 in c1["env"]: + if "name" in e1 and "value" in e1: + if e1["name"] == "CLUSTER_WORKERS" and e1["value"] == "": + e1["value"] = ",".join([w["name"] for w in workers]) + + +def _PullImages(vm, images): + registry_url = FLAGS.dpt_registry_map[0] if FLAGS.dpt_registry_map else "" + priv_images = {} + for image in images: + attr = skopeo.InspectImage(image, registry_url) + if attr: + priv_images[image] = attr + if priv_images: + priv_images_remaining = archived_images.CopyImagesToDocker(vm, priv_images) if archived_images else priv_images + if priv_images_remaining: + skopeo.CopyImagesToDocker(vm, priv_images_remaining) + + +def _UpdateK8sConfig(controller0, workers, vimages): + with open(FLAGS.dpt_kubernetes_yaml, "rt") as fd: + docs = [d for d in safe_load_all(fd) if d] + + images = _ScanImages(docs) + if controller0.CLOUD != "Static" or FLAGS.install_packages: + nworkers = len(_UniqueVms(workers)) + worker1 = workers[0]["vm"] + if nworkers == 1 and docker_ce.IsDocker(worker1): + images_remaining = archived_images.CopyImagesToDocker(worker1, images) if archived_images else images + if images_remaining: + skopeo.CopyImagesToDocker(worker1, images_remaining) + elif not skopeo.IsSUTAccessible(FLAGS.dpt_registry_map[0]): + vms = _UniqueVms(workers, controller0) + registry_url = k8s.CreateRegistry(controller0, vms) + images_remaining = archived_images.CopyImagesToRegistry(controller0 if nworkers > 1 else worker1, images, registry_url, nworkers > 1) if archived_images else images + if images_remaining: + skopeo.CopyImagesToRegistry(controller0, images_remaining, registry_url) + FLAGS.dpt_registry_map[1] = f"{registry_url}/" + logging.info(f"SUT/Info: registry {FLAGS.dpt_registry_map[1]}") + + modified_docs = [] + for doc in docs: + modified_docs.append(doc) + + spec = _WalkTo(doc, "containers") + if spec and spec["containers"]: + _AddNodeAffinity(spec, workers) + _ModifyImageRef(spec["containers"], images, controller0.CLOUD != "Static" or FLAGS.dpt_reuse_sut or FLAGS.install_packages) + _ModifyEnvs(spec["containers"], vimages, workers) + + spec = _WalkTo(doc, "initContainers") + if spec and spec["initContainers"]: + _ModifyImageRef(spec["initContainers"], images, controller0.CLOUD != "Static" or FLAGS.dpt_reuse_sut or FLAGS.install_packages) + _ModifyEnvs(spec["initContainers"], vimages, workers) + + modified_filename = FLAGS.dpt_kubernetes_yaml + ".mod.yaml" + with open(modified_filename, "wt") as fd: + dump_all(modified_docs, fd) + + return modified_filename, images + + +def _ParseParams(params, metadata): + for kv in params.split(";"): + k, p, v = kv.partition(":") + try: + v = float(v.strip()) + except Exception: + pass + metadata[k.strip()] = v + + +@vm_util.Retry() +def _AddNodeLabels(controller0, workers, vm): + node = None + labels = {} + for worker in workers: + if worker["vm"] == vm: + for k in worker["labels"]: + if worker["labels"][k] == "required": + labels[k] = "yes" + node = worker["name"] + if labels and node: + cmd = ["kubectl", "label", "--overwrite", "node", node] + [k + "=" + labels[k] for k in labels] + controller0.RemoteCommand(" ".join(cmd)) + + +def _GetController(vm_groups): + if SUT_VM_CTR in vm_groups: + return vm_groups[SUT_VM_CTR][0] + return vm_groups[FLAGS.dpt_vm_groups[0]][-1] + + +def _GetWorkers(vm_groups): + workers = [] + for g1 in vm_groups: + if (g1 != SUT_VM_CTR) and (g1 in FLAGS.dpt_vm_groups): + for vm1 in vm_groups[g1]: + if vm1 not in workers: + workers.append(vm1) + return workers + + +def _SetupHugePages(workers, vm): + reqs = {} + for worker in workers: + if worker["vm"] == vm: + for k in worker["labels"]: + if k.startswith("HAS-SETUP-HUGEPAGE-") and worker["labels"][k] == "required": + req = k.split("-")[-2:] + if req[0] not in reqs: + reqs[req[0]] = 0 + if int(req[1]) > reqs[req[0]]: + reqs[req[0]] = int(req[1]) + + cmds = ["hugepagesz={} hugepages={}".format(sz.replace("B",""), reqs[sz]) for sz in reqs] + if cmds: + vm.AppendKernelCommandLine(" ".join(cmds), reboot=False) + vm._needs_reboot = True + + +def _ProbeModules(workers, vm): + modules = [] + for worker in workers: + if worker["vm"] == vm: + for k in worker["labels"]: + if k.startswith("HAS-SETUP-MODULE-") and worker["labels"][k] == "required": + modules.append(k.replace("HAS-SETUP-MODULE-", "").lower()) + if modules: + cmd = ["sudo", "modprobe"] + modules + vm.RemoteCommand(" ".join(cmd), ignore_failure=True, suppress_warning=True) + + +# We use Habana Gaudi AMI. Assume the driver is already installed. +# Just need to refresh docker or containerd configurations after +# a new installation (k8s). +def _SetupHabanaWorker(controller0, workers, vm): + for worker in workers: + if worker["vm"] == vm: + for k in worker["labels"]: + if k.startswith("HAS-SETUP-HABANA-") and worker["labels"][k] == "required": + vm.Install("habana") + if controller0: + habana.RegisterWithContainerD(vm) + else: + habana.RegisterWithDocker(vm) + return + + +# Use aws inferentia for ai workload, need to install neuron runtime driver in vm firstly. +def _SetupInferentiaWorker(controller0, workers, vm): + for worker in workers: + if worker["vm"] == vm: + for k in worker["labels"]: + if k.startswith("HAS-SETUP-INFERENTIA-") and worker["labels"][k] == "required": + vm.Install("inferentia") + return + + +# Use aws nvidia for ai workload, need to install cuda driver in vm firstly. +def _SetupGPUWorker(controller0, workers, vm): + for worker in workers: + if worker["vm"] == vm: + for k in worker["labels"]: + if k.startswith("HAS-SETUP-NVIDIA-") and worker["labels"][k] == "required": + vm.Install("nvidia_gpu") + return + + +def _SetupHabanaController(controller0, workers): + for worker in workers: + for k in worker["labels"]: + if k.startswith("HAS-SETUP-HABANA-") and worker["labels"][k] == "required": + habana.RegisterKubernetesPlugins(controller0) + return + + +def _SetupWorker(controller0, workers, vm): + for worker in workers: + if worker["vm"] == vm: + for k in worker["labels"]: + if k.startswith("HAS-SETUP-") and worker["labels"][k] == "required": + pass + + +def _UniqueVms(workers, controller0=None): + vms = [] + for worker in workers: + if worker["vm"] != controller0 and worker["vm"] not in vms: + vms.append(worker["vm"]) + return vms + + +def _PrepareWorker(controller0, workers, vm): + _SetupHugePages(workers, vm) + _ProbeModules(workers, vm) + _SetupHabanaWorker(controller0, workers, vm) + _SetupInferentiaWorker(controller0, workers, vm) + _SetupGPUWorker(controller0, workers, vm) + if controller0: + _AddNodeLabels(controller0, workers, vm) + vm._RebootIfNecessary() + + +def _PrepareVM(vimage): + if "setup" in vimage: + setup = vimage["setup"] + vm1 = vimage["vm"] + + while True: + try: + vm1.RemoteCommand("sudo mkdir -p /opt/pkb/vmsetup && sudo chown -R {} /opt/pkb".format(vm1.user_name)) + vm1.PushFile("{}/{}.tgz".format(FLAGS.dpt_logs_dir, setup), "/opt/pkb") + vm1.RemoteCommand("cd /opt/pkb/vmsetup && sudo tar xfz ../{}.tgz && sudo ./setup.sh {}".format(setup, " ".join(FLAGS.dpt_script_args))) + break + except Exception as e: + logging.warning("VM Setup Exception: %s", str(e)) + vm1.RemoteCommand("sleep 10s", ignore_failure=True) + vm1.WaitForBootCompletion() + + +def _SaveConfigFiles(spec): + spec.s3_reports.append((FLAGS.dpt_cluster_yaml, 'text/yaml')) + cumulus_config_file = join(FLAGS.dpt_logs_dir, "cumulus-config.yaml") + spec.s3_reports.append((cumulus_config_file, 'text/yaml')) + + if FLAGS.dpt_kubernetes_yaml: + spec.s3_reports.append((FLAGS.dpt_kubernetes_yaml, 'text/yaml')) + + test_config_file = join(FLAGS.dpt_logs_dir, "test-config.yaml") + if os.path.exists(test_config_file): + spec.s3_reports.append((test_config_file, 'text/yaml')) + + +def Prepare(benchmark_spec): + _SetBreakPoint("PrepareStage") + + benchmark_spec.name = FLAGS.dpt_name.replace(" ", "_").lower() + benchmark_spec.workload_name = FLAGS.dpt_name + benchmark_spec.sut_vm_group = FLAGS.dpt_vm_groups[0] + benchmark_spec.always_call_cleanup = True + + benchmark_spec.control_traces = True + FLAGS.dpt_trace_mode = [x.strip() for x in FLAGS.dpt_trace_mode] + + _SaveConfigFiles(benchmark_spec) + _ParseParams(FLAGS.dpt_params, benchmark_spec.software_config_metadata) + _ParseParams(FLAGS.dpt_tunables, benchmark_spec.tunable_parameters_metadata) + + # export SUT Instrumentation + for group1 in benchmark_spec.vm_groups: + for worker1 in benchmark_spec.vm_groups[group1]: + logging.info(f"SUT/Info: {group1} {worker1.ip_address} {worker1.internal_ip} {worker1.OS_TYPE}") + + if FLAGS.dpt_docker_image: + controller0 = _GetWorkers(benchmark_spec.vm_groups)[0] + tmp_dir = _MakeTempDir(controller0) + if controller0.CLOUD != "Static" or FLAGS.install_packages: + controller0.Install('docker_ce') + docker_auth.CopyDockerConfig(controller0) + _PullImages(controller0, [FLAGS.dpt_docker_image] + FLAGS.dpt_docker_dataset) + + workers, vimages, options = _ParseClusterConfigs(benchmark_spec.vm_groups) + FLAGS.dpt_docker_options = " ".join(FLAGS.dpt_docker_options.split(" ") + options) + _PrepareWorker(None, workers, controller0) + if controller0.CLOUD != "Static" or FLAGS.install_packages: + _SetBreakPoint("SetupVM") + vm_util.RunThreaded(lambda vim1: _PrepareVM(vim1), vimages) + + if FLAGS.dpt_kubernetes_yaml: + controller0 = _GetController(benchmark_spec.vm_groups) + tmp_dir = _MakeTempDir(controller0) + + if controller0.CLOUD != "Static" or FLAGS.install_packages: + _ParseClusterConfigs(benchmark_spec.vm_groups) + workers = [vm1 for vm1 in _GetWorkers(benchmark_spec.vm_groups) if vm1 != controller0] + taint = SUT_VM_CTR in benchmark_spec.vm_groups + k8s.CreateCluster(controller0, workers, taint) + docker_auth.CopyDockerConfig(controller0) + + nodes = _GetNodes(controller0) + workers, vimages, _ = _ParseClusterConfigs(benchmark_spec.vm_groups, nodes) + k8s_config_yaml, images = _UpdateK8sConfig(controller0, workers, vimages) + + if controller0.CLOUD != "Static" or FLAGS.install_packages: + vm_util.RunThreaded(lambda vm1: _PrepareWorker(controller0, workers, vm1), _UniqueVms(workers)) + _SetupHabanaController(controller0, workers) + _SetBreakPoint("SetupVM") + vm_util.RunThreaded(lambda vim1: _PrepareVM(vim1), vimages) + + remote_yaml_file = join(tmp_dir, KUBERNETES_CONFIG) + controller0.PushFile(k8s_config_yaml, remote_yaml_file) + + kpish = f"{FLAGS.dpt_logs_dir}/{KPISH}" + for i in range(1, FLAGS.run_stage_iterations + 1): + local_logs_dir = join(FLAGS.dpt_logs_dir, ITERATION_DIR.format(i)) + vm_util.IssueCommand(["mkdir", "-p", local_logs_dir]) + vm_util.IssueCommand(["cp", "-f", kpish, local_logs_dir]) + + with open(kpish) as fd: + test_string = f"cd {ITERATION_DIR.format(1)}" + if test_string not in fd.read(): + vm_util.IssueCommand(["sh", "-c", "sed -i '1a[ -d {} ] && cd {}' {}". + format(ITERATION_DIR.format(1), ITERATION_DIR.format(1), kpish)]) + + +def _PullExtractLogs(controller0, pods, remote_logs_dir): + estr = None + for pod1 in pods: + remote_logs_tarfile = join(remote_logs_dir, LOGS_TARFILE.format(pod1)) + local_logs_dir = join(FLAGS.dpt_logs_dir, join(ITERATION_DIR.format(run_seq), f"{pod1}")) + local_logs_tarfile = join(local_logs_dir, LOGS_TARFILE.format(pod1)) + try: + vm_util.IssueCommand(["mkdir", "-p", local_logs_dir]) + controller0.PullFile(local_logs_tarfile, remote_logs_tarfile) + vm_util.IssueCommand(["tar", "xf", local_logs_tarfile, "-C", local_logs_dir]) + vm_util.IssueCommand(["rm", "-f", local_logs_tarfile]) + except Exception as e: + estr = str(e) + if estr: + raise Exception("ExtractLogs Exception: " + estr) + + +def _TraceByTime(benchmark_spec, controller0): + controller0.RemoteCommand(f"sleep {FLAGS.dpt_trace_mode[1]}s", ignore_failure=True) + events.start_trace.send(stages.RUN, benchmark_spec=benchmark_spec) + controller0.RemoteCommand(f"sleep {FLAGS.dpt_trace_mode[2]}s", ignore_failure=True) + events.stop_trace.send(stages.RUN, benchmark_spec=benchmark_spec) + + +def _TraceByROI(benchmark_spec, controller0, timeout, cmds): + _, _, status = controller0.RemoteCommandWithReturnCode("timeout {}s bash -c 'while true; do ({}) | grep -q -F \"{}\" && exit 0 || sleep 1s; done'".format(timeout, cmds, FLAGS.dpt_trace_mode[1]), ignore_failure=True) + if status == 0: + events.start_trace.send(stages.RUN, benchmark_spec=benchmark_spec) + controller0.RemoteCommand("timeout {}s bash -c 'while true; do ({}) | grep -q -F \"{}\" && exit 0 || sleep 1s; done'".format(timeout, cmds, FLAGS.dpt_trace_mode[2]), ignore_failure=True) + events.stop_trace.send(stages.RUN, benchmark_spec=benchmark_spec) + + +@vm_util.Retry() +def _RobustGetLogs(vm, pod1, container, remote_logs_tarfile): + # copy with tarball validity check + vm.RemoteCommand(f"kubectl exec --namespace={FLAGS.dpt_namespace} -c {container} {pod1} -- sh -c \"cat {EXPORT_LOGS_TARFILE}\" > {remote_logs_tarfile} && tar xf {remote_logs_tarfile} -O > /dev/null") + + +def Run(benchmark_spec): + global run_seq + run_seq = run_seq + 1 + + _SetBreakPoint("RunStage") + + for vm in benchmark_spec.vms: + thread_count = vm.num_cpus + logging.debug(f"VM thread count: {thread_count}") + + tmp_dir = _GetTempDir() + timeout = list(map(int,FLAGS.dpt_timeout.split(","))) + if len(timeout)<2: + timeout.append(timeout[0]) + if len(timeout)<3: + timeout.append(timeout[0]/2) + + pull_logs = False + if FLAGS.dpt_docker_image: + controller0 = _GetWorkers(benchmark_spec.vm_groups)[0] + + options = FLAGS.dpt_docker_options.split(' ') + if controller0.CLOUD == "Static" and not FLAGS.dpt_reuse_sut and not FLAGS.install_packages and FLAGS.dpt_registry_map[0]: + options.extend(["--pull", "always"]) + + containers = [] + options1 = "--pull always" if controller0.CLOUD == "Static" and not FLAGS.install_packages and FLAGS.dpt_registry_map[0] else "" + for image1 in FLAGS.dpt_docker_dataset: + stdout, _ = controller0.RemoteCommand("sudo -E docker create {} {} -".format(options1, _ReplaceImage(image1))) + container_id = stdout.strip() + containers.append(container_id) + options.extend(["--volumes-from", container_id]) + + _SetBreakPoint("ScheduleExec") + container_id, pid = runwith.DockerRun(controller0, options, _ReplaceImage(FLAGS.dpt_docker_image)) + + if events.start_trace.receivers: + try: + if not FLAGS.dpt_trace_mode: + events.start_trace.send(stages.RUN, benchmark_spec=benchmark_spec) + + elif FLAGS.dpt_trace_mode[0] == "roi": + _TraceByROI(benchmark_spec, controller0, timeout[2], + runwith.DockerLogsCmd(container_id)) + + elif FLAGS.dpt_trace_mode[0] == "time": + _TraceByTime(benchmark_spec, controller0) + except Exception as e: + logging.warning("Trace Exception: %s", str(e)) + _SetBreakPoint("TraceFailed") + + pods = [container_id] + try: + _SetBreakPoint("ExtractLogs") + runwith.DockerWaitForCompletion(controller0, container_id, timeout[0], join(tmp_dir, LOGS_TARFILE.format(container_id))) + pull_logs = True + except Exception as e: + logging.fatal("ExtractLogs Exception: %s", str(e)) + _SetBreakPoint("ExtractLogsFailed") + + if events.start_trace.receivers and (not FLAGS.dpt_trace_mode): + try: + events.stop_trace.send(stages.RUN, benchmark_spec=benchmark_spec) + except: + pass + + controller0.RemoteCommand(runwith.DockerLogsCmd(container_id), + ignore_failure=True, should_log=True) + runwith.DockerRemove(controller0, containers, container_id, pid) + + if FLAGS.dpt_kubernetes_yaml: + controller0 = _GetController(benchmark_spec.vm_groups) + remote_yaml_file = join(tmp_dir, KUBERNETES_CONFIG) + + _SetBreakPoint("ScheduleExec") + controller0.RemoteCommand(f"kubectl create namespace {FLAGS.dpt_namespace}") + controller0.RemoteCommand(f"kubectl label namespace {FLAGS.dpt_namespace} {SF_NS_LABEL}") + docker_auth.InstallImagePullSecret(controller0, FLAGS.dpt_namespace) + try: + controller0.RemoteCommand(f"kubectl create --namespace={FLAGS.dpt_namespace} -f {remote_yaml_file}") + + try: + controller0.RemoteCommand("timeout {1}s bash -c 'q=0;until kubectl --namespace={0} wait pod --all --for=condition=Ready --timeout=1s 1>/dev/null 2>&1; do if kubectl --namespace={0} get pod -o json | grep -q Unschedulable; then q=1; break; fi; done; exit $q'".format(FLAGS.dpt_namespace, timeout[1])) + + pods, _ = controller0.RemoteCommand("kubectl get --namespace=" + FLAGS.dpt_namespace + " pod --selector=" + FLAGS.dpt_kubernetes_job + " '-o=jsonpath={.items[*].metadata.name}'") + pods = pods.strip(" \t\n").split(" ") + container = FLAGS.dpt_kubernetes_job.rpartition("=")[2] + + if events.start_trace.receivers: + try: + if not FLAGS.dpt_trace_mode: + events.start_trace.send(stages.RUN, benchmark_spec=benchmark_spec) + + elif FLAGS.dpt_trace_mode[0] == "roi": + cmds = [] + for pod1 in pods: + cmds.append(f"kubectl logs --ignore-errors --prefix=false {pod1} -c {container} --namespace={FLAGS.dpt_namespace}") + _TraceByROI(benchmark_spec, controller0, timeout[2], ";".join(cmds)) + + elif FLAGS.dpt_trace_mode[0] == "time": + _TraceByTime(benchmark_spec, controller0) + except Exception as e: + logging.warning("Trace Exception: %s", str(e)) + _SetBreakPoint("TraceFailed") + + cmds = [] + for pod1 in pods: + cmds.append(f"kubectl exec --namespace={FLAGS.dpt_namespace} {pod1} -c {container} -- sh -c \"cat /export-logs > {EXPORT_LOGS_TARFILE}\";x=$?;test $x -ne 0 && r=$x") + + try: + _SetBreakPoint("ExtractLogs") + controller0.RemoteCommand("timeout {}s bash -c 'r=0;{};exit $r'".format(timeout[0], ";".join(cmds))) + + for pod1 in pods: + remote_logs_tarfile = join(tmp_dir, LOGS_TARFILE.format(pod1)) + _RobustGetLogs(controller0, pod1, container, remote_logs_tarfile) + + pull_logs = True + except Exception as e: + logging.fatal("ExtractLogs Exception: %s", str(e)) + _SetBreakPoint("ExtractLogsFailed") + + if events.start_trace.receivers and (not FLAGS.dpt_trace_mode): + try: + events.stop_trace.send(stages.RUN, benchmark_spec=benchmark_spec) + except: + pass + + except Exception as e: + logging.fatal("Schedule Exception: %s", str(e)) + _SetBreakPoint("ScheduleExecFailed") + + controller0.RemoteCommand(f"kubectl describe node --namespace={FLAGS.dpt_namespace}", ignore_failure=True, should_log=True) + controller0.RemoteCommand(f"kubectl describe pod --namespace={FLAGS.dpt_namespace}", ignore_failure=True, should_log=True) + controller0.RemoteCommand("bash -c 'for p in $(kubectl get pod -n {0} --no-headers -o custom-columns=:metadata.name);do echo \"pod $p:\";kubectl -n {0} logs --all-containers=true $p;done'".format(FLAGS.dpt_namespace), ignore_failure=True, should_log=True) + + except Exception as e: + logging.fatal("Failed to deploy test: %s", str(e)) + _SetBreakPoint("ScheduleExecFailed") + + if (controller0.CLOUD == "Static" and not FLAGS.install_packages) or (run_seq < FLAGS.run_stage_iterations): + controller0.RemoteCommand(f"kubectl delete --namespace={FLAGS.dpt_namespace} -f {remote_yaml_file} --ignore-not-found=true", ignore_failure=True) + try: + controller0.RemoteCommand(f"timeout 120s kubectl delete namespace {FLAGS.dpt_namespace} --timeout=0 --wait --ignore-not-found=true") + except: + # force namespace removal + controller0.RemoteCommand("bash -c 'kubectl replace --raw \"/api/v1/namespaces/{0}/finalize\" -f <(kubectl get ns {0} -o json | grep -v \"\\\"kubernetes\\\"\")'".format(FLAGS.dpt_namespace), ignore_failure=True) + + nodes = _GetNodes(controller0) + workers, _, _ = _ParseClusterConfigs(benchmark_spec.vm_groups, nodes) + vm_util.RunThreaded(lambda vm1: vm1.Reboot(), _UniqueVms(workers)) + controller0.RemoteCommand("kubectl wait --for=condition=Ready nodes --all") + + _SetBreakPoint("ExtractKPI") + + # pull the logs tarfile back + samples = [] + if pull_logs: + _PullExtractLogs(controller0, pods, tmp_dir) + samples = _ParseKPI(benchmark_spec.tunable_parameters_metadata) + + if not samples: + _SetBreakPoint("ExtractKPIFailed") + raise Exception("KPI Exception: No KPI data") + + return samples + + +def Cleanup(benchmark_spec): + _SetBreakPoint("CleanupStage") + tmp_dir = _GetTempDir() + + _, vimages, _ = _ParseClusterConfigs(benchmark_spec.vm_groups) + for i, vim1 in enumerate(vimages): + if "setup" in vim1: + local_dir = "{}/{}/{}".format(FLAGS.dpt_logs_dir, vim1["name"], i) + vm_util.IssueCommand(["mkdir", "-p", local_dir]) + try: + vm1 = vim1["vm"] + vm1.RemoteCommand("cd /opt/pkb/vmsetup && sudo ./cleanup.sh {}".format(" ".join(FLAGS.dpt_script_args))) + vm1.PullFile("{}/vmlogs.tgz".format(local_dir), "/opt/pkb/vmsetup/vmlogs.tgz") + except Exception as e: + logging.warning("Cleanup Exception: %s", str(e)) + + if FLAGS.dpt_docker_image: + controller0 = _GetWorkers(benchmark_spec.vm_groups)[0] + + if FLAGS.dpt_kubernetes_yaml: + controller0 = _GetController(benchmark_spec.vm_groups) + + # cleanup containers + if controller0.CLOUD == "Static" and not FLAGS.install_packages: + controller0.RemoteCommand(f"sudo rm -rf '{tmp_dir}'", ignore_failure=True) + + +def GetCmdLine(): + tcase = FLAGS.dpt_tunables[FLAGS.dpt_tunables.index(";testcase:")+10:] + tconfig = "TEST_CONFIG=$(pwd)/test-config.yaml " if os.path.exists("test-config.yaml") else "" + return f"{tconfig}ctest -R '^{tcase}$' -V" + diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/__init__.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/__init__.py new file mode 100644 index 0000000..9a53e22 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/__init__.py @@ -0,0 +1,68 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains package imports and a dictionary of package names and modules. + +All modules within this package are considered packages, and are loaded +dynamically. Add non-package code to other packages. + +Packages should, at a minimum, define install functions for each type of +package manager (e.g. YumInstall(vm) and AptInstall(vm)). +They may also define functions that return the path to a configuration file +(e.g. AptGetPathToConfig(vm)) and functions that return the linux service +name (e.g. YumGetServiceName(vm)). If the package only installs +packages through the package manager, then it does not need to define an +uninstall function. If the package manually places files in other locations +(e.g. /usr/bin, /opt/pkb), then it also needs to define uninstall functions +(e.g. YumUninstall(vm)). + +Package installation should persist across reboots. + +All functions in each package module should be prefixed with the type of package +manager, and all functions should accept a BaseVirtualMachine object as their +only arguments. + +See perfkitbenchmarker/package_managers.py for more information on how to use +packages in benchmarks. +""" + +import os +from perfkitbenchmarker import import_util + + +# Place to install stuff. Persists across reboots. +INSTALL_DIR = '/opt/pkb' + + +def _LoadPackages(): + packages = dict([(module.__name__.split('.')[-1], module) for module in + import_util.LoadModulesForPath(__path__, __name__)]) + #packages.update(packages['docker'].CreateImagePackages()) + return packages + + +PACKAGES = _LoadPackages() + + +def GetPipPackageVersion(vm, package_name): + """This function returns the version of a pip package installed on a vm. + + Args: + vm: the VM the package is installed on. + package_name: the name of the package. + + Returns: + The version string of the package. + """ + version, _ = vm.RemoteCommand('pip3 show %s |grep Version' % package_name) + return version.strip() diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/aws_credentials.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/aws_credentials.py new file mode 100644 index 0000000..42f9260 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/aws_credentials.py @@ -0,0 +1,163 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing AWS credential file installation and cleanup helpers. + +AWS credentials consist of a secret access key and its ID, stored in a single +file. Following PKB's AWS setup instructions (see +https://github.com/GoogleCloudPlatform/PerfKitBenchmarker#install-aws-cli-and-setup-authentication), +the default location of the file will be at ~/.aws/credentials + +This package copies the credentials file to the remote VM to make them available +for calls from the VM to other AWS services, such as SQS or Kinesis. +""" + +import configparser +import logging +import os +from absl import flags +from perfkitbenchmarker import data +from perfkitbenchmarker import errors +from subprocess import check_output + + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + 'aws_credentials_local_path', os.path.join('~', '.aws'), + 'Path where the AWS credential files can be found on the local machine.') + +flags.DEFINE_string( + 'aws_credentials_remote_path', '.aws', + 'Path where the AWS credential files will be written on remote machines.') + +flags.DEFINE_boolean( + 'aws_credentials_overwrite', False, + 'When set, if an AWS credential file already exists at the destination ' + 'specified by --aws_credentials_remote_path, it will be overwritten during ' + 'AWS credential file installation.') +flags.DEFINE_string('aws_s3_region', None, 'Region for the S3 bucket') + + +def _GetLocalPath(): + """Gets the expanded local path of the credential files. + + Returns: + string. Path to the credential files on the local machine. + """ + return os.path.expanduser(FLAGS.aws_credentials_local_path) + + +def GetCredentials(credentials_file_name='credentials'): + """Gets the credentials from the local credential file. + + AWS credentials file is expected to be called 'credentials'. + AWS credentials file looks like this, and ends with a newline: + [default] + aws_access_key_id = {access_key} + aws_secret_access_key = {secret_access_key} + + Args: + credentials_file_name: String name of the file containing the credentials. + + Returns: + A string, string tuple of access_key and secret_access_key + """ + config = configparser.ConfigParser() + config.read(os.path.join(_GetLocalPath(), credentials_file_name)) + key_id = config['default']['aws_access_key_id'] + key = config['default']['aws_secret_access_key'] + return key_id, key + + +def CheckPrerequisites(): + """Verifies that the required resources are present. + + Raises: + perfkitbenchmarker.data.ResourceNotFound: On missing resource. + """ + local_path = _GetLocalPath() + if not os.path.exists(local_path): + raise data.ResourceNotFound( + 'AWS credential files were not found at {0}'.format(local_path)) + + +def _IsCopyNeeded(vm, local_path, remote_path): + # return False: if the username, file_path, IP (or localhost) are the same for local and remote + local_ips = check_output(['hostname', '--all-ip-addresses']) + local_ip_list_str = local_ips.decode('utf-8') + remote_ip = vm.GetConnectionIp() + if remote_path == '.aws': + full_remote_path = '/home/{}/{}'.format(vm.user_name, remote_path) + else: + full_remote_path = remote_path + local_user = os.environ.get('USER') or os.environ.get('USERNAME') or '' + # remote_ip is the same as one of the local IP or localhost + if remote_ip in local_ip_list_str or remote_ip == 'localhost': + # same user and same path, do NOT copy + if local_user == vm.user_name and local_path == full_remote_path: + return False + return True + + +def Install(vm): + """Copies credential files to the specified VM. + + Args: + vm: BaseVirtualMachine. VM that receives the credential files. + + Raises: + errors.Error: If the file destination on the VM already exists, and the + overwrite behavior is not specified via --aws_credentials_overwrite. + """ + local_path = _GetLocalPath() + remote_path = FLAGS.aws_credentials_remote_path + overwrite = FLAGS.aws_credentials_overwrite + try: + vm.RemoteCommand('[[ ! -e {0} ]]'.format(remote_path)) + except errors.VirtualMachine.RemoteCommandError: + err_msg = 'File {0} already exists on VM {1}.'.format(remote_path, vm) + if overwrite: + logging.info('%s Overwriting.', err_msg) + else: + raise errors.Error(err_msg) + remote_dir = os.path.dirname(remote_path) + if remote_dir: + vm.RemoteCommand('mkdir -p {0}'.format(remote_dir)) + if _IsCopyNeeded(vm, local_path, remote_path): + if FLAGS.enable_rsync: + vm.PushFile(f"{local_path}/", remote_path) + else: + vm.PushFile(local_path, remote_path) + + +def Uninstall(vm): + """Deletes the credential files from the specified VM. + + Args: + vm: BaseVirtualMachine. VM that has the credential files. + """ + vm.RemoveFile(FLAGS.aws_credentials_remote_path) + + +def IsInstalled(vm): + """Checks whether aws credentials is installed on the VM.""" + _, _, retVal = vm.RemoteCommandWithReturnCode('test -d {0}'.format(FLAGS.aws_credentials_remote_path), + ignore_failure=True, + suppress_warning=True) + # It is not installed + if retVal != 0: + return False + + return True diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/awscliv2.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/awscliv2.py new file mode 100644 index 0000000..d47dc05 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/awscliv2.py @@ -0,0 +1,37 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Package for installing the AWS CLI.""" + +AWSCLI_URL_FMT = "https://awscli.amazonaws.com/awscli-exe-linux-{arch}.zip" +AWSCLI_ZIP = "awscliv2.zip" + + +def Install(vm): + """Installs the awscli package on the VM.""" + uname = vm.RemoteCommand('uname -m')[0].strip() + if uname != 'x86_64' and uname != 'aarch64': + raise NotImplementedError("unsupported architecture: {}".format(uname)) + + vm.InstallPackages("unzip") + cli_url = AWSCLI_URL_FMT.format(arch=uname) + vm.RemoteCommand(f"curl {cli_url} -o {AWSCLI_ZIP} && unzip {AWSCLI_ZIP}") + vm.RemoteCommand("sudo ./aws/install") + # Clean up unused files + vm.RemoteCommand(f"rm -rf aws {AWSCLI_ZIP}") + + +def Uninstall(vm): + vm.RemoteCommand('sudo rm -rf /usr/local/aws-cli') + vm.RemoteCommand('sudo rm /usr/local/bin/aws') diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/azure_credentials.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/azure_credentials.py new file mode 100644 index 0000000..6e03012 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/azure_credentials.py @@ -0,0 +1,72 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Package for installing the Azure MSAL credentials for the azure-cli.""" + +import os + +from absl import logging +from packaging import version + +from perfkitbenchmarker import errors +from perfkitbenchmarker import object_storage_service +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import azure + +AZURE_CREDENTIAL_DIRECTORY = os.path.join('~', '.azure') +# Necessary for user login with MSAL +TOKENS_FILE = os.path.join(AZURE_CREDENTIAL_DIRECTORY, 'msal_token_cache.json') +# Necessary for service_principal login with MSAL +SERVICE_PRINCIPAL_FILE = os.path.join(AZURE_CREDENTIAL_DIRECTORY, + 'service_principal_entries.json') +# Determines which login is active +PROFILE_FILE = os.path.join(AZURE_CREDENTIAL_DIRECTORY, 'azureProfile.json') + +# See https://docs.microsoft.com/en-us/cli/azure/msal-based-azure-cli +_REQUIRED_MSAL_CLI_VERSION = '2.30.0' + + +def Install(vm): + """Copies Azure credentials to the VM.""" + # Validate local azure-cli uses MSAL + stdout, _, _ = vm_util.IssueCommand( + [azure.AZURE_PATH, 'version', '--query', '"azure-cli"']) + az_version = version.Version(stdout.strip('"\n')) + if az_version < version.Version(_REQUIRED_MSAL_CLI_VERSION): + raise errors.Benchmarks.MissingObjectCredentialException( + f'Local Azure CLI version must be at least {_REQUIRED_MSAL_CLI_VERSION}' + f' to copy credentials into a VM. Found version {az_version}. ' + 'The recent CLI on the VM will not be able to use your credentials.') + + # Install CLI to validate credentials + vm.Install('azure_cli') + + # Copy credentials to VM + vm.RemoteCommand('mkdir -p {0}'.format(AZURE_CREDENTIAL_DIRECTORY)) + vm.PushFile( + object_storage_service.FindCredentialFile(PROFILE_FILE), PROFILE_FILE) + for file in [SERVICE_PRINCIPAL_FILE, TOKENS_FILE]: + try: + vm.PushFile(object_storage_service.FindCredentialFile(file), file) + except errors.Benchmarks.MissingObjectCredentialException: + logging.info('Optional service account file %s not found.', file) + + # Validate azure-cli is now authenticating correctly. + try: + # This token is not used, it is simply used to prove that the CLI on the VM + # is authenticated. + vm.RemoteCommand('az account get-access-token') + except errors.VirtualMachine.RemoteExceptionError as e: + raise errors.Benchmarks.MissingObjectCredentialException( + 'Failed to install azure_credentials on VM.') from e diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/build_tools.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/build_tools.py new file mode 100644 index 0000000..a515e78 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/build_tools.py @@ -0,0 +1,146 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing build tools installation and cleanup functions.""" +import logging +from absl import flags + +FLAGS = flags.FLAGS +flags.DEFINE_string( + 'gcc_version', None, 'Version of gcc to use. Benchmarks ' + 'that utilize gcc compilation should ensure reinstallation ' + 'of GCC. Default is set by the OS package manager.') +flags.DEFINE_boolean('force_build_gcc_from_source', False, 'Whether to force ' + 'building GCC from source.') +flags.DEFINE_boolean( + 'build_fortran', False, 'Whether to build fortran ' + 'alongside c and c++ when building GCC.') + +GCC_TAR = 'gcc-{version}.tar.gz' +GCC_URL = 'https://ftp.gnu.org/gnu/gcc/gcc-{version}/' + GCC_TAR +PREPROVISIONED_DATA = { + GCC_TAR.format(version='9.2.0'): + 'a931a750d6feadacbeecb321d73925cd5ebb6dfa7eff0802984af3aef63759f4' +} +PACKAGE_DATA_URL = { + GCC_TAR.format(version='9.2.0'): GCC_URL.format(version='9.2.0') +} + + +def YumInstall(vm): + """Installs build tools on the VM.""" + vm.InstallPackageGroup('Development Tools') + if FLAGS.gcc_version: + Reinstall(vm, version=FLAGS.gcc_version) + + +def AptInstall(vm): + """Installs build tools on the VM.""" + vm.InstallPackages('build-essential git libtool autoconf automake') + if FLAGS.gcc_version: + Reinstall(vm, version=FLAGS.gcc_version) + + +def BuildGccFromSource(vm, gcc_version): + """Install a specific version of gcc by compiling from source. + + Args: + vm: VirtualMachine object. + gcc_version: string. GCC version. + Taken from: https://gist.github.com/nchaigne/ad06bc867f911a3c0d32939f1e930a11 + """ + if gcc_version == '9' or gcc_version == '9.2': + gcc_version = '9.2.0' + logging.info('Compiling GCC %s', gcc_version) + + # build GCC on scratch disks for speed if possible + build_dir = vm.GetScratchDir() if vm.scratch_disks else '~/' + gcc_tar = GCC_TAR.format(version=gcc_version) + if gcc_tar in PREPROVISIONED_DATA: + vm.InstallPreprovisionedPackageData('build_tools', + PREPROVISIONED_DATA.keys(), build_dir) + else: + vm.RemoteCommand(f'cd {build_dir} && ' + f'wget {GCC_URL.format(version=gcc_version)}') + vm.RemoteCommand(f'cd {build_dir} && tar xzvf {gcc_tar}') + vm.RemoteCommand(f'cd {build_dir} && mkdir -p obj.gcc-{gcc_version}') + vm.RemoteCommand(f'cd {build_dir}/gcc-{gcc_version} && ' + './contrib/download_prerequisites') + enable_languages = 'c,c++' + (',fortran' if FLAGS.build_fortran else '') + vm.RemoteCommand(f'cd {build_dir}/obj.gcc-{gcc_version} && ' + f'../gcc-{gcc_version}/configure ' + f'--disable-multilib --enable-languages={enable_languages}') + # TODO(user): Measure GCC compilation time as a benchmark. + vm.RemoteCommand(f'cd {build_dir}/obj.gcc-{gcc_version} && ' + f'time make -j {vm.NumCpusForBenchmark()}') + vm.RemoteCommand(f'cd {build_dir}/obj.gcc-{gcc_version} && sudo make install') + vm.RemoteCommand('sudo rm -rf /usr/bin/gcc && ' + 'sudo ln -s /usr/local/bin/gcc /usr/bin/gcc') + vm.RemoteCommand('sudo rm -rf /usr/bin/g++ && ' + 'sudo ln -s /usr/local/bin/g++ /usr/bin/g++') + if FLAGS.build_fortran: + vm.RemoteCommand('sudo rm -rf /usr/bin/gfortran && ' + 'sudo ln -s /usr/local/bin/gfortran /usr/bin/gfortran') + + if '11' in gcc_version: + # https://stackoverflow.com/a/65384705 + vm.RemoteCommand( + f'sudo cp {build_dir}/obj.gcc-{gcc_version}/x86_64-pc-linux-gnu/' + 'libstdc++-v3/src/.libs/* /usr/lib/x86_64-linux-gnu/', + ignore_failure=True) + vm.RemoteCommand( + f'sudo cp {build_dir}/obj.gcc-{gcc_version}/aarch64-unknown-linux-gnu/' + 'libstdc++-v3/src/.libs/* /usr/lib/aarch64-linux-gnu/', + ignore_failure=True) + + +def GetVersion(vm, pkg): + """Get version of package using -dumpversion.""" + out, _ = vm.RemoteCommand( + '{pkg} -dumpversion'.format(pkg=pkg), ignore_failure=True) + return out.rstrip() + + +def GetVersionInfo(vm, pkg): + """Get compiler version info for package using --version.""" + out, _ = vm.RemoteCommand( + '{pkg} --version'.format(pkg=pkg), ignore_failure=True) + # return first line of pkg --version + return out.splitlines()[0] if out else None + + +def Reinstall(vm, version: str): + """Install specific version of gcc. + + Args: + vm: VirtualMachine object. + version: string. GCC version. + """ + if 'ubuntu' not in vm.OS_TYPE or FLAGS.force_build_gcc_from_source: + BuildGccFromSource(vm, version) + logging.info('GCC info: %s', GetVersion(vm, 'gcc')) + return + vm.Install('ubuntu_toolchain') + for pkg in ('gcc', 'gfortran', 'g++'): + version_string = GetVersion(vm, pkg) + if version in version_string: + logging.info('Have expected version of %s: %s', pkg, version_string) + continue + else: + new_pkg = pkg + '-' + version + vm.InstallPackages(new_pkg) + vm.RemoteCommand('sudo rm -f /usr/bin/{pkg}'.format(pkg=pkg)) + vm.RemoteCommand('sudo ln -s /usr/bin/{new_pkg} /usr/bin/{pkg}'.format( + new_pkg=new_pkg, pkg=pkg)) + logging.info('Updated version of %s: Old: %s New: %s', pkg, + version_string, GetVersion(vm, pkg)) diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/collectd.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/collectd.py new file mode 100644 index 0000000..ae166d0 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/collectd.py @@ -0,0 +1,186 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Builds collectd from source, installs to linux_packages.INSTALL_DIR. + +https://collectd.org/ +collectd is extremely configurable. We enable some basic monitoring, gathered +every 10s, saving in .csv format. See +perfkitbenchmarker/data/build_collectd.sh.j2 for configuration details. +""" + +import posixpath +import os +import re +import logging +import csv + +from absl import flags +from perfkitbenchmarker import data + +flags.DEFINE_string('collectd_depend', None, + 'list collectd dependency files required to add plugin in configuration file.') +flags.DEFINE_string('collectd_config', None, + 'override the default collectd configuration file.') +flags.DEFINE_string('collectd_tar', None, + 'Path on PKB host from which user can upload collectd package to SUT') + +FLAGS = flags.FLAGS + +COLLECTD_DIR = '/opt/collectd' +BUILD_SCRIPT_NAME = 'build_collectd.sh.j2' +PLUGIN_DIR_NAME = 'collectd_plugins' +PATCHES_DIR_NAME = 'collectd_patches' +COLLECTD_URL = ('https://storage.googleapis.com/collectd-tarballs/collectd-5.12.0.tar.bz2') +BUILD_DIR = posixpath.join(COLLECTD_DIR, 'collectd-build') +CSV_DIR = posixpath.join(COLLECTD_DIR, 'collectd-csv') +PREFIX = posixpath.join(COLLECTD_DIR, 'collectd') +PLUGIN_DIR = posixpath.join(COLLECTD_DIR, PLUGIN_DIR_NAME) +PATCHES_DIR = posixpath.join(COLLECTD_DIR, PATCHES_DIR_NAME) +PID_FILE = posixpath.join(PREFIX, 'var', 'run', 'collectd.pid') +PYTHON_CONFIG = "/usr/bin/python3-config" + + +def _GetAbsPath(path): + absPath = os.path.abspath(os.path.expanduser(path)) + if not os.path.isfile(absPath): + raise RuntimeError('File (%s) does not exist.' % path) + return absPath + + +def _CollectdInstalled(vm): + ret, _ = vm.RemoteCommand(f"test -d {COLLECTD_DIR} && echo collectd_installed || echo collectd_not_found", + ignore_failure=True) + if "collectd_installed" in ret: + return True + return False + + +def Prepare(vm, benchmark_spec): + """Prepares collect on VM.""" + pass + + +def _Install(vm): + # if skip_install flag is true and collectd_dir exist, do nothing + if FLAGS.trace_skip_install and _CollectdInstalled(vm): + logging.info('Skip installing collectd') + return + + # first clean up the existing collectd_dir + vm.RemoteCommand(f"sudo rm -rf {COLLECTD_DIR}") + vm.RemoteCommand(f"sudo mkdir -p {COLLECTD_DIR}; sudo chown -R {vm.user_name} {COLLECTD_DIR}") + # copy config file + if FLAGS.collectd_config: + configFile_path = _GetAbsPath(FLAGS.collectd_config) + else: + configFile_path = data.ResourcePath('collectd.conf') + vm.RemoteCopy(configFile_path, COLLECTD_DIR, True) + # install user-defined package dependencies + if FLAGS.collectd_depend: + dependFile_path = _GetAbsPath(FLAGS.collectd_depend) + else: + dependFile_path = data.ResourcePath('collectdDepend.txt') + # this file may not be there or it could be empty + if posixpath.exists(dependFile_path) and posixpath.getsize(dependFile_path) > 0: + with open(dependFile_path) as fh: + for line in fh: + if re.match(r"^\w+", line): + vm.InstallPackages(line) + # upload patches + vm.RemoteCommand('bash -c "mkdir -p {0}"'.format(PATCHES_DIR)) + patches_path = data.ResourcePath(PATCHES_DIR_NAME) + for file in os.listdir(patches_path): + vm.RemoteCopy(posixpath.join(patches_path, file), posixpath.join(PATCHES_DIR, file), True) + # upload plugins + vm.RemoteCommand('bash -c "mkdir -p {0}"'.format(PLUGIN_DIR)) + plugins_path = data.ResourcePath(PLUGIN_DIR_NAME) + for file in os.listdir(plugins_path): + vm.RemoteCopy(posixpath.join(plugins_path, file), posixpath.join(PLUGIN_DIR, file), True) + + collectd_tar_package = posixpath.join(COLLECTD_DIR, 'collectd.tar.bz2') + if FLAGS.collectd_tar: + vm.RemoteCopy(data.ResourcePath(FLAGS.collectd_tar), '{0}'.format(collectd_tar_package)) + else: + vm.RemoteCommand("curl -L {0} --output {1}".format(COLLECTD_URL, collectd_tar_package)) + + if vm.OS_TYPE == "ubuntu2004": + python_config = PYTHON_CONFIG + " --embed" + else: + python_config = PYTHON_CONFIG + + # build collectd + context = { + 'collectd_package': collectd_tar_package, + 'build_dir': BUILD_DIR, + 'root_dir': PREFIX, + 'parent_dir': COLLECTD_DIR, + 'plugin_dir': PLUGIN_DIR, + 'patches_dir': PATCHES_DIR, + 'config_depd_file': os.path.basename(dependFile_path), + 'config_file': os.path.basename(configFile_path), + 'python_config': python_config} + remote_path = posixpath.join( + COLLECTD_DIR, + posixpath.splitext(posixpath.basename(BUILD_SCRIPT_NAME))[0]) + vm.RenderTemplate(data.ResourcePath(BUILD_SCRIPT_NAME), + remote_path, context=context) + vm.RemoteCommand('bash ' + remote_path) + + +def _Uninstall(vm): + pass + + +def Start(vm): + vm.RemoteCommand("%s/sbin/collectd -t" % PREFIX) # exception will be thrown if this fails + vm.RemoteCommand("%s/sbin/collectd" % PREFIX) + + +def Stop(vm): + vm.RemoteCommand('kill -9 $(cat {0})'.format(PID_FILE), ignore_failure=True) + + +def YumInstall(vm): + """Installs collectd on 'vm'.""" + vm.InstallPackages('libtool-ltdl-devel python3-devel python3-pip curl') + vm.InstallPackageGroup('Development Tools') + _Install(vm) + + +def AptInstall(vm): + """Installs collectd on 'vm'.""" + pkg_list = "autoconf bison flex libtool pkg-config build-essential python3 python3-dev curl" + vm.InstallPackages(pkg_list) + _Install(vm) + + +def AptUninstall(vm): + """Stops collectd on 'vm'.""" + _Uninstall(vm) + + +def YumUninstall(vm): + """Stops collectd on 'vm'.""" + _Uninstall(vm) + + +def SwupdInstall(vm): + """Installs collectd on 'vm'.""" + vm.InstallPackages('os-testsuite-phoronix-server') + _Install(vm) + + +def SwupdUninstall(vm): + """Stops collectd on 'vm'.""" + _Uninstall(vm) diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/compiler.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/compiler.py new file mode 100644 index 0000000..c96b90a --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/compiler.py @@ -0,0 +1,30 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Module containing compiler installation and cleanup functions.""" + +import logging +from absl import flags + +FLAGS = flags.FLAGS + + +def Install(vm): + try: + vm.Install('{}'.format(FLAGS.compiler)) + except: + """ If we can't figure it out, use gcc """ + logging.warn('Unable to determine desired compiler, using GCC instead!') + vm.Install('gcc') diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/containerd.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/containerd.py new file mode 100644 index 0000000..d37bf50 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/containerd.py @@ -0,0 +1,33 @@ +from perfkitbenchmarker.linux_packages import proxy +from absl import flags + +FLAGS = flags.FLAGS +flags.DEFINE_string("containerd_version", "1.5", + "Specify the containerd version") + +CONFIG_FILE = "/etc/containerd/config.toml" + + +def YumInstall(vm): + raise Exception("Not Implemented") + + +def AptInstall(vm): + vm.AptUpdate() + version, _ = vm.RemoteCommand(f"sudo apt-cache madison containerd | grep {FLAGS.containerd_version} | cut -f2 -d'|' | tr -d ' ' | sort -V -r | head -n 1") + version = version.strip() + vm.InstallPackages(f'containerd={version}') + _ConfigureContainerd(vm) + + +def _ConfigureContainerd(vm): + vm.RemoteCommand(f"sudo mkdir -p $(dirname {CONFIG_FILE})") + vm.RemoteCommand(f"containerd config default | sudo tee {CONFIG_FILE}") + vm.RemoteCommand(f"sudo sed -i 's/SystemdCgroup = .*/SystemdCgroup = true/' {CONFIG_FILE}") + proxy.AddProxy(vm, "containerd") + vm.RemoteCommand(f"sudo systemctl daemon-reload") + vm.RemoteCommand(f"sudo systemctl restart containerd") + + +def Uninstall(vm): + pass diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/docker_auth.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/docker_auth.py new file mode 100644 index 0000000..898da2e --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/docker_auth.py @@ -0,0 +1,38 @@ +from absl import flags +import posixpath +import tempfile +import logging +import json +import os + +FLAGS = flags.FLAGS +flags.DEFINE_boolean('docker_auth_reuse', False, 'SUT reuses the same docker auth info.') +flags.DEFINE_string('docker_auth_local_path', '~/.docker/config.json', 'The docker config file local path') +flags.DEFINE_string('docker_auth_remote_path', '.docker/config.json', 'The docker config file remote path') + +SECRET_NAME = "my-registry-secret" + +def CopyDockerConfig(vm): + if FLAGS.docker_auth_reuse: + try: + with open(os.path.expanduser(FLAGS.docker_auth_local_path),'r') as fdr: + auths = json.load(fdr)["auths"] + except Exception as e: + logging.warning("Exception: %s", str(e)) + return + + if auths: + handle, local_path = tempfile.mkstemp() + with os.fdopen(handle, "w") as fdw: + fdw.write(json.dumps({"auths": auths}) + "\n") + remote_path = f"/home/{vm.user_name}/{FLAGS.docker_auth_remote_path}" + vm.RemoteCommand("mkdir -p {}".format(posixpath.dirname(remote_path))) + vm.PushFile(local_path, remote_path) + os.unlink(local_path) + + +def InstallImagePullSecret(vm, namespace): + if FLAGS.docker_auth_reuse: + remote_path = f"/home/{vm.user_name}/{FLAGS.docker_auth_remote_path}" + vm.RemoteCommand(f"kubectl create secret docker-registry {SECRET_NAME} --from-file=.dockerconfigjson={remote_path} -n {namespace}", ignore_failure=True) + vm.RemoteCommand(f"kubectl patch serviceaccount default -p '{{\"imagePullSecrets\": [{{\"name\": \"{SECRET_NAME}\"}}]}}' -n {namespace}", ignore_failure=True) diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/docker_ce.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/docker_ce.py new file mode 100644 index 0000000..9043d32 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/docker_ce.py @@ -0,0 +1,89 @@ + +from perfkitbenchmarker.linux_packages import proxy +from absl import flags +import json + +FLAGS = flags.FLAGS +flags.DEFINE_string('docker_dist_repo', None, + 'Path to the dockerce repository.') +flags.DEFINE_string('docker_version', '20.10', + 'Specify the docker version.') +flags.DEFINE_list('docker_registry_mirrors', [], + 'Specify the docker mirrors.') + + +def YumInstall(vm): + repo = FLAGS.docker_dist_repo if FLAGS.docker_dist_repo else "https://download.docker.com/linux/centos" + vm.InstallPackages("yum-utils device-mapper-persistent-data lvm2") + # Package for RHEL8 containerd.io does not yet exist - this is a workaround + if vm.OS_TYPE == "centos8" or vm.OS_TYPE == "rhel8": + cmd = "sudo yum install -y " + repo + "/7/x86_64/stable/Packages/containerd.io-1.2.6-3.3.el7.x86_64.rpm" + cmd += " && sudo yum-config-manager --add-repo " + repo + "/docker-ce.repo" + else: + cmd = 'sudo yum-config-manager --add-repo ' + repo + '/docker-ce.repo' + vm.RemoteCommand(cmd) + vm.InstallPackages('docker-ce') + + proxy.AddProxy(vm, "docker") + AddConfig(vm) + _AddUserToDockerGroup(vm) + + +def AptInstall(vm): + repo = FLAGS.docker_dist_repo if FLAGS.docker_dist_repo else "https://download.docker.com/linux/ubuntu" + vm.InstallPackages("apt-transport-https ca-certificates curl gnupg-agent software-properties-common") + vm.RemoteCommand(f'curl -fsSL {repo}/gpg | sudo apt-key add -') + vm.RemoteCommand(f"bash -c 'sudo -E add-apt-repository \"deb [arch=$(dpkg --print-architecture)] {repo} $(grep CODENAME /etc/lsb-release | cut -f2 -d=) stable\"'") + vm.AptUpdate() + version, _ = vm.RemoteCommand(f"sudo apt-cache madison docker-ce | grep {FLAGS.docker_version} | cut -f2 -d'|' | tr -d ' ' | sort -V -r | head -n 1") + vm.InstallPackages(f'docker-ce={version.strip()} --allow-change-held-packages') + + proxy.AddProxy(vm, "docker") + AddConfig(vm) + _AddUserToDockerGroup(vm) + + +def SwupdInstall(vm): + vm.RemoteCommand("sudo swupd update") + vm.InstallPackages("containers-basic") + + proxy.AddProxy(vm, "docker") + AddConfig(vm) + _AddUserToDockerGroup(vm) + + +def AddConfig(vm, config={}): + config["exec-opts"] = ["native.cgroupdriver=systemd"] + if FLAGS.docker_registry_mirrors: + config["registry-mirrors"] = FLAGS.docker_registry_mirrors + + vm.RemoteCommand(f"echo '{json.dumps(config)}' | sudo tee /etc/docker/daemon.json") + vm.RemoteCommand(f"sudo systemctl daemon-reload") + vm.RemoteCommand(f"sudo systemctl restart docker") + + +def _AddUserToDockerGroup(vm): + """ + Add user to the docker group so docker commands can be executed without sudo + """ + vm.RemoteCommand("sudo usermod --append --groups docker {}".format(vm.user_name)) + vm.RemoteCommand("sudo systemctl restart docker") + + # SSH uses multiplexing to reuse connections without going through the SSH handshake + # for a remote host. Typically we need to logout / login after adding the user to + # the docker group as group memberships are evaluated at login. + # See: https://docs.docker.com/engine/install/linux-postinstall/ + # This requirement along with the multiplexing causes subsequent docker commands run in the + # reused session to fail with "permission denied" errors. + # This command will cause the ssh multiplexing for this particular VM to stop causing the next + # SSH command to the VM to restart a multiplex session with ControlMaster=auto. This new session + # will start with docker group membership and will be able to execute docker commands without root. + vm.RemoteCommand('', ssh_args = ['-O', 'stop']) + + +def IsDocker(vm): + return "docker_ce" in vm._installed_packages + + +def Uninstall(vm): + pass diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/emon.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/emon.py new file mode 100644 index 0000000..1a85d01 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/emon.py @@ -0,0 +1,413 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Builds and install emon from source. +""" + +import posixpath +import os +import logging + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import data +from perfkitbenchmarker import os_types +from perfkitbenchmarker import vm_util +try: + from perfkitbenchmarker.linux_packages import intel_s3_transfer +except: + intel_s3_transfer = None +from perfkitbenchmarker.linux_packages import INSTALL_DIR + +flags.DEFINE_string('emon_tarball', None, + 'Optional, path to emon package. eg --emon_tarball=/tmp/sep_private_5_19_linux_07062101c5153a9.tar.bz2') +flags.DEFINE_string('edp_events_file', None, + 'Optional, path to edp event list. present in config/edp') +flags.DEFINE_boolean('emon_post_process_skip', False, + 'Optional, no post processing will be done if supplied') +flags.DEFINE_enum('edp_script_type', 'python3', + ['ruby', 'python3'], 'Optional, default is python3. eg --edp_script_type=ruby if need to use ruby script') +flags.DEFINE_string('edp_config_file', None, + 'Optional, path to EDP config. eg --edp_config_file=/tmp/edp_config.txt') +flags.DEFINE_boolean('edp_publish', False, + 'Optional, EDP csv files will be published to zip file if provided and --intel-publish is applied') +flags.DEFINE_boolean('emon_debug', False, + 'Optional, for debugging EMON driver, build, collection, and post processing. eg --emon_debug') +flags.DEFINE_enum('emon_package_version', '5_34_linux_050122015feb2b5', + ['5_29_linux_09162200a7108a4', '5_33_linux_0316081130eb678', '5_34_linux_050122015feb2b5'], + 'Specify the internal emon version') +FLAGS = flags.FLAGS + + +UBUNTU_PKGS = ["linux-headers-`uname -r`", "build-essential"] +RHEL_PKGS = ["kernel-devel"] +EMON_SOURCE_TARBALL_DEFAULT = 'sep_private_linux_pkb.tar.bz2' +EMON_SOURCE_TARBALL_DEFAULT_LOCATION_S3_BUCKET = 'emon' +EMON_MAIN_DIR = '/opt/emon' +EMON_INSTALL_DIR = '/opt/emon/emon_files' +EMON_RESULT_TARBALL = 'emon_result.tar.gz' +EMON_EDP_TARBALL = 'emon_edp.tar.gz' +PKB_RUBY_FILE = '{0}/pkb_ruby_file'.format(EMON_MAIN_DIR) +PKB_POSTPROCESS_FILE = '{0}/pkb_postprocess_packages_file'.format(EMON_MAIN_DIR) + + +def _GetEmonTarball(): + if FLAGS.emon_package_version: + return "sep_private_" + FLAGS.emon_package_version + ".tar.bz2" + else: + return EMON_SOURCE_TARBALL_DEFAULT + + +def _GetAbsPath(path): + absPath = os.path.abspath(os.path.expanduser(path)) + if not os.path.isfile(absPath): + raise RuntimeError('File (%s) does not exist.' % path) + + return absPath + + +def _TransferEMONTarball(vm): + # get emon_tarball file name + emon_tarball = _GetEmonTarball() + + if FLAGS.emon_tarball: + logging.info("Copying local emon tarball ({}) to remote SUT location ({})" + .format(FLAGS.emon_tarball, INSTALL_DIR)) + tarFile_path = _GetAbsPath(FLAGS.emon_tarball) + _, emon_tarball = os.path.split(FLAGS.emon_tarball) + vm.RemoteCopy(tarFile_path, INSTALL_DIR, True) + else: + download_success = False + s3_image_path = posixpath.join(EMON_SOURCE_TARBALL_DEFAULT_LOCATION_S3_BUCKET, emon_tarball) + target = posixpath.join(INSTALL_DIR, emon_tarball) + download_success = intel_s3_transfer.GetFileFromS3(vm, s3_image_path, target) if intel_s3_transfer else None + + if not download_success: + raise RuntimeError(f'Failed to download EMON tarball ({emon_tarball}). Quit!') + + return emon_tarball + + +def _DoSanityCheck(vm): + """do a sanity check first""" + logging.info("Doing EMON sanity check") + sub_dir = CheckIfExternalVersion(vm) + cmds = ['source {0}/{1}/sep_vars.sh'.format(EMON_INSTALL_DIR, sub_dir), + 'emon -v > {0}/emon-v.dat'.format(INSTALL_DIR), + 'emon -M > {0}/emon-M.dat'.format(INSTALL_DIR)] + cmd = ' ; '.join(cmds) + vm.RemoteCommand("bash -c '{}'".format(cmd)) + emon_v_dat = posixpath.join(INSTALL_DIR, 'emon-v.dat') + emon_M_dat = posixpath.join(INSTALL_DIR, 'emon-M.dat') + logging.info("checking the contents in sanity checking output files ({}) and ({})".format(emon_M_dat, emon_v_dat)) + wc_M, stderr_M, ret_M = vm.RemoteCommandWithReturnCode('wc -c {}'.format(emon_M_dat), ignore_failure=False) + wc_v, stderr_v, ret_v = vm.RemoteCommandWithReturnCode('wc -c {}'.format(emon_v_dat), ignore_failure=False) + if ret_M != 0 or ret_v != 0: + logging.info("Failed to collect emon sanity checking data ({}) and data ({}) with stderr ({}) and ({})" + .format(emon_M_dat, emon_v_dat, stderr_M, stderr_v)) + raise RuntimeError('EMON sanity check failed, quit!') + else: + # check the str return with num > 0 from wc_M and wc_v + # "wc -c /opt/pkb/emon-M.dat" ==> "4255 /opt/pkb/emon-M.dat" + # sample output: wc_M = '4255 /opt/pkb/emon-M.dat' + # split into an array with multiple strings separated by space, + # and get the first string, which is 4255 before converting it to int + # if that int is zero, we didn't get any output + if int(wc_M.split()[0]) <= 0 or int(wc_v.split()[0]) <= 0: + err_str = ('EMON sanity check failed with invalid output ' + ' in ({}) and/or ({}), quit!').format(emon_M_dat, emon_v_dat) + raise RuntimeError(err_str) + + +def GetEmonVersion(vm): + sub_dir = 'sep' + if FLAGS.emon_tarball: + _, emon_version = os.path.split(FLAGS.emon_tarball) + else: + tar_file = vm.RemoteCommand("cd {0} && ls -ld {1}_* | head -n 1 | awk -F' ' '{{print $9}}'" + .format(EMON_MAIN_DIR, sub_dir))[0].rstrip("\n") + emon_version = tar_file.split('.tar')[0] + return emon_version + + +def GetEDPVersion(vm): + sub_dir = CheckIfExternalVersion(vm) + edp_version = vm.RemoteCommand("grep 'EDP_VERSION' {0}/{1}/config/edp/edp.rb | head -n 1" + " | awk -F' ' '{{print $3}}'" + .format(EMON_INSTALL_DIR, sub_dir))[0].rstrip("\n") + return edp_version.strip('"') + + +def CheckIfExternalVersion(vm): + use_dir = 'sep' + if FLAGS.emon_tarball and 'emon_nda' in FLAGS.emon_tarball: + use_dir = 'emon' + return use_dir + + +def _GetGroup(vm): + group = 'sudo' + if "centos" in vm.OS_TYPE: + group = 'wheel' + return group + + +def _AddUserToGroup(vm, group): + vm.RemoteCommand('sudo usermod -g {0} $USER'.format(group)) + # When we add pkb to the wheel group in CentOS, we need to exit and log in for new shells + # to load this new environment. + if "centos" in vm.OS_TYPE: + if FLAGS.ssh_reuse_connections: + vm.RemoteCommand('', ssh_args=['-O', 'stop']) + + +def _InstallCentosKernelDev(vm): + mirror_base = 'https://mirrors.portworx.com/mirrors/http/mirror.centos.org/centos/' + if os_types.CENTOS7 == vm.OS_TYPE: + os_repo = '7' + elif os_types.CENTOS8 == vm.OS_TYPE: + os_repo = '8' + elif os_types.CENTOS_STREAM8 == vm.OS_TYPE: + os_repo = '8-stream' + else: + return False + base_arq_pkg = '/BaseOS/x86_64/os/Packages/' + kernel_devel = 'kernel-devel-$(uname -r).rpm' + vm.InstallPackages('wget') + pkg_url = mirror_base + os_repo + base_arq_pkg + kernel_devel + _, _, wget_rc = vm.RemoteCommandWithReturnCode('wget {} -O /tmp/{}'.format(pkg_url, kernel_devel), + ignore_failure=True) + if wget_rc: + return False + + _, _, yum_rc = vm.RemoteCommandWithReturnCode('sudo yum -y install /tmp/{}'.format(kernel_devel), + ignore_failure=True) + + vm.RemoteCommand('rm /tmp/{}'.format(kernel_devel)) + if yum_rc: + return False + + return True + + +def _Install(vm): + # check input file exists asap, if supplied from command line as an optional flag + # error out if file does not exist + if FLAGS.emon_tarball: + _GetAbsPath(FLAGS.emon_tarball) + + if FLAGS.edp_events_file is not None: + _GetAbsPath(FLAGS.edp_events_file) + + if FLAGS.edp_config_file is not None: + _GetAbsPath(FLAGS.edp_config_file) + + """Installs emon on vm.""" + logging.info("Installing emon") + + # transfer tarball to the SUT + emon_tarball = _TransferEMONTarball(vm) + # install emon + vm.RemoteCommand('sudo rm -rf {0} && sudo mkdir -p {0}'.format(EMON_MAIN_DIR)) + vm.RemoteCommand('sudo mkdir -p {0}'.format(EMON_INSTALL_DIR)) + + vm.RemoteCommand('sudo tar -xf {}/{} -C {} --strip-components=1' + .format(INSTALL_DIR, emon_tarball, EMON_MAIN_DIR)) + + sub_dir = CheckIfExternalVersion(vm) + group = _GetGroup(vm) + _AddUserToGroup(vm, group) + + cmds = ['cd {0}'.format(EMON_MAIN_DIR), + './{0}-installer.sh -i -u -C {1} --accept-license -ni -g {2}' + .format(sub_dir, EMON_INSTALL_DIR, group)] + vm.RemoteCommand(' && '.join(cmds)) + + # quick sanity check + _DoSanityCheck(vm) + + +def Start(vm): + sub_dir = CheckIfExternalVersion(vm) + logging.info("Starting emon collection") + cmd = ('source {0}/{1}/sep_vars.sh; emon -collect-edp > {2}/emon.dat 2>&1 &' + .format(EMON_INSTALL_DIR, sub_dir, INSTALL_DIR)) + if FLAGS.edp_events_file: + edp_event_file_path = _GetAbsPath(FLAGS.edp_events_file) + _, edp_event_file_name = os.path.split(FLAGS.edp_events_file) + vm.RemoteCopy(edp_event_file_path, INSTALL_DIR, True) + cmd = ('source {0}/{1}/sep_vars.sh; cd {2};' + 'emon -collect-edp edp_file={3} > {2}/emon.dat 2>&1 &' + .format(EMON_INSTALL_DIR, sub_dir, INSTALL_DIR, edp_event_file_name)) + + stdout, stderr, retcode = vm.RemoteCommandWithReturnCode("bash -c '{}'".format(cmd)) + + +def Stop(vm): + """Stops emon collection on vm""" + logging.info("Stopping emon collection") + sub_dir = CheckIfExternalVersion(vm) + logging.info("Stopping emon") + cmds = ['source {0}/{1}/sep_vars.sh'.format(EMON_INSTALL_DIR, sub_dir), + 'emon -stop', + 'sleep 5', + 'pkill -9 -x emon'] + stdout, stderr, retcode = vm.RemoteCommandWithReturnCode("bash -c '{}'".format(' ; '.join(cmds)), ignore_failure=True) + if not FLAGS.emon_post_process_skip: + _PostProcess(vm) + + +def _CheckRubyFile(vm): + _, _, retcode = vm.RemoteCommandWithReturnCode('sudo file -f {0}'.format(PKB_RUBY_FILE), + ignore_failure=True, suppress_warning=True) + if retcode == 0: + return True + return False + + +def _PostProcessingPackagesExist(vm): + _, _, retcode = vm.RemoteCommandWithReturnCode('sudo file -f {0}'.format(PKB_POSTPROCESS_FILE), + ignore_failure=True, suppress_warning=True) + if retcode == 0: + return True + return False + + +def _PostProcess(vm): + logging.info("Starting emon post processing") + if FLAGS.trace_skip_install and _PostProcessingPackagesExist(vm): + logging.info("Post processing packages present. Skipping installation") + else: + if FLAGS.edp_script_type == 'ruby': + logging.info("Installing ruby ...") + vm.Install('ruby') + elif FLAGS.edp_script_type == 'python3': + vm.InstallPackages('python3-pip') + if 'centos' in vm.OS_TYPE: + vm.InstallPackages('python3-devel') + vm.RemoteCommand('sudo pip3 install xlsxwriter pandas numpy pytz defusedxml tdigest dataclasses') + vm.RemoteCommand('sudo touch {0}'.format(PKB_POSTPROCESS_FILE)) + + sub_dir = CheckIfExternalVersion(vm) + cmd = 'source {0}/{1}/sep_vars.sh; cd {2};'.format(EMON_INSTALL_DIR, sub_dir, INSTALL_DIR) + default_edp_config_file = " {0}/{1}/config/edp/edp_config.txt".format(EMON_INSTALL_DIR, sub_dir) + + if FLAGS.edp_script_type == 'ruby': + if _CheckRubyFile(vm): + cmd = cmd + 'rvm use ruby-{0}; '.format(FLAGS.ruby_version) + cmd = cmd + "emon -process-edp" + elif FLAGS.edp_script_type == 'python3': + cmd = cmd + "emon -process-pyedp" + default_edp_config_file = " {0}/{1}/config/edp/pyedp_config.txt".format(EMON_INSTALL_DIR, sub_dir) + + if FLAGS.edp_config_file: + edp_config_file_full_path = _GetAbsPath(FLAGS.edp_config_file) + _, edp_config_file_name = os.path.split(FLAGS.edp_config_file) + vm.RemoteCopy(edp_config_file_full_path, INSTALL_DIR, True) + cmd = cmd + ' {0}'.format(edp_config_file_name) + else: + cmd = cmd + default_edp_config_file + stdout, stderr, retcode = vm.RemoteCommandWithReturnCode("bash -c '{}'".format(cmd)) + + if FLAGS.emon_debug: + if stdout != '' or stderr != '': + logging.info("Emon post process generated stdout ({}) and stderr ({})" + .format(stdout, stderr)) + + +def FetchResults(vm): + """Copies emon data to PKB host.""" + logging.info('Fetching emon results') + + # TODO: tag vm with machine category, such as server, client, single_machine + # if vm.tag is not None and vm.tag is not '': + # local_dir = os.path.join(vm_util.GetTempDir(), vm.name + '-' + vm.tag + '-emon') + # e.g.: pkb-5c37bc7a-0-client-emon + # e.g.: pkb-5c37bc7a-1-server-emon + # else: + local_dir = os.path.join(vm_util.GetTempDir(), vm.name + '-emon') + # e.g.: pkb-5c37bc7a-0-emon + cmd = ['mkdir', '-p', local_dir] + vm_util.IssueCommand(cmd) + tar_pkgs = [('*emon*', EMON_RESULT_TARBALL)] + if not FLAGS.emon_post_process_skip: + edp_files = '*edp*.csv' + if FLAGS.edp_script_type == 'python3': + edp_files += ' summary.xlsx' + tar_pkgs.append((edp_files, EMON_EDP_TARBALL)) + # tar command below will cause an exception if edp fails to generate the output files as expected, + # and PKB process will be shutdown by the framework + # this is desired if edp post process fails, which could be due to multiple reasons, + # such as EMON data corruption, and we should quit + + for remote_output_files, remote_output_tarfile in tar_pkgs: + remote_output_tarfile = os.path.join("/tmp", remote_output_tarfile) + vm.RemoteCommand('cd {} && sudo -E tar cvzf {} {}' + .format(INSTALL_DIR, remote_output_tarfile, remote_output_files)) + vm.PullFile(local_dir, remote_output_tarfile) + + +def EmonCleanup(vm): + sub_dir = CheckIfExternalVersion(vm) + group = _GetGroup(vm) + cmds = ['cd {0}'.format(EMON_MAIN_DIR), + './{1}-installer.sh -u -C {0} --accept-license -ni -g {2} > /dev/null 2>&1 &' + .format(EMON_INSTALL_DIR, sub_dir, group)] + vm.RemoteCommand(' && '.join(cmds)) + if not FLAGS.emon_post_process_skip: + emon_edp_data = posixpath.join(INSTALL_DIR, '*edp*.csv') + if FLAGS.edp_script_type == 'python3': + emon_edp_data += ' summary.xlsx' + vm.RemoteCommand('sudo rm -f {}'.format(emon_edp_data), ignore_failure=False) + # rm emon shell scripts and output emon*.dat files + emon_all_files = posixpath.join(INSTALL_DIR, '*emon*') + vm.RemoteCommand('sudo rm -f {}'.format(emon_all_files), ignore_failure=False) + + # rm entire emon install folder + vm.RemoteCommand('sudo rm -fr {}'.format(EMON_MAIN_DIR)) + + +def EmonDirsExist(vm): + _, _, retVal = vm.RemoteCommandWithReturnCode('test -d {0}'.format(EMON_INSTALL_DIR), + ignore_failure=True) + # They do not exist + if retVal != 0: + return False + return True + + +def YumInstall(vm): + vm.InstallPackageGroup('Development Tools') + _, _, rc = vm.RemoteCommandWithReturnCode('test -d /usr/src/kernels/$(uname -r)', ignore_failure=True) + if rc != 0: + pkg_name = "kernel-devel-$(uname -r)" + if vm.HasPackage(pkg_name): + vm.InstallPackages(pkg_name) + elif _InstallCentosKernelDev(vm): + pass + else: + raise Exception("\n Could not find the kernel headers to match your kernel ! Please install it manually \n " + "There are two approaches to solve this :- \n" + "1) Get the kernel details with 'uname-r' and search for kernel headers on " + "CentOS repo and download them.\n" + "2)'sudo yum -y update' and then 'sudo reboot' - Please note that this may \n" + "update other packages on your system which may not be desirable to you\n") + vm.RemoteCommand('sudo yum -y install bzip2') + _Install(vm) + + +def AptInstall(vm): + vm.InstallPackages(' '.join(UBUNTU_PKGS)) + # since we always install the exact matching kernel headers by UBUNTU_PKGS + # there is no need to search for it like in YUM based kernel + _Install(vm) diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/gcc.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/gcc.py new file mode 100644 index 0000000..f4178eb --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/gcc.py @@ -0,0 +1,77 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing gcc installation and cleanup functions.""" + +""" This is a generic gcc installer and must offer installation + methods for various versions of gcc """ + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker.linux_packages import INSTALL_DIR +import logging + +FLAGS = flags.FLAGS + + +def _Install(vm): + """Installs the gcc package on the VM.""" + pass + + +def YumInstall(vm): + """Installs the gcc package on the VM.""" + # TODO: Figure out how to install gcc with yum + raise NotImplementedError + + +def AptInstall(vm): + """ Install gcc from the ppa ubuntu-toolchain repo """ + # On Ubuntu this is a symlink, save it for uninstall + vm.RemoteCommand('test -L /usr/bin/gcc && readlink /usr/bin/gcc > {0}/gcc-symlink-target'.format(INSTALL_DIR), + ignore_failure=True) + try: + # try first if distributions/versions have gcc-8 in their default repository + vm.AptUpdate() + vm.RemoteCommand('sudo apt-get -y install {0}-{1}' + .format(FLAGS.compiler, FLAGS.compiler_version)) + except errors.VirtualMachine.RemoteCommandError: + # try again if distributions/versions do NOT have gcc-8 in their default repository + vm.RemoteCommand('sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y') + vm.AptUpdate() + vm.RemoteCommand('sudo apt-get -y install {0}-{1}' + .format(FLAGS.compiler, FLAGS.compiler_version)) + + try: + vm.RemoteCommand('if [[ -s {0}/gcc-symlink-target ]]; then cat {0}/gcc-symlink-target; fi'.format(INSTALL_DIR))[0].rstrip('\n') + if vm.RemoteCommandWithReturnCode('test -L /usr/bin/gcc')[2] == 0: + vm.RemoteCommand('sudo rm -f /usr/bin/gcc && sudo ln -s /usr/bin/gcc-{0} /usr/bin/gcc'.format(FLAGS.compiler_version)) + else: + vm.RemoteCommand('sudo ln -s /usr/bin/gcc-{0} /usr/bin/gcc'.format(FLAGS.compiler_version)) + except errors.VirtualMachine.RemoteCommandError: + # Install the distro default for gcc + # This should ALWAYS work, unless there are bigger issues + logging.warn("Falling back to your distro's default gcc!") + vm.InstallPackages('gcc') + + +def SwupdInstall(vm): + """ Installs a gcc containing bundle on the Clear Linux VM """ + raise NotImplementedError + + +def Uninstall(vm): + gcc = vm.RemoteCommand('if [[ -s {0}/gcc-symlink-target ]]; then cat {0}/gcc-symlink-target; fi'.format(INSTALL_DIR))[0].rstrip('\n') + if gcc != '': + vm.RemoteCommand('sudo rm /usr/bin/gcc && sudo ln -s /usr/bin/{0} /usr/bin/gcc'.format(gcc), ignore_failure=True) diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/google_cloud_sdk.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/google_cloud_sdk.py new file mode 100644 index 0000000..c7d24a5 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/google_cloud_sdk.py @@ -0,0 +1,43 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing google cloud sdk installation function.""" + +import os + +from perfkitbenchmarker import vm_util + + +SDK_REPO = 'https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz' +SDK_DIR = '%s/google-cloud-sdk' % vm_util.VM_TMP_DIR +SDK_INSTALL_FILE = '%s/install.sh' % SDK_DIR +GCLOUD_PATH = '%s/bin/gcloud' % SDK_DIR +GSUTIL_PATH = '%s/bin/gsutil' % SDK_DIR +KUBECTL_PATH = '%s/bin/kubectl' % SDK_DIR + + +def RunGcloud(vm, cmd): + return vm.RemoteCommand('export CLOUDSDK_CORE_DISABLE_PROMPTS=1 && %s %s ' + '--project %s --format json' % (GCLOUD_PATH, cmd, + vm.project)) + + +def Install(vm): + """Installs google cloud sdk on the VM.""" + vm.Install('wget') + vm.RemoteCommand('cd {0} && wget {1} && tar xzf {2} && rm {2}'.format( + vm_util.VM_TMP_DIR, SDK_REPO, os.path.basename(SDK_REPO))) + vm.RemoteCommand('%s --disable-installation-options --usage-report=false ' + '--path-update=false --bash-completion=false' + % SDK_INSTALL_FILE) diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/habana.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/habana.py new file mode 100644 index 0000000..983acd5 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/habana.py @@ -0,0 +1,143 @@ +from absl import flags +from perfkitbenchmarker import os_types + +FLAGS = flags.FLAGS +flags.DEFINE_string("habana_version", "1.3.0-499", "Specify the Habana driver version") + +TEST_HABANA_YAML = """ +apiVersion: batch/v1 +kind: Job +metadata: + name: test-habana +spec: + template: + spec: + containers: + - name: test-habana + image: busybox + command: [ "true" ] + resources: + limits: + habana.ai/gaudi: 1 + requests: + habana.ai/gaudi: 1 + restartPolicy: Never +""".replace('\n', '\\n') + +HABANA_REPO = '''[vault] +name=Habana Vault +baseurl=https://vault.habana.ai/artifactory/centos/8/8.3 +enabled=1 +gpgcheck=0 +gpgkey=https://vault.habana.ai/artifactory/centos/8/8.3/repodata/repomod.xml.key +repo_gpgcheck=0''' + +CONTAINERD_CONFIG = '''disabled_plugins = [] +version = 2 + + [plugins] + [plugins."io.containerd.grpc.v1.cri"] + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "habana" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.habana] + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.habana.options] + BinaryName = "/usr/bin/habana-container-runtime" + [plugins."io.containerd.runtime.v1.linux"] + runtime = "habana-container-runtime"''' + +DOCKER_DAEMON = '''{ + "default-runtime": "habana", + "runtimes": { + "habana": { + "path": "/usr/bin/habana-container-runtime", + "runtimeArgs": [] + } + } +}''' + + +def RegisterKubernetesPlugins(vm): + vm.RemoteCommand("kubectl apply -f https://vault.habana.ai/artifactory/docker-k8s-device-plugin/habana-k8s-device-plugin.yaml") + vm.RemoteCommand(f"printf '{TEST_HABANA_YAML}' | kubectl apply -f -") + vm.RemoteCommand("timeout 300s bash -c 'until kubectl wait job test-habana --for=condition=complete; do sleep 1s; done'", ignore_failure=True) + vm.RemoteCommand(f"printf '{TEST_HABANA_YAML}' | kubectl delete -f -") + + +def RegisterWithContainerD(vm): + vm.RemoteCommand(f"echo '{CONTAINERD_CONFIG}' | sudo tee /etc/containerd/config.toml") + vm.RemoteCommand("sudo systemctl restart containerd") + vm.RemoteCommand("sleep 5s") + vm.RemoteCommand("sudo systemctl restart kubelet") + vm.RemoteCommand("sleep 5s") + + +def RegisterWithDocker(vm): + vm.RemoteCommand(f"echo '{DOCKER_DAEMON}' | sudo tee /etc/docker/daemon.json") + vm.RemoteCommand("sudo systemctl restart docker") + vm.RemoteCommand("sleep 5s") + + +def _InstallCentosKernelDev(vm): + mirror_base = 'https://mirrors.portworx.com/mirrors/http/mirror.centos.org/centos/' + if os_types.CENTOS7 == vm.OS_TYPE: + os_repo = '7' + elif os_types.CENTOS8 == vm.OS_TYPE: + os_repo = '8' + elif os_types.CENTOS_STREAM8 == vm.OS_TYPE: + os_repo = '8-stream' + else: + return False + + base_arq_pkg = '/BaseOS/x86_64/os/Packages/' + kernel_devel = 'kernel-devel-$(uname -r).rpm' + vm.InstallPackages('wget') + pkg_url = mirror_base + os_repo + base_arq_pkg + kernel_devel + _, _, wget_rc = vm.RemoteCommandWithReturnCode('wget {} -O /tmp/{}'.format(pkg_url, kernel_devel), + ignore_failure=True) + if wget_rc: + return False + + _, _, yum_rc = vm.RemoteCommandWithReturnCode('sudo yum -y install /tmp/{}'.format(kernel_devel), + ignore_failure=True) + vm.RemoteCommand('rm /tmp/{}'.format(kernel_devel)) + + if yum_rc: + return False + + return True + + +def YumInstall(vm): + if vm.OS_TYPE != os_types.CENTOS_STREAM8: + raise Exception("Only CentOS 8 Stream is supported!") + + # Install kernel devel for CentOS + if not _InstallCentosKernelDev(vm): + raise Exception("Failed to install kernel-devel for CentOS") + + habana_centos_version = FLAGS.habana_version + ".el8" + vm.RemoteCommand(f"echo '{HABANA_REPO}' | sudo tee /etc/yum.repos.d/Habana-Vault.repo") + vm.RemoteCommand("sudo yum makecache") + vm.InstallPackages("--enablerepo=extras epel-release") + vm.InstallPackages("dkms habanalabs-firmware-{0} habanalabs-firmware-tools-{0}". + format(habana_centos_version)) + vm.RemoteCommand("sudo yum install -y habanalabs-{0} habanalabs-graph-{0} habanalabs-container-runtime-{0}". + format(habana_centos_version)) + vm.RemoteCommand("sudo modprobe habanalabs_en") + vm.RemoteCommand("sudo modprobe habanalabs") + + +def AptInstall(vm): + vm.RemoteCommand('curl -X GET https://vault.habana.ai/artifactory/api/gpg/key/public | sudo apt-key add - && echo "deb https://vault.habana.ai/artifactory/debian $(grep VERSION_CODENAME= /etc/os-release | cut -f2 -d=) main" | sudo tee -a /etc/apt/sources.list.d/artifactory.list > /dev/null && sudo dpkg --configure -a') + vm.AptUpdate() + vm.InstallPackages("dkms libelf-dev") + vm.InstallPackages("habanalabs-firmware={0} habanalabs-firmware-tools={0}".format(FLAGS.habana_version)) + vm.InstallPackages("--allow-downgrades habanalabs-thunk={0} habanalabs-dkms={0} habanalabs-graph={0} habanalabs-container-runtime={0}".format(FLAGS.habana_version)) + vm.RemoteCommand("sudo modprobe habanalabs_en") + vm.RemoteCommand("sudo modprobe habanalabs") + + +def Uninstall(vm): + pass diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/hadoop.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/hadoop.py new file mode 100644 index 0000000..c62b0f0 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/hadoop.py @@ -0,0 +1,406 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing Hadoop installation and cleanup functions. + +For documentation of commands to run at startup and shutdown, see: +http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html#Hadoop_Startup +""" +import functools +import logging +import os +import posixpath +import re +import time +from absl import flags +from perfkitbenchmarker import data +from perfkitbenchmarker import linux_packages +from perfkitbenchmarker import regex_util +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.linux_packages import aws_credentials + +FLAGS = flags.FLAGS + +flags.DEFINE_string('hadoop_version', '3.3.1', 'Version of Hadoop.') +flags.DEFINE_string('hadoop_bin_url', None, + 'Specify to override url from HADOOP_URL_BASE.') + +DATA_FILES = [ + 'hadoop/core-site.xml.j2', 'hadoop/yarn-site.xml.j2', + 'hadoop/hdfs-site.xml', 'hadoop/mapred-site.xml.j2', + 'hadoop/hadoop-env.sh.j2', 'hadoop/workers.j2' +] +START_HADOOP_SCRIPT = 'hadoop/start-hadoop.sh.j2' + +HADOOP_URL_BASE = 'https://downloads.apache.org/hadoop/common' + +HADOOP_DIR = posixpath.join(linux_packages.INSTALL_DIR, 'hadoop') +HADOOP_BIN = posixpath.join(HADOOP_DIR, 'bin') +HADOOP_SBIN = posixpath.join(HADOOP_DIR, 'sbin') +HADOOP_CONF_DIR = posixpath.join(HADOOP_DIR, 'etc', 'hadoop') +HADOOP_PRIVATE_KEY = posixpath.join(HADOOP_CONF_DIR, 'hadoop_keyfile') +HADOOP_URL = 'https://archive.apache.org/dist/hadoop/core/hadoop-{0}/hadoop-{0}.tar.gz' +HADOOP_LOCAL_SCRATCH = posixpath.join(vm_util.VM_TMP_DIR, 'local_scratch', 'hadoop') + +PACKAGE_NAME = 'hadoop' +PREPROVISIONED_DATA = { + 'hadoop-{0}.tar.gz'.format('2.9.2'): + '3d2023c46b1156c1b102461ad08cbc17c8cc53004eae95dab40a1f659839f28a', + 'hadoop-{0}.tar.gz'.format('3.2.1'): + 'f66a3a4115b8f16c1077d1a198a06854dbef0e4233291712ed08d0a10629ed37' +} +PACKAGE_DATA_URL = { + 'hadoop-{0}.tar.gz'.format('2.9.2'): HADOOP_URL.format('2.9.2'), + 'hadoop-{0}.tar.gz'.format('3.2.1'): HADOOP_URL.format('3.2.1') +} +HADOOP_LIB_DIR = posixpath.join(HADOOP_DIR, 'share', 'hadoop', 'common', 'lib') +HADOOP_TOOLS_DIR = posixpath.join(HADOOP_DIR, 'share', 'hadoop', 'tools', 'lib') + +HADOOP_CMD = posixpath.join(HADOOP_BIN, 'hadoop') +HDFS_CMD = posixpath.join(HADOOP_BIN, 'hdfs') +YARN_CMD = posixpath.join(HADOOP_BIN, 'yarn') + + +def _GetHadoopURL(): + """Gets the Hadoop download url based on flags. + + The default is to look for the version `--hadoop_version` to download. + + Returns: + The Hadoop download url. + """ + + return '{0}/hadoop-{1}/hadoop-{1}.tar.gz'.format(HADOOP_URL_BASE, + FLAGS.hadoop_version) + + +def CheckPrerequisites(): + """Verifies that the required resources are present. + + Raises: + perfkitbenchmarker.data.ResourceNotFound: On missing resource. + """ + for resource in DATA_FILES + [START_HADOOP_SCRIPT]: + data.ResourcePath(resource) + + +def _Install(vm): + vm.Install('openjdk') + vm.Install('curl') + hadoop_url = HADOOP_URL.format(FLAGS.hadoop_version) + hadoop_tar = hadoop_url.split('/')[-1] + if hadoop_tar not in PREPROVISIONED_DATA: + PREPROVISIONED_DATA[hadoop_tar] = '' # will only work with preprovision_ignore_checksum + PACKAGE_DATA_URL[hadoop_tar] = hadoop_url + vm.InstallPreprovisionedPackageData( + PACKAGE_NAME, + [hadoop_tar], + linux_packages.INSTALL_DIR + ) + hadoop_remote_path = posixpath.join(linux_packages.INSTALL_DIR, hadoop_tar) + # Intel - rather than overwrite, to ensure a pristine Hadoop, remove first then unpack + vm.RemoteCommand('test -d {0} && rm -rf {0}; mkdir {0} && tar -C {0} --strip-component=1 -xzf {1}'.format( + HADOOP_DIR, hadoop_remote_path)) + + +def YumInstall(vm): + """Installs Hadoop on the VM.""" + vm.InstallPackages('snappy') + _Install(vm) + + +def AptInstall(vm): + """Installs Hadoop on the VM.""" + libsnappy = 'libsnappy1' + if not vm.HasPackage(libsnappy): + # libsnappy's name on ubuntu16.04 is libsnappy1v5. Let's try that instead. + libsnappy = 'libsnappy1v5' + vm.InstallPackages(libsnappy) + _Install(vm) + + +def InstallGcsConnector(vm, install_dir=HADOOP_LIB_DIR): + """Install the GCS connector for Hadoop, which allows I/O to GCS.""" + connector_url = ('https://storage.googleapis.com/hadoop-lib/gcs/' + 'gcs-connector-hadoop{}-latest.jar'.format( + FLAGS.hadoop_version[0])) + vm.RemoteCommand('cd {0} && curl -O {1}'.format(install_dir, connector_url)) + + +# Scheduling constants. +# Give 90% of VM memory to YARN for scheduling. +# This is roguhly consistent with Dataproc 2.0+ +YARN_MEMORY_FRACTION = 0.9 +# Give 80% of the memory YARN schedules to the JVM Heap space. +# This is probably conservative on more memory mahcines, but is a traditonal +# rule of thumb. +HEAP_MEMORY_RATIO = 0.8 + +# Schedule slightly more tasks than vCPUs. This was found to be optimal for +# sorting 240 GB using standard GCE virtual machines with sufficient disk. +# Using a grid seach. +# TODO(pclay): Confirm results generalize to larger data sizes. +MAP_SLOTS_PER_CORE = 1.5 +REDUCE_SLOTS_PER_CORE = 4 / 3 + + +def _RenderConfig(vm, + master, + workers, + memory_fraction=YARN_MEMORY_FRACTION, + configure_s3=False, + extra_config={}): + """Load Hadoop Condfiguration on VM.""" + # Use first worker to get worker configuration + worker = workers[0] + num_workers = len(workers) + worker_cores = worker.NumCpusForBenchmark() + yarn_memory_mb = int((vm.total_memory_kb / 1024) * memory_fraction) + # Reserve 1 GB per worker for AppMaster containers. + usable_memory_mb = yarn_memory_mb - 1024 + + # YARN generally schedules based on memory (and ignores cores). We invert this + # by calculating memory in terms of cores. This means that changing + # machine memory will not change scheduling simply change the memory given to + # each task. + maps_per_node = int(worker_cores * MAP_SLOTS_PER_CORE) + map_memory_mb = usable_memory_mb // maps_per_node + map_heap_mb = int(map_memory_mb * HEAP_MEMORY_RATIO) + + reduces_per_node = int(worker_cores * REDUCE_SLOTS_PER_CORE) + reduce_memory_mb = usable_memory_mb // reduces_per_node + reduce_heap_mb = int(reduce_memory_mb * HEAP_MEMORY_RATIO) + + # This property is only used for generating data like teragen. + # Divide 2 to avoid tiny files on large clusters. + num_map_tasks = maps_per_node * num_workers + # This determines the number of reduce tasks in Terasort and is critical to + # scale with the cluster. + num_reduce_tasks = reduces_per_node * num_workers + + if vm.scratch_disks: + # TODO(pclay): support multiple scratch disks. A current suboptimal + # workaround is RAID0 local_ssds with --num_striped_disks. + scratch_dir = posixpath.join(vm.GetScratchDir(), 'hadoop') + else: + # Intel change + scratch_dir = HADOOP_LOCAL_SCRATCH + # End Intel change + + aws_access_key = None + aws_secret_key = None + optional_tools = None + if configure_s3: + aws_access_key, aws_secret_key = aws_credentials.GetCredentials() + optional_tools = 'hadoop-aws' + + context = { + 'master_ip': master.internal_ip, + 'worker_ips': [vm.internal_ip for vm in workers], + 'scratch_dir': scratch_dir, + 'worker_vcpus': worker_cores, + 'hadoop_private_key': HADOOP_PRIVATE_KEY, + 'user': vm.user_name, + 'yarn_memory_mb': yarn_memory_mb, + 'map_memory_mb': map_memory_mb, + 'map_heap_mb': map_heap_mb, + 'num_map_tasks': num_map_tasks, + 'reduce_memory_mb': reduce_memory_mb, + 'reduce_heap_mb': reduce_heap_mb, + 'num_reduce_tasks': num_reduce_tasks, + 'aws_access_key': aws_access_key, + 'aws_secret_key': aws_secret_key, + 'optional_tools': optional_tools + } + context.update(extra_config) + for file_name in DATA_FILES: + file_path = data.ResourcePath(file_name) + if (file_name == 'hadoop/workers.j2' and + FLAGS.hadoop_version.split('.')[0] < '3'): + file_name = 'hadoop/slaves.j2' + remote_path = posixpath.join(HADOOP_CONF_DIR, os.path.basename(file_name)) + if file_name.endswith('.j2'): + vm.RenderTemplate(file_path, os.path.splitext(remote_path)[0], context) + else: + vm.RemoteCopy(file_path, remote_path) + + +def _GetHDFSOnlineNodeCount(master): + cmd = HDFS_CMD + ' dfsadmin -report' + stdout = master.RemoteCommand(cmd)[0] + avail_str = regex_util.ExtractGroup(r'Live datanodes\s+\((\d+)\):', stdout) + return int(avail_str) + + +def _GetYARNOnlineNodeCount(master): + cmd = YARN_CMD + ' node -list -all' + stdout = master.RemoteCommand(cmd)[0] + return len(re.findall(r'RUNNING', stdout)) + + +def _WaitForNodes(vm, expected_nodes, GetJoinedNodesFunc): + num_tries = 5 + healthy = False + for _ in range(num_tries): + logging.info('Sleeping 5s to wait for nodes to join.') + time.sleep(5) + online_count = GetJoinedNodesFunc(vm) + if online_count == expected_nodes: + logging.info('Service running on all %d workers', expected_nodes) + healthy = True + break + else: + logging.info('Only {0} out of {1} nodes are up. Retrying'.format( + online_count, expected_nodes)) + if not healthy: + raise ValueError('Not all nodes running: {0} < {1}'.format( + online_count, expected_nodes)) + + +def _PopulateHostEntries(vm): + hostname = vm.RemoteCommand('hostname')[0] + host_entry = '{0} {1}'.format(vm.internal_ip, hostname.rstrip()) + cmd = "grep -qxF '{0}' {1} || echo '{0}' | sudo tee -a {1}".format( + host_entry, '/etc/hosts') + vm.RemoteCommand(cmd) + + +def _SetupHosts(vms): + vm_util.RunThreaded(lambda vm: _PopulateHostEntries(vm), vms) + + +def StartHadoop(master, workers, start_yarn=True, extra_config={}): + vms = [master] + workers + context = {'hadoop_dir': HADOOP_DIR, + 'vm_ips': [vm.internal_ip for vm in vms], + 'start_yarn': start_yarn} + + # make sure /etc/hosts is properly populated + _SetupHosts(vms) + + # HDFS setup and formatting, YARN startup + script_path = posixpath.join(HADOOP_DIR, 'start-hadoop.sh') + master.RenderTemplate(data.ResourcePath(START_HADOOP_SCRIPT), + script_path, context=context) + master.RemoteCommand('bash {0}'.format(script_path), should_log=True) + logging.info('Checking HDFS status.') + _WaitForNodes(master, len(workers), _GetHDFSOnlineNodeCount) + + if start_yarn: + logging.info('Checking YARN status.') + _WaitForNodes(master, len(workers), _GetYARNOnlineNodeCount) + + +def CleanHadoopTmp(vm, mountpoints): + """Delete Hadoop data from 'vm'.""" + for _, mountpoint in enumerate(mountpoints): + vm.RemoteCommand('rm -rf {0}'.format( + posixpath.join(mountpoint, 'hadoop_tmp'))) + + +def ConfigureAndStart(master, workers, start_yarn=True, configure_s3=False, extra_config={}): + """Configure hadoop on a cluster. + + Args: + master: VM. Master VM - will be the HDFS NameNode, YARN ResourceManager. + workers: List of VMs. Each VM will run an HDFS DataNode, YARN node. + start_yarn: bool. Start YARN and JobHistory server? Set to False if HDFS is + the only service required. Default: True. + configure_s3: Whether to configure Hadoop to access S3. + """ + vms = [master] + workers + fn = functools.partial( + _RenderConfig, master=master, workers=workers, configure_s3=configure_s3, extra_config=extra_config) + vm_util.RunThreaded(fn, vms) + + master.RemoteCommand("rm -f {0} && ssh-keygen -q -t rsa -N '' -f {0}".format( + HADOOP_PRIVATE_KEY)) + + public_key = master.RemoteCommand('cat {0}.pub'.format(HADOOP_PRIVATE_KEY))[0] + + def AddKey(vm): + vm.RemoteCommand('echo "{0}" >> ~/.ssh/authorized_keys'.format(public_key)) + + vm_util.RunThreaded(AddKey, vms) + + context = { + 'hadoop_dir': HADOOP_DIR, + 'vm_ips': [vm.internal_ip for vm in vms], + 'start_yarn': start_yarn + } + + # HDFS setup and formatting, YARN startup + script_path = posixpath.join(HADOOP_DIR, 'start-hadoop.sh') + master.RenderTemplate( + data.ResourcePath(START_HADOOP_SCRIPT), script_path, context=context) + master.RemoteCommand('bash {0}'.format(script_path), should_log=True) + + logging.info('Sleeping 10s for Hadoop nodes to join.') + time.sleep(10) + + logging.info('Checking HDFS status.') + hdfs_online_count = _GetHDFSOnlineNodeCount(master) + if hdfs_online_count != len(workers): + raise ValueError('Not all nodes running HDFS: {0} < {1}'.format( + hdfs_online_count, len(workers))) + else: + logging.info('HDFS running on all %d workers', len(workers)) + + if start_yarn: + logging.info('Checking YARN status.') + yarn_online_count = _GetYARNOnlineNodeCount(master) + if yarn_online_count != len(workers): + raise ValueError('Not all nodes running YARN: {0} < {1}'.format( + yarn_online_count, len(workers))) + else: + logging.info('YARN running on all %d workers', len(workers)) + try: + StartHadoop(master, workers, start_yarn, extra_config) + except: + raise Exception("Caught exception while starting Hadoop!") + + +def StopYARN(master): + """Stop YARN on all nodes.""" + master.RemoteCommand(posixpath.join(HADOOP_SBIN, 'stop-yarn.sh'), + ignore_failure=True) + + +def StopHDFS(master): + """Stop HDFS on all nodes.""" + master.RemoteCommand(posixpath.join(HADOOP_SBIN, 'stop-dfs.sh'), + ignore_failure=True) + + +def StopHistoryServer(master): + """Stop the MapReduce JobHistory daemon.""" + master.RemoteCommand('{0} stop historyserver'.format( + posixpath.join(HADOOP_SBIN, 'mr-jobhistory-daemon.sh')), + ignore_failure=True) + + +def StopAll(master): + """Stop HDFS and YARN. + + Args: + master: VM. HDFS NameNode/YARN ResourceManager. + """ + StopHistoryServer(master) + StopYARN(master) + StopHDFS(master) + + +def CleanDatanode(vm): + """Delete Hadoop data from 'vm'.""" + vm.RemoteCommand('rm -rf {0}'.format( + posixpath.join(vm.GetScratchDir(), 'hadoop'))) diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/k8s.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/k8s.py new file mode 100644 index 0000000..6f1b930 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/k8s.py @@ -0,0 +1,331 @@ +from absl import flags +from perfkitbenchmarker import vm_util +from perfkitbenchmarker import os_types +from perfkitbenchmarker.linux_packages import docker_ce +from yaml import safe_load_all, dump_all +import logging +import posixpath +import uuid +import os + +FLAGS = flags.FLAGS +flags.DEFINE_string("k8s_repo_key_url", "https://packages.cloud.google.com/apt/doc/apt-key.gpg", + "Specify the installation repo GPG key url") +flags.DEFINE_string("k8s_repo_url", "http://apt.kubernetes.io/", + "Specify the installation repo url") +flags.DEFINE_string("k8s_version", "1.21", + "Specify the installation repo url") +flags.DEFINE_string("k8s_nfd_version", "0.10.1", + "Specify the node feature discovery version") +flags.DEFINE_boolean("k8s_kubevirt", False, + "Specify whether to install kubevirt") +flags.DEFINE_string("k8s_kubevirt_version", "v0.55.0", + "Specify the kubevert version") +flags.DEFINE_list("k8s_kubeadm_options", [], + "Specify the kubeadm options") +flags.DEFINE_list("k8s_nfd_scripts", [], + "Specify any extra node feature discovery scripts") +flags.DEFINE_list("k8s_image_mirrors", [], + "Specify docker image mirrors") +flags.DEFINE_enum("k8s_cni", "flannel", + ["flannel", "calico"], + "Specify the CNI") +flags.DEFINE_string("k8s_flannel_version", "v0.18.1", + "Specify the flannel CNI version") +flags.DEFINE_string("k8s_calico_version", "v3.23", + "Specify the calico CNI version") +flags.DEFINE_string("k8s_cni_options", "", + "Specify the CNI options") + + +NFD_SOURCE_D = "/etc/kubernetes/node-feature-discovery/source.d" +REGISTRY_CERTS_DIR = "registry-certs" +REGISTRY_YAML = """ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: registry + labels: + app: registry +spec: + replicas: 1 + selector: + matchLabels: + app: registry + template: + metadata: + labels: + app: registry + spec: + volumes: + - name: cert + secret: + secretName: registry-cert + containers: + - image: registry:2 + name: registry + imagePullPolicy: IfNotPresent + env: + - name: REGISTRY_HTTP_TLS_CERTIFICATE + value: "/certs/tls.crt" + - name: REGISTRY_HTTP_TLS_KEY + value: "/certs/tls.key" + ports: + - containerPort: 5000 + volumeMounts: + - name: cert + mountPath: /certs + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + - effect: NoSchedule + key: node-role.kubernetes.io/master + nodeSelector: + node-role.kubernetes.io/control-plane: "" +--- +apiVersion: v1 +kind: Service +metadata: + name: registry-service + labels: + app: registry-service +spec: + ports: + - port: {HOSTPORT} + targetPort: 5000 + externalIPs: + - "{HOSTIP}" + selector: + app: registry +""".replace("\n", "\\n") +POD_NETWORK_CIDR = "10.244.0.0/16" + + +def YumInstall(vm): + raise Exception("Not Implemented") + + +def AptInstall(vm): + vm.InstallPackages(f'apt-transport-https ca-certificates curl') + vm.RemoteCommand(f'curl {FLAGS.k8s_repo_key_url} | sudo apt-key add -') + vm.RemoteCommand(f"bash -c 'sudo -E add-apt-repository \"deb [arch=$(dpkg --print-architecture)] {FLAGS.k8s_repo_url} kubernetes-xenial main\"'") + vm.AptUpdate() + version, _ = vm.RemoteCommand(f"sudo apt-cache madison kubelet | grep {FLAGS.k8s_version} | cut -f2 -d'|' | tr -d ' ' | sort -V -r | head -n 1") + version = version.strip() + vm.InstallPackages(f'kubeadm={version} kubelet={version} kubectl={version}') + + +def _InstallNFDScripts(vm): + for script1 in FLAGS.k8s_nfd_scripts: + basename = os.path.basename(script1) + remote_file = posixpath.join(INSTALL_DIR, basename) + vm.RemoteCopy(script1, remote_file) + vm.RemoteCommand(f'sudo mv -f {remote_file} {NFD_SOURCE_D}') + vm.RemoteCommand(f'sudo chmod -R a+rx {NFD_SOURCE_D}') + vm.RemoteCommand('sudo systemctl restart kubelet') + + +@vm_util.Retry() +def _RetagImage(vm, image, image_retag): + if docker_ce.IsDocker(vm): + vm.RemoteCommand(f"sudo -E docker pull {image}") + vm.RemoteCommand(f"sudo -E docker tag {image} {image_retag}") + else: + vm.RemoteCommand(f"sudo ctr -n k8s.io i pull {image}") + vm.RemoteCommand(f"sudo ctr -n k8s.io i tag {image} {image_retag}") + + +def _PrepareSystem(vm): + vm.RemoteCommand("sudo systemctl stop named", ignore_failure=True) + vm.RemoteCommand("sudo systemctl disable named", ignore_failure=True) + + vm.RemoteCommand("sudo swapoff -a") + vm.RemoteCommand("sudo sed -ri 's/.*swap.*/#&/' /etc/fstab") + + vm.RemoteCommand("sudo modprobe overlay") + vm.RemoteCommand("sudo modprobe br_netfilter") + vm.RemoteCommand("printf 'overlay\\nbr_netfilter\\n' | sudo tee /etc/modules-load.d/k8s.conf") + + vm.RemoteCommand("printf 'net.bridge.bridge-nf-call-ip6tables=1\\nnet.bridge.bridge-nf-call-iptables=1\\nnet.ipv4.ip_forward=1\\nnet.netfilter.nf_conntrack_max=1000000' | sudo tee /etc/sysctl.d/k8s.conf") + vm.RemoteCommand("sudo sysctl --system") + + version = FLAGS.k8s_version.split(".") + if int(version[0]) <= 1 and int(version[1]) < 24: + vm.Install("docker_ce") + else: + vm.Install("containerd") + + for i in range(0,len(FLAGS.k8s_image_mirrors),2): + _RetagImage(vm, FLAGS.k8s_image_mirrors[i], FLAGS.k8s_image_mirrors[i+1]) + + vm.Install("k8s") + + +def _SetCNIOptions(): + if FLAGS.k8s_cni == "flannel": + FLAGS.k8s_kubeadm_options.append(f"--pod-network-cidr={POD_NETWORK_CIDR}") + elif FLAGS.k8s_cni == "calico": + FLAGS.k8s_kubeadm_options.append(f"--pod-network-cidr={POD_NETWORK_CIDR}") + + +def _InstallCNI(vm): + if FLAGS.k8s_cni == "flannel": + vm.RemoteCommand(f'kubectl create -f https://raw.githubusercontent.com/flannel-io/flannel/{FLAGS.k8s_flannel_version}/Documentation/kube-flannel.yml') + + elif FLAGS.k8s_cni == "calico": + if "vxlan" in FLAGS.k8s_cni_options: + manifest = f"https://projectcalico.docs.tigera.io/archive/{FLAGS.k8s_calico_version}/manifests/calico-vxlan.yaml" + manifest_mod = "-e \"/CALICO_IPV[4|6]POOL_VXLAN/{n;s|\\\"CrossSubnet\\\"|\\\"Always\\\"|}\"" + else: + manifest = f"https://projectcalico.docs.tigera.io/archive/{FLAGS.k8s_calico_version}/manifests/calico.yaml" + manifest_mod = "" + + vm.RemoteCommand(f"bash -c 'kubectl apply -f <(curl -o - {manifest} | sed -e \"s|^\\(\\s*\\)env:\\s*$|\\1env:\\n\\1 - name: CALICO_IPV4POOL_CIDR\\n\\1 value: \\\"{POD_NETWORK_CIDR}\\\"\\n\\1 - name: IP_AUTODETECTION_METHOD\\n\\1 value: \\\"can-reach={vm.internal_ip}\\\"|\" {manifest_mod})'") + + _RobustRemoteCommand(vm, "kubectl wait --namespace=kube-system pod --for=condition=ready -l k8s-app=calico-node") + else: + raise Exception(f"Kubernetes: Unsupported CNI {FLAGS.k8s_cni}") + + +def _InstallNFD(vm, all_vms): + _RobustRemoteCommand(vm, f"kubectl apply -k 'https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v{FLAGS.k8s_nfd_version}'") + _RobustRemoteCommand(vm, "kubectl --namespace=node-feature-discovery wait pod --for=condition=ready -l app=nfd-worker") + if FLAGS.k8s_nfd_scripts: + vm_util.RunThreaded(lambda vm1: _InstallNFDScripts(vm1), all_vms) + + +def _InstallKubeVirt(vm): + if FLAGS.k8s_kubevirt: + vm.RemoteCommand(f"kubectl apply -f https://github.com/kubevirt/kubevirt/releases/download/{FLAGS.k8s_kubevirt_version}/kubevirt-operator.yaml") + vm.RemoteCommand(f"kubectl apply -f https://github.com/kubevirt/kubevirt/releases/download/{FLAGS.k8s_kubevirt_version}/kubevirt-cr.yaml") + _RobustRemoteCommand(vm, "kubectl -n kubevirt wait kv kubevirt --for condition=Available") + + +def _UpdateKubeadmConfigFile(): + cluster_config = { + "apiVersion": "kubeadm.k8s.io/v1beta2", + "kind": "ClusterConfiguration", + "networking": {}, + } + config_file = None + options = [] + for option1 in FLAGS.k8s_kubeadm_options: + if option1.startswith("--pod-network-cidr="): + _, _, cluster_config["networking"]["podSubnet"] = option1.partition("=") + elif option1.startswith("--image-repository="): + _, _, cluster_config["imageRepository"] = option1.partition("=") + elif option1.startswith("--config="): + _, _, config_file = option1.partition("=") + else: + options.append(option1) + + if config_file: + docs = [] + with open(config_file, "r") as fd: + cluster_config_doc = False + for doc1 in safe_load_all(fd): + if doc1 and "kind" in doc1: + if doc1["kind"] == "ClusterConfiguration": + doc1.update(cluster_config) + cluster_config_doc = True + docs.append(doc1) + if not cluster_config_doc: + docs.append(cluster_config) + + config_yaml = dump_all(docs).replace("\n","\\n").replace('"','\\"') + options.append("--config=<(printf \"{}\")".format(config_yaml)) + FLAGS.k8s_kubeadm_options = options + + +def CreateCluster(vm, workers, taint=True): + all_vms = list(set([vm] + workers)) + vm_util.RunThreaded(lambda vm1: _PrepareSystem(vm1), all_vms) + + _SetCNIOptions() + + _UpdateKubeadmConfigFile() + vm.RemoteCommand("sudo bash -c 'kubeadm init "+ ' '.join(FLAGS.k8s_kubeadm_options) + "'") + vm.RemoteCommand('mkdir -p $HOME/.kube') + vm.RemoteCommand('sudo cp -f /etc/kubernetes/admin.conf $HOME/.kube/config') + vm.RemoteCommand("bash -c 'sudo chown $(id -u):$(id -g) $HOME/.kube/config'") + + cmd, _ = vm.RemoteCommand('sudo kubeadm token create --print-join-command') + if not cmd.startswith("kubeadm join "): + raise Exception(f"Invalid kubeadm command: {cmd}") + vm_util.RunThreaded(lambda vm1: _RobustRemoteCommand(vm1, f'sudo {cmd}'), workers) + + _InstallCNI(vm) + _RobustRemoteCommand(vm, "kubectl --namespace=kube-system wait pod --all --for=condition=ready") + + _InstallNFD(vm, all_vms) + _InstallKubeVirt(vm) + + if not taint: + vm.RemoteCommand(f'kubectl taint node --all --overwrite node-role.kubernetes.io/master-') + vm.RemoteCommand(f'kubectl taint node --all --overwrite node-role.kubernetes.io/control-plane-') + + +def _OpenSSLConf(vm): + if vm.BASE_OS_TYPE == os_types.DEBIAN: + return "/etc/ssl/openssl.cnf" + if vm.BASE_OS_TYPE == os_types.RHEL: + return "/etc/pki/tls/openssl.conf" + raise Exception(f"{vm.BASE_OS_TYPE} Not Supported") + + +def _CopyCertsToWorker(vm, certs_dir): + vm.RemoteCommand(f"mkdir -p {certs_dir}") + vm.PushFile(f"{certs_dir}/client.cert", f"{certs_dir}/client.cert") + + if vm.BASE_OS_TYPE == os_types.DEBIAN: + vm.RemoteCommand(f"sudo cp -f {certs_dir}/client.cert /usr/local/share/ca-certificates/registry.crt") + vm.RemoteCommand("sudo update-ca-certificates") + elif vm.BASE_OS_TYPE == os_types.RHEL: + vm.RemoteCommand(f"sudo cp -f {certs_dir}/client.cert /etc/pki/ca-trust/source/anchors/registry.crt") + vm.RemoteCommand("sudo update-ca-trust") + else: + raise Exception(f"{vm.BASE_OS_TYPE} not supported") + + if docker_ce.IsDocker(vm): + vm.RemoteCommand("sudo systemctl restart docker") + else: + vm.RemoteCommand("sudo systemctl restart containerd") + vm.RemoteCommand("sudo systemctl restart kubelet") + + +def CreateRegistry(vm, workers, port=5000): + registry_url = f"{vm.internal_ip}:{port}" + certs_dir = vm_util.PrependTempDir(f"{REGISTRY_CERTS_DIR}/{registry_url}") + + vm_util.IssueCommand(["mkdir", "-p", certs_dir]) + vm_util.IssueCommand(["openssl", "req", "-newkey", "rsa:4096", "-nodes", "-sha256", "-keyout", f"{certs_dir}/client.key", "--addext", f"subjectAltName = IP:{vm.internal_ip}", "-x509", "-days", "365", "-out", f"{certs_dir}/client.cert", "-subj", f"/CN={vm.internal_ip}"]) + vm_util.IssueCommand(["chmod", "400", f"{certs_dir}/client.key"]) + #vm_util.IssueCommand(["cp", "-f", f"{certs_dir}/client.cert", f"{certs_dir}/ca.crt"]) + + vm_util.RunThreaded(lambda vm1: _CopyCertsToWorker(vm1, certs_dir), list(set(workers+[vm]))) + + vm.PushFile(f"{certs_dir}/client.key", f"{certs_dir}/client.key") + _RobustRemoteCommand(vm, f"kubectl create secret tls registry-cert --cert={certs_dir}/client.cert --key={certs_dir}/client.key") + + registry_yaml = REGISTRY_YAML.format(HOSTIP=vm.internal_ip, HOSTPORT=port).replace('"', '\\"') + vm.RemoteCommand(f"bash -c 'kubectl create -f <(printf \"{registry_yaml}\")'") + _RobustRemoteCommand(vm, f"kubectl wait pod --for=condition=ready -l app=registry") + return registry_url + + +@vm_util.Retry() +def _RobustRemoteCommand(vm, cmd): + vm.RemoteCommand(cmd) + + +def Uninstall(vm): + try: + vm.RemoteCommand("sudo kubeadm reset --force") + except: + vm.RemoteCommand("sudo rm -rf /etc/kubernetes $HOME/.config", ignore_failure=True) + vm.RemoteCommand("sudo kubeadm reset --force", ignore_failure=True) + vm.RemoteCommand("sudo ip link delete cni0", ignore_failure=True) + vm.RemoteCommand("sudo rm -rf /etc/cni /var/lib/cni", ignore_failure=True) + + if FLAGS.k8s_cni == "Flannel": + vm.RemoteCommand("sudo ip link delete flannel.1", ignore_failure=True) diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/memcached_server.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/memcached_server.py new file mode 100644 index 0000000..b127f33 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/memcached_server.py @@ -0,0 +1,151 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Module containing memcached server installation and cleanup functions.""" + +import logging +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import linux_packages +from perfkitbenchmarker import vm_util + +FLAGS = flags.FLAGS + +MEMCACHED_PORT = 11211 + +flags.DEFINE_integer('memcached_size_mb', 64, + 'Size of memcached cache in megabytes.') + +flags.DEFINE_integer('memcached_num_threads', 4, + 'Number of worker threads.') +flags.DEFINE_string('memcached_version', '1.6.9', + 'Memcached version to use.') + +DIR = linux_packages.INSTALL_DIR + + +def _Install(vm): + """Installs the memcached server on the VM.""" + vm.InstallPackages('wget') + vm.Install('build_tools') + vm.Install('event') + vm.RemoteCommand( + f'cd {DIR}; ' + 'wget https://www.memcached.org/files/' + f'memcached-{FLAGS.memcached_version}.tar.gz --no-check-certificate; ' + f'tar -zxvf memcached-{FLAGS.memcached_version}.tar.gz; ' + f'cd memcached-{FLAGS.memcached_version}; ' + './configure && make && sudo make install') + + +def YumInstall(vm): + """Installs the memcache package on the VM.""" + _Install(vm) + + +def AptInstall(vm): + """Installs the memcache package on the VM.""" + _Install(vm) + + +@vm_util.Retry(poll_interval=5, timeout=300, + retryable_exceptions=(errors.Resource.RetryableCreationError)) +def _WaitForServerUp(vm, port=MEMCACHED_PORT): + """Block until the memcached server is up and responsive. + + Will timeout after 5 minutes, and raise an exception. Before the timeout + expires any exceptions are caught and the status check is retried. + + We check the status of the server by issuing a 'stats' command. This should + return many lines of form 'STAT ' if the server is up and + running. + + Args: + vm: VirtualMachine memcached has been installed on. + port: int. Memcached port to use. + + Raises: + errors.Resource.RetryableCreationError when response is not as expected or + if there is an error connecting to the port or otherwise running the + remote check command. + """ + address = vm.internal_ip + + logging.info('Trying to connect to memcached at %s:%s', address, port) + try: + out, _ = vm.RemoteCommand( + '(echo -e "stats\n")| netcat -q 1 %s %s' % (address, port)) + if out.startswith('STAT '): + logging.info('memcached server stats received. Server up and running.') + return + except errors.VirtualMachine.RemoteCommandError as e: + raise errors.Resource.RetryableCreationError( + 'memcached server not up yet: %s.' % str(e)) + else: + raise errors.Resource.RetryableCreationError( + 'memcached server not up yet. Expected "STAT" but got "%s".' % out) + + +def ConfigureAndStart(vm, port=MEMCACHED_PORT, smp_affinity=False): + """Prepare the memcached server on a VM. + + Args: + vm: VirtualMachine to install and start memcached on. + port: int. Memcached port to use. + smp_affinity: Boolean. Whether or not to set smp_affinity. + """ + vm.Install('memcached_server') + if smp_affinity: + vm.SetSmpAffinity() + + for scratch_disk in vm.scratch_disks: + vm.RemoteCommand('sudo umount %s' % scratch_disk.mount_point) + + vm.RemoteCommand( + 'ulimit -n 32768; ' + 'sudo nohup memcached ' + # Increase maximum memcached server connections + '-u $USER -c 32768 ' + f'-t {FLAGS.memcached_num_threads} ' + # update default port + f'-p {port} ' + # update memory size + f'-m {FLAGS.memcached_size_mb} ' + # update security config to allow incoming network + '-l 0.0.0.0 -v &> log &') + + _WaitForServerUp(vm, port) + logging.info('memcached server configured and started.') + + +def GetVersion(vm): + """Returns the version of the memcached server installed.""" + results, _ = vm.RemoteCommand('memcached -help |grep -m 1 "memcached"' + '| tr -d "\n"') + return results + + +def StopMemcached(vm): + vm.RemoteCommand('sudo pkill -9 memcached', ignore_failure=True) + + +def FlushMemcachedServer(ip, port): + vm_util.IssueCommand( + '(echo -e "flush_all\n" ; sleep 1)| netcat %s %s' % (ip, port)) + + +def AptUninstall(vm): + """Removes the memcache package on the VM.""" + del vm diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/proxy.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/proxy.py new file mode 100644 index 0000000..213f7a5 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/proxy.py @@ -0,0 +1,5 @@ + +def AddProxy(vm, service): + vm.RemoteCommand(f'sudo mkdir -p /etc/systemd/system/{service}.service.d') + vm.RemoteCommand(f'printf "[Service]\\nEnvironment=\"HTTP_PROXY=$http_proxy\" \"HTTPS_PROXY=$https_proxy\" \"NO_PROXY=$no_proxy\"\\n" | sudo tee /etc/systemd/system/{service}.service.d/proxy.conf') + diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/ruby.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/ruby.py new file mode 100644 index 0000000..1bd4ecc --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/ruby.py @@ -0,0 +1,54 @@ +from perfkitbenchmarker import errors +from absl import flags +import logging + + +FLAGS = flags.FLAGS + +flags.DEFINE_string('ruby_version', '2.7', + 'Version of ruby to be installed') +EMON_MAIN_DIR = '/opt/emon' + + +def _Install(vm): + cmds = ['curl -sSL https://rvm.io/mpapis.asc | gpg2 --import -', + 'curl -sSL https://rvm.io/pkuczynski.asc | gpg2 --import -', + 'curl -L get.rvm.io | bash -s stable', + 'sed -i "1 i\source $HOME/.rvm/scripts/rvm" ~/.bashrc'] + vm.RemoteCommand(' && '.join(cmds)) + cmds = ['rvm reload', + 'rvm requirements run', + 'rvm install {0} '.format(FLAGS.ruby_version)] + vm.RemoteCommand(' && '.join(cmds)) + + +def _CompatibleRubyVersion(vm): + installed_ruby_version = vm.RemoteCommand("ruby -v | awk -F' ' '{{print $2}}'")[0].rstrip("\n") + # If apt or yum downloads a compatible version, then don't download rvm + logging.info("Installed Version: {}, Version Needed: {}" + .format(installed_ruby_version, FLAGS.ruby_version)) + if FLAGS.ruby_version in installed_ruby_version: + return True + vm.RemoteCommand('sudo touch {0}/pkb_ruby_file'.format(EMON_MAIN_DIR)) + return False + + +def YumInstall(vm): + """Installs the package on the VM.""" + vm.RemoteCommand('sudo yum install -y ruby') + if not _CompatibleRubyVersion(vm): + logging.info("Installing a newer compatible verison of ruby") + vm.InstallPackages('gcc-c++ patch readline readline-devel zlib zlib-devel libffi-devel ' + 'openssl-devel make bzip2 autoconf automake libtool bison sqlite-devel') + _Install(vm) + + +def AptInstall(vm): + """Installs the package on the VM.""" + vm.RemoteCommand('sudo apt-get install -y ruby') + if not _CompatibleRubyVersion(vm): + logging.info("Installing a newer compatible verison of ruby") + vm.InstallPackages('gnupg2 curl g++ gcc autoconf automake bison libc6-dev libffi-dev ' + 'libgdbm-dev libncurses5-dev libsqlite3-dev libtool libyaml-dev make ' + 'pkg-config sqlite3 zlib1g-dev libgmp-dev libreadline-dev libssl-dev') + _Install(vm) diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/runwith.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/runwith.py new file mode 100644 index 0000000..81364bf --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/runwith.py @@ -0,0 +1,95 @@ + +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.linux_packages import INSTALL_DIR +from absl import flags +import posixpath +import json + +FLAGS = flags.FLAGS +flags.DEFINE_boolean("run_with_vm", False, + "Whether to run with VM or docker") + + +def DockerRun(vm, options, image): + if FLAGS.run_with_vm: + stdout, _ = vm.RemoteCommand("sudo -E docker create {options} {image}".format(options=" ".join(options), image=image)) + container = stdout.strip() + + stdout, _ = vm.RemoteCommand(f"sudo -E docker container inspect {container}" + " -f '{{json .}}'") + inspect = json.loads(stdout) + + root_dir = posixpath.join(INSTALL_DIR, container) + vm.RemoteCommand(f"mkdir -p {root_dir}") + vm.RemoteCommand(f"sudo -E docker container export {container} | sudo tar xf - -C {root_dir}") + for mount1 in inspect["Mounts"]: + mount_path = posixpath.join(root_dir, mount1["Destination"][1:]) + vm.RemoteCommand(f"sudo mkdir -p {mount_path}") + vm.RemoteCommand("sudo mount --bind {src} {dst}".format(src=mount1["Source"], dst=mount_path)) + + mount_path = posixpath.join(root_dir, "proc") + vm.RemoteCommand(f"sudo mount -t proc /proc {mount_path}") + for mount1 in ["sys", "dev"]: + mount_path = posixpath.join(root_dir, mount1) + vm.RemoteCommand(f"sudo mount --rbind /{mount1} {mount_path}") + vm.RemoteCommand(f"sudo mount --make-rslave {mount_path}") + + cmds = [] + for cmd1 in inspect["Config"]["Cmd"]: + if " " in cmd1: + cmds.append("'{}'".format(cmd1.replace("'", "'\\''"))) + else: + cmds.append(cmd1) + + working_dir = inspect["Config"]["WorkingDir"] if inspect["Config"]["WorkingDir"] else "/" + logfile = posixpath.join(root_dir, ".logs") + user = inspect["Config"]["User"] if inspect["Config"]["User"] else "root" + stdout, _ = vm.RemoteCommand("sudo chroot --userspec={userspec} {root_dir} bash -c 'cd {dir};{envs} {cmd}' > {logfile} 2>&1 & echo $!".format(envs=" ".join(inspect["Config"]["Env"]), root_dir=root_dir, cmd=" ".join(cmds).replace("'", "'\\''"), dir=working_dir, logfile=logfile, userspec=user)) + pid = stdout.strip() + return (container, pid) + + stdout, _ = vm.RemoteCommand("sudo -E docker run --rm -d {options} {image}".format(options=" ".join(options), image=image)) + container = stdout.strip() + + # set perf cgroup + FLAGS.perf_options = FLAGS.perf_options.replace("%container%", f"docker/{container}") + + return (container, None) + + +def DockerWaitForCompletion(vm, container, timeout, logs_file): + if FLAGS.run_with_vm: + root_dir = posixpath.join(INSTALL_DIR, container) + export_logs = posixpath.join(root_dir, "export-logs") + vm.RemoteCommand(f"timeout {timeout}s cat {export_logs} > {logs_file}") + else: + vm.RemoteCommand(f"timeout {timeout}s sudo -E docker exec {container} cat /export-logs > {logs_file}") + + +def DockerLogsCmd(container, flags=""): + if FLAGS.run_with_vm: + root_dir = posixpath.join(INSTALL_DIR, container) + log_file = posixpath.join(root_dir, ".logs") + return f"sudo tail {flags} {log_file}" + + return f"sudo -E docker logs {flags} {container}" + + +def DockerRemove(vm, containers, container_id, pid): + if FLAGS.run_with_vm: + vm.RemoteCommand(f"sudo kill -9 {pid}", ignore_failure=True) + + stdout, _ = vm.RemoteCommand(f"sudo -E docker container inspect {container_id}" + " -f '{{json .}}'") + inspect = json.loads(stdout) + + root_dir = posixpath.join(INSTALL_DIR, container_id) + stdout, _ = vm.RemoteCommand(f"sudo mount | cut -f3 -d' ' | sort") + last_umounted = "na" + for mount_path in stdout.split("\n"): + if mount_path.startswith(root_dir + '/') and not mount_path.startswith(last_umounted): + vm.RemoteCommand(f"sudo umount -R {mount_path}", ignore_failure=True) + last_umounted = mount_path + + vm.RemoteCommand(f"sudo rm -rf {root_dir}") + + vm.RemoteCommand("sudo -E docker rm -v -f {}".format(" ".join(containers + [container_id])), ignore_failure = True) + diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/skopeo.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/skopeo.py new file mode 100644 index 0000000..313b726 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/skopeo.py @@ -0,0 +1,69 @@ + +from perfkitbenchmarker import vm_util +#from perfkitbenchmarker.linux_packages import k8s +from absl import flags + +FLAGS = flags.FLAGS +flags.DEFINE_list("skopeo_insecure_registries", [], + "Specify the skopeo command line options") +flags.DEFINE_list("skopeo_sut_accessible_registries", [], + "Specify the list of registries that the SUT has access.") +flags.DEFINE_string("skopeo_src_cert_dir", None, + "Specify the source certs directory") + + +def IsSUTAccessible(registry_url): + return registry_url in FLAGS.skopeo_sut_accessible_registries + + +def InspectImage(image, registry_url): + if registry_url: + if image.startswith(registry_url) and not IsSUTAccessible(registry_url): + options = [] + if FLAGS.skopeo_src_cert_dir: + options.append(f"--src-cert-dir={FLAGS.skopeo_src_cert_dir}") + for r1 in FLAGS.skopeo_insecure_registries: + r2 = r1 if r1.endswith("/") else r1 + "/" + if image.startswith(r2): + options.append("--src-tls-verify=false") + break + basename = image[len(registry_url):] + return (basename, options + [f"docker://{image}"]) + return None + + try: + vm_util.IssueCommand(["skopeo", "inspect", f"docker-daemon:{image}"]) + return (image, [f"docker-daemon:{image}"]) + except: + pass + return None + + +@vm_util.Retry() +def _RobustSkopeoCopy(cmd): + vm_util.IssueCommand(["sudo", "-E", "skopeo", "copy"] + cmd, timeout=None) + + +def CopyImagesToDocker(vm, images, port=12222): + daemon_url = f"localhost:{port}" + vm.RemoteCommand("", ssh_args = ["-fNL", f"{daemon_url}:/var/run/docker.sock"]) + for image1 in images: + _RobustSkopeoCopy([f"--dest-daemon-host=http://{daemon_url}"] + images[image1][1] + [f"docker-daemon:{image1}"]) + + # cancel forwaring + vm.RemoteCommand("", ssh_args = ["-O", "exit"]) + + +def CopyImagesToRegistry(vm, images, registry_url, port=16666): + local_registry_url = f"localhost:{port}" + vm.RemoteCommand("", ssh_args = ["-fNL", f"{local_registry_url}:{registry_url}"]) + + #certs_dir = vm_util.PrependTempDir(k8s.REGISTRY_CERTS_DIR) + for image1 in images: + local_image=f"{local_registry_url}/{images[image1][0]}" + #_RobustSkopeoCopy([f"--dest-cert-dir={certs_dir}", "--dest-tls-verify=false"] + images[image1][1] + [f"docker://{local_image}"]) + _RobustSkopeoCopy(["--dest-tls-verify=false"] + images[image1][1] + [f"docker://{local_image}"]) + + # cancel forwaring + vm.RemoteCommand("", ssh_args = ["-O", "exit"]) + diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/spark.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/spark.py new file mode 100644 index 0000000..8fe20a2 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/spark.py @@ -0,0 +1,237 @@ +# Copyright 2021 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing Apache Spark installation and configuration. + +For documentation of Spark Stalanone clusters, see: +https://spark.apache.org/docs/latest/spark-standalone.html +""" +import functools +import logging +import os +import posixpath +import time +from typing import Dict + +from absl import flags +from packaging import version +from perfkitbenchmarker import data +from perfkitbenchmarker import linux_packages +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.linux_packages import aws_credentials +from perfkitbenchmarker.linux_packages import hadoop + +FLAGS = flags.FLAGS + +SPARK_VERSION_FLAG = flags.DEFINE_string('spark_version', '3.1.2', + 'Version of spark.') + +DATA_FILES = [ + 'spark/spark-defaults.conf.j2', 'spark/spark-env.sh.j2', 'spark/workers.j2' +] + +SPARK_DIR = posixpath.join(linux_packages.INSTALL_DIR, 'spark') +SPARK_BIN = posixpath.join(SPARK_DIR, 'bin') +SPARK_SBIN = posixpath.join(SPARK_DIR, 'sbin') +SPARK_CONF_DIR = posixpath.join(SPARK_DIR, 'conf') +SPARK_PRIVATE_KEY = posixpath.join(SPARK_CONF_DIR, 'spark_keyfile') + +SPARK_SUBMIT = posixpath.join(SPARK_BIN, 'spark-submit') + + +def _SparkVersion() -> version.Version: + return version.Version(SPARK_VERSION_FLAG.value) + + +def _ScalaVersion() -> version.Version: + if _SparkVersion().major >= 3: + # https://spark.apache.org/docs/3.0.0/#downloading + return version.Version('2.12') + else: + # https://spark.apache.org/docs/2.4.0/#downloading + return version.Version('2.11') + + +def SparkExamplesJarPath() -> str: + return posixpath.join( + SPARK_DIR, 'examples/jars/', + f'spark-examples_{_ScalaVersion()}-{_SparkVersion()}.jar') + + +def CheckPrerequisites(): + """Verifies that the required resources are present. + + Raises: + perfkitbenchmarker.data.ResourceNotFound: On missing resource. + """ + for resource in DATA_FILES: + data.ResourcePath(resource) + + +def Install(vm): + vm.Install('openjdk') + vm.Install('python3') + vm.Install('curl') + # Needed for HDFS not as a dependency. + # Also used on Spark's classpath to support s3a client. + vm.Install('hadoop') + spark_url = ('https://downloads.apache.org/spark/spark-{0}/' + 'spark-{0}-bin-without-hadoop.tgz').format(FLAGS.spark_version) + vm.RemoteCommand( + ('mkdir {0} && curl -L {1} | ' + 'tar -C {0} --strip-components=1 -xzf -').format(SPARK_DIR, spark_url)) + + +# Scheduling constants. +# Give 90% of VM memory to Spark for scheduling. +# This is roughly consistent with Dataproc 2.0+ +SPARK_MEMORY_FRACTION = 0.9 +SPARK_DRIVER_MEMORY = 'spark.driver.memory' +SPARK_WORKER_MEMORY = 'spark.executor.memory' +SPARK_WORKER_VCPUS = 'spark.executor.cores' + + +def GetConfiguration(driver_memory_mb: int, + worker_memory_mb: int, + worker_cores: int, + num_workers: int, + configure_s3: bool = False) -> Dict[str, str]: + """Calculate Spark configuration. Shared between VMs and k8s.""" + conf = { + SPARK_DRIVER_MEMORY: f'{driver_memory_mb}m', + SPARK_WORKER_MEMORY: f'{worker_memory_mb}m', + SPARK_WORKER_VCPUS: str(worker_cores), + 'spark.executor.instances': str(num_workers), + # Tell spark not to run job if it can't schedule all workers. This would + # silently degrade performance. + 'spark.scheduler.minRegisteredResourcesRatio': '1' + } + if configure_s3: + # Configure S3A Hadoop's S3 filesystem + aws_access_key, aws_secret_key = aws_credentials.GetCredentials() + conf.update({ + # Use s3:// scheme to be consistent with EMR + 'spark.hadoop.fs.s3.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem', + 'spark.hadoop.fs.s3a.access.key': aws_access_key, + 'spark.hadoop.fs.s3a.secret.key': aws_secret_key, + }) + return conf + + +def _RenderConfig(vm, + leader, + workers, + memory_fraction=SPARK_MEMORY_FRACTION, + configure_s3=False): + """Load Spark Condfiguration on VM.""" + # Use first worker to get worker configuration + worker = workers[0] + worker_cores = worker.NumCpusForBenchmark() + worker_memory_mb = int((worker.total_memory_kb / 1024) * memory_fraction) + driver_memory_mb = int((leader.total_memory_kb / 1024) * memory_fraction) + + spark_conf = GetConfiguration( + driver_memory_mb=driver_memory_mb, + worker_memory_mb=worker_memory_mb, + worker_cores=worker_cores, + num_workers=len(workers), + configure_s3=configure_s3) + + if vm.scratch_disks: + # TODO(pclay): support multiple scratch disks. A current suboptimal + # workaround is RAID0 local_ssds with --num_striped_disks. + scratch_dir = posixpath.join(vm.GetScratchDir(), 'spark') + else: + scratch_dir = posixpath.join('/tmp/pkb/local_scratch', 'spark') + + optional_tools = None + if configure_s3: + optional_tools = 'hadoop-aws' + + context = { + 'spark_conf': spark_conf, + 'leader_ip': leader.internal_ip, + 'worker_ips': [vm.internal_ip for vm in workers], + 'scratch_dir': scratch_dir, + 'worker_vcpus': worker_cores, + 'spark_private_key': SPARK_PRIVATE_KEY, + 'worker_memory': spark_conf[SPARK_WORKER_MEMORY], + 'hadoop_cmd': hadoop.HADOOP_CMD, + 'python_cmd': 'python3', + 'optional_tools': optional_tools + } + + for file_name in DATA_FILES: + file_path = data.ResourcePath(file_name) + if file_name == 'spark/workers.j2': + # Spark calls its worker list slaves. + file_name = 'spark/slaves.j2' + remote_path = posixpath.join(SPARK_CONF_DIR, os.path.basename(file_name)) + if file_name.endswith('.j2'): + vm.RenderTemplate(file_path, os.path.splitext(remote_path)[0], context) + else: + vm.RemoteCopy(file_path, remote_path) + + +def _GetOnlineWorkerCount(leader): + """Curl Spark Master Web UI for worker status.""" + cmd = ('curl http://localhost:8080 ' + "| grep 'Alive Workers' " + "| grep -o '[0-9]\\+'") + stdout = leader.RemoteCommand(cmd)[0] + return int(stdout) + + +def ConfigureAndStart(leader, workers, configure_s3=False): + """Run Spark Standalone and HDFS on a cluster. + + Args: + leader: VM. leader VM - will be the HDFS NameNode, Spark Master. + workers: List of VMs. Each VM will run an HDFS DataNode, Spark Worker. + configure_s3: Whether to configure Spark to access S3. + """ + # Start HDFS + hadoop.ConfigureAndStart(leader, workers, start_yarn=False) + + vms = [leader] + workers + # If there are no workers set up in pseudo-distributed mode, where the leader + # node runs the worker daemons. + workers = workers or [leader] + fn = functools.partial( + _RenderConfig, leader=leader, workers=workers, configure_s3=configure_s3) + vm_util.RunThreaded(fn, vms) + + leader.RemoteCommand("rm -f {0} && ssh-keygen -q -t rsa -N '' -f {0}".format( + SPARK_PRIVATE_KEY)) + + public_key = leader.RemoteCommand('cat {0}.pub'.format(SPARK_PRIVATE_KEY))[0] + + def AddKey(vm): + vm.RemoteCommand('echo "{0}" >> ~/.ssh/authorized_keys'.format(public_key)) + + vm_util.RunThreaded(AddKey, vms) + + # HDFS setup and formatting, Spark startup + leader.RemoteCommand( + 'bash {0}/start-all.sh'.format(SPARK_SBIN), should_log=True) + + logging.info('Sleeping 10s for Spark nodes to join.') + time.sleep(10) + + logging.info('Checking Spark status.') + worker_online_count = _GetOnlineWorkerCount(leader) + if worker_online_count != len(workers): + raise ValueError('Not all nodes running Spark: {0} < {1}'.format( + worker_online_count, len(workers))) + else: + logging.info('Spark running on all %d workers', len(workers)) diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/storage_tools.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/storage_tools.py new file mode 100644 index 0000000..cedbc7c --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/storage_tools.py @@ -0,0 +1,20 @@ +"""Module containing storage_tools installation. + +This provides nvme-cli among other useful storage tools. +""" + + +def _Install(vm): + vm.InstallPackages('nvme-cli') + + +def YumInstall(vm): + _Install(vm) + + +def AptInstall(vm): + _Install(vm) + + +def SwupdInstall(vm): + vm.InstallPackages('storage-utils') diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_packages/stream.py b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/stream.py new file mode 100644 index 0000000..685355e --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_packages/stream.py @@ -0,0 +1,96 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Module containing STREAM installation and cleanup functions.""" + +from perfkitbenchmarker.linux_packages import INSTALL_DIR +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import os_types +from perfkitbenchmarker import vm_util + +FLAGS = flags.FLAGS + +STREAM_DIR = '%s/STREAM' % INSTALL_DIR +STREAM_PATH = STREAM_DIR + '/stream' +GIT_REPO = 'https://github.com/jeffhammond/STREAM.git' + + +def GetStreamExec(vm): + if FLAGS.stream_omp_num_threads == 0: + num_threads = vm.num_cpus + else: + num_threads = FLAGS.stream_omp_num_threads + + if num_threads <= 1: + raise errors.Setup.InvalidSetupError( + 'Stream could not be run on machine with CPU/Threads number <= 1') + + if FLAGS.stream_binary_url: + source_psxe_vars = "source /opt/intel/psxe_runtime/linux/bin/psxevars.sh" + return 'export OMP_NUM_THREADS={0};export GOMP_CPU_AFFINITY="0-{1}:1";{2} && {3}'.format(num_threads, + num_threads - 1, + source_psxe_vars, + STREAM_PATH) + else: + return 'export OMP_NUM_THREADS={0};export GOMP_CPU_AFFINITY="0-{1}:1";{2}'.format(num_threads, + num_threads - 1, + STREAM_PATH) + + +def _GetInternalResources(vm, url): + stream_path = vm_util.PrependTempDir('stream') + vm_util.IssueCommand("curl -SL {0} -o {1}".format(url, stream_path).split(), timeout=None) + vm.RemoteCommand("mkdir -p {0}".format(STREAM_DIR)) + vm.RemoteCopy(stream_path, STREAM_PATH) + vm.RemoteCommand("sudo chmod +x {0}".format(STREAM_PATH)) + + +def _Install(vm): + """Installs the stream package on the VM.""" + if FLAGS.stream_binary_url: + vm.Install('intel_parallel_studio_runtime') + _GetInternalResources(vm, FLAGS.stream_binary_url) + else: + vm.RemoteCommand('git clone {0} {1}'.format(GIT_REPO, STREAM_DIR)) + vm.RemoteCommand('cd {0}; {1} {2} ' + '-DSTREAM_ARRAY_SIZE={3} -DNTIMES={4} -DOFFSET={5} ' + 'stream.c -o stream'.format(STREAM_DIR, + FLAGS.compiler, + FLAGS.stream_compiler_flags, + FLAGS.stream_array_size, + FLAGS.stream_ntimes, + FLAGS.stream_offset) + ) + + +def YumInstall(vm): + """Installs the stream package on the VM.""" + # for RHEL 7 + if vm.OS_TYPE == os_types.RHEL: + raise NotImplementedError + vm.Install('build_tools') + _Install(vm) + + +def AptInstall(vm): + """Installs the stream package on the VM.""" + vm.Install('build_tools') + vm.Install('compiler') + _Install(vm) + + +def Uninstall(vm): + vm.RemoteCommand('sudo rm -rf {0}'.format(STREAM_DIR)) diff --git a/script/cumulus/pkb/perfkitbenchmarker/linux_virtual_machine.py b/script/cumulus/pkb/perfkitbenchmarker/linux_virtual_machine.py new file mode 100644 index 0000000..cd15599 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/linux_virtual_machine.py @@ -0,0 +1,3035 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing mixin classes for linux virtual machines. + +These classes allow installation on both Debian and RHEL based linuxes. +They also handle some initial setup (especially on RHEL based linuxes +since by default sudo commands without a tty don't work) and +can restore the VM to the state it was in before packages were +installed. + +To install a package on a VM, just call vm.Install(package_name). +The package name is just the name of the package module (i.e. the +file name minus .py). The framework will take care of all cleanup +for you. +""" + +import abc +import collections +import copy +import logging +import os +import pipes +import posixpath +import re +import threading +import time +from typing import Dict, Set +import uuid + +from absl import flags +from perfkitbenchmarker import context +from perfkitbenchmarker import disk +from perfkitbenchmarker import errors +from perfkitbenchmarker import linux_packages +from perfkitbenchmarker import os_types +from perfkitbenchmarker import regex_util +from perfkitbenchmarker import virtual_machine +from perfkitbenchmarker import vm_util + +import yaml + +FLAGS = flags.FLAGS + + +OS_PRETTY_NAME_REGEXP = r'PRETTY_NAME="(.*)"' +CLEAR_BUILD_REGEXP = r'Installed version:\s*(.*)\s*' +UPDATE_RETRIES = 5 +DEFAULT_SSH_PORT = 22 +REMOTE_KEY_PATH = '~/.ssh/id_rsa' +CONTAINER_MOUNT_DIR = '/mnt' +CONTAINER_WORK_DIR = '/root' + +# This pair of scripts used for executing long-running commands, which will be +# resilient in the face of SSH connection errors. +# EXECUTE_COMMAND runs a command, streaming stdout / stderr to a file, then +# writing the return code to a file. An exclusive lock is acquired on the return +# code file, so that other processes may wait for completion. +EXECUTE_COMMAND = 'execute_command.py' +# WAIT_FOR_COMMAND waits on the file lock created by EXECUTE_COMMAND, +# then copies the stdout and stderr, exiting with the status of the command run +# by EXECUTE_COMMAND. +WAIT_FOR_COMMAND = 'wait_for_command.py' + +_DEFAULT_DISK_FS_TYPE = 'ext4' +_DEFAULT_DISK_MOUNT_OPTIONS = 'discard' +_DEFAULT_DISK_FSTAB_OPTIONS = 'defaults' + +# regex for parsing lscpu and /proc/cpuinfo +_COLON_SEPARATED_RE = re.compile(r'^\s*(?P.*?)\s*:\s*(?P.*?)\s*$') + +flags.DEFINE_bool('setup_remote_firewall', False, + 'Whether PKB should configure the firewall of each remote' + 'VM to make sure it accepts all internal connections.') + +flags.DEFINE_list('sysctl', [], + 'Sysctl values to set. This flag should be a comma-separated ' + 'list of path=value pairs. Each pair will be appended to' + '/etc/sysctl.conf. The presence of any items in this list ' + 'will cause a reboot to occur after VM prepare. ' + 'For example, if you pass ' + '--sysctls=vm.dirty_background_ratio=10,vm.dirty_ratio=25, ' + 'PKB will append "vm.dirty_background_ratio=10" and' + '"vm.dirty_ratio=25" on separate lines to /etc/sysctrl.conf' + ' and then the machine will be rebooted before starting' + 'the benchmark.') + +flags.DEFINE_list( + 'set_files', + [], + 'Arbitrary filesystem configuration. This flag should be a ' + 'comma-separated list of path=value pairs. Each value will ' + 'be written to the corresponding path. For example, if you ' + 'pass --set_files=/sys/kernel/mm/transparent_hugepage/enabled=always, ' + 'then PKB will write "always" to ' + '/sys/kernel/mm/transparent_hugepage/enabled before starting ' + 'the benchmark.') + +flags.DEFINE_bool('network_enable_BBR', False, + 'A shortcut to enable BBR congestion control on the network. ' + 'equivalent to appending to --sysctls the following values ' + '"net.core.default_qdisc=fq, ' + '"net.ipv4.tcp_congestion_control=bbr" ' + 'As with other sysctrls, will cause a reboot to happen.') + +flags.DEFINE_integer('num_disable_cpus', None, + 'Number of CPUs to disable on the virtual machine.' + 'If the VM has n CPUs, you can disable at most n-1.', + lower_bound=1) +flags.DEFINE_integer('disk_fill_size', 0, + 'Size of file to create in GBs.') +flags.DEFINE_enum('disk_fs_type', _DEFAULT_DISK_FS_TYPE, + [_DEFAULT_DISK_FS_TYPE, 'xfs'], + 'File system type used to format disk.') +flags.DEFINE_integer( + 'disk_block_size', None, 'Block size to format disk with.' + 'Defaults to 4096 for ext4.') + +flags.DEFINE_bool( + 'enable_transparent_hugepages', None, 'Whether to enable or ' + 'disable transparent hugepages. If unspecified, the setting ' + 'is unchanged from the default in the OS.') + +flags.DEFINE_integer( + 'ssh_retries', 10, 'Default number of times to retry SSH.', lower_bound=0) + +flags.DEFINE_integer( + 'scp_connect_timeout', 30, 'timeout for SCP connection.', lower_bound=0) + +flags.DEFINE_string( + 'append_kernel_command_line', None, + 'String to append to the kernel command line. The presence of any ' + 'non-empty string will cause a reboot to occur after VM prepare. ' + 'If unspecified, the kernel command line will be unmodified.') + +flags.DEFINE_integer( + 'tcp_max_receive_buffer', None, + 'Changes the third component of the sysctl value net.ipv4.tcp_rmem. ' + 'This sets the maximum receive buffer for TCP socket connections in bytes. ' + 'Increasing this value may increase single stream TCP throughput ' + 'for high latency connections') + +flags.DEFINE_integer( + 'tcp_max_send_buffer', None, + 'Changes the third component of the sysctl value net.ipv4.tcp_wmem. ' + 'This sets the maximum send buffer for TCP socket connections in bytes. ' + 'Increasing this value may increase single stream TCP throughput ' + 'for high latency connections') + +flags.DEFINE_integer( + 'rmem_max', None, + 'Sets the sysctl value net.core.rmem_max. This sets the max OS ' + 'receive buffer size in bytes for all types of connections') + +flags.DEFINE_integer( + 'wmem_max', None, + 'Sets the sysctl value net.core.wmem_max. This sets the max OS ' + 'send buffer size in bytes for all types of connections') + +flags.DEFINE_bool( + 'skip_package_restore', False, + 'Skip package restoring in the Teardown phase') + +flags.DEFINE_boolean('gce_hpc_tools', False, + 'Whether to apply the hpc-tools environment script.') + +GOVERNORS = ["conservative", "ondemand", "userspace", "powersave", "performance", "schedutil"] +flags.DEFINE_enum('scaling_governor', None, GOVERNORS, 'If specified, sets the cpufreq governor.') + +flags.DEFINE_boolean('disable_smt', False, + 'Whether to disable SMT (Simultaneous Multithreading) ' + 'in BIOS.') + +flags.DEFINE_boolean('enable_rsync', False, + 'Whether to enable rsync as the remote copy app') + +_DISABLE_YUM_CRON = flags.DEFINE_boolean( + 'disable_yum_cron', True, 'Whether to disable the cron-run yum service.') + +RETRYABLE_SSH_RETCODE = 255 + + +class CpuVulnerabilities: + """The 3 different vulnerablity statuses from vm.cpu_vulernabilities. + + Example input: + /sys/devices/system/cpu/vulnerabilities/itlb_multihit:KVM: Vulnerable + Is put into vulnerability with a key of "itlb_multihit" and value "KVM" + + Unparsed lines are put into the unknown dict. + """ + + def __init__(self): + self.mitigations: Dict[str, str] = {} + self.vulnerabilities: Dict[str, str] = {} + self.notaffecteds: Set[str] = set() + self.unknowns: Dict[str, str] = {} + + def AddLine(self, full_line: str) -> None: + """Parses a line of output from the cpu/vulnerabilities/* files.""" + if not full_line: + return + file_path, line = full_line.split(':', 1) + file_name = posixpath.basename(file_path) + if self._AddMitigation(file_name, line): + return + if self._AddVulnerability(file_name, line): + return + if self._AddNotAffected(file_name, line): + return + self.unknowns[file_name] = line + + def _AddMitigation(self, file_name, line): + match = re.match('^Mitigation: (.*)', line) or re.match( + '^([^:]+): Mitigation: (.*)$', line) + if match: + self.mitigations[file_name] = ':'.join(match.groups()) + return True + + def _AddVulnerability(self, file_name, line): + match = re.match('^Vulnerable: (.*)', line) or re.match( + '^Vulnerable$', line) or re.match('^([^:]+): Vulnerable$', line) + if match: + self.vulnerabilities[file_name] = ':'.join(match.groups()) + return True + + def _AddNotAffected(self, file_name, line): + match = re.match('^Not affected$', line) + if match: + self.notaffecteds.add(file_name) + return True + + @property + def asdict(self) -> Dict[str, str]: + """Returns the parsed CPU vulnerabilities as a dict.""" + ret = {} + if self.mitigations: + ret['mitigations'] = ','.join(sorted(self.mitigations)) + for key, value in self.mitigations.items(): + ret[f'mitigation_{key}'] = value + if self.vulnerabilities: + ret['vulnerabilities'] = ','.join(sorted(self.vulnerabilities)) + for key, value in self.vulnerabilities.items(): + ret[f'vulnerability_{key}'] = value + if self.unknowns: + ret['unknowns'] = ','.join(self.unknowns) + for key, value in self.unknowns.items(): + ret[f'unknown_{key}'] = value + if self.notaffecteds: + ret['notaffecteds'] = ','.join(sorted(self.notaffecteds)) + return ret + + +class BaseLinuxMixin(virtual_machine.BaseOsMixin): + """Class that holds Linux related VM methods and attributes.""" + + # If multiple ssh calls are made in parallel using -t it will mess + # the stty settings up and the terminal will become very hard to use. + # Serializing calls to ssh with the -t option fixes the problem. + _pseudo_tty_lock = threading.Lock() + + # TODO(user): Remove all uses of Python 2. + PYTHON_2_PACKAGE = 'python2' + + def __init__(self, *args, **kwargs): + super(BaseLinuxMixin, self).__init__(*args, **kwargs) + # N.B. If you override ssh_port you must override remote_access_ports and + # primary_remote_access_port. + self.ssh_port = DEFAULT_SSH_PORT + self.remote_access_ports = [self.ssh_port] + self.primary_remote_access_port = self.ssh_port + self.has_private_key = False + + self._remote_command_script_upload_lock = threading.Lock() + self._has_remote_command_script = False + self._needs_reboot = False + self._lscpu_cache = None + self._partition_table = {} + self._proccpu_cache = None + self._smp_affinity_script = None + + def _Suspend(self): + """Suspends a VM.""" + raise NotImplementedError() + + def _Resume(self): + """Resumes a VM.""" + raise NotImplementedError() + + def _BeforeSuspend(self): + pass + + def _CreateVmTmpDir(self): + self.RemoteCommand('mkdir -p %s' % vm_util.VM_TMP_DIR) + + def _SetTransparentHugepages(self): + """Sets transparent hugepages based on --enable_transparent_hugepages. + + If the flag is unset (None), this is a nop. + """ + if FLAGS.enable_transparent_hugepages is None: + return + setting = 'always' if FLAGS.enable_transparent_hugepages else 'never' + self.RemoteCommand( + 'echo %s | sudo tee /sys/kernel/mm/transparent_hugepage/enabled' % + setting) + self.os_metadata['transparent_hugepage'] = setting + + def _SetupRobustCommand(self): + """Sets up the RobustRemoteCommand tooling. + + This includes installing python3 and pushing scripts required by + RobustRemoteCommand to this VM. There is a check to skip if previously + installed. + """ + with self._remote_command_script_upload_lock: + if not self._has_remote_command_script: + # Python3 is needed for RobustRemoteCommands + self.Install('python3') + + for f in (EXECUTE_COMMAND, WAIT_FOR_COMMAND): + remote_path = os.path.join(vm_util.VM_TMP_DIR, os.path.basename(f)) + if os.path.basename(remote_path): + self.RemoteCommand('sudo rm -f ' + remote_path) + self.PushDataFile(f, remote_path) + self._has_remote_command_script = True + + def RobustRemoteCommand(self, command, should_log=False, timeout=None, + ignore_failure=False): + """Runs a command on the VM in a more robust way than RemoteCommand. + + This is used for long-running commands that might experience network issues + that would normally interrupt a RemoteCommand and fail to provide results. + Executes a command via a pair of scripts on the VM: + + * EXECUTE_COMMAND, which runs 'command' in a nohupped background process. + * WAIT_FOR_COMMAND, which first waits on confirmation that EXECUTE_COMMAND + has acquired an exclusive lock on a file with the command's status. This + is done by waiting for the existence of a file written by EXECUTE_COMMAND + once it successfully acquires an exclusive lock. Once confirmed, + WAIT_COMMAND waits to acquire the file lock held by EXECUTE_COMMAND until + 'command' completes, then returns with the stdout, stderr, and exit status + of 'command'. + + Temporary SSH failures (where ssh returns a 255) while waiting for the + command to complete will be tolerated and safely retried. However, if + remote command actually returns 255, SSH will return 1 instead to bypass + retry behavior. + + Args: + command: The command to run. + should_log: Whether to log the command's output at the info level. The + output is always logged at the debug level. + timeout: The timeout for the command in seconds. + ignore_failure: Ignore any failure if set to true. + + Returns: + A tuple of stdout, stderr from running the command. + + Raises: + RemoteCommandError: If there was a problem establishing the connection, or + the command fails. + """ + self._SetupRobustCommand() + + execute_path = os.path.join(vm_util.VM_TMP_DIR, + os.path.basename(EXECUTE_COMMAND)) + wait_path = os.path.join(vm_util.VM_TMP_DIR, + os.path.basename(WAIT_FOR_COMMAND)) + + uid = uuid.uuid4() + file_base = os.path.join(vm_util.VM_TMP_DIR, 'cmd%s' % uid) + wrapper_log = file_base + '.log' + stdout_file = file_base + '.stdout' + stderr_file = file_base + '.stderr' + status_file = file_base + '.status' + exclusive_file = file_base + '.exclusive' + + if not isinstance(command, str): + command = ' '.join(command) + + start_command = ['nohup', 'python3', execute_path, + '--stdout', stdout_file, + '--stderr', stderr_file, + '--status', status_file, + '--exclusive', exclusive_file, + '--command', pipes.quote(command)] # pyformat: disable + if timeout: + start_command.extend(['--timeout', str(timeout)]) + + start_command = '%s 1> %s 2>&1 &' % (' '.join(start_command), + wrapper_log) + self.RemoteCommand(start_command) + # sleep 1 to prevent _WaitForCommand from starting before start_command + time.sleep(1) + + def _WaitForCommand(): + wait_command = ['python3', wait_path, + '--status', status_file, + '--exclusive', exclusive_file] # pyformat: disable + stdout = '' + while 'Command finished.' not in stdout: + stdout, _ = self.RemoteCommand( + ' '.join(wait_command), should_log=should_log, timeout=1800) + wait_command.extend([ + '--stdout', stdout_file, + '--stderr', stderr_file, + '--delete', + ]) # pyformat: disable + return self.RemoteCommand(' '.join(wait_command), should_log=should_log, + ignore_failure=ignore_failure) + + try: + return _WaitForCommand() + except errors.VirtualMachine.RemoteCommandError: + # In case the error was with the wrapper script itself, print the log. + stdout, _ = self.RemoteCommand('cat %s' % wrapper_log, should_log=False) + if stdout.strip(): + logging.warning('Exception during RobustRemoteCommand. ' + 'Wrapper script log:\n%s', stdout) + raise + + def SetupRemoteFirewall(self): + """Sets up IP table configurations on the VM.""" + self.RemoteHostCommand('sudo iptables -A INPUT -j ACCEPT') + self.RemoteHostCommand('sudo iptables -A OUTPUT -j ACCEPT') + + def SetupProxy(self): + """Sets up proxy configuration variables for the cloud environment.""" + env_file = '/etc/environment' + commands = [] + command_template = "grep -qxF '{0}' {1} || echo '{0}' | sudo tee -a {1}" + + if FLAGS.http_proxy: + commands.append(command_template.format( + 'http_proxy=' + FLAGS.http_proxy, env_file)) + + if FLAGS.https_proxy: + commands.append(command_template.format( + 'https_proxy=' + FLAGS.https_proxy, env_file)) + + if FLAGS.ftp_proxy: + commands.append(command_template.format( + 'ftp_proxy=' + FLAGS.ftp_proxy, env_file)) + + if FLAGS.no_proxy: + commands.append(command_template.format( + 'no_proxy=' + FLAGS.no_proxy, env_file)) + + if commands: + self.RemoteCommand(';'.join(commands)) + if FLAGS.ssh_reuse_connections: + # This will stop the existing connection once current commands + # exit, and will allow a new login and /etc/environment evaluation + self.RemoteCommand('', ssh_args=['-O', 'stop']) + + def SetupPackageManager(self): + """Specific Linux flavors should override this.""" + pass + + def OfflineModePrepare(self): + """Specific Linux flavors should override this.""" + pass + + def PrepareVMEnvironment(self): + super(BaseLinuxMixin, self).PrepareVMEnvironment() + self.SetupProxy() + self._CreateVmTmpDir() + self._SetTransparentHugepages() + if FLAGS.setup_remote_firewall: + self.SetupRemoteFirewall() + if self.install_packages: + self._CreateInstallDir() + if self.is_static: + self.SnapshotPackages() + self.SetupPackageManager() + if FLAGS.enable_rsync: + self.InstallPackages('rsync') + self.SetFiles() + self.DoSysctls() + self._DoAppendKernelCommandLine() + self.DoConfigureNetworkForBBR() + self.DoConfigureTCPWindow() + self.UpdateEnvironmentPath() + self._RebootIfNecessary() + self._ApplyScalingGovernor() + self._DisableCpus() + self._RebootIfNecessary() + self.RecordAdditionalMetadata() + self.BurnCpu() + self.FillDisk() + + def _CreateInstallDir(self): + self.RemoteCommand( + ('sudo mkdir -p {0}; ' + 'sudo chmod a+rwxt {0}').format(linux_packages.INSTALL_DIR)) + + # LinuxMixins do not implement _Start or _Stop + def _Start(self): + """Starts the VM.""" + raise NotImplementedError() + + def _Stop(self): + """Stops the VM.""" + raise NotImplementedError() + + def SetFiles(self): + """Apply --set_files to the VM.""" + + for pair in FLAGS.set_files: + path, value = pair.split('=') + self.RemoteCommand('echo "%s" | sudo tee %s' % + (value, path)) + + def _ApplyScalingGovernor(self): + """Applies selected scaling governor to all CPUs + + This setting does not persist if the VM is rebooted. + + Raises: + ValueError: if cpufreq is not available or specified governor cannot be used + """ + if not FLAGS.scaling_governor: + return + + _, _, rc = self.RemoteCommandWithReturnCode('test -f /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor', + ignore_failure=True) + + if rc != 0: + raise ValueError("cpufreq is not available in the system") + + stdout, _ = self.RemoteCommand('cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors') + available_governors = stdout.split() + if FLAGS.scaling_governor not in available_governors: + msg = "Specified governor {} cannot be used. Available governors: {}".format(FLAGS.scaling_governor, + available_governors) + raise ValueError(msg) + + cmd = 'echo {} | sudo tee -a /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor'.format(FLAGS.scaling_governor) + self.RemoteCommand(cmd) + + def _DisableCpus(self): + """Apply num_disable_cpus to the VM. + + Raises: + ValueError: if num_disable_cpus is outside of (0 ... num_cpus-1) + inclusive + """ + if not FLAGS.num_disable_cpus: + return + + self.num_disable_cpus = FLAGS.num_disable_cpus + + if (self.num_disable_cpus <= 0 or + self.num_disable_cpus >= self.num_cpus): + raise ValueError('num_disable_cpus must be between 1 and ' + '(num_cpus - 1) inclusive. ' + 'num_disable_cpus: %i, num_cpus: %i' % + (self.num_disable_cpus, self.num_cpus)) + + # We can't disable cpu 0, starting from the last cpu in /proc/cpuinfo. + # On multiprocessor systems, we also attempt to disable cpus on each + # physical processor based on "physical id" in order to keep a similar + # number of cpus on each physical processor. + # In addition, for each cpu we disable, we will look for cpu with same + # "core id" in order to disable vcpu pairs. + cpus = copy.deepcopy(self.CheckProcCpu().mappings) + cpu_mapping = collections.defaultdict(list) + for cpu, info in cpus.items(): + numa = info.get('physical id') + cpu_mapping[int(numa)].append((cpu, int(info.get('core id')))) + + # Sort cpus based on 'core id' on each numa node + for numa in cpu_mapping: + cpu_mapping[numa] = sorted( + cpu_mapping[numa], + key=lambda cpu_info: (cpu_info[1], cpu_info[0])) + + def _GetNextCPUToDisable(num_disable_cpus): + """Get the next CPU id to disable.""" + numa_nodes = list(cpu_mapping) + while num_disable_cpus: + for numa in sorted(numa_nodes, reverse=True): + cpu_id, _ = cpu_mapping[numa].pop() + num_disable_cpus -= 1 + yield cpu_id + if not num_disable_cpus: + break + + for cpu_id in _GetNextCPUToDisable(self.num_disable_cpus): + self.RemoteCommand('sudo bash -c "echo 0 > ' + f'/sys/devices/system/cpu/cpu{cpu_id}/online"') + self._proccpu_cache = None + self._lscpu_cache = None + + def UpdateEnvironmentPath(self): + """Specific Linux flavors should override this.""" + pass + + def FillDisk(self): + """Fills the primary scratch disk with a zeros file.""" + if FLAGS.disk_fill_size: + out_file = posixpath.join(self.scratch_disks[0].mount_point, 'fill_file') + self.RobustRemoteCommand( + 'dd if=/dev/zero of={out_file} bs=1G count={fill_size}'.format( + out_file=out_file, fill_size=FLAGS.disk_fill_size)) + + def _ApplySysctlPersistent(self, sysctl_params): + """Apply "key=value" pairs to /etc/sysctl.conf and mark the VM for reboot. + + The reboot ensures the values take effect and remain persistent across + future reboots. + + Args: + sysctl_params: dict - the keys and values to write + """ + if not sysctl_params: + return + + for key, value in sysctl_params.items(): + self.RemoteCommand('sudo bash -c \'echo "%s=%s" >> /etc/sysctl.conf\'' + % (key, value)) + + self._needs_reboot = True + + def ApplySysctlPersistent(self, sysctl_params): + """Apply "key=value" pairs to /etc/sysctl.conf and reboot immediately. + + The reboot ensures the values take effect and remain persistent across + future reboots. + + Args: + sysctl_params: dict - the keys and values to write + """ + self._ApplySysctlPersistent(sysctl_params) + self._RebootIfNecessary() + + def DoSysctls(self): + """Apply --sysctl to the VM. + + The Sysctl pairs are written persistently so that if a reboot + occurs, the flags are not lost. + """ + sysctl_params = {} + for pair in FLAGS.sysctl: + key, value = pair.split('=') + sysctl_params[key] = value + self._ApplySysctlPersistent(sysctl_params) + + def DoConfigureNetworkForBBR(self): + """Apply --network_enable_BBR to the VM.""" + if not FLAGS.network_enable_BBR: + return + + if not KernelRelease(self.kernel_release).AtLeast(4, 9): + raise flags.ValidationError( + 'BBR requires a linux image with kernel 4.9 or newer') + + # if the current congestion control mechanism is already BBR + # then nothing needs to be done (avoid unnecessary reboot) + if self.TcpCongestionControl() == 'bbr': + return + + self._ApplySysctlPersistent({ + 'net.core.default_qdisc': 'fq', + 'net.ipv4.tcp_congestion_control': 'bbr' + }) + + def DoConfigureTCPWindow(self): + """Change TCP window parameters in sysctl.""" + + # Return if none of these flags are set + if all(x is None for x in [FLAGS.tcp_max_receive_buffer, + FLAGS.tcp_max_send_buffer, + FLAGS.rmem_max, + FLAGS.wmem_max]): + return + + # Get current values from VM + stdout, _ = self.RemoteCommand('cat /proc/sys/net/ipv4/tcp_rmem') + rmem_values = stdout.split() + stdout, _ = self.RemoteCommand('cat /proc/sys/net/ipv4/tcp_wmem') + wmem_values = stdout.split() + stdout, _ = self.RemoteCommand('cat /proc/sys/net/core/rmem_max') + rmem_max = int(stdout) + stdout, _ = self.RemoteCommand('cat /proc/sys/net/core/wmem_max') + wmem_max = int(stdout) + + # third number is max receive/send + max_receive = rmem_values[2] + max_send = wmem_values[2] + + # if flags are set, override current values from vm + if FLAGS.tcp_max_receive_buffer: + max_receive = FLAGS.tcp_max_receive_buffer + if FLAGS.tcp_max_send_buffer: + max_send = FLAGS.tcp_max_send_buffer + if FLAGS.rmem_max: + rmem_max = FLAGS.rmem_max + if FLAGS.wmem_max: + wmem_max = FLAGS.wmem_max + + # Add values to metadata + self.os_metadata['tcp_max_receive_buffer'] = max_receive + self.os_metadata['tcp_max_send_buffer'] = max_send + self.os_metadata['rmem_max'] = rmem_max + self.os_metadata['wmem_max'] = wmem_max + + rmem_string = '{} {} {}'.format(rmem_values[0], + rmem_values[1], + max_receive) + wmem_string = '{} {} {}'.format(wmem_values[0], + wmem_values[1], + max_send) + + self._ApplySysctlPersistent({ + 'net.ipv4.tcp_rmem': rmem_string, + 'net.ipv4.tcp_wmem': wmem_string, + 'net.core.rmem_max': rmem_max, + 'net.core.wmem_max': wmem_max + }) + + def _RebootIfNecessary(self): + """Will reboot the VM if self._needs_reboot has been set.""" + if self._needs_reboot: + self.Reboot() + self._needs_reboot = False + + def TcpCongestionControl(self): + """Return the congestion control used for tcp.""" + try: + resp, _ = self.RemoteCommand( + 'cat /proc/sys/net/ipv4/tcp_congestion_control') + return resp.rstrip('\n') + except errors.VirtualMachine.RemoteCommandError: + return 'unknown' + + def CheckLsCpu(self): + """Returns a LsCpuResults from the host VM.""" + if not self._lscpu_cache: + lscpu, _ = self.RemoteCommand('lscpu') + self._lscpu_cache = LsCpuResults(lscpu) + return self._lscpu_cache + + def CheckProcCpu(self): + """Returns a ProcCpuResults from the host VM.""" + if not self._proccpu_cache: + proccpu, _ = self.RemoteCommand('cat /proc/cpuinfo') + self._proccpu_cache = ProcCpuResults(proccpu) + return self._proccpu_cache + + def GetOsInfo(self): + """Returns information regarding OS type and version.""" + stdout, _ = self.RemoteCommand('grep PRETTY_NAME /etc/os-release') + return regex_util.ExtractGroup(OS_PRETTY_NAME_REGEXP, stdout) + + @property + def os_info(self): + """Get distribution-specific information.""" + if self.os_metadata.get('os_info'): + return self.os_metadata['os_info'] + else: + return self.GetOsInfo() + + @property + def kernel_release(self): + """Return kernel release number.""" + if self.os_metadata.get('kernel_release'): + return self.os_metadata.get('kernel_release') + else: + stdout, _ = self.RemoteCommand('uname -r') + return stdout.strip() + + @property + def kernel_command_line(self): + """Return the kernel command line.""" + return (self.os_metadata.get('kernel_command_line') or + self.RemoteCommand('cat /proc/cmdline')[0].strip()) + + @property + def partition_table(self): + """Return partition table information.""" + if not self._partition_table: + cmd = 'sudo fdisk -l' + partition_tables = self.RemoteCommand(cmd)[0] + try: + self._partition_table = { + dev: int(size) for (dev, size) in regex_util.ExtractAllMatches( + r'Disk\s*(.*):[\s\w\.]*,\s(\d*)\sbytes', partition_tables)} + except regex_util.NoMatchError: + # TODO(user): Use alternative methods to retrieve partition table. + logging.warning('Partition table not found with "%s".', cmd) + return self._partition_table + + @vm_util.Retry(log_errors=False, poll_interval=1) + def WaitForBootCompletion(self): + """Waits until the VM has booted.""" + # Test for listening on the port first, because this will happen strictly + # first. + if (FLAGS.cluster_boot_test_port_listening and + self.port_listening_time is None): + self.TestConnectRemoteAccessPort() + self.port_listening_time = time.time() + + self._WaitForSSH() + + if self.bootable_time is None: + self.bootable_time = time.time() + + @vm_util.Retry(log_errors=False, poll_interval=1) + def _WaitForSSH(self): + """Waits until the VM is ready.""" + # Always wait for remote host command to succeed, because it is necessary to + # run benchmarks + resp, _ = self.RemoteHostCommand('hostname', retries=1, + suppress_warning=True) + if self.hostname is None: + self.hostname = resp[:-1] + + def RecordAdditionalMetadata(self): + """After the VM has been prepared, store metadata about the VM.""" + self.tcp_congestion_control = self.TcpCongestionControl() + lscpu_results = self.CheckLsCpu() + self.numa_node_count = lscpu_results.numa_node_count + self.os_metadata['threads_per_core'] = lscpu_results.threads_per_core + self.os_metadata['os_info'] = self.os_info + self.os_metadata['kernel_release'] = self.kernel_release + self.os_metadata.update(self.partition_table) + if FLAGS.append_kernel_command_line: + self.os_metadata['kernel_command_line'] = self.kernel_command_line + self.os_metadata[ + 'append_kernel_command_line'] = FLAGS.append_kernel_command_line + + @vm_util.Retry(log_errors=False, poll_interval=1) + def VMLastBootTime(self): + """Returns the time the VM was last rebooted as reported by the VM. + + See + https://unix.stackexchange.com/questions/165002/how-to-reliably-get-timestamp-at-which-the-system-booted. + """ + stdout, _ = self.RemoteHostCommand( + 'stat -c %z /proc/', retries=1, suppress_warning=True) + if stdout.startswith('1970-01-01'): + # Fix for ARM returning epochtime + date_fmt = '+%Y-%m-%d %H:%M:%S.%s %z' + date_cmd = "grep btime /proc/stat | awk '{print $2}'" + stdout, _ = self.RemoteHostCommand(f'date "{date_fmt}" -d@$({date_cmd})') + return stdout + + def SnapshotPackages(self): + """Grabs a snapshot of the currently installed packages.""" + pass + + def RestorePackages(self): + """Restores the currently installed packages to those snapshotted.""" + pass + + def ProxyCleanup(self): + """ Restore to a state before SetupProxy() executed """ + env_file = '/etc/environment' + commands = [] + command_template = "sudo sed -i '\#^{0}$#d' {1}" + + if FLAGS.http_proxy: + commands.append(command_template.format( + 'http_proxy=' + FLAGS.http_proxy, env_file)) + + if FLAGS.https_proxy: + commands.append(command_template.format( + 'https_proxy=' + FLAGS.https_proxy, env_file)) + + if FLAGS.ftp_proxy: + commands.append(command_template.format( + 'ftp_proxy=' + FLAGS.ftp_proxy, env_file)) + + if FLAGS.no_proxy: + commands.append(command_template.format( + 'no_proxy=' + FLAGS.no_proxy, env_file)) + + if commands: + self.RemoteCommand(';'.join(commands)) + + def PackageCleanup(self): + """Cleans up all installed packages. + + Deletes the temp directory, restores packages, and uninstalls all + PerfKit packages. + """ + for package_name in list(self._installed_packages): + try: + self.Uninstall(package_name) + except Exception as e: + logging.info('Got an exception ({0}) when uninstalling package {1}' + 'Attempting to continue to uninstall PerfKit package.'.format(e, package_name)) + + if not FLAGS.skip_package_restore: + self.RestorePackages() + self.RemoteCommand('sudo rm -rf %s' % linux_packages.INSTALL_DIR) + + def GetPathToConfig(self, package_name): + """Returns the path to the config file for PerfKit packages. + + This function is mostly useful when config files locations + don't match across distributions (such as mysql). Packages don't + need to implement it if this is not the case. + """ + pass + + def GetServiceName(self, package_name): + """Returns the service name of a PerfKit package. + + This function is mostly useful when service names don't + match across distributions (such as mongodb). Packages don't + need to implement it if this is not the case. + """ + pass + + @vm_util.Retry() + def FormatDisk(self, device_path, disk_type=None): + """Formats a disk attached to the VM.""" + # Some images may automount one local disk, but we don't + # want to fail if this wasn't the case. + if disk.NFS == disk_type: + return + if disk.SMB == disk_type: + return + umount_cmd = '[[ -d /mnt ]] && sudo umount /mnt; ' + # TODO(user): Allow custom disk formatting options. + if FLAGS.disk_fs_type == 'xfs': + block_size = FLAGS.disk_block_size or 512 + fmt_cmd = ('sudo mkfs.xfs -f -i size={0} {1}'.format( + block_size, device_path)) + else: + block_size = FLAGS.disk_block_size or 4096 + fmt_cmd = ('sudo mke2fs -F -E lazy_itable_init=0,discard -O ' + '^has_journal -t ext4 -b {0} {1}'.format( + block_size, device_path)) + self.os_metadata['disk_filesystem_type'] = FLAGS.disk_fs_type + self.os_metadata['disk_filesystem_blocksize'] = block_size + self.RemoteHostCommand(umount_cmd + fmt_cmd) + + @vm_util.Retry( + timeout=vm_util.DEFAULT_TIMEOUT, + retryable_exceptions=(errors.VirtualMachine.RemoteCommandError,)) + def MountDisk(self, + device_path, + mount_path, + disk_type=None, + mount_options=disk.DEFAULT_MOUNT_OPTIONS, + fstab_options=disk.DEFAULT_FSTAB_OPTIONS): + """Mounts a formatted disk in the VM.""" + mount_options = '-o %s' % mount_options if mount_options else '' + if disk.NFS == disk_type: + mount_options = '-t nfs %s' % mount_options + fs_type = 'nfs' + elif disk.SMB == disk_type: + mount_options = '-t cifs %s' % mount_options + fs_type = 'smb' + else: + fs_type = FLAGS.disk_fs_type + fstab_options = fstab_options or '' + mnt_cmd = ('sudo mkdir -p {mount_path};' + 'sudo mount {mount_options} {device_path} {mount_path} && ' + 'sudo chown $USER:$USER {mount_path};').format( + mount_path=mount_path, + device_path=device_path, + mount_options=mount_options) + self.RemoteHostCommand(mnt_cmd) + # add to /etc/fstab to mount on reboot + mnt_cmd = ('echo "{device_path} {mount_path} {fs_type} {fstab_options}" ' + '| sudo tee -a /etc/fstab').format( + device_path=device_path, + mount_path=mount_path, + fs_type=fs_type, + fstab_options=fstab_options) + self.RemoteHostCommand(mnt_cmd) + + def LogVmDebugInfo(self): + """Logs the output of calling dmesg on the VM.""" + if FLAGS.log_dmesg: + self.RemoteCommand('hostname && dmesg', should_log=True) + + def BackupFiles(self, files): + """Creates a backup of the files in the files dict. A file + "f.txt" will be saved as "f.txt.". Works in tandem + with RestoreFiles. + + Args: + files: Dictionary containing 2 lists of files, one for "system" + and one for "user" files. The difference is mave because + you only need to use sudo for "system" files. + + example: FILES_TO_SAVE = { + "system_files": [ + "/etc/hosts", + "/etc/bash.bashrc" + ], + "user_files": [ + "~/.ssh/authorized_keys" + ] + } + """ + cmds = [] + for category in files: + for path in files[category]: + _, _, retcode = self.RemoteCommandWithReturnCode( + "test -f {0}.{1}".format(path, FLAGS.run_uri), + ignore_failure=True) + if retcode == 0: + logging.warn("File {} already backed up, SKIPPING".format(path)) + else: + cmd = "" + if category == "system_files": + cmd += "sudo " + cmd += "cp -f {0} {0}.{1} 2>/dev/null".format(path, FLAGS.run_uri) + cmds.append(cmd) + if len(cmds) > 0: + self.RemoteCommand(' ; '.join(cmds), ignore_failure=True) + + def RestoreFiles(self, files): + """Works in tandem with BackupFiles. Reverts files that were previously + backed up. + + Args: + files: Dictionary containing 2 lists of files, one for "system" + and one for "user" files. The difference is mave because + you only need to use sudo for "system" files. + + example: FILES_TO_SAVE = { + "system_files": [ + "/etc/hosts", + "/etc/bash.bashrc" + ], + "user_files": [ + "~/.ssh/authorized_keys" + ] + } + """ + cmds = [] + for category in files: + for path in files[category]: + cmd = "" + if category == "system_files": + cmd += "sudo " + cmd += "mv -f {0}.{1} {0} 2>/dev/null".format(path, FLAGS.run_uri) + cmds.append(cmd) + if len(cmds) > 0: + self.RemoteCommand(' ; '.join(cmds), ignore_failure=True) + + def RemoteCopy(self, file_path, remote_path='', copy_to=True): + self.RemoteHostCopy(file_path, remote_path, copy_to) + + def RemoteHostCopy(self, file_path, remote_path='', copy_to=True): + """Copies a file to or from the VM. + + Args: + file_path: Local path to file. + remote_path: Optional path of where to copy file on remote host. + copy_to: True to copy to vm, False to copy from vm. + + Raises: + RemoteCommandError: If there was a problem copying the file. + """ + if vm_util.RunningOnWindows(): + if ':' in file_path: + # scp doesn't like colons in paths. + file_path = file_path.split(':', 1)[1] + # Replace the last instance of '\' with '/' to make scp happy. + file_path = '/'.join(file_path.rsplit('\\', 1)) + remote_ip = '[%s]' % self.GetConnectionIp() + remote_location = '%s@%s:%s' % ( + self.user_name, remote_ip, remote_path) + # An scp is not retried, so increase the connection timeout. + ssh_private_key = (self.ssh_private_key if self.is_static else + vm_util.GetPrivateKeyPath()) + + if FLAGS.enable_rsync: + remote_shell = ["ssh", "-p", str(self.ssh_port)] + remote_shell.extend(vm_util.GetSshOptions(ssh_private_key, connect_timeout=FLAGS.scp_connect_timeout)) + scp_env = {"RSYNC_RSH": " ".join(remote_shell)} + scp_cmd = ['rsync', '-aztpr'] + else: + scp_env = None + scp_cmd = ['scp', '-P', str(self.ssh_port), '-pr'] + scp_cmd.extend(vm_util.GetSshOptions( + ssh_private_key, connect_timeout=FLAGS.scp_connect_timeout)) + + if copy_to: + scp_cmd.extend([file_path, remote_location]) + else: + scp_cmd.extend([remote_location, file_path]) + + stdout, stderr, retcode = vm_util.IssueCommand(scp_cmd, timeout=None, env=scp_env, + raise_on_failure=False) + + if retcode: + full_cmd = ' '.join(scp_cmd) + error_text = ('Got non-zero return code (%s) executing %s\n' + 'STDOUT: %sSTDERR: %s' % + (retcode, full_cmd, stdout, stderr)) + raise errors.VirtualMachine.RemoteCommandError(error_text) + + def RemoteCommand(self, *args, **kwargs): + """Runs a command on the VM. + + Args: + *args: Arguments passed directly to RemoteCommandWithReturnCode. + **kwargs: Keyword arguments passed directly to + RemoteCommandWithReturnCode. + + Returns: + A tuple of stdout, stderr from running the command. + + Raises: + RemoteCommandError: If there was a problem establishing the connection. + """ + return self.RemoteCommandWithReturnCode(*args, **kwargs)[:2] + + def RemoteCommandWithReturnCode(self, *args, **kwargs): + """Runs a command on the VM. + + Args: + *args: Arguments passed directly to RemoteHostCommandWithReturnCode. + **kwargs: Keyword arguments passed directly to + RemoteHostCommandWithReturnCode. + + Returns: + A tuple of stdout, stderr, return_code from running the command. + + Raises: + RemoteCommandError: If there was a problem establishing the connection. + """ + return self.RemoteHostCommandWithReturnCode(*args, **kwargs) + + def RemoteHostCommandWithReturnCode(self, + command, + should_log=False, + retries=None, + ignore_failure=False, + login_shell=False, + suppress_warning=False, + timeout=None, + ssh_args=None): + """Runs a command on the VM. + + This is guaranteed to run on the host VM, whereas RemoteCommand might run + within i.e. a container in the host VM. + + Args: + command: A valid bash command. + should_log: A boolean indicating whether the command result should be + logged at the info level. Even if it is false, the results will + still be logged at the debug level. + retries: The maximum number of times RemoteCommand should retry SSHing + when it receives a 255 return code. If None, it defaults to the value + of the flag ssh_retries. + ignore_failure: Ignore any failure if set to true. + login_shell: Run command in a login shell. + suppress_warning: Suppress the result logging from IssueCommand when the + return code is non-zero. + timeout: The timeout for IssueCommand. + + Returns: + A tuple of stdout, stderr, return_code from running the command. + + Raises: + RemoteCommandError: If there was a problem establishing the connection. + """ + if retries is None: + retries = FLAGS.ssh_retries + if vm_util.RunningOnWindows(): + # Multi-line commands passed to ssh won't work on Windows unless the + # newlines are escaped. + command = command.replace('\n', '\\n') + ip_address = self.GetConnectionIp() + user_host = '%s@%s' % (self.user_name, ip_address) + ssh_cmd = ['ssh', '-A', '-p', str(self.ssh_port), user_host] + ssh_private_key = (self.ssh_private_key if self.is_static else + vm_util.GetPrivateKeyPath()) + ssh_cmd.extend(vm_util.GetSshOptions(ssh_private_key)) + if ssh_args: + ssh_cmd.extend(ssh_args) + try: + if login_shell: + ssh_cmd.extend(['-t', '-t', 'bash -l -c "%s"' % command]) + self._pseudo_tty_lock.acquire() + else: + ssh_cmd.append(command) + + for _ in range(retries): + stdout, stderr, retcode = vm_util.IssueCommand( + ssh_cmd, force_info_log=should_log, + suppress_warning=suppress_warning, + timeout=timeout, raise_on_failure=False) + # Retry on 255 because this indicates an SSH failure + if retcode != RETRYABLE_SSH_RETCODE: + break + finally: + if login_shell: + self._pseudo_tty_lock.release() + + if retcode: + full_cmd = ' '.join(ssh_cmd) + error_text = ('Got non-zero return code (%s) executing %s\n' + 'Full command: %s\nSTDOUT: %sSTDERR: %s' % + (retcode, command, full_cmd, stdout, stderr)) + if not ignore_failure: + raise errors.VirtualMachine.RemoteCommandError(error_text) + + return (stdout, stderr, retcode) + + def RemoteHostCommand(self, *args, **kwargs): + """Runs a command on the VM. + + This is guaranteed to run on the host VM, whereas RemoteCommand might run + within i.e. a container in the host VM. + + Args: + *args: Arguments passed directly to RemoteHostCommandWithReturnCode. + **kwargs: Keyword arguments passed directly to + RemoteHostCommandWithReturnCode. + + Returns: + A tuple of stdout, stderr from running the command. + + Raises: + RemoteCommandError: If there was a problem establishing the connection. + """ + return self.RemoteHostCommandWithReturnCode(*args, **kwargs)[:2] + + def _CheckRebootability(self): + if not self.IS_REBOOTABLE: + raise errors.VirtualMachine.VirtualMachineError( + "Trying to reboot a VM that isn't rebootable.") + + def _Reboot(self): + """OS-specific implementation of reboot command.""" + self._CheckRebootability() + self.RemoteCommand('sudo reboot', ignore_failure=True) + + def _AfterReboot(self): + """Performs any OS-specific setup on the VM following reboot. + + This will be called after every call to Reboot(). + """ + # clear out os_info and kernel_release as might have changed + previous_os_info = self.os_metadata.pop('os_info', None) + previous_kernel_release = self.os_metadata.pop('kernel_release', None) + previous_kernel_command = self.os_metadata.pop('kernel_command_line', None) + if previous_os_info or previous_kernel_release or previous_kernel_command: + self.RecordAdditionalMetadata() + if self._lscpu_cache: + self._lscpu_cache = None + self.CheckLsCpu() + if self.install_packages: + self._CreateInstallDir() + self._CreateVmTmpDir() + self._SetTransparentHugepages() + self._has_remote_command_script = False + self._DisableCpus() + + def MoveFile(self, target, source_path, remote_path=''): + self.MoveHostFile(target, source_path, remote_path) + + def MoveHostFile(self, target, source_path, remote_path=''): + """Copies a file from one VM to a target VM. + + Args: + target: The target BaseVirtualMachine object. + source_path: The location of the file on the REMOTE machine. + remote_path: The destination of the file on the TARGET machine, default + is the home directory. + """ + self.AuthenticateVm() + + # TODO(user): For security we may want to include + # -o UserKnownHostsFile=/dev/null in the scp command + # however for the moment, this has happy side effects + # ie: the key is added to know known_hosts which allows + # OpenMPI to operate correctly. + remote_location = '%s@%s:%s' % ( + target.user_name, target.ip_address, remote_path) + self.RemoteHostCommand('scp -P %s -o StrictHostKeyChecking=no -i %s %s %s' % + (target.ssh_port, REMOTE_KEY_PATH, source_path, + remote_location)) + + def AuthenticateVm(self): + """Authenticate a remote machine to access all peers.""" + if not self.is_static and not self.has_private_key: + self.RemoteHostCopy(vm_util.GetPrivateKeyPath(), + REMOTE_KEY_PATH) + self.RemoteCommand( + 'echo "Host *\n StrictHostKeyChecking no\n" > ~/.ssh/config') + self.RemoteCommand('chmod 600 ~/.ssh/config') + self.has_private_key = True + + def TestAuthentication(self, peer): + """Tests whether the VM can access its peer. + + Raises: + AuthError: If the VM cannot access its peer. + """ + if not self.TryRemoteCommand('ssh %s hostname' % peer.internal_ip): + raise errors.VirtualMachine.AuthError( + 'Authentication check failed. If you are running with Static VMs, ' + 'please make sure that %s can ssh into %s without supplying any ' + 'arguments except the ip address.' % (self, peer)) + + def CheckJavaVersion(self): + """Check the version of java on remote machine. + + Returns: + The version of Java installed on remote machine. + """ + version, _ = self.RemoteCommand('java -version 2>&1 >/dev/null | ' + 'grep version | ' + 'awk \'{print $3}\'') + return version[:-1] + + def RemoveFile(self, filename): + """Deletes a file on a remote machine. + + Args: + filename: Path to the file to delete. + """ + self.RemoteCommand('sudo rm -rf %s' % filename) + + def GetDeviceSizeFromPath(self, path): + """Gets the size of the a drive that contains the path specified. + + Args: + path: The function will return the amount of space on the file system + that contains this file name. + + Returns: + The size in 1K blocks of the file system containing the file. + """ + df_command = "df -k -P %s | tail -n +2 | awk '{ print $2 }'" % path + stdout, _ = self.RemoteCommand(df_command) + return int(stdout) + + def DropCaches(self): + """Drops the VM's caches.""" + drop_caches_command = 'sync; sudo /sbin/sysctl vm.drop_caches=3' + self.RemoteCommand(drop_caches_command) + + def _GetNumCpus(self): + """Returns the number of logical CPUs on the VM. + + This method does not cache results (unlike "num_cpus"). + """ + stdout, _ = self.RemoteCommand( + 'cat /proc/cpuinfo | grep processor | wc -l') + return int(stdout) + + def _GetTotalFreeMemoryKb(self): + """Calculate amount of free memory in KB of the given vm. + + Free memory is calculated as sum of free, cached, and buffers + as output from /proc/meminfo. + + Args: + vm: vm to check + + Returns: + free memory on the vm in KB + """ + stdout, _ = self.RemoteCommand(""" + awk ' + BEGIN {total =0} + /MemFree:/ {total += $2} + /Cached:/ {total += $2} + /Buffers:/ {total += $2} + END {print total} + ' /proc/meminfo + """) + return int(stdout) + + def _GetTotalMemoryKb(self): + """Returns the amount of physical memory on the VM in Kilobytes. + + This method does not cache results (unlike "total_memory_kb"). + """ + meminfo_command = 'cat /proc/meminfo | grep MemTotal | awk \'{print $2}\'' + stdout, _ = self.RemoteCommand(meminfo_command) + return int(stdout) + + def _TestReachable(self, ip): + """Returns True if the VM can reach the ip address and False otherwise.""" + return self.TryRemoteCommand('ping -c 1 %s' % ip) + + def SetupLocalDisks(self): + """Performs Linux specific setup of local disks.""" + pass + + def CreateRamDisk(self, disk_spec): + """Performs Linux specific setup of ram disk.""" + assert disk_spec.mount_point + ramdisk = self.RamDisk(disk_spec) + ramdisk.Mount(self) + self.scratch_disks.append(ramdisk) + + class RamDisk(disk.MountableDisk): + """Linux specific setup of ram disk.""" + + def Mount(self, vm): + logging.info('Mounting and creating Ram Disk %s, %s', + self.mount_point, self.disk_size) + mnt_cmd = ('sudo mkdir -p {0};sudo mount -t tmpfs -o size={1}g tmpfs {0};' + 'sudo chown -R $USER:$USER {0};').format( + self.mount_point, self.disk_size) + vm.RemoteHostCommand(mnt_cmd) + + def _CreateScratchDiskFromDisks(self, disk_spec, disks): + """Helper method to prepare data disks. + + Given a list of BaseDisk objects, this will do most of the work creating, + attaching, striping, formatting, and mounting them. If multiple BaseDisk + objects are passed to this method, it will stripe them, combining them + into one 'logical' data disk (it will be treated as a single disk from a + benchmarks perspective). This is intended to be called from within a cloud + specific VM's CreateScratchDisk method. + + Args: + disk_spec: The BaseDiskSpec object corresponding to the disk. + disks: A list of the disk(s) to be created, attached, striped, + formatted, and mounted. If there is more than one disk in + the list, then they will be striped together. + """ + if len(disks) > 1: + # If the disk_spec called for a striped disk, create one. + disk_spec.device_path = '/dev/md%d' % len(self.scratch_disks) + data_disk = disk.StripedDisk(disk_spec, disks) + else: + data_disk = disks[0] + + self.scratch_disks.append(data_disk) + + if data_disk.disk_type != disk.LOCAL: + data_disk.Create() + data_disk.Attach(self) + + if data_disk.is_striped: + device_paths = [d.GetDevicePath() for d in data_disk.disks] + self.StripeDisks(device_paths, data_disk.GetDevicePath()) + + if disk_spec.mount_point: + if isinstance(data_disk, disk.MountableDisk): + data_disk.Mount(self) + else: + self.FormatDisk(data_disk.GetDevicePath(), disk_spec.disk_type) + self.MountDisk(data_disk.GetDevicePath(), disk_spec.mount_point, + disk_spec.disk_type, data_disk.mount_options, + data_disk.fstab_options) + + @vm_util.Retry(max_retries=3, poll_interval=5) + def StripeDisks(self, devices, striped_device): + """Raids disks together using mdadm. + + Args: + devices: A list of device paths that should be striped together. + striped_device: The path to the device that will be created. + """ + self.Install('mdadm') + stripe_cmd = ('yes | sudo mdadm --create %s --level=stripe --raid-devices=' + '%s %s' % (striped_device, len(devices), ' '.join(devices))) + self.RemoteHostCommand(stripe_cmd) + + # Save the RAID layout on the disk + cmd = ('sudo mdadm --detail --scan | ' + + 'sudo tee -a /etc/mdadm/mdadm.conf') + self.RemoteHostCommand(cmd) + + # Make the disk available during reboot + cmd = 'sudo update-initramfs -u' + self.RemoteHostCommand(cmd) + + # Automatically mount the disk after reboot + cmd = ('echo \'/dev/md0 /mnt/md0 ext4 defaults,nofail' + ',discard 0 0\' | sudo tee -a /etc/fstab') + self.RemoteHostCommand(cmd) + + def BurnCpu(self, burn_cpu_threads=None, burn_cpu_seconds=None): + """Burns vm cpu for some amount of time and dirty cache. + + Args: + burn_cpu_threads: Number of threads to burn cpu. + burn_cpu_seconds: Amount of time in seconds to burn cpu. + """ + burn_cpu_threads = burn_cpu_threads or FLAGS.burn_cpu_threads + burn_cpu_seconds = burn_cpu_seconds or FLAGS.burn_cpu_seconds + if burn_cpu_seconds: + self.Install('sysbench') + end_time = time.time() + burn_cpu_seconds + self.RemoteCommand( + 'nohup sysbench --num-threads=%s --test=cpu --cpu-max-prime=10000000 ' + 'run 1> /dev/null 2> /dev/null &' % burn_cpu_threads) + if time.time() < end_time: + time.sleep(end_time - time.time()) + self.RemoteCommand('pkill -9 sysbench') + + def SetSmpAffinity(self): + """Set SMP IRQ affinity.""" + if self._smp_affinity_script: + self.PushDataFile(self._smp_affinity_script) + self.RemoteCommand('sudo bash %s' % self._smp_affinity_script) + else: + raise NotImplementedError() + + def SetReadAhead(self, num_sectors, devices): + """Set read-ahead value for block devices. + + Args: + num_sectors: int. Number of sectors of read ahead. + devices: list of strings. A list of block devices. + """ + self.RemoteCommand( + 'sudo blockdev --setra {0} {1}; sudo blockdev --setfra {0} {1};'.format( + num_sectors, ' '.join(devices))) + + def GetSha256sum(self, path, filename): + """Gets the sha256sum hash for a filename in a path on the VM. + + Args: + path: string; Path on the VM. + filename: string; Name of the file in the path. + + Returns: + string; The sha256sum hash. + """ + stdout, _ = self.RemoteCommand( + 'sha256sum %s' % posixpath.join(path, filename)) + sha256sum, _ = stdout.split() + return sha256sum + + def _GetSmbService(self): + """Returns the SmbService created in the benchmark spec. + + Before calling this method check that the disk.disk_type is equal to + disk.SMB or else an exception will be raised. + + Returns: + The smb_service.BaseSmbService service for this cloud. + + Raises: + CreationError: If no SMB service was created. + """ + smb = getattr(context.GetThreadBenchmarkSpec(), 'smb_service') + if smb is None: + raise errors.Resource.CreationError('No SMB Service created') + return smb + + def AppendKernelCommandLine(self, command_line, reboot=True): + """Appends the provided command-line to the VM and reboots by default. + + This method should be overwritten by the desired Linux flavor to be useful. + Most (all?) Linux flavors modify the kernel command line by updating the + GRUB configuration files and rebooting. + + Args: + command_line: The string to append to the kernel command line. + reboot: Whether or not to reboot to have the change take effect. + """ + raise NotImplementedError( + 'Kernel command-line appending for given Linux flavor not implemented.') + + def _DoAppendKernelCommandLine(self): + """If the flag is set, attempts to append the provided kernel command line. + + In addition, to consolidate reboots during VM prepare, this method sets the + needs reboot bit instead of immediately rebooting. + """ + if FLAGS.disable_smt and self.CheckLsCpu().threads_per_core != 1: + FLAGS.append_kernel_command_line = ' '.join( + (FLAGS.append_kernel_command_line, + 'nosmt')) if FLAGS.append_kernel_command_line else 'nosmt' + if FLAGS.append_kernel_command_line: + self.AppendKernelCommandLine( + FLAGS.append_kernel_command_line, reboot=False) + self._needs_reboot = True + + @abc.abstractmethod + def InstallPackages(self, packages: str) -> None: + """Installs packages using the OS's package manager.""" + pass + + def _IsSmtEnabled(self): + """Whether simultaneous multithreading (SMT) is enabled on the vm. + + Looks for the "nosmt" attribute in the booted linux kernel command line + parameters. + + Returns: + Whether SMT is enabled on the vm. + """ + return not bool(re.search(r'\bnosmt\b', self.kernel_command_line)) + + @property + def cpu_vulnerabilities(self) -> CpuVulnerabilities: + """Returns a CpuVulnerabilities of CPU vulnerabilities. + + Output of "grep . .../cpu/vulnerabilities/*" looks like this: + /sys/devices/system/cpu/vulnerabilities/itlb_multihit:KVM: Vulnerable + /sys/devices/system/cpu/vulnerabilities/l1tf:Mitigation: PTE Inversion + Which gets turned into + CpuVulnerabilities(vulnerabilities={'itlb_multihit': 'KVM'}, + mitigations= {'l1tf': 'PTE Inversion'}) + """ + text, _ = self.RemoteCommand( + 'sudo grep . /sys/devices/system/cpu/vulnerabilities/*', + ignore_failure=True) + vuln = CpuVulnerabilities() + if not text: + logging.warning('No text response when getting CPU vulnerabilities') + return vuln + for line in text.splitlines(): + vuln.AddLine(line) + return vuln + + +class ClearMixin(BaseLinuxMixin): + """Class holding Clear Linux specific VM methods and attributes.""" + + OS_TYPE = os_types.CLEAR + BASE_OS_TYPE = os_types.CLEAR + PYTHON_2_PACKAGE = 'python-basic' + + def OnStartup(self): + """Eliminates the need to have a tty to run sudo commands.""" + super(ClearMixin, self).OnStartup() + self.RemoteHostCommand('sudo swupd autoupdate --disable') + self.RemoteHostCommand('sudo mkdir -p /etc/sudoers.d') + self.RemoteHostCommand('echo \'Defaults:{0} !requiretty\' | ' + 'sudo tee /etc/sudoers.d/pkb'.format(self.user_name), + login_shell=True) + + def PackageCleanup(self): + """Cleans up all installed packages. + + Performs the normal package cleanup, then deletes the file + added to the /etc/sudoers.d directory during startup. + """ + super(ClearMixin, self).PackageCleanup() + self.RemoteCommand('sudo rm /etc/sudoers.d/pkb') + + def SnapshotPackages(self): + """See base class.""" + self.RemoteCommand('sudo swupd bundle-list > {0}/bundle_list'.format( + linux_packages.INSTALL_DIR)) + + def RestorePackages(self): + """See base class.""" + self.RemoteCommand( + 'sudo swupd bundle-list | grep --fixed-strings --line-regexp --invert-match --file ' + '{0}/bundle_list | xargs --no-run-if-empty sudo swupd bundle-remove' + .format(linux_packages.INSTALL_DIR), + ignore_failure=True) + + def HasPackage(self, package): + """Returns True iff the package is available for installation.""" + return self.TryRemoteCommand( + 'sudo swupd bundle-list --all | grep {0}'.format(package), + suppress_warning=True) + + def InstallPackages(self, packages: str) -> None: + """Installs packages using the swupd bundle manager.""" + self.RemoteCommand('sudo swupd bundle-add {0}'.format(packages)) + + def Install(self, package_name): + """Installs a PerfKit package on the VM.""" + if not self.install_packages: + return + if package_name not in self._installed_packages: + package = linux_packages.PACKAGES[package_name] + if hasattr(package, 'SwupdInstall'): + package.SwupdInstall(self) + elif hasattr(package, 'Install'): + package.Install(self) + else: + raise KeyError( + 'Package {0} has no install method for Clear Linux.'.format( + package_name)) + self._installed_packages.add(package_name) + + def Uninstall(self, package_name): + """Uninstalls a PerfKit package on the VM.""" + package = linux_packages.PACKAGES[package_name] + if hasattr(package, 'SwupdUninstall'): + package.SwupdUninstall(self) + elif hasattr(package, 'Uninstall'): + package.Uninstall(self) + + def GetPathToConfig(self, package_name): + """See base class.""" + package = linux_packages.PACKAGES[package_name] + return package.SwupdGetPathToConfig(self) + + def GetServiceName(self, package_name): + """See base class.""" + package = linux_packages.PACKAGES[package_name] + return package.SwupdGetServiceName(self) + + def GetOsInfo(self): + """See base class.""" + stdout, _ = self.RemoteCommand('swupd info | grep Installed') + return 'Clear Linux build: {0}'.format( + regex_util.ExtractGroup(CLEAR_BUILD_REGEXP, stdout)) + + def SetupProxy(self): + """Sets up proxy configuration variables for the cloud environment.""" + super(ClearMixin, self).SetupProxy() + profile_file = '/etc/profile' + commands = [] + + if FLAGS.http_proxy: + commands.append("echo 'export http_proxy=%s' | sudo tee -a %s" % ( + FLAGS.http_proxy, profile_file)) + + if FLAGS.https_proxy: + commands.append("echo 'https_proxy=%s' | sudo tee -a %s" % ( + FLAGS.https_proxy, profile_file)) + + if FLAGS.ftp_proxy: + commands.append("echo 'ftp_proxy=%s' | sudo tee -a %s" % ( + FLAGS.ftp_proxy, profile_file)) + + if FLAGS.no_proxy: + commands.append("echo 'export no_proxy=%s' | sudo tee -a %s" % ( + FLAGS.no_proxy, profile_file)) + if commands: + self.RemoteCommand(';'.join(commands)) + + def RemoteCommand(self, command, **kwargs): + """Runs a command inside the container. + + Args: + command: Arguments passed directly to RemoteHostCommandWithReturnCode. + **kwargs: Keyword arguments passed directly to + RemoteHostCommandWithReturnCode. + + Returns: + A tuple of stdout and stderr from running the command. + """ + # Escapes bash sequences + command = '. /etc/profile; %s' % (command) + return self.RemoteHostCommand(command, **kwargs)[:2] + + +class BaseContainerLinuxMixin(BaseLinuxMixin): + """Class holding VM methods for minimal container-based OSes like Core OS. + + These operating systems have SSH like other Linux OSes, but no package manager + to run Linux benchmarks without Docker. + + Because they cannot install packages, they only support VM life cycle + benchmarks like cluster_boot. + """ + + def InstallPackages(self, package_name): + raise NotImplementedError('Container OSes have no package managers.') + + def HasPackage(self, package: str) -> bool: + return False + + # Install could theoretically be supported. A hermetic architecture + # appropriate binary could be copied into the VM and run. + # However because curl, wget, and object store clients cannot be installed and + # may or may not be present, copying the binary is non-trivial so simply + # block trying. + + def Install(self, package_name): + raise NotImplementedError('Container OSes have no package managers.') + + def Uninstall(self, package_name): + raise NotImplementedError('Container OSes have no package managers.') + + def PrepareVMEnvironment(self): + # Don't try to install packages as normal, because it will fail. + pass + + +class BaseRhelMixin(BaseLinuxMixin): + """Class holding RHEL/CentOS specific VM methods and attributes.""" + + # OS_TYPE = os_types.RHEL + BASE_OS_TYPE = os_types.RHEL + + def OnStartup(self): + """Eliminates the need to have a tty to run sudo commands.""" + super(BaseRhelMixin, self).OnStartup() + self.RemoteHostCommand('echo \'Defaults:%s !requiretty\' | ' + 'sudo tee /etc/sudoers.d/pkb' % self.user_name, + login_shell=True) + if FLAGS.gce_hpc_tools: + self.InstallGcpHpcTools() + if _DISABLE_YUM_CRON.value: + # yum cron can stall causing yum commands to hang + self.RemoteHostCommand('sudo systemctl disable yum-cron.service', + ignore_failure=True) + + def InstallGcpHpcTools(self): + """Installs the GCP HPC tools.""" + self.Install('gce_hpc_tools') + + def InstallEpelRepo(self): + """Installs the Extra Packages for Enterprise Linux repository.""" + self.Install('epel_release') + + def PackageCleanup(self): + """Cleans up all installed packages. + + Performs the normal package cleanup, then deletes the file + added to the /etc/sudoers.d directory during startup. + """ + super(BaseRhelMixin, self).PackageCleanup() + self.RemoteCommand('sudo rm -f /etc/sudoers.d/pkb') + + def SnapshotPackages(self): + """Grabs a snapshot of the currently installed packages.""" + self.RemoteCommand('rpm -qa > %s/rpm_package_list' + % linux_packages.INSTALL_DIR) + + def RestorePackages(self): + """Restores the currently installed packages to those snapshotted.""" + self.RemoteCommand( + 'rpm -qa | grep --fixed-strings --line-regexp --invert-match --file ' + '%s/rpm_package_list | xargs --no-run-if-empty sudo rpm -e' % + linux_packages.INSTALL_DIR, + ignore_failure=True) + + def HasPackage(self, package): + """Returns True iff the package is available for installation.""" + return self.TryRemoteCommand('sudo yum info %s' % package, + suppress_warning=True) + + # yum talks to the network on each request so transient issues may fix + # themselves on retry + @vm_util.Retry(max_retries=UPDATE_RETRIES) + def InstallPackages(self, packages): + """Installs packages using the yum package manager.""" + self.RemoteCommand('sudo yum install -y %s' % packages) + + @vm_util.Retry() + def InstallPackageGroup(self, package_group): + """Installs a 'package group' using the yum package manager.""" + self.RemoteCommand('sudo yum groupinstall -y "%s"' % package_group) + + def Install(self, package_name): + """Installs a PerfKit package on the VM.""" + if not self.install_packages: + return + if package_name not in self._installed_packages: + package = linux_packages.PACKAGES[package_name] + if hasattr(package, 'YumInstall'): + package.YumInstall(self) + elif hasattr(package, 'Install'): + package.Install(self) + else: + raise KeyError('Package %s has no install method for RHEL.' % + package_name) + self._installed_packages.add(package_name) + + def Uninstall(self, package_name): + """Uninstalls a PerfKit package on the VM.""" + package = linux_packages.PACKAGES[package_name] + if hasattr(package, 'YumUninstall'): + package.YumUninstall(self) + elif hasattr(package, 'Uninstall'): + package.Uninstall(self) + + def GetPathToConfig(self, package_name): + """Returns the path to the config file for PerfKit packages. + + This function is mostly useful when config files locations + don't match across distributions (such as mysql). Packages don't + need to implement it if this is not the case. + """ + package = linux_packages.PACKAGES[package_name] + return package.YumGetPathToConfig(self) + + def GetServiceName(self, package_name): + """Returns the service name of a PerfKit package. + + This function is mostly useful when service names don't + match across distributions (such as mongodb). Packages don't + need to implement it if this is not the case. + """ + package = linux_packages.PACKAGES[package_name] + return package.YumGetServiceName(self) + + def SetupProxy(self): + """Sets up proxy configuration variables for the cloud environment.""" + super(BaseRhelMixin, self).SetupProxy() + yum_proxy_file = '/etc/yum.conf' + + if FLAGS.http_proxy: + self.RemoteCommand("grep -qx 'proxy={0}' {1} || echo -e 'proxy={0}' |\ + sudo tee -a {1}".format(FLAGS.http_proxy, yum_proxy_file)) + + def ProxyCleanup(self): + """ Restore to a state before SetupProxy() executed """ + super(BaseRhelMixin, self).ProxyCleanup() + yum_proxy_file = '/etc/yum.conf' + command = "sudo sed -i '\#^proxy={0}$#d' {1}" + + if FLAGS.http_proxy: + self.RemoteCommand(command.format(FLAGS.http_proxy, yum_proxy_file)) + + def AppendKernelCommandLine(self, command_line, reboot=True): + """Appends the provided command-line to the VM and reboots by default.""" + self.RemoteCommand( + r'echo GRUB_CMDLINE_LINUX_DEFAULT=\"\${GRUB_CMDLINE_LINUX_DEFAULT} %s\"' + ' | sudo tee -a /etc/default/grub' % command_line) + self.RemoteCommand('sudo grub2-mkconfig -o /boot/grub2/grub.cfg') + self.RemoteCommand('sudo grub2-mkconfig -o /etc/grub2.cfg') + if reboot: + self.Reboot() + + def AllowPortOsFirewall(self, start_port, end_port=None): + out, stderr, retcode = self.RemoteCommandWithReturnCode('sudo firewall-cmd --state', ignore_failure=True) + if retcode == 1: + logging.info("Firewalld does not appear to be installed, " + "skipping opening port(s) {}-{}.".format(start_port, end_port)) + else: + if out.startswith('running'): + cmd = 'sudo firewall-cmd --permanent --add-port {0}/tcp && ' \ + 'sudo firewall-cmd --permanent --add-port {0}/udp && ' \ + 'sudo firewall-cmd --reload'.format(start_port) + if end_port: + cmd = 'sudo firewall-cmd --permanent --add-port {0}-{1}/tcp && ' \ + 'sudo firewall-cmd --permanent --add-port {0}-{1}/udp && ' \ + 'sudo firewall-cmd --reload'.format(start_port, end_port) + self.RemoteCommand(cmd) + else: + logging.info("Firewalld does not appear to be running, " + "skipping opening port(s) {}-{}.".format(start_port, end_port)) + + def _AssignAdditionalPrivateIpAddresses(self, netmask, gateway): + self.InstallPackages("net-tools iproute") + private_addresses = self.additional_private_ip_addresses + stderr = "" + route_cmd = "{0} | awk '/^default/{{print $NF}}' | sort | head -n 1" + route_paths = ["/usr/sbin/route", "route"] + for route_path in route_paths: + out, stderr, retcode = self.RemoteCommandWithReturnCode(route_cmd.format(route_path), + ignore_failure=True) + if retcode == 0: + break + else: + raise errors.VirtualMachine.RemoteCommandError(stderr) + interface_name = str(out.strip()) + out, _ = self.RemoteCommand("cat /sys/class/net/{0}/address".format(interface_name)) + mac_address = str(out.strip()) + dir_path = posixpath.join("/etc", "sysconfig", "network-scripts") + append_to_file = 'echo "{{0}}" | sudo tee -a {0}' + for i, addr in enumerate(private_addresses): + interface_num = i + 1 + subinterface_name = "{0}:{1}".format(interface_name, interface_num) + file_path = posixpath.join(dir_path, "ifcfg-{0}".format(subinterface_name)) + append_to_local_file = append_to_file.format(file_path) + cmds = [ + append_to_local_file.format('DEVICE="{0}"'.format(subinterface_name)), + append_to_local_file.format("BOOTPROTO=static"), + append_to_local_file.format("ONBOOT=YES"), + append_to_local_file.format("TYPE=Ethernet"), + append_to_local_file.format("IPADDR={0}".format(addr)), + append_to_local_file.format("NETMASK={0}".format(netmask)), + append_to_local_file.format("GATEWAY={0}".format(gateway)), + append_to_local_file.format("HWADDR={0}".format(mac_address)), + "sudo ifup {0}".format(subinterface_name) + ] + self.RemoteCommand(' && '.join(cmds)) + + +class AmazonLinux2Mixin(BaseRhelMixin): + """Class holding Amazon Linux 2 VM methods and attributes.""" + OS_TYPE = os_types.AMAZONLINUX2 + + +class Rhel7Mixin(BaseRhelMixin): + """Class holding RHEL 7 specific VM methods and attributes.""" + OS_TYPE = os_types.RHEL7 + + +class Rhel8Mixin(BaseRhelMixin): + """Class holding RHEL 8 specific VM methods and attributes.""" + OS_TYPE = os_types.RHEL8 + + +class CentOs7Mixin(BaseRhelMixin): + """Class holding CentOS 7 specific VM methods and attributes.""" + OS_TYPE = os_types.CENTOS7 + + +class CentOs8Mixin(BaseRhelMixin, virtual_machine.DeprecatedOsMixin): + """Class holding CentOS 8 specific VM methods and attributes.""" + OS_TYPE = os_types.CENTOS8 + END_OF_LIFE = '2021-12-31' + ALTERNATIVE_OS = f'{os_types.CENTOS_STREAM8} or {os_types.CENTOS_STREAM9}' + + +class CentOsStream8Mixin(BaseRhelMixin): + """Class holding CentOS Stream 8 specific VM methods and attributes.""" + OS_TYPE = os_types.CENTOS_STREAM8 + + +class CentOsStream9Mixin(BaseRhelMixin): + """Class holding CentOS Stream 9 specific VM methods and attributes.""" + OS_TYPE = os_types.CENTOS_STREAM9 + + +class RockyLinux8Mixin(BaseRhelMixin): + """Class holding Rocky Linux 8 specific VM methods and attributes.""" + OS_TYPE = os_types.ROCKY_LINUX8 + + +class ContainerOptimizedOsMixin(BaseContainerLinuxMixin): + """Class holding COS specific VM methods and attributes.""" + OS_TYPE = os_types.COS + BASE_OS_TYPE = os_types.CORE_OS + + def PrepareVMEnvironment(self): + super(ContainerOptimizedOsMixin, self).PrepareVMEnvironment() + # COS mounts /home and /tmp with -o noexec, which blocks running benchmark + # binaries. + # TODO(user): Support reboots + self.RemoteCommand('sudo mount -o remount,exec /home') + self.RemoteCommand('sudo mount -o remount,exec /tmp') + + +class CoreOsMixin(BaseContainerLinuxMixin): + """Class holding CoreOS Container Linux specific VM methods and attributes.""" + OS_TYPE = os_types.CORE_OS + BASE_OS_TYPE = os_types.CORE_OS + + +class BaseDebianMixin(BaseLinuxMixin): + """Class holding Debian specific VM methods and attributes.""" + + OS_TYPE = 'base-only' + BASE_OS_TYPE = os_types.DEBIAN + _PACKAGES = {} + + def __init__(self, *args, **kwargs): + super(BaseDebianMixin, self).__init__(*args, **kwargs) + + # Whether or not apt-get update has been called. + # We defer running apt-get update until the first request to install a + # package. + self._apt_updated = False + + @vm_util.Retry(max_retries=UPDATE_RETRIES) + def AptUpdate(self): + """Updates the package lists on VMs using apt.""" + try: + # setting the timeout on the apt-get to 5 minutes because + # it is known to get stuck. In a normal update this + # takes less than 30 seconds. + self.RemoteCommand('sudo apt-get update', timeout=300) + except errors.VirtualMachine.RemoteCommandError as e: + # If there is a problem, remove the lists in order to get rid of + # "Hash Sum mismatch" errors (the files will be restored when + # apt-get update is run again). + self.RemoteCommand('sudo rm -r /var/lib/apt/lists/*') + raise e + + def SnapshotPackages(self): + """Grabs a snapshot of the currently installed packages.""" + initial_packages, _ = self.RemoteCommand('dpkg-query --show') + initial_packages = str(initial_packages.strip()) + initial_packages = initial_packages.splitlines() + package_infos = '(?:\s+)?(.*?)(?::(.*?))?\s+(.*)' + + for line in initial_packages: + package = re.search(package_infos, line).group(1) + architecture = re.search(package_infos, line).group(2) + version = re.search(package_infos, line).group(3) + package_pair = (version, architecture) + self._PACKAGES[package] = package_pair + + def RestorePackages(self): + """Restores the currently installed packages to those snapshotted.""" + initial_package_names = self._PACKAGES.keys() + # If snapshots are not available, do nothing + if len(initial_package_names) == 0: + logging.info("No package snapshot information available, could not restore system to its original state") + return + + unavailable_package_names = [] + MAX_PACKAGE_PER_CMD = 50 + + def _PurgePackages(current_packages, initial_package_names): + '''Purge unwanted packages that are in addition over default.''' + remove_packages = 'sudo DEBIAN_FRONTEND=noninteractive apt-get --purge remove -y' + package_idx = 0 + commands_list = [] + checked_packages = {} + validate_package = False + madison_package_infos = '(?:\s+)?(.*?)(?::.*?)?\s+\|\s+(.*?)(?=\s+\|)\s+\|\s+(?:.*?)\s+(?:.*?)\s+(?:(.*?)\s+)?' + current_package_infos = '(?:\s+)?(.*?)(?::(.*?))?\s+(.*)' + while package_idx < len(current_packages): + cmd = 'apt-cache madison' + # Batching packages is needed because SSH have a limit of how long the string can be + for count in range(MAX_PACKAGE_PER_CMD): + if package_idx < len(current_packages): + package = re.search(current_package_infos, current_packages[package_idx]).group(1) + cmd += " " + package + package_idx += 1 + else: + break + commands_list.append(cmd) + for madison_pack in commands_list: + check_package, _ = self.RemoteCommand(madison_pack) + check_package = str(check_package.strip()) + check_package = check_package.splitlines() + for line in check_package: + if not line or "|" not in line: + continue + found = re.search(madison_package_infos, line) + if found: + package = found.group(1) + version = found.group(2) + architecture = found.group(3) + package_pair = (version, architecture) + if package not in checked_packages.keys(): + checked_packages[package] = [package_pair] + else: + checked_packages[package].append(package_pair) + else: + logging.warn('Unmatched package found: [%s] ', line) + for element in reversed(current_packages): + package_name = re.search(current_package_infos, element).group(1) + # for packages do not have apt-cache madison info + if package_name not in checked_packages: + continue + # remove packages that do not belong to initial setup + if package_name not in initial_package_names: + remove_packages += " " + package_name + unavailable_package_names.append(package_name) + current_packages.remove(element) + del checked_packages[package_name] + continue + for pair in checked_packages[package_name]: + try: + if self._PACKAGES[package_name][0] == pair[0] and self._PACKAGES[package_name][1] == pair[1]: + validate_package = True + except TypeError: + if self._PACKAGES[package_name][0] == pair[0]: + validate_package = True + if validate_package is False: + unavailable_package_names.append(package_name) + else: + validate_package = False + self.RemoteCommand(remove_packages, ignore_failure=True, suppress_warning=True) + return current_packages + + def _GetAvailablePackages(current_packages): + '''Create commands batches to downgrade all default packages to their initial state''' + package_idx = 0 + commands_list = [] + while package_idx < len(current_packages): + cmd = 'sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-downgrades' + # Batching packages is needed because SSH have a limit of how long the string can be + for count in range(MAX_PACKAGE_PER_CMD): + if package_idx < len(current_packages): + package = re.search(r'(?:\s+)?(.*?)(?::(.*?))?\s+(.*)', current_packages[package_idx]).group(1) + if package not in unavailable_package_names and package in self._PACKAGES: + cmd += " " + package + "=" + self._PACKAGES[package][0] + package_idx += 1 + else: + break + commands_list.append(cmd) + return commands_list + current_packages, _ = self.RemoteCommand('dpkg-query --show') + current_packages = str(current_packages.strip()) + current_packages = current_packages.splitlines() + current_packages = _PurgePackages(current_packages, initial_package_names) + commands_list = _GetAvailablePackages(current_packages) + for pack in commands_list: + self.RemoteCommand(pack, ignore_failure=True, suppress_warning=True) + self.RemoteCommand('sudo apt-get -y autoremove', ignore_failure=True, suppress_warning=True) + + def HasPackage(self, package): + """Returns True iff the package is available for installation.""" + return self.TryRemoteCommand('apt-get install --just-print %s' % package, + suppress_warning=True) + + @vm_util.Retry() + def InstallPackages(self, packages): + """Installs packages using the apt package manager.""" + if not self._apt_updated: + self.AptUpdate() + self._apt_updated = True + try: + install_command = ('sudo DEBIAN_FRONTEND=\'noninteractive\' ' + '/usr/bin/apt-get -y install %s' % (packages)) + self.RemoteCommand(install_command) + except errors.VirtualMachine.RemoteCommandError as e: + # TODO(user): Remove code below after Azure fix their package repository, + # or add code to recover the sources.list + self.RemoteCommand( + 'sudo sed -i.bk "s/azure.archive.ubuntu.com/archive.ubuntu.com/g" ' + '/etc/apt/sources.list') + logging.info('Installing "%s" failed on %s. This may be transient. ' + 'Updating package list.', packages, self) + self.AptUpdate() + raise e + + def Install(self, package_name): + """Installs a PerfKit package on the VM.""" + if not self.install_packages: + return + + if not self._apt_updated: + self.AptUpdate() + self._apt_updated = True + + if package_name not in self._installed_packages: + package = linux_packages.PACKAGES[package_name] + if hasattr(package, 'AptInstall'): + package.AptInstall(self) + elif hasattr(package, 'Install'): + package.Install(self) + else: + raise KeyError('Package %s has no install method for Debian.' % + package_name) + self._installed_packages.add(package_name) + + def Uninstall(self, package_name): + """Uninstalls a PerfKit package on the VM.""" + package = linux_packages.PACKAGES[package_name] + if hasattr(package, 'AptUninstall'): + package.AptUninstall(self) + elif hasattr(package, 'Uninstall'): + package.Uninstall(self) + self._installed_packages.discard(package_name) + + def GetPathToConfig(self, package_name): + """Returns the path to the config file for PerfKit packages. + + This function is mostly useful when config files locations + don't match across distributions (such as mysql). Packages don't + need to implement it if this is not the case. + + Args: + package_name: the name of the package. + """ + package = linux_packages.PACKAGES[package_name] + return package.AptGetPathToConfig(self) + + def GetServiceName(self, package_name): + """Returns the service name of a PerfKit package. + + This function is mostly useful when service names don't + match across distributions (such as mongodb). Packages don't + need to implement it if this is not the case. + + Args: + package_name: the name of the package. + """ + package = linux_packages.PACKAGES[package_name] + return package.AptGetServiceName(self) + + def AllowPortOsFirewall(self, start_port, end_port=None): + out, stderr, retcode = self.RemoteCommandWithReturnCode('sudo ufw status', ignore_failure=True) + if retcode == 1: + logging.info("Ufw does not appear to be installed, " + "skipping opening port(s) {}-{}.".format(start_port, end_port)) + else: + if out.startswith('Status: active'): + cmd = 'sudo ufw allow {}'.format(start_port) + if end_port: + cmd = 'sudo ufw allow {}:{}'.format(start_port, end_port) + self.RemoteCommand(cmd) + else: + logging.info("Ufw does not appear to be running, " + "skipping opening port(s) {}-{}.".format(start_port, end_port)) + + def SetupProxy(self): + """Sets up proxy configuration variables for the cloud environment.""" + super(BaseDebianMixin, self).SetupProxy() + apt_proxy_file = '/etc/apt/apt.conf' + commands = [] + + if FLAGS.http_proxy: + commands.append("egrep -q 'http::proxy\s+\"{0}\";$' {1}" + "|| echo -e 'Acquire::http::proxy \"{0}\";' |" + "sudo tee -a {1}".format(FLAGS.http_proxy, apt_proxy_file)) + + if FLAGS.https_proxy: + commands.append("egrep -q 'https::proxy\s+\"{0}\";$' {1}" + "|| echo -e 'Acquire::https::proxy \"{0}\";' |" + "sudo tee -a {1}".format(FLAGS.https_proxy, apt_proxy_file)) + + if commands: + self.RemoteCommand(";".join(commands)) + + def ProxyCleanup(self): + """ Restore to a state before SetupProxy() executed """ + super(BaseDebianMixin, self).ProxyCleanup() + apt_proxy_file = '/etc/apt/apt.conf' + commands = [] + command_template = "sudo sed -i '\#{0}\";$#d' {1}" + + if FLAGS.http_proxy: + commands.append(command_template.format(FLAGS.http_proxy, apt_proxy_file)) + + if FLAGS.https_proxy: + commands.append(command_template.format(FLAGS.https_proxy, apt_proxy_file)) + + if commands: + self.RemoteCommand(';'.join(commands)) + + def IncreaseSSHConnection(self, target): + """Increase maximum number of ssh connections on vm. + + Args: + target: int. The max number of ssh connection. + """ + self.RemoteCommand(r'sudo sed -i -e "s/.*MaxStartups.*/MaxStartups {0}/" ' + '/etc/ssh/sshd_config'.format(target)) + self.RemoteCommand('sudo service ssh restart') + + def AppendKernelCommandLine(self, command_line, reboot=True): + """Appends the provided command-line to the VM and reboots by default.""" + self.RemoteCommand( + r'echo GRUB_CMDLINE_LINUX_DEFAULT=\"\${GRUB_CMDLINE_LINUX_DEFAULT} %s\"' + r' | sudo tee -a /etc/default/grub' % command_line) + self.RemoteCommand('sudo update-grub') + if reboot: + self.Reboot() + + +class Debian9Mixin(BaseDebianMixin): + """Class holding Debian9 specific VM methods and attributes.""" + OS_TYPE = os_types.DEBIAN9 + # https://packages.debian.org/stretch/python + PYTHON_2_PACKAGE = 'python' + + +class Debian10Mixin(BaseDebianMixin): + """Class holding Debian 10 specific VM methods and attributes.""" + OS_TYPE = os_types.DEBIAN10 + + +class Debian11Mixin(BaseDebianMixin): + """Class holding Debian 11 specific VM methods and attributes.""" + OS_TYPE = os_types.DEBIAN11 + + def PrepareVMEnvironment(self): + # Missing in some images. Required by PrepareVMEnvironment to determine + # partitioning. + self.InstallPackages('fdisk') + super().PrepareVMEnvironment() + + +class BaseUbuntuMixin(BaseDebianMixin): + """Class holding Ubuntu specific VM methods and attributes.""" + + def AppendKernelCommandLine(self, command_line, reboot=True): + """Appends the provided command-line to the VM and reboots by default.""" + self.RemoteCommand( + r'echo GRUB_CMDLINE_LINUX_DEFAULT=\"\${GRUB_CMDLINE_LINUX_DEFAULT} %s\"' + r' | sudo tee -a /etc/default/grub.d/50-cloudimg-settings.cfg' % + command_line) + self.RemoteCommand('sudo update-grub') + if reboot: + self.Reboot() + + def _AssignAdditionalPrivateIpAddresses(self, netmask, gateway): + if self.OS_TYPE > os_types.UBUNTU1604: + if self.HasPackage("ifupdown"): + self.InstallPackages("ifupdown") + if self.OS_TYPE == os_types.UBUNTU2004: + if self.HasPackage("net-tools"): + self.InstallPackages('net-tools') + out, _ = self.RemoteCommand("route | awk '/^default/{{print $NF}}' | sort | head -n 1") + interface_name = str(out.strip()) + private_addresses = self.additional_private_ip_addresses + pkb_net_cfg = posixpath.join("/etc", "network", "interfaces") + append_to_file = 'echo "{{0}}" | sudo tee -a {0}'.format(pkb_net_cfg) + subinterface_names = [] + for i, addr in enumerate(private_addresses): + interface_num = i + 1 + subinterface_name = "{0}:{1}".format(interface_name, interface_num) + subinterface_names.append(subinterface_name) + cmds = [ + append_to_file.format("auto {0}".format(subinterface_name)), + append_to_file.format("iface {0}:{1} inet static".format(interface_name, interface_num)), + append_to_file.format("address {0}".format(addr)), + append_to_file.format("netmask {0}".format(netmask)), + append_to_file.format(""), + ] + self.RemoteCommand(' && '.join(cmds)) + cmds = ["sudo ifup {0}".format(subiface) for subiface in subinterface_names] + self.RemoteCommand(' && '.join(cmds)) + + +class Ubuntu1604Mixin(BaseUbuntuMixin, virtual_machine.DeprecatedOsMixin): + """Class holding Ubuntu1604 specific VM methods and attributes.""" + OS_TYPE = os_types.UBUNTU1604 + PYTHON_2_PACKAGE = 'python' + END_OF_LIFE = '2021-05-01' + ALTERNATIVE_OS = os_types.UBUNTU1804 + + +class Ubuntu1804Mixin(BaseUbuntuMixin): + """Class holding Ubuntu1804 specific VM methods and attributes.""" + OS_TYPE = os_types.UBUNTU1804 + # https://packages.ubuntu.com/bionic/python + PYTHON_2_PACKAGE = 'python' + + +class Ubuntu1804EfaMixin(Ubuntu1804Mixin): + """Class holding EFA specific VM methods and attributes.""" + OS_TYPE = os_types.UBUNTU1804_EFA + + +# Inherit Ubuntu 18's idiosyncracies. +# Note https://bugs.launchpad.net/snappy/+bug/1659719 is also marked not fix in +# focal. +class Ubuntu2004Mixin(Ubuntu1804Mixin): + """Class holding Ubuntu2004 specific VM methods and attributes.""" + OS_TYPE = os_types.UBUNTU2004 + # https://packages.ubuntu.com/focal/python2 + PYTHON_2_PACKAGE = 'python2' + + +class Ubuntu2204Mixin(BaseUbuntuMixin): + """Class holding Ubuntu2204 specific VM methods and attributes.""" + OS_TYPE = os_types.UBUNTU2204 + + +class Ubuntu1604Cuda9Mixin(Ubuntu1604Mixin): + """Class holding NVIDIA CUDA specific VM methods and attributes.""" + OS_TYPE = os_types.UBUNTU1604_CUDA9 + + +class ContainerizedDebianMixin(BaseDebianMixin): + """Class representing a Containerized Virtual Machine. + + A Containerized Virtual Machine is a VM that runs remote commands + within a Docker Container. + Any call to RemoteCommand() will be run within the container + whereas any call to RemoteHostCommand() will be run in the VM itself. + """ + + OS_TYPE = os_types.UBUNTU_CONTAINER + BASE_DOCKER_IMAGE = 'ubuntu:xenial' + + def __init__(self, *args, **kwargs): + super(ContainerizedDebianMixin, self).__init__(*args, **kwargs) + self.docker_id = None + + def _CheckDockerExists(self): + """Returns whether docker is installed or not.""" + resp, _ = self.RemoteHostCommand('command -v docker', ignore_failure=True, + suppress_warning=True) + if resp.rstrip() == '': + return False + return True + + def PrepareVMEnvironment(self): + """Initializes docker before proceeding with preparation.""" + if not self._CheckDockerExists(): + self.Install('docker') + # We need to explicitly create VM_TMP_DIR in the host because + # otherwise it will be implicitly created by Docker in InitDocker() + # (because of the -v option) and owned by root instead of perfkit, + # causing permission problems. + self.RemoteHostCommand('mkdir -p %s' % vm_util.VM_TMP_DIR) + self.InitDocker() + # This will create the VM_TMP_DIR in the container. + # Has to be done after InitDocker() because it needs docker_id. + self._CreateVmTmpDir() + + super(ContainerizedDebianMixin, self).PrepareVMEnvironment() + + def InitDocker(self): + """Initializes the docker container daemon.""" + init_docker_cmd = ['sudo docker run -d ' + '--rm ' + '--net=host ' + '--workdir=%s ' + '-v %s:%s ' % (CONTAINER_WORK_DIR, + vm_util.VM_TMP_DIR, + CONTAINER_MOUNT_DIR)] + for sd in self.scratch_disks: + init_docker_cmd.append('-v %s:%s ' % (sd.mount_point, sd.mount_point)) + init_docker_cmd.append('%s sleep infinity ' % self.BASE_DOCKER_IMAGE) + init_docker_cmd = ''.join(init_docker_cmd) + + resp, _ = self.RemoteHostCommand(init_docker_cmd) + self.docker_id = resp.rstrip() + return self.docker_id + + def RemoteCommand(self, command, **kwargs): + """Runs a command inside the container. + + Args: + command: A valid bash command. + **kwargs: Keyword arguments passed directly to RemoteHostCommand. + + Returns: + A tuple of stdout and stderr from running the command. + """ + # Escapes bash sequences + command = command.replace("'", r"'\''") + + logging.info('Docker running: %s', command) + command = "sudo docker exec %s bash -c '%s'" % (self.docker_id, command) + return self.RemoteHostCommand(command, **kwargs) + + def ContainerCopy(self, file_name, container_path='', copy_to=True): + """Copies a file to or from container_path to the host's vm_util.VM_TMP_DIR. + + Args: + file_name: Name of the file in the host's vm_util.VM_TMP_DIR. + container_path: Optional path of where to copy file on container. + copy_to: True to copy to container, False to copy from container. + Raises: + RemoteExceptionError: If the source container_path is blank. + """ + if copy_to: + if container_path == '': + container_path = CONTAINER_WORK_DIR + + # Everything in vm_util.VM_TMP_DIR is directly accessible + # both in the host and in the container + source_path = posixpath.join(CONTAINER_MOUNT_DIR, file_name) + command = 'cp %s %s' % (source_path, container_path) + self.RemoteCommand(command) + else: + if container_path == '': + raise errors.VirtualMachine.RemoteExceptionError('Cannot copy ' + 'from blank target') + destination_path = posixpath.join(CONTAINER_MOUNT_DIR, file_name) + command = 'cp %s %s' % (container_path, destination_path) + self.RemoteCommand(command) + + @vm_util.Retry( + poll_interval=1, max_retries=3, + retryable_exceptions=(errors.VirtualMachine.RemoteCommandError,)) + def RemoteCopy(self, file_path, remote_path='', copy_to=True): + """Copies a file to or from the container in the remote VM. + + Args: + file_path: Local path to file. + remote_path: Optional path of where to copy file inside the container. + copy_to: True to copy to VM, False to copy from VM. + """ + if copy_to: + file_name = os.path.basename(file_path) + tmp_path = posixpath.join(vm_util.VM_TMP_DIR, file_name) + self.RemoteHostCopy(file_path, tmp_path, copy_to) + self.ContainerCopy(file_name, remote_path, copy_to) + else: + file_name = posixpath.basename(remote_path) + tmp_path = posixpath.join(vm_util.VM_TMP_DIR, file_name) + self.ContainerCopy(file_name, remote_path, copy_to) + self.RemoteHostCopy(file_path, tmp_path, copy_to) + + def MoveFile(self, target, source_path, remote_path=''): + """Copies a file from one VM to a target VM. + + Copies a file from a container in the source VM to a container + in the target VM. + + Args: + target: The target ContainerizedVirtualMachine object. + source_path: The location of the file on the REMOTE machine. + remote_path: The destination of the file on the TARGET machine, default + is the root directory. + """ + file_name = posixpath.basename(source_path) + + # Copies the file to vm_util.VM_TMP_DIR in source + self.ContainerCopy(file_name, source_path, copy_to=False) + + # Moves the file to vm_util.VM_TMP_DIR in target + source_host_path = posixpath.join(vm_util.VM_TMP_DIR, file_name) + target_host_dir = vm_util.VM_TMP_DIR + self.MoveHostFile(target, source_host_path, target_host_dir) + + # Copies the file to its final destination in the container + target.ContainerCopy(file_name, remote_path) + + def SnapshotPackages(self): + """Grabs a snapshot of the currently installed packages.""" + pass + + def PackageCleanup(self): + """Cleans up all installed packages. + + Stop the docker container launched with --rm. + """ + if self.docker_id: + self.RemoteHostCommand('docker stop %s' % self.docker_id) + + +class KernelRelease(object): + """Holds the contents of the linux kernel version returned from uname -r.""" + + def __init__(self, uname): + """KernelVersion Constructor. + + Args: + uname: A string in the format of "uname -r" command + """ + + # example format would be: "4.5.0-96-generic" + # or "3.10.0-514.26.2.el7.x86_64" for centos + # major.minor.Rest + # in this example, major = 4, minor = 5 + major_string, minor_string, _ = uname.split('.', 2) + self.major = int(major_string) + self.minor = int(minor_string) + + def AtLeast(self, major, minor): + """Check If the kernel version meets a minimum bar. + + The kernel version needs to be at least as high as the major.minor + specified in args. + + Args: + major: The major number to test, as an integer + minor: The minor number to test, as an integer + + Returns: + True if the kernel version is at least as high as major.minor, + False otherwise + """ + if self.major < major: + return False + if self.major > major: + return True + return self.minor >= minor + + +def _ParseTextProperties(text): + """Parses raw text that has lines in "key:value" form. + + When comes across an empty line will return a dict of the current values. + + Args: + text: Text of lines in "key:value" form. + + Yields: + Dict of [key,value] values for a section. + """ + current_data = {} + for line in (line.strip() for line in text.splitlines()): + if line: + m = _COLON_SEPARATED_RE.match(line) + if m: + current_data[m.group('key')] = m.group('value') + else: + logging.debug('Ignoring bad line "%s"', line) + else: + # Hit a section break + if current_data: + yield current_data + current_data = {} + if current_data: + yield current_data + + +class LsCpuResults(object): + """Holds the contents of the command lscpu.""" + + def __init__(self, lscpu): + """LsCpuResults Constructor. + + The lscpu command on Ubuntu 16.04 does *not* have the "--json" option for + json output, so keep on using the text format. + + Args: + lscpu: A string in the format of "lscpu" command + + Raises: + ValueError: if the format of lscpu isnt what was expected for parsing + + Example value of lscpu is: + Architecture: x86_64 + CPU op-mode(s): 32-bit, 64-bit + Byte Order: Little Endian + CPU(s): 12 + On-line CPU(s) list: 0-11 + Thread(s) per core: 2 + Core(s) per socket: 6 + Socket(s): 1 + NUMA node(s): 1 + Vendor ID: GenuineIntel + CPU family: 6 + Model: 79 + Stepping: 1 + CPU MHz: 1202.484 + BogoMIPS: 7184.10 + Virtualization: VT-x + L1d cache: 32K + L1i cache: 32K + L2 cache: 256K + L3 cache: 15360K + NUMA node0 CPU(s): 0-11 + """ + self.data = {} + for stanza in _ParseTextProperties(lscpu): + self.data.update(stanza) + + def GetInt(key): + if key in self.data and self.data[key].isdigit(): + return int(self.data[key]) + raise ValueError('Could not find integer "{}" in {}'.format( + key, sorted(self.data))) + + if 'NUMA node(s)' in self.data: + self.numa_node_count = GetInt('NUMA node(s)') + else: + self.numa_node_count = None + self.cores_per_socket = GetInt('Core(s) per socket') + self.socket_count = GetInt('Socket(s)') + self.threads_per_core = GetInt('Thread(s) per core') + + +class ProcCpuResults(object): + """Parses /proc/cpuinfo text into grouped values. + + Most of the cpuinfo is repeated per processor. Known ones that change per + processor are listed in _PER_CPU_KEYS and are processed separately to make + reporting easier. + + Example metadata for metric='proccpu': + |bugs:spec_store_bypass spectre_v1 spectre_v2 swapgs|, + |cache size:25344 KB| + + Example metadata for metric='proccpu_mapping': + |proc_0:apicid=0;core id=0;initial apicid=0;physical id=0|, + |proc_1:apicid=2;core id=1;initial apicid=2;physical id=0| + + Attributes: + text: The /proc/cpuinfo text. + mappings: Dict of [processor id: dict of values that change with cpu] + attributes: Dict of /proc/cpuinfo entries that are not in mappings. + """ + + # known attributes that vary with the processor id + _PER_CPU_KEYS = ['core id', 'initial apicid', 'apicid', 'physical id'] + # attributes that should be sorted, for example turning the 'flags' value + # of "popcnt avx512bw" to "avx512bw popcnt" + _SORT_VALUES = ['flags', 'bugs'] + + def __init__(self, text): + self.mappings = {} + self.attributes = collections.defaultdict(set) + for stanza in _ParseTextProperties(text): + processor_id, single_values, multiple_values = self._ParseStanza(stanza) + if processor_id is None: # can be 0 + continue + if processor_id in self.mappings: + logging.warning('Processor id %s seen twice in %s', processor_id, text) + continue + self.mappings[processor_id] = single_values + for key, value in multiple_values.items(): + self.attributes[key].add(value) + + def GetValues(self): + """Dict of cpuinfo keys to its values. + + Multiple values are joined by semicolons. + + Returns: + Dict of [cpuinfo key:value string] + """ + cpuinfo = { + key: ';'.join(sorted(values)) + for key, values in self.attributes.items() + } + cpuinfo['proccpu'] = ','.join(sorted(self.attributes.keys())) + return cpuinfo + + def _ParseStanza(self, stanza): + """Parses the cpuinfo section for an individual CPU. + + Args: + stanza: Dict of the /proc/cpuinfo results for an individual CPU. + + Returns: + Tuple of (processor_id, dict of values that are known to change with + each CPU, dict of other cpuinfo results). + """ + singles = {} + if 'processor' not in stanza: + return None, None, None + processor_id = int(stanza.pop('processor')) + for key in self._PER_CPU_KEYS: + if key in stanza: + singles[key] = stanza.pop(key) + for key in self._SORT_VALUES: + if key in stanza: + stanza[key] = ' '.join(sorted(stanza[key].split())) + return processor_id, singles, stanza + + +class JujuMixin(BaseDebianMixin): + """Class to allow running Juju-deployed workloads. + + Bootstraps a Juju environment using the manual provider: + https://jujucharms.com/docs/stable/config-manual + """ + + # TODO: Add functionality to tear down and uninstall Juju + # (for pre-provisioned) machines + JujuUninstall for packages using charms. + + OS_TYPE = os_types.JUJU + + is_controller = False + + # A reference to the juju controller, useful when operations occur against + # a unit's VM but need to be preformed from the controller. + controller = None + + vm_group = None + + machines = {} + units = [] + + installation_lock = threading.Lock() + + environments_yaml = """ + default: perfkit + + environments: + perfkit: + type: manual + bootstrap-host: {0} + """ + + def _Bootstrap(self): + """Bootstrap a Juju environment.""" + resp, _ = self.RemoteHostCommand('juju bootstrap') + + def JujuAddMachine(self, unit): + """Adds a manually-created virtual machine to Juju. + + Args: + unit: An object representing the unit's BaseVirtualMachine. + """ + resp, _ = self.RemoteHostCommand('juju add-machine ssh:%s' % + unit.internal_ip) + + # We don't know what the machine's going to be used for yet, + # but track it's placement for easier access later. + # We're looking for the output: created machine %d + machine_id = _[_.rindex(' '):].strip() + self.machines[machine_id] = unit + + def JujuConfigureEnvironment(self): + """Configure a bootstrapped Juju environment.""" + if self.is_controller: + resp, _ = self.RemoteHostCommand('mkdir -p ~/.juju') + + with vm_util.NamedTemporaryFile() as tf: + tf.write(self.environments_yaml.format(self.internal_ip)) + tf.close() + self.PushFile(tf.name, '~/.juju/environments.yaml') + + def JujuEnvironment(self): + """Get the name of the current environment.""" + output, _ = self.RemoteHostCommand('juju switch') + return output.strip() + + def JujuRun(self, cmd): + """Run a command on the virtual machine. + + Args: + cmd: The command to run. + """ + output, _ = self.RemoteHostCommand(cmd) + return output.strip() + + def JujuStatus(self, pattern=''): + """Return the status of the Juju environment. + + Args: + pattern: Optionally match machines/services with a pattern. + """ + output, _ = self.RemoteHostCommand('juju status %s --format=json' % + pattern) + return output.strip() + + def JujuVersion(self): + """Return the Juju version.""" + output, _ = self.RemoteHostCommand('juju version') + return output.strip() + + def JujuSet(self, service, params=[]): + """Set the configuration options on a deployed service. + + Args: + service: The name of the service. + params: A list of key=values pairs. + """ + output, _ = self.RemoteHostCommand( + 'juju set %s %s' % (service, ' '.join(params))) + return output.strip() + + @vm_util.Retry(poll_interval=30, timeout=3600) + def JujuWait(self): + """Wait for all deployed services to be installed, configured, and idle.""" + status = yaml.safe_load(self.JujuStatus()) + for service in status['services']: + ss = status['services'][service]['service-status']['current'] + + # Accept blocked because the service may be waiting on relation + if ss not in ['active', 'unknown']: + raise errors.Juju.TimeoutException( + 'Service %s is not ready; status is %s' % (service, ss)) + + if ss in ['error']: + # The service has failed to deploy. + debuglog = self.JujuRun('juju debug-log --limit 200') + logging.warning(debuglog) + raise errors.Juju.UnitErrorException( + 'Service %s is in an error state' % service) + + for unit in status['services'][service]['units']: + unit_data = status['services'][service]['units'][unit] + ag = unit_data['agent-state'] + if ag != 'started': + raise errors.Juju.TimeoutException( + 'Service %s is not ready; agent-state is %s' % (service, ag)) + + ws = unit_data['workload-status']['current'] + if ws not in ['active', 'unknown']: + raise errors.Juju.TimeoutException( + 'Service %s is not ready; workload-state is %s' % (service, ws)) + + def JujuDeploy(self, charm, vm_group): + """Deploy (and scale) this service to the machines in its vm group. + + Args: + charm: The charm to deploy, i.e., cs:trusty/ubuntu. + vm_group: The name of vm_group the unit(s) should be deployed to. + """ + + # Find the already-deployed machines belonging to this vm_group + machines = [] + for machine_id, unit in self.machines.items(): + if unit.vm_group == vm_group: + machines.append(machine_id) + + # Deploy the first machine + resp, _ = self.RemoteHostCommand( + 'juju deploy %s --to %s' % (charm, machines.pop())) + + # Get the name of the service + service = charm[charm.rindex('/') + 1:] + + # Deploy to the remaining machine(s) + for machine in machines: + resp, _ = self.RemoteHostCommand( + 'juju add-unit %s --to %s' % (service, machine)) + + def JujuRelate(self, service1, service2): + """Create a relation between two services. + + Args: + service1: The first service to relate. + service2: The second service to relate. + """ + resp, _ = self.RemoteHostCommand( + 'juju add-relation %s %s' % (service1, service2)) + + def Install(self, package_name): + """Installs a PerfKit package on the VM.""" + package = linux_packages.PACKAGES[package_name] + try: + # Make sure another unit doesn't try + # to install the charm at the same time + with self.controller.installation_lock: + if package_name not in self.controller._installed_packages: + package.JujuInstall(self.controller, self.vm_group) + self.controller._installed_packages.add(package_name) + except AttributeError as e: + logging.warning('Failed to install package %s, falling back to Apt (%s)', + package_name, e) + if package_name not in self._installed_packages: + if hasattr(package, 'AptInstall'): + package.AptInstall(self) + elif hasattr(package, 'Install'): + package.Install(self) + else: + raise KeyError('Package %s has no install method for Juju machines.' % + package_name) + self._installed_packages.add(package_name) + + def SetupPackageManager(self): + if self.is_controller: + resp, _ = self.RemoteHostCommand( + 'sudo add-apt-repository ppa:juju/stable' + ) + super(JujuMixin, self).SetupPackageManager() + + def PrepareVMEnvironment(self): + """Install and configure a Juju environment.""" + super(JujuMixin, self).PrepareVMEnvironment() + if self.is_controller: + self.InstallPackages('juju') + + self.JujuConfigureEnvironment() + + self.AuthenticateVm() + + self._Bootstrap() + + # Install the Juju agent on the other VMs + for unit in self.units: + unit.controller = self + self.JujuAddMachine(unit) + + +class BaseLinuxVirtualMachine(BaseLinuxMixin, + virtual_machine.BaseVirtualMachine): + """Linux VM for use with pytyping.""" diff --git a/script/cumulus/pkb/perfkitbenchmarker/log_util.py b/script/cumulus/pkb/perfkitbenchmarker/log_util.py new file mode 100644 index 0000000..54493d3 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/log_util.py @@ -0,0 +1,178 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities related to loggers and logging.""" + +from contextlib import contextmanager +import logging +import sys +import threading + +try: + import colorlog +except ImportError: + colorlog = None + + +DEBUG = 'debug' +INFO = 'info' +WARNING = 'warning' +ERROR = 'error' +LOG_LEVELS = { + DEBUG: logging.DEBUG, + INFO: logging.INFO, + WARNING: logging.WARNING, + ERROR: logging.ERROR +} + + +class ThreadLogContext(object): + """Per-thread context for log message prefix labels.""" + def __init__(self, thread_log_context=None): + """Constructs a ThreadLogContext by copying a previous ThreadLogContext. + + Args: + thread_log_context: A ThreadLogContext for an existing thread whose state + will be copied to initialize a ThreadLogContext for a new thread. + """ + if thread_log_context: + self._label_list = thread_log_context._label_list[:] + else: + self._label_list = [] + self._RecalculateLabel() + + @property + def label(self): + return self._label + + def _RecalculateLabel(self): + """Recalculate the string label used to to prepend log messages. + + The label is the concatenation of all non-empty strings in the _label_list. + """ + non_empty_string_list = [s for s in self._label_list if s] + if len(non_empty_string_list): + self._label = ' '.join(non_empty_string_list) + ' ' + else: + self._label = '' + + @contextmanager + def ExtendLabel(self, label_extension): + """Extends the string label used to prepend log messages. + + Args: + label_extension: A string appended to the end of the current label. + """ + self._label_list.append(label_extension) + self._RecalculateLabel() + yield + self._label_list.pop() + self._RecalculateLabel() + + +class _ThreadData(threading.local): + def __init__(self): + self.pkb_thread_log_context = ThreadLogContext() + +thread_local = _ThreadData() + + +def SetThreadLogContext(thread_log_context): + """Set the current thread's ThreadLogContext object. + + Args: + thread_log_context: A ThreadLogContext to be written to thread local + storage. + """ + thread_local.pkb_thread_log_context = thread_log_context + + +def GetThreadLogContext(): + """Get the current thread's ThreadLogContext object. + + Returns: + The ThreadLogContext previously written via SetThreadLogContext. + """ + return thread_local.pkb_thread_log_context + + +class PkbLogFilter(logging.Filter): + """Filter that injects a thread's ThreadLogContext label into log messages. + + Sets the LogRecord's pkb_label attribute with the ThreadLogContext label. + """ + def filter(self, record): + record.pkb_label = GetThreadLogContext().label + return True + + +def ConfigureBasicLogging(): + """Initializes basic python logging before a log file is available.""" + logging.basicConfig(format='%(levelname)-8s %(message)s', level=logging.INFO) + + +def ConfigureLogging(stderr_log_level, log_path, run_uri, + file_log_level=logging.DEBUG): + """Configure logging. + + Note that this will destroy existing logging configuration! + + This configures python logging to emit messages to stderr and a log file. + + Args: + stderr_log_level: Messages at this level and above are emitted to stderr. + log_path: Path to the log file. + run_uri: A string containing the run_uri to be appended to the log prefix + labels. + file_log_level: Messages at this level and above are written to the log + file. + """ + # Build the format strings for the stderr and log file message formatters. + stderr_format = ('%(asctime)s {} %(threadName)s %(pkb_label)s' + '%(levelname)-8s %(message)s').format(run_uri) + stderr_color_format = ('%(log_color)s%(asctime)s {} %(threadName)s ' + '%(pkb_label)s%(levelname)-8s%(reset)s ' + '%(message)s').format(run_uri) + file_format = ('%(asctime)s {} %(threadName)s %(pkb_label)s' + '%(filename)s:%(lineno)d %(levelname)-8s %(message)s') + file_format = file_format.format(run_uri) + + # Reset root logger settings. + logger = logging.getLogger() + logger.handlers = [] + logger.setLevel(logging.DEBUG) + + # Initialize the main thread's ThreadLogContext. This object must be + # initialized to use the PkbLogFilter, and it is used to derive the + # ThreadLogContext of other threads started through vm_util.RunThreaded. + SetThreadLogContext(ThreadLogContext()) + + # Add handler to output to stderr. + handler = logging.StreamHandler() + handler.addFilter(PkbLogFilter()) + handler.setLevel(stderr_log_level) + if colorlog is not None and sys.stderr.isatty(): + formatter = colorlog.ColoredFormatter(stderr_color_format, reset=True) + handler.setFormatter(formatter) + else: + handler.setFormatter(logging.Formatter(stderr_format)) + logger.addHandler(handler) + + # Add handler for output to log file. + logging.info('Verbose logging to: %s', log_path) + handler = logging.FileHandler(filename=log_path) + handler.addFilter(PkbLogFilter()) + handler.setLevel(file_log_level) + handler.setFormatter(logging.Formatter(file_format)) + logger.addHandler(handler) + logging.getLogger('requests').setLevel(logging.ERROR) diff --git a/script/cumulus/pkb/perfkitbenchmarker/managed_memory_store.py b/script/cumulus/pkb/perfkitbenchmarker/managed_memory_store.py new file mode 100644 index 0000000..8477981 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/managed_memory_store.py @@ -0,0 +1,144 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for cloud managed memory stores.""" + +import abc +import logging +from absl import flags +from perfkitbenchmarker import resource + +# List of memory store types +REDIS = 'REDIS' +MEMCACHED = 'MEMCACHED' + +FLAGS = flags.FLAGS + + +class Failover(object): + """Enum for redis failover options.""" + FAILOVER_NONE = 'failover_none' + FAILOVER_SAME_ZONE = 'failover_same_zone' + FAILOVER_SAME_REGION = 'failover_same_region' + + +flags.DEFINE_enum( + 'redis_failover_style', Failover.FAILOVER_NONE, [ + Failover.FAILOVER_NONE, Failover.FAILOVER_SAME_ZONE, + Failover.FAILOVER_SAME_REGION + ], 'Failover behavior of cloud redis cluster. Acceptable values are:' + 'failover_none, failover_same_zone, and failover_same_region') + +# List of redis versions +REDIS_3_2 = 'redis_3_2' +REDIS_4_0 = 'redis_4_0' +REDIS_5_0 = 'redis_5_0' +REDIS_6_X = 'redis_6_x' +REDIS_VERSIONS = [REDIS_3_2, REDIS_4_0, REDIS_5_0, REDIS_6_X] + +flags.DEFINE_string( + 'managed_memory_store_version', None, + 'The version of managed memory store to use. This flag ' + 'overrides Redis or Memcached version defaults that is set ' + 'in benchmark config. Defaults to None so that benchmark ' + 'config defaults are used.') +flags.DEFINE_string( + 'cloud_redis_region', 'us-central1', 'The region to spin up cloud redis in.' + 'Defaults to the GCP region of us-central1.') + +MEMCACHED_NODE_COUNT = 1 + + +def GetManagedMemoryStoreClass(cloud, memory_store): + """Gets the cloud managed memory store class corresponding to 'cloud'. + + Args: + cloud: String. Name of cloud to get the class for. + memory_store: String. Type of memory store to get the class for. + + Returns: + Implementation class corresponding to the argument cloud + + Raises: + Exception: An invalid cloud was provided + """ + return resource.GetResourceClass( + BaseManagedMemoryStore, CLOUD=cloud, MEMORY_STORE=memory_store) + + +def ParseReadableVersion(version): + """Parses Redis major and minor version number. + + Used for Azure and AWS versions. + + Args: + version: String. Version string to get parsed. + + Returns: + Parsed version + """ + if version.count('.') < 1: + logging.info( + 'Could not parse version string correctly,' + 'full Redis version returned: %s', version) + return version + return '.'.join(version.split('.', 2)[:2]) + + +class BaseManagedMemoryStore(resource.BaseResource): + """Object representing a cloud managed memory store.""" + + REQUIRED_ATTRS = ['CLOUD', 'MEMORY_STORE'] + RESOURCE_TYPE = 'BaseManagedMemoryStore' + + def __init__(self, spec): + """Initialize the cloud managed memory store object. + + Args: + spec: spec of the managed memory store. + """ + super(BaseManagedMemoryStore, self).__init__() + self.spec = spec + self.name = 'pkb-%s' % FLAGS.run_uri + self._ip = None + self._port = None + self._password = None + + def GetMemoryStoreIp(self): + """Returns the Ip address of the managed memory store.""" + if not self._ip: + self._PopulateEndpoint() + return self._ip + + def GetMemoryStorePort(self): + """Returns the port number of the managed memory store.""" + if not self._port: + self._PopulateEndpoint() + return self._port + + @abc.abstractmethod + def _PopulateEndpoint(self): + """Populates the endpoint information for the managed memory store.""" + raise NotImplementedError() + + def GetMemoryStorePassword(self): + """Returns the access password of the managed memory store, if any.""" + return self._password + + def MeasureCpuUtilization(self): + """Measures the CPU utilization of an instance using the cloud's API.""" + return NotImplementedError() + + def GetInstanceSize(self): + """Returns size of instance in gigabytes.""" + return NotImplementedError() diff --git a/script/cumulus/pkb/perfkitbenchmarker/memcache_service.py b/script/cumulus/pkb/perfkitbenchmarker/memcache_service.py new file mode 100644 index 0000000..a61d786 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/memcache_service.py @@ -0,0 +1,35 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class MemcacheService(object): + CLOUD = None + + def __init__(self): + pass + + def Create(self): + raise NotImplementedError + + def Destroy(self): + raise NotImplementedError + + def Flush(self): + raise NotImplementedError + + def GetHosts(self): + raise NotImplementedError + + def GetMetadata(self): + raise NotImplementedError diff --git a/script/cumulus/pkb/perfkitbenchmarker/messaging_service.py b/script/cumulus/pkb/perfkitbenchmarker/messaging_service.py new file mode 100644 index 0000000..4049276 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/messaging_service.py @@ -0,0 +1,138 @@ +"""Common interface for messaging services resources. + +MessagingService class offers a common interface to provision resources, and to +run different phases of the benchmark [Prepare, Run, Cleanup]. The messaging +service benchmark uses the specific instance (from +messaging_service_util.py file) to run the phases of the benchmark on it. +Prepare and Cleanup phases runs from the benchmark VM, on the run phase the +benchmark VM send commands to the Client VM. Client VM's implementations that +runs the benchmark can be found on: /data/messaging_service. +""" + +import abc +import os +from typing import Any, Dict +from perfkitbenchmarker import resource + +MESSAGING_SERVICE_SCRIPTS_VM_PKB = os.path.join('~', 'perfkitbenchmarker') +MESSAGING_SERVICE_SCRIPTS_VM_BIN_DIR = '~' +MESSAGING_SERVICE_SCRIPTS_VM_LIB_DIR = os.path.join( + '~', 'perfkitbenchmarker', 'scripts', 'messaging_service_scripts') +MESSAGING_SERVICE_SCRIPTS_VM_COMMON_DIR = os.path.join( + MESSAGING_SERVICE_SCRIPTS_VM_LIB_DIR, 'common') +MESSAGING_SERVICE_SCRIPTS_COMMON_PREFIX = 'messaging_service_scripts/common/' +MESSAGING_SERVICE_SCRIPTS_COMMON_FILES = [ + '__init__.py', + 'app.py', + 'client.py', + 'errors.py', + 'runners.py', + 'e2e/__init__.py', + 'e2e/latency_runner.py', + 'e2e/main_process.py', + 'e2e/protocol.py', + 'e2e/publisher.py', + 'e2e/receiver.py', + 'e2e/worker_utils.py', +] + + +def GetMessagingServiceClass(cloud, delivery): + """Gets the underlying Messaging Service class.""" + return resource.GetResourceClass( + BaseMessagingService, CLOUD=cloud, DELIVERY=delivery) + + +class BaseMessagingService(resource.BaseResource): + """Common interface of a messaging service resource. + + Attributes: + client: The client virtual machine that runs the benchmark. + """ + + REQUIRED_ATTRS = ['CLOUD', 'DELIVERY'] + RESOURCE_TYPE = 'BaseMessagingService' + + # TODO(odiego): Move DELIVERY down to child classes when adding more options + DELIVERY = 'pull' + + END_TO_END_LATENCY = 'end_to_end_latency' + PUBLISH_LATENCY = 'publish_latency' + PULL_LATENCY = 'pull_latency' + + @classmethod + def FromSpec(cls, messaging_service_spec): + return cls() + + def setVms(self, vm_groups): + self.client_vm = vm_groups['clients' if 'clients' in + vm_groups else 'default'][0] + + def PrepareClientVm(self): + self._InstallCommonClientPackages() + self._InstallCloudClients() + + def _InstallCommonClientPackages(self): + """Installs common software for running benchmarks on the client VM.""" + # Install commom packages + self.client_vm.Install('python3') + self.client_vm.Install('pip3') + self.client_vm.RemoteCommand('sudo pip3 install absl-py numpy') + + # Upload common scripts + self.client_vm.RemoteCommand( + f'mkdir -p {MESSAGING_SERVICE_SCRIPTS_VM_LIB_DIR}') + self.client_vm.RemoteCommand(' '.join([ + 'find', MESSAGING_SERVICE_SCRIPTS_VM_PKB, '-type', 'd', '-exec', + 'touch', "'{}/__init__.py'", '\\;' + ])) + self._CopyFiles( + MESSAGING_SERVICE_SCRIPTS_COMMON_PREFIX, + MESSAGING_SERVICE_SCRIPTS_COMMON_FILES, + MESSAGING_SERVICE_SCRIPTS_VM_COMMON_DIR) + + def _CopyFiles(self, prefix, data_srcs, vm_dest_dir): + for subpath in data_srcs: + dirname = os.path.dirname(os.path.join(vm_dest_dir, subpath)) + self.client_vm.RemoteCommand(f'mkdir -p {dirname}') + self.client_vm.PushDataFile( + os.path.join(prefix, subpath), + os.path.join(vm_dest_dir, subpath)) + + @abc.abstractmethod + def _InstallCloudClients(self): + """Installs software for running benchmarks on the client VM. + + This method should be overriden by subclasses to install software specific + to the flavor of MessagingService they provide. + """ + raise NotImplementedError + + @abc.abstractmethod + def Run(self, benchmark_scenario: str, number_of_messages: str, + message_size: str) -> Dict[str, Any]: + """Runs remote commands on client VM - benchmark's run phase. + + Runs a benchmark that consists of first publishing messages and then + pulling messages from messaging service, based on the configuration + specified through the FLAGS: benchmark_scenario, number_of_messages, and + message_size. Specific implementations should override this method. + Different providers needs different info to run the benchmark - for GCP we + need 'topic_name' and 'subscription_name', while for AWS 'queue_name' + suffices. + + Args: + benchmark_scenario: Specifies which benchmark scenario to run. + number_of_messages: Number of messages to use on the benchmark. + message_size: Size of the messages that will be used on the benchmark. It + specifies the number of characters in those messages. + + Returns: + Dictionary with metric_name (mean_latency, p50_latency...) as key and the + results from the benchmark as the value: + results = { + 'mean_latency': 0.3423443... + ... + } + """ + raise NotImplementedError diff --git a/script/cumulus/pkb/perfkitbenchmarker/network.py b/script/cumulus/pkb/perfkitbenchmarker/network.py new file mode 100644 index 0000000..027fe2c --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/network.py @@ -0,0 +1,332 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing abstract classes related to VM networking. + +The Firewall class provides a way of opening VM ports. The Network class allows +VMs to communicate via internal ips and isolates PerfKitBenchmarker VMs from +others in the +same project. +""" + +import abc +import enum + +from absl import flags +from perfkitbenchmarker import context +from perfkitbenchmarker import errors +from perfkitbenchmarker import regex_util +from perfkitbenchmarker import resource + + +flags.DEFINE_integer('mtu', None, + 'Network MTU to set, if any. Only enabled for GCP.') + + +class NetType(enum.Enum): + DEFAULT = 'default' + SINGLE = 'single' + MULTI = 'multi' + + +class BaseFirewall(object): + """An object representing the Base Firewall.""" + + CLOUD = None + + @classmethod + def GetFirewall(cls): + """Returns a BaseFirewall. + + This method is used instead of directly calling the class's constructor. + It creates BaseFirewall instances and registers them. + If a BaseFirewall object has already been registered, that object + will be returned rather than creating a new one. This enables multiple + VMs to call this method and all share the same BaseFirewall object. + """ + if cls.CLOUD is None: + raise errors.Error('Firewalls should have CLOUD attributes.') + benchmark_spec = context.GetThreadBenchmarkSpec() + if benchmark_spec is None: + raise errors.Error('GetFirewall called in a thread without a ' + 'BenchmarkSpec.') + with benchmark_spec.firewalls_lock: + key = cls.CLOUD + if key not in benchmark_spec.firewalls: + benchmark_spec.firewalls[key] = cls() + return benchmark_spec.firewalls[key] + + def AllowIcmp(self, vm): + """Opens the ICMP protocol on the firewall. + + Args: + vm: The BaseVirtualMachine object to open the ICMP protocol for. + """ + pass + + def AllowPort(self, vm, start_port, end_port=None): + """Opens a port on the firewall. + + Args: + vm: The BaseVirtualMachine object to open the port for. + start_port: The first local port in a range of ports to open. + end_port: The last port in a range of ports to open. If None, only + start_port will be opened. + """ + pass + + def DisallowAllPorts(self): + """Closes all ports on the firewall.""" + pass + + +class BaseNetworkSpec(object): + """Object containing all information needed to create a Network.""" + + def __init__(self, zone=None, cidr=None): + """Initializes the BaseNetworkSpec. + + Args: + zone: The zone in which to create the network. + cidr: The subnet this network belongs to in CIDR notation + """ + self.zone = zone + self.cidr = cidr + + def __repr__(self): + return '%s(%r)' % (self.__class__, self.__dict__) + + +class BaseVpnGateway(object, metaclass=abc.ABCMeta): + """An object representing the Base VPN Gateway.""" + CLOUD = None + + def __init__(self, zone=None, cidr=None): + """Initializes the BaseVpnGateway. + + Args: + zone: The zone in which to create the VpnGateway. + cidr: The cidr for the VpnGateway. + """ + self.zone = zone + self.cidr: str = cidr + # Set to True if we need target Gateway up front (AWS) + self.require_target_to_init = False + + @abc.abstractmethod + def IsTunnelReady(self, tunnel_id): + """Returns True if the tunnel is ready. + + Args: + tunnel_id: The id of the tunnel to check. + + Returns: + boolean. + """ + raise NotImplementedError() + + @abc.abstractmethod + def ConfigureTunnel(self, tunnel_config): + """Updates the tunnel_config object with new information. + + Each provider may require different information to setup a VPN tunnel, + and all information needed to configure the tunnel may not be available + up front. Incremental updates to tunnel_config are made by calling this + function on each endpoint until either both endpoint tunnels are configured + or no more updates can be made. + + Args: + tunnel_config: The tunnel_config object of the tunnel to configure. + """ + raise NotImplementedError() + + @abc.abstractmethod + def Create(self): + """Creates the actual VPN Gateway.""" + raise NotImplementedError() + + @abc.abstractmethod + def Delete(self): + """Deletes the actual VPN Gateway.""" + raise NotImplementedError() + + +class BaseNetwork(object): + """Object representing a Base Network.""" + + CLOUD = None + + def __init__(self, spec): + self.zone = spec.zone + self.cidr = spec.cidr + + @staticmethod + def _GetNetworkSpecFromVm(vm): + """Returns a BaseNetworkSpec created from VM attributes.""" + return BaseNetworkSpec(zone=vm.zone, cidr=vm.cidr) + + @classmethod + def _GetKeyFromNetworkSpec(cls, spec): + """Returns a key used to register Network instances.""" + if cls.CLOUD is None: + raise errors.Error('Networks should have CLOUD attributes.') + return (cls.CLOUD, spec.zone) + + @classmethod + def GetNetwork(cls, vm): + """Returns a BaseNetwork. + + This method is used instead of directly calling the class's constructor. + It creates BaseNetwork instances and registers them. If a BaseNetwork + object has already been registered with the same key, that object + will be returned rather than creating a new one. This enables multiple + VMs to call this method and all share the same BaseNetwork object. + + Args: + vm: The VM for which the Network is being created. + """ + return cls.GetNetworkFromNetworkSpec(cls._GetNetworkSpecFromVm(vm)) + + @staticmethod + def FormatCidrString(cidr_raw): + """Format CIDR string for use in resource name. + + Removes or replaces illegal characters from CIDR. + eg '10.128.0.0/9' -> '10-128-0-0-9' + + Args: + cidr_raw: The unformatted CIDR string. + Returns: + A CIDR string suitable for use in resource names. + Raises: + Error: Invalid CIDR format + """ + + delim = r'-' # Safe delimiter for most providers + int_regex = r'[0-9]+' + octets_mask = regex_util.ExtractAllMatches(int_regex, str(cidr_raw)) + if len(octets_mask) != 5: # expecting 4 octets plus 1 prefix mask. + raise ValueError('Invalid CIDR format: "{0}"'.format(cidr_raw)) + return delim.join(octets_mask) + + @classmethod + def GetNetworkFromNetworkSpec(cls, spec): + """Returns a BaseNetwork. + + This method is used instead of directly calling the class's constructor. + It creates BaseNetwork instances and registers them. If a BaseNetwork + object has already been registered with the same key, that object + will be returned rather than creating a new one. This enables multiple + VMs to call this method and all share the same BaseNetwork object. + + Args: + spec: The network spec for the network. + """ + benchmark_spec = context.GetThreadBenchmarkSpec() + if benchmark_spec is None: + raise errors.Error('GetNetwork called in a thread without a ' + 'BenchmarkSpec.') + key = cls._GetKeyFromNetworkSpec(spec) + + # Grab the list of other networks to setup firewalls, forwarding, etc. + if not hasattr(spec, 'custom_subnets'): + spec.__setattr__('custom_subnets', benchmark_spec.custom_subnets) + + with benchmark_spec.networks_lock: + if key not in benchmark_spec.networks: + benchmark_spec.networks[key] = cls(spec) + return benchmark_spec.networks[key] + + def Create(self): + """Creates the actual network.""" + pass + + def Delete(self): + """Deletes the actual network.""" + pass + + def Peer(self, peering_network): + """Peers the network with the peering_network. + + This method is used for VPC peering. It will connect 2 VPCs together. + + Args: + peering_network: BaseNetwork. The network to peer with. + """ + pass + + +class BaseVPCPeeringSpec(object): + """Object containing all information needed to create a VPC Peering Object.""" + + def __init__(self, network_a=None, network_b=None): + """Initializes BaseVPCPeeringSpec. + + Args: + network_a: BaseNetwork. The network initiating the peering. + network_b: BaseNetwork. The network to be peered to. + """ + self.network_a = network_a + self.network_b = network_b + + def __repr__(self): + return '%s(%r)' % (self.__class__, self.__dict__) + + +class BaseVPCPeering(resource.BaseResource): + """Base class for VPC Peering. + + This class holds VPC Peering methods and attributes relating to the + VPC Peering as a cloud resource. + + Attributes: + network_a: BaseNetwork. The network initiating the peering. + network_b: BaseNetwork. The network to be peered to. + """ + + RESOURCE_TYPE = 'BaseVPCPeering' + + def __init__(self, vpc_peering_spec): + """Initialize BaseVPCPeering class. + + Args: + vpc_peering_spec: BaseVPCPeeringSpec. Spec for VPC peering object. + """ + super(BaseVPCPeering, self).__init__() + self.network_a = vpc_peering_spec.network_a + self.network_b = vpc_peering_spec.network_b + + +def GetCidrBlock(regional_index=0, subnet_index=0, mask_size=24): + """Returns a Cidr Block. + + Each cloud region should be assigned a unique IP Address Space. And each + Subnet within a regional cloud network should also have an unique space. This + function returns the IP Address allocation based on the regional and subnet + index given. It is expected that each cloud regional network will have a + unique regional index and each of its subnets will also have a unique index. + Regional cidr blocks should be large enough to cover the subnet cidr blocks. + Chose a mask_size for regional cidr block accordingly. For example, a + mask_size of 16 with regional starting block 10.0.0.0 will cover a subnet of + 10.0.1.0/24. + + Args: + regional_index: Int. The IP Address allocation dependent on the region. + Default index is 0. + subnet_index: Int. The IP Address section dependent on the subnet. + Default index is 0. + mask_size: Int. Mask size to request from cidr block. + Default index is 24. + """ + return '10.{}.{}.0/{}'.format(regional_index, subnet_index, mask_size) diff --git a/script/cumulus/pkb/perfkitbenchmarker/nfs_service.py b/script/cumulus/pkb/perfkitbenchmarker/nfs_service.py new file mode 100644 index 0000000..6c730ac --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/nfs_service.py @@ -0,0 +1,260 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Resource encapsulating provisioned cloud NFS services. + +Defines a resource for use in other benchmarks such as SpecSFS2014 and FIO. + +Example --benchmark_config_file: + +nfs_10_tb: &nfs_10_tb + AWS: + disk_type: nfs + mount_point: /scratch + +specsfs: + name: specsfs2014 + flags: + specsfs2014_num_runs: 1 + specsfs2014_load: 1 + vm_groups: + clients: + disk_spec: *nfs_10_tb + vm_count: 1 + os_type: rhel + gluster_servers: + vm_count: 0 +""" + +import abc +import logging +import re +from typing import Optional + +from absl import flags +from perfkitbenchmarker import disk +from perfkitbenchmarker import errors +from perfkitbenchmarker import os_types +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util + +flags.DEFINE_string('nfs_tier', None, 'NFS Mode') +flags.DEFINE_string('nfs_version', None, 'NFS Version') + +FLAGS = flags.FLAGS + +_MOUNT_NFS_RE = re.compile(r'.*type nfs \((.*?)\)', re.MULTILINE) + +UNMANAGED = 'Unmanaged' + + +def GetNfsServiceClass(cloud): + """Get the NFS service corresponding to the cloud. + + Args: + cloud: The name of the cloud to supply the NFS service. + + Returns: + The NFS service class for this cloud. + + Raises: + NotImplementedError: No service found for this cloud. + """ + return resource.GetResourceClass(BaseNfsService, CLOUD=cloud) + + +class BaseNfsService(resource.BaseResource): + """Object representing an NFS Service.""" + + # subclasses must override this with a list or tuple for acceptable + # "nfs_tier" values if applicable. + CLOUD = 'Unknown' + NFS_TIERS = None + RESOURCE_TYPE = 'BaseNfsService' + DEFAULT_NFS_VERSION = None + DEFAULT_TIER = None + + def __init__(self, disk_spec: disk.BaseDiskSpec, zone): + super(BaseNfsService, self).__init__() + self.disk_spec = disk_spec + self.zone = zone + self.server_directory = '/' + self.nfs_tier = FLAGS.nfs_tier or self.DEFAULT_TIER + if self.nfs_tier and self.NFS_TIERS and self.nfs_tier not in self.NFS_TIERS: + # NFS service does not have to have a list of nfs_tiers nor does it have + # to be implemented by a provider + raise errors.Config.InvalidValue( + ('nfs_tier "%s" not in acceptable list "%s" ' + 'for cloud %s') % (self.nfs_tier, self.NFS_TIERS, self.CLOUD)) + logging.debug('%s NFS service with nfs_tier %s zone %s default version %s', + self.CLOUD, self.nfs_tier, self.zone, + self.DEFAULT_NFS_VERSION) + + def CreateNfsDisk(self): + mount_point = '%s:%s' % (self.GetRemoteAddress(), self.server_directory) + return disk.NfsDisk(self.disk_spec, mount_point, self.DEFAULT_NFS_VERSION, + self.nfs_tier) + + @abc.abstractmethod + def _IsReady(self): + """Boolean function to determine if disk is NFS mountable.""" + pass + + @abc.abstractmethod + def GetRemoteAddress(self): + """The NFS server's address.""" + pass + + +class StaticNfsService(BaseNfsService): + """Object allowing VMs to connect to a preprovisioned NFS endpoint.""" + CLOUD = 'Static' + + def __init__(self, disk_spec): + super(StaticNfsService, self).__init__(disk_spec, None) + self.ip_address = disk_spec.nfs_ip_address + self.server_directory = disk_spec.nfs_directory or '/' + + def _Create(self): + pass + + def _Delete(self): + pass + + def CreateNfsDisk(self): + mount_point = '%s:/%s' % (self.GetRemoteAddress(), self.server_directory) + return disk.NfsDisk(self.disk_spec, mount_point, None, None) + + def _IsReady(self): + """Boolean function to determine if disk is NFS mountable.""" + return True + + def GetRemoteAddress(self): + """The NFS server's address.""" + return self.ip_address + + +class UnmanagedNfsService(BaseNfsService): + """Object allowing VMs to connect to a local NFS disk.""" + CLOUD = UNMANAGED + + # Allows anybody to write to the NFS mount. + _EXPORT_FS_COMMAND = ' && '.join([ + 'sudo mkdir -p {export_dir}', + 'sudo chown $USER:$USER {export_dir}', + 'sudo chmod 777 {export_dir}', + ('echo "{export_dir} *(rw,sync,no_subtree_check,no_root_squash)" | ' + 'sudo tee -a /etc/exports'), + 'sudo exportfs -a' + ]) + + _NFS_NAME = { + os_types.RHEL: 'nfs-server', + os_types.DEBIAN: 'nfs-kernel-server', + } + _NFS_RESTART_CMD = 'sudo systemctl restart {nfs_name}' + + def __init__(self, + disk_spec: Optional[disk.BaseDiskSpec], + server_vm, + check_export_not_same_mount=True, + server_directory=None): + super(UnmanagedNfsService, self).__init__(disk_spec, None) + self.server_vm = server_vm + # Path on the server to export. Must be different from mount_point. + if server_directory: + self.server_directory = server_directory + elif disk_spec and disk_spec.device_path: + self.server_directory = disk_spec.device_path + else: + self.server_directory = '/pkb-nfs-server-directory' + logging.info('Exporting server directory %s', self.server_directory) + if check_export_not_same_mount and disk_spec: + assert self.server_directory != disk_spec.mount_point, ( + 'export server directory must be different from mount point') + + def GetRemoteAddress(self): + """The NFS server's address.""" + return self.server_vm.internal_ip + + def _ExportNfsDir(self, export_dir_path): + """Export a directory on the NFS server to be shared with NFS clients. + + Args: + export_dir_path: Path to the directory to export. + """ + if self.server_vm.TryRemoteCommand( + f'grep "^{export_dir_path} " /etc/exports'): + logging.info('Already NFS exported directory %s', export_dir_path) + else: + self.server_vm.RemoteCommand( + self._EXPORT_FS_COMMAND.format(export_dir=export_dir_path)) + nfs_name = self._NFS_NAME[self.server_vm.BASE_OS_TYPE] + self.server_vm.RemoteCommand( + self._NFS_RESTART_CMD.format(nfs_name=nfs_name)) + + def _Create(self): + assert self.server_vm, 'NFS server VM not created.' + self.server_vm.Install('nfs_server') + self._ExportNfsDir(self.server_directory) + # Restart NFS service upon reboot if required (Centos7) + self.server_vm.RemoteCommand( + 'sudo systemctl enable nfs', ignore_failure=True) + + def _Delete(self): + pass + + def _IsReady(self): + """Boolean function to determine if disk is NFS mountable.""" + return True + + +def NfsExport(server_vm, local_disk_path): + """NFS exports the directory on the VM.""" + service = UnmanagedNfsService(None, server_vm, False, local_disk_path) + service.Create() + + +def NfsMount(server_ip, client_vm, client_path, server_path=None) -> None: + """NFS mounts the server's path on the client. + + Args: + server_ip: IP address of the NFS server. + client_vm: The VM that will mount the NFS server's exported directory. + client_path: The mount point on the client. + server_path: The NFS exported directory on the server. Defaults to the same + as the client_path. + """ + client_vm.Install('nfs_utils') + fstab_line = (f'{server_ip}:{server_path or client_path} ' + f'{client_path} nfs defaults 0 0') + client_vm.RemoteCommand(f'sudo mkdir -p {client_path}; ' + f'sudo chown {client_vm.user_name} {client_path}; ' + f'echo "{fstab_line}\n" | sudo tee -a /etc/fstab; ' + 'sudo mount -a') + + +def NfsExportAndMount(vms, client_path, server_path=None) -> None: + """NFS exports from the first VM to the others. + + Args: + vms: List of VMs. First is the NFS server, the others will mount it. + client_path: The path on the client to mount the NFS export. + server_path: The path on the server to export. Default is the same as the + client_path + """ + nfs_server, clients = vms[0], vms[1:] + NfsExport(nfs_server, server_path or client_path) + vm_util.RunThreaded( + lambda vm: NfsMount(nfs_server.internal_ip, vm, client_path, server_path), + clients) diff --git a/script/cumulus/pkb/perfkitbenchmarker/non_relational_db.py b/script/cumulus/pkb/perfkitbenchmarker/non_relational_db.py new file mode 100644 index 0000000..24f3075 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/non_relational_db.py @@ -0,0 +1,87 @@ +# Copyright 2020 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing base class for non-relational databases.""" + +from typing import Dict, Optional + +from absl import flags +from perfkitbenchmarker import resource +from perfkitbenchmarker.configs import freeze_restore_spec +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.configs import spec + +# List of nonrelational database types +DYNAMODB = 'dynamodb' +BIGTABLE = 'bigtable' + + +FLAGS = flags.FLAGS + + +class BaseNonRelationalDbSpec(freeze_restore_spec.FreezeRestoreSpec): + """Configurable options of a nonrelational database service.""" + + # Needed for registering the spec class and its subclasses. See BaseSpec. + SPEC_TYPE = 'BaseNonRelationalDbSpec' + SPEC_ATTRS = ['SERVICE_TYPE'] + + def __init__(self, + component_full_name: str, + flag_values: Optional[Dict[str, flags.FlagValues]] = None, + **kwargs): + super().__init__(component_full_name, flag_values=flag_values, **kwargs) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments + to construct in order to decode the named option. + """ + result = super()._GetOptionDecoderConstructions() + result.update({ + 'service_type': ( + option_decoders.EnumDecoder, + { + 'default': + None, + 'valid_values': [ + DYNAMODB, + BIGTABLE, + ], + }), + }) + return result + + +class BaseNonRelationalDb(resource.BaseResource): + """Object representing a nonrelational database.""" + REQUIRED_ATTRS = ['SERVICE_TYPE'] + RESOURCE_TYPE = 'BaseNonRelationalDb' + SERVICE_TYPE = 'Base' + + +def GetNonRelationalDbSpecClass( + service_type: str) -> Optional[spec.BaseSpecMetaClass]: + """Gets the non-relational db spec class corresponding to 'service_type'.""" + return spec.GetSpecClass(BaseNonRelationalDbSpec, SERVICE_TYPE=service_type) + + +def GetNonRelationalDbClass( + service_type: str) -> Optional[resource.AutoRegisterResourceMeta]: + """Gets the non-relational database class corresponding to 'service_type'.""" + return resource.GetResourceClass(BaseNonRelationalDb, + SERVICE_TYPE=service_type) diff --git a/script/cumulus/pkb/perfkitbenchmarker/num_gpus_map_util.py b/script/cumulus/pkb/perfkitbenchmarker/num_gpus_map_util.py new file mode 100644 index 0000000..52a058a --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/num_gpus_map_util.py @@ -0,0 +1,31 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A map of machine types to their number of K80 GPUs""" + +gpus_per_vm = { + 'n1-standard-4-k80x1': 1, + 'n1-standard-8-k80x1': 1, + 'n1-standard-8-k80x2': 2, + 'n1-standard-16-k80x2': 2, + 'n1-standard-16-k80x4': 4, + 'n1-standard-32-k80x4': 4, + 'n1-standard-32-k80x8': 8, + 'p2.xlarge': 1, + 'p2.8xlarge': 8, + 'p2.16xlarge': 16, + 'Standard_NC6': 1, + 'Standard_NC12': 2, + 'Standard_NC24': 4 +} diff --git a/script/cumulus/pkb/perfkitbenchmarker/object_storage_service.py b/script/cumulus/pkb/perfkitbenchmarker/object_storage_service.py new file mode 100644 index 0000000..a308951 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/object_storage_service.py @@ -0,0 +1,383 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""An interface to object storage services.""" + + +import abc +import logging +import os +import pathlib +from typing import Optional + +from absl import flags +from perfkitbenchmarker import errors +import six + +flags.DEFINE_string('object_storage_credential_file', None, + 'Directory of credential file.') +flags.DEFINE_string('boto_file_location', None, + 'The location of the boto file.') + +FLAGS = flags.FLAGS + +DEFAULT_BOTO_LOCATION_USER = '~/.boto' +DEFAULT_BOTO_LOCATION_MACHINE = '/etc/boto.cfg' +BOTO_LIB_VERSION = 'boto_lib_version' + +_OBJECT_STORAGE_REGISTRY = {} + + +class AutoRegisterObjectStorageMeta(abc.ABCMeta): + """Metaclass for auto registration.""" + STORAGE_NAME = None + + def __init__(cls, name, bases, dct): + super(AutoRegisterObjectStorageMeta, cls).__init__(name, bases, dct) + if cls.STORAGE_NAME in _OBJECT_STORAGE_REGISTRY: + logging.info( + "Duplicate storage implementations for name '%s'. " + 'Replacing %s with %s', cls.STORAGE_NAME, + _OBJECT_STORAGE_REGISTRY[cls.STORAGE_NAME].__name__, cls.__name__) + _OBJECT_STORAGE_REGISTRY[cls.STORAGE_NAME] = cls + + +class ObjectStorageService( + six.with_metaclass(AutoRegisterObjectStorageMeta, object)): + """Base class for ObjectStorageServices.""" + + # Keeping the location in the service object is not very clean, but + # a nicer solution would be more complex, and we only use different + # locations in a very limited way. Also, true multi-location + # providers would require another abstraction for Azure service + # accounts, which would add more complexity. + + # Service object lifecycle + + def PrepareService(self, location): + """Get ready to use object storage. + + This method should be called before any other method of this + class. Once it is called, all of this class' methods should work + with data in the given location. + + Args: + location: where to place our data. + """ + pass + + def CleanupService(self): + """Clean up what we did. + + No other method of this class should be called after + CleanupProvider. + """ + pass + + # Bucket management + + @abc.abstractmethod + def MakeBucket(self, bucket, raise_on_failure=True): + """Make an object storage bucket. + + Args: + bucket: the name of the bucket to create. + raise_on_failure: Whether to raise errors.Benchmarks.BucketCreationError + if the bucket fails to be created. + """ + pass + + @abc.abstractmethod + def Copy(self, src_url, dst_url, recursive=False): + """Copy files, objects and directories. + + Note: Recursive copy behavior mimics gsutil cp -r where: + Copy(/foo/bar, /baz, True) copies the directory bar into /baz/bar whereas + aws s3 cp --recursive would copy the contents of bar into /baz. + + Args: + src_url: string, the source url path. + dst_url: string, the destination url path. + recursive: whether to copy directories. + """ + pass + + @abc.abstractmethod + def CopyToBucket(self, src_path, bucket, object_path): + """Copy a local file to a bucket. + + Args: + src_path: string, the local source path. + bucket: string, the destination bucket. + object_path: string, the object's path in the bucket. + """ + pass + + @abc.abstractmethod + def MakeRemoteCliDownloadUrl(self, bucket, object_path): + """Creates a download url for an object in a bucket. + + This is used by GenerateCliDownloadFileCommand(). + + Args: + bucket: string, the name of the bucket. + object_path: string, the path of the object in the bucket. + """ + pass + + @abc.abstractmethod + def GenerateCliDownloadFileCommand(self, src_url, local_path): + """Generates a CLI command to copy src_url to local_path. + + This is suitable for use in scripts e.g. startup scripts. + + Args: + src_url: string, the source url path. + local_path: string, the local path. + """ + pass + + @abc.abstractmethod + def List(self, bucket): + """List providers, buckets, or objects. + + Args: + bucket: the name of the bucket to list the contents of. + """ + pass + + def ListTopLevelSubfolders(self, bucket): + """Lists the top level folders (not files) in a bucket. + + Args: + bucket: Name of the bucket to list the top level subfolders of. + + Returns: + A list of top level subfolder names. Can be empty if there are no folders. + """ + return [] + + @abc.abstractmethod + def DeleteBucket(self, bucket): + """Delete an object storage bucket. + + This method should succeed even if bucket contains objects. + + Args: + bucket: the name of the bucket to delete. + """ + pass + + @abc.abstractmethod + def EmptyBucket(self, bucket): + """Empty an object storage bucket. + + Args: + bucket: the name of the bucket to empty. + """ + pass + + # Working with a VM + + def PrepareVM(self, vm): + """Prepare a VM to use object storage. + + Args: + vm: the VM to prepare. + """ + pass + + def CleanupVM(self, vm): + """Clean up a VM that was used in this benchmark. + + Args: + vm: the VM to clean up. + """ + pass + + # CLI commands + + @abc.abstractmethod + def CLIUploadDirectory(self, vm, directory, file_names, bucket): + """Upload directory contents to a bucket through the CLI. + + The VM must have had PrepareVM called on it first. The command + will be wrapped in 'time ...'. + + The caller must ensure that file_names is a full list of files in + the directory, so the provider implementation can either use a + generic "upload directory" command or use the file names. This + method *must* pass all file names to the CLI at once if possible, + not in a loop, to give it the chance to share connections and + overlap uploads. + + Args: + vm: the VM to run commands on directory: the directory to + directory: the directory to upload files from + file_names: a list of paths (relative to directory) to upload + bucket: the bucket to upload the file to + + Returns: + A tuple of the (stdout, stderr) of the command. + """ + pass + + @abc.abstractmethod + def CLIDownloadBucket(self, vm, bucket, objects, dest): + """Download bucket contents to a folder. + + The VM must have had PrepareVM called on it first. The command + will be wrapped in 'time ...'. + + The caller must ensure that objects is a full list of objects in + the bucket, so the provider implementation can either use a + generic "download bucket" command or use the object names. This + method *must* pass all object names to the CLI at once if + possible, not in a loop, to give it the chance to share + connections and overlap downloads. + + Args: + vm: the VM to run commands on + bucket: the name of the bucket to download from + objects: a list of names of objects to download + dest: the name of the folder to download to + + Returns: + A tuple of the (stdout, stderr) of the command. + """ + pass + + # General methods + + def Metadata(self, vm): + """Provider-specific metadata for collected samples. + + Args: + vm: the VM we're running on. + + Returns: + A dict of key, value pairs to add to our sample metadata. + """ + + return {} + + def UpdateSampleMetadata(self, samples): + """Updates metadata of samples with provider specific information. + + Args: + samples: the samples that need the metadata to be updated with provider + specific information. + """ + pass + + def GetDownloadUrl(self, + bucket: str, + object_name: str, + use_https=True) -> str: + """Get the URL to download objects over HTTP(S). + + Args: + bucket: name of bucket + object_name: name of object + use_https: whether to use HTTPS or else HTTP + + Returns: + The URL to download objects over. + """ + raise NotImplementedError + + def GetUploadUrl(self, bucket: str, object_name: str, use_https=True) -> str: + """Get the URL to upload objects over HTTP(S). + + Args: + bucket: name of bucket + object_name: name of object + use_https: whether to use HTTPS or else HTTP + + Returns: + The URL to upload objects over. + """ + return self.GetDownloadUrl(bucket, object_name, use_https) + + # Different services require uploads to be POST or PUT. + UPLOAD_HTTP_METHOD: Optional[str] = None + + def MakeBucketPubliclyReadable(self, bucket: str, also_make_writable=False): + """Make a bucket readable and optionally writable by everyone.""" + raise NotImplementedError + + def APIScriptArgs(self): + """Extra arguments for the API test script. + + The service implementation has two parts - one that runs in the + PKB controller, and one that runs on worker VMs. This method is + how the controller communicates service-specific information to + the workers. + + Returns: + A list of strings, which will be passed as arguments to the API + test script. + """ + + return [] + + @classmethod + def APIScriptFiles(cls): + """Files to upload for the API test script. + + Returns: + A list of file names. These files will be uploaded to the remote + VM if this service's API is being benchmarked. + """ + + return [] + + +def GetObjectStorageClass(storage_name) -> type(ObjectStorageService): + """Return the ObjectStorageService subclass corresponding to storage_name.""" + + return _OBJECT_STORAGE_REGISTRY[storage_name] + + +# TODO(user): Move somewhere more generic +def FindCredentialFile(default_location): + """Return the path to the credential file.""" + + credential_file = ( + FLAGS.object_storage_credential_file or default_location) + credential_file = os.path.expanduser(credential_file) + if not (os.path.isfile(credential_file) or + os.path.isdir(credential_file)): + raise errors.Benchmarks.MissingObjectCredentialException( + 'Credential cannot be found in %s' % credential_file) + + return credential_file + + +def FindBotoFile(): + """Return the path to the boto file.""" + paths_to_check = [ + FLAGS.boto_file_location, + DEFAULT_BOTO_LOCATION_USER, + DEFAULT_BOTO_LOCATION_MACHINE, + ] + + for path in paths_to_check: + if not path: + continue + if pathlib.Path(path).exists(): + return path + + raise errors.Benchmarks.MissingObjectCredentialException( + 'Boto file cannot be found in %s.' % paths_to_check) diff --git a/script/cumulus/pkb/perfkitbenchmarker/os_types.py b/script/cumulus/pkb/perfkitbenchmarker/os_types.py new file mode 100644 index 0000000..3e05c79 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/os_types.py @@ -0,0 +1,112 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Supported types of operating systems that a VM may host.""" + +from absl import flags + +AMAZONLINUX2 = 'amazonlinux2' +CENTOS7 = 'centos7' +CENTOS8 = 'centos8' # deprecated +CENTOS_STREAM8 = 'centos_stream8' +CENTOS_STREAM9 = 'centos_stream9' +CLEAR = 'clear' +COS = 'cos' +CORE_OS = 'core_os' +DEBIAN9 = 'debian9' +DEBIAN10 = 'debian10' +DEBIAN11 = 'debian11' +JUJU = 'juju' +RHEL7 = 'rhel7' +RHEL8 = 'rhel8' +ROCKY_LINUX8 = 'rocky_linux8' +UBUNTU_CONTAINER = 'ubuntu_container' +UBUNTU1604 = 'ubuntu1604' # deprecated +UBUNTU1604_CUDA9 = 'ubuntu1604_cuda9' +UBUNTU1804 = 'ubuntu1804' +UBUNTU1804_EFA = 'ubuntu1804_efa' +UBUNTU2004 = 'ubuntu2004' +UBUNTU2204 = 'ubuntu2204' +WINDOWS2012_CORE = 'windows2012_core' +WINDOWS2016_CORE = 'windows2016_core' +WINDOWS2019_CORE = 'windows2019_core' +WINDOWS2022_CORE = 'windows2022_core' +WINDOWS2012_DESKTOP = 'windows2012_desktop' +WINDOWS2016_DESKTOP = 'windows2016_desktop' +WINDOWS2019_DESKTOP = 'windows2019_desktop' +WINDOWS2022_DESKTOP = 'windows2022_desktop' +WINDOWS2019_SQLSERVER_2017_STANDARD = 'windows2019_desktop_sqlserver_2017_standard' +WINDOWS2019_SQLSERVER_2017_ENTERPRISE = 'windows2019_desktop_sqlserver_2017_enterprise' +WINDOWS2019_SQLSERVER_2019_STANDARD = 'windows2019_desktop_sqlserver_2019_standard' +WINDOWS2019_SQLSERVER_2019_ENTERPRISE = 'windows2019_desktop_sqlserver_2019_enterprise' +WINDOWS2022_SQLSERVER_2019_STANDARD = 'windows2022_desktop_sqlserver_2019_standard' +WINDOWS2022_SQLSERVER_2019_ENTERPRISE = 'windows2022_desktop_sqlserver_2019_enterprise' +# Base-only OS types +DEBIAN = 'debian' +RHEL = 'rhel' +WINDOWS = 'windows' + +# These operating systems have SSH like other Linux OSes, but no package manager +# to run Linux benchmarks without Docker. +# Because they cannot install packages, they only support VM life cycle +# benchmarks like cluster_boot. +CONTAINER_OS_TYPES = [ + CORE_OS, + COS, +] + +LINUX_OS_TYPES = CONTAINER_OS_TYPES + [ + AMAZONLINUX2, + CENTOS7, + CENTOS8, + CENTOS_STREAM8, + CENTOS_STREAM9, + CLEAR, + DEBIAN9, + DEBIAN10, + DEBIAN11, + JUJU, + RHEL7, + RHEL8, + ROCKY_LINUX8, + UBUNTU_CONTAINER, + UBUNTU1604, # deprecated + UBUNTU1604_CUDA9, + UBUNTU1804, + UBUNTU1804_EFA, + UBUNTU2004, + UBUNTU2204, +] +WINDOWS_OS_TYPES = [ + WINDOWS2012_CORE, + WINDOWS2016_CORE, + WINDOWS2019_CORE, + WINDOWS2022_CORE, + WINDOWS2012_DESKTOP, + WINDOWS2016_DESKTOP, + WINDOWS2019_DESKTOP, + WINDOWS2022_DESKTOP, + WINDOWS2019_SQLSERVER_2017_STANDARD, + WINDOWS2019_SQLSERVER_2017_ENTERPRISE, + WINDOWS2019_SQLSERVER_2019_STANDARD, + WINDOWS2019_SQLSERVER_2019_ENTERPRISE, + WINDOWS2022_SQLSERVER_2019_STANDARD, + WINDOWS2022_SQLSERVER_2019_ENTERPRISE, +] +ALL = LINUX_OS_TYPES + WINDOWS_OS_TYPES +BASE_OS_TYPES = [CLEAR, CORE_OS, DEBIAN, RHEL, WINDOWS] + +# May change from time to time. +DEFAULT = UBUNTU1804 + +flags.DEFINE_enum('os_type', DEFAULT, ALL, 'The VM\'s OS type.') diff --git a/script/cumulus/pkb/perfkitbenchmarker/package_lookup.py b/script/cumulus/pkb/perfkitbenchmarker/package_lookup.py new file mode 100644 index 0000000..f204bc3 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/package_lookup.py @@ -0,0 +1,47 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Function to lookup modules from package names. + +PackageModule: Returns a package module given its name. + +This module works around a circular import issue where we cannot import +benchmark_sets.py directly into virtual_machine.py. After SetUpPKB is called, +package_lookup.PackageModule is equivalent to benchmark_sets.PackageModule. +""" + +from perfkitbenchmarker import errors + +_global_package_module_function = None + + +def SetPackageModuleFunction(function): + """Sets the function called by PackageModule; See benchmark_sets.py.""" + global _global_package_module_function + _global_package_module_function = function + + +def PackageModule(package_name): + """Finds the module for a benchmark by name. + + Args: + package_name: The name of the package. + + Returns: + The package's module, or None if the package is invalid. + """ + if not _global_package_module_function: + raise errors.Setup.InvalidSetupError( + 'Cannot call package_lookup.py; Was SetUpPKB called?') + return _global_package_module_function(package_name) diff --git a/script/cumulus/pkb/perfkitbenchmarker/pkb.py b/script/cumulus/pkb/perfkitbenchmarker/pkb.py new file mode 100644 index 0000000..99885b1 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/pkb.py @@ -0,0 +1,1903 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Runs all benchmarks in PerfKitBenchmarker. + +All benchmarks in PerfKitBenchmarker export the following interface: + +GetConfig: this returns, the name of the benchmark, the number of machines + required to run one instance of the benchmark, a detailed description + of the benchmark, and if the benchmark requires a scratch disk. +Prepare: this function takes a list of VMs as an input parameter. The benchmark + will then get all binaries required to run the benchmark and, if + required, create data files. +Run: this function takes a list of VMs as an input parameter. The benchmark will + then run the benchmark upon the machines specified. The function will + return a dictonary containing the results of the benchmark. +Cleanup: this function takes a list of VMs as an input parameter. The benchmark + will then return the machine to the state it was at before Prepare + was called. + +PerfKitBenchmarker has the following run stages: provision, prepare, + run, cleanup, teardown, and all. + +provision: Read command-line flags, decide what benchmarks to run, and + create the necessary resources for each benchmark, including + networks, VMs, disks, and keys, and generate a run_uri, which can + be used to resume execution at later stages. +prepare: Execute the Prepare function of each benchmark to install + necessary software, upload datafiles, etc. +run: Execute the Run function of each benchmark and collect the + generated samples. The publisher may publish these samples + according to PKB's settings. The Run stage can be called multiple + times with the run_uri generated by the provision stage. +cleanup: Execute the Cleanup function of each benchmark to uninstall + software and delete data files. +teardown: Delete VMs, key files, networks, and disks created in the + 'provision' stage. + +all: PerfKitBenchmarker will run all of the above stages (provision, + prepare, run, cleanup, teardown). Any resources generated in the + provision stage will be automatically deleted in the teardown + stage, even if there is an error in an earlier stage. When PKB is + running in this mode, the run cannot be repeated or resumed using + the run_uri. +""" + + +import collections +import copy +import getpass +import itertools +import json +import logging +import multiprocessing +from os.path import isfile +import pickle +import random +import secrets +import re +import sys +import threading +import time +import types +from typing import Any, Dict, List, Optional, Sequence, Set, Tuple +import uuid +from threading import Timer, Lock + +from absl import flags +from perfkitbenchmarker import archive +from perfkitbenchmarker import background_tasks +from perfkitbenchmarker import benchmark_lookup +from perfkitbenchmarker import benchmark_sets +from perfkitbenchmarker import benchmark_spec as bm_spec +from perfkitbenchmarker import benchmark_status +from perfkitbenchmarker import configs +from perfkitbenchmarker import context +from perfkitbenchmarker import disk +from perfkitbenchmarker import errors +from perfkitbenchmarker import events +from perfkitbenchmarker import flag_util +from perfkitbenchmarker import linux_benchmarks +from perfkitbenchmarker import log_util +from perfkitbenchmarker import os_types +from perfkitbenchmarker import package_lookup +from perfkitbenchmarker import providers +from perfkitbenchmarker import publisher +from perfkitbenchmarker import requirements +from perfkitbenchmarker import sample +from perfkitbenchmarker import spark_service +from perfkitbenchmarker import stages +from perfkitbenchmarker import static_virtual_machine +from perfkitbenchmarker import timing_util +from perfkitbenchmarker import traces +from perfkitbenchmarker import version +from perfkitbenchmarker import vm_util +from perfkitbenchmarker import windows_benchmarks +try: + from perfkitbenchmarker import intel_publisher +except: + intel_publisher = None +from perfkitbenchmarker.configs import benchmark_config_spec +from perfkitbenchmarker.linux_benchmarks import cluster_boot_benchmark +from perfkitbenchmarker.linux_packages import build_tools +import six +from six.moves import zip + +LOG_FILE_NAME = 'pkb.log' +COMPLETION_STATUS_FILE_NAME = 'completion_statuses.json' +REQUIRED_INFO = ['scratch_disk', 'num_machines'] +REQUIRED_EXECUTABLES = frozenset(['ssh', 'ssh-keygen', 'scp', 'openssl']) +MAX_RUN_URI_LENGTH = 12 +EMON_EDP_TARBALL = 'emon_edp.tar.gz' +FLAGS = flags.FLAGS + +# Define patterns for help text processing. +BASE_RELATIVE = '../' # Relative path from markdown output to PKB home for link writing. +MODULE_REGEX = r'^\s+?(.*?):.*' # Pattern that matches module names. +FLAGS_REGEX = r'(^\s\s--.*?(?=^\s\s--|\Z))+?' # Pattern that matches each flag. +FLAGNAME_REGEX = r'^\s+?(--.*?)(:.*\Z)' # Pattern that matches flag name in each flag. +DOCSTRING_REGEX = r'"""(.*?|$)"""' # Pattern that matches triple quoted comments. + +flags.DEFINE_list('ssh_options', [], 'Additional options to pass to ssh.') +flags.DEFINE_boolean('use_ipv6', False, 'Whether to use ipv6 for ssh/scp.') +flags.DEFINE_list('benchmarks', [benchmark_sets.STANDARD_SET], + 'Benchmarks and/or benchmark sets that should be run. The ' + 'default is the standard set. For more information about ' + 'benchmarks and benchmark sets, see the README and ' + 'benchmark_sets.py.') +flags.DEFINE_boolean('multi_os_benchmark', False, 'Whether is benchmark will ' + 'involve multiple os types.') +flags.DEFINE_string('archive_bucket', None, + 'Archive results to the given S3/GCS bucket.') +flags.DEFINE_string('project', None, 'GCP project ID under which ' + 'to create the virtual machines') +flags.DEFINE_multi_string( + 'zone', [], + 'Similar to the --zones flag, but allows the flag to be specified ' + 'multiple times on the commandline. For example, --zone=a --zone=b is ' + 'equivalent to --zones=a,b. Furthermore, any values specified by --zone ' + 'will be appended to those specfied by --zones.') +flags.DEFINE_list( + 'zones', [], + 'A list of zones within which to run PerfKitBenchmarker. ' + 'This is specific to the cloud provider you are running on. ' + 'If multiple zones are given, PerfKitBenchmarker will create 1 VM in ' + 'zone, until enough VMs are created as specified in each ' + 'benchmark. The order in which this flag is applied to VMs is ' + 'undefined.') +flags.DEFINE_list( + 'extra_zones', [], + 'Zones that will be appended to the "zones" list. This is functionally ' + 'the same, but allows flag matrices to have two zone axes.') +# TODO(user): note that this is currently very GCE specific. Need to create a +# module which can translate from some generic types to provider specific +# nomenclature. +flags.DEFINE_string('machine_type', None, 'Machine ' + 'types that will be created for benchmarks that don\'t ' + 'require a particular type.') +flags.DEFINE_integer('num_vms', 1, 'For benchmarks which can make use of a ' + 'variable number of machines, the number of VMs to use.') +flags.DEFINE_string('image', None, 'Default image that will be ' + 'linked to the VM') +flags.DEFINE_string('run_uri', None, 'Name of the Run. If provided, this ' + 'should be alphanumeric and less than or equal to %d ' + 'characters in length.' % MAX_RUN_URI_LENGTH) +flags.DEFINE_boolean('use_pkb_logging', True, 'Whether to use PKB-specific ' + 'logging handlers. Disabling this will use the standard ' + 'ABSL logging directly.') +flags.DEFINE_boolean('log_dmesg', False, 'Whether to log dmesg from ' + 'each VM to the PKB log file before the VM is deleted.') +# Added by Cumulus +flags.DEFINE_boolean('get_benchmark_usage', False, 'Whether to display additional ' + 'usage information for the benchmark.') +# End Added by Cumulus +flags.DEFINE_boolean('always_teardown_on_exception', False, 'Whether to tear ' + 'down VMs when there is exception during the PKB run. If' + 'enabled, VMs will be torn down even if FLAGS.run_stage ' + 'does not specify teardown.') +_RESTORE_PATH = flags.DEFINE_string('restore', None, + 'Path to restore resources from.') +_FREEZE_PATH = flags.DEFINE_string('freeze', None, + 'Path to freeze resources to.') +_COLLECT_MEMINFO = flags.DEFINE_bool('collect_meminfo', False, + 'Whether to collect /proc/meminfo stats.') + + +def GetCurrentUser(): + """Get the current user name. + + On some systems the current user information may be unavailable. In these + cases we just need a string to tag the created resources with. It should + not be a fatal error. + + Returns: + User name OR default string if user name not available. + """ + try: + return getpass.getuser() + except KeyError: + return 'user_unknown' + + +flags.DEFINE_string( + 'owner', GetCurrentUser(), 'Owner name. ' + 'Used to tag created resources and performance records.') +flags.DEFINE_enum( + 'log_level', log_util.INFO, + list(log_util.LOG_LEVELS.keys()), + 'The log level to run at.') +flags.DEFINE_enum( + 'file_log_level', log_util.DEBUG, list(log_util.LOG_LEVELS.keys()), + 'Anything logged at this level or higher will be written to the log file.') +flags.DEFINE_integer('duration_in_seconds', None, + 'duration of benchmarks. ' + '(only valid for mesh_benchmark)') +flags.DEFINE_string('static_vm_file', None, + 'The file path for the Static Machine file. See ' + 'static_virtual_machine.py for a description of this file.') +flags.DEFINE_boolean('version', False, 'Display the version and exit.', + allow_override_cpp=True) +flags.DEFINE_boolean('time_commands', False, 'Times each command issued.') +flags.DEFINE_enum( + 'scratch_disk_type', None, + [disk.STANDARD, disk.REMOTE_SSD, disk.PIOPS, disk.LOCAL], + 'Type for all scratch disks. The default is standard') +flags.DEFINE_string( + 'data_disk_type', None, + 'Type for all data disks. If a provider keeps the operating system and ' + 'user data on separate disks, this only affects the user data disk(s).' + 'If the provider has OS and user data on the same disk, this flag affects' + 'that disk.') +flags.DEFINE_integer('scratch_disk_size', None, 'Size, in gb, for all scratch ' + 'disks.') +flags.DEFINE_list( + 'data_disk_zones', [], + 'The zone of the data disk. This is only used to provision regional pd with' + ' multiple zones on GCP.') +flags.DEFINE_integer('data_disk_size', None, 'Size, in gb, for all data disks.') +flags.DEFINE_integer('scratch_disk_iops', None, + 'IOPS for Provisioned IOPS (SSD) volumes in AWS.') +flags.DEFINE_integer('scratch_disk_throughput', None, + 'Throughput (MB/s) for volumes in AWS.') +flags.DEFINE_integer('num_striped_disks', None, + 'The number of data disks to stripe together to form one ' + '"logical" data disk. This defaults to 1 ' + '(except with local disks), which means no striping. ' + 'When using local disks, they default to striping ' + 'all disks together. The striped disks will appear as ' + 'one disk (data_disk_0) in the metadata.', + lower_bound=1) +flags.DEFINE_bool('install_packages', None, + 'Override for determining whether packages should be ' + 'installed. If this is false, no packages will be installed ' + 'on any VMs. This option should probably only ever be used ' + 'if you have already created an image with all relevant ' + 'packages installed.') +flags.DEFINE_bool( + 'stop_after_benchmark_failure', False, + 'Determines response when running multiple benchmarks serially and a ' + 'benchmark run fails. When True, no further benchmarks are scheduled, and ' + 'execution ends. When False, benchmarks continue to be scheduled. Does not ' + 'apply to keyboard interrupts, which will always prevent further ' + 'benchmarks from being scheduled.') +flags.DEFINE_boolean( + 'ignore_package_requirements', False, + 'Disables Python package requirement runtime checks.') +flags.DEFINE_enum('spark_service_type', None, + [spark_service.PKB_MANAGED, spark_service.PROVIDER_MANAGED], + 'Type of spark service to use') +flags.DEFINE_boolean( + 'publish_after_run', False, + 'If true, PKB will publish all samples available immediately after running ' + 'each benchmark. This may be useful in scenarios where the PKB run time ' + 'for all benchmarks is much greater than a single benchmark.') +flags.DEFINE_integer( + 'publish_period', None, + 'The period in seconds to publish samples from repeated run stages. ' + 'This will only publish samples if publish_after_run is True.') +flags.DEFINE_integer( + 'run_stage_time', 0, + 'PKB will run/re-run the run stage of each benchmark until it has spent ' + 'at least this many seconds. It defaults to 0, so benchmarks will only ' + 'be run once unless some other value is specified. This flag and ' + 'run_stage_iterations are mutually exclusive.') +flags.DEFINE_integer( + 'run_stage_iterations', 1, + 'PKB will run/re-run the run stage of each benchmark this many times. ' + 'It defaults to 1, so benchmarks will only be run once unless some other ' + 'value is specified. This flag and run_stage_time are mutually exclusive.') +flags.DEFINE_integer( + 'run_stage_retries', 0, + 'The number of allowable consecutive failures during the run stage. After ' + 'this number of failures any exceptions will cause benchmark termination. ' + 'If run_stage_time is exceeded, the run stage will not be retried even if ' + 'the number of failures is less than the value of this flag.') +_MAX_RETRIES = flags.DEFINE_integer( + 'retries', 0, 'The amount of times PKB should retry each benchmark.' + 'Use with --retry_substatuses to specify which failure substatuses to ' + 'retry on. Defaults to all valid substatuses.') +_RETRY_SUBSTATUSES = flags.DEFINE_multi_enum( + 'retry_substatuses', benchmark_status.FailedSubstatus.RETRYABLE_SUBSTATUSES, + benchmark_status.FailedSubstatus.RETRYABLE_SUBSTATUSES, + 'The failure substatuses to retry on. By default, failed runs are run with ' + 'the same previous config.') +_RETRY_DELAY_SECONDS = flags.DEFINE_integer( + 'retry_delay_seconds', 0, 'The time to wait in between retries.') +# Retries could also allow for a dict of failed_substatus: 'zone'|'region' +# retry method which would make the retry functionality more customizable. +_SMART_QUOTA_RETRY = flags.DEFINE_bool( + 'smart_quota_retry', False, + 'If True, causes the benchmark to rerun in a zone in a different region ' + 'in the same geo on a quota exception. Currently only works for benchmarks ' + 'that specify a single zone (via --zone or --zones). The zone is selected ' + 'at random and overrides the --zones flag or the --zone flag, depending on ' + 'which is provided. QUOTA_EXCEEDED must be in the list of retry ' + 'substatuses for this to work.') +_SMART_CAPACITY_RETRY = flags.DEFINE_bool( + 'smart_capacity_retry', False, + 'If True, causes the benchmark to rerun in a different zone in the same ' + 'region on a capacity/config exception. Currently only works for ' + 'benchmarks that specify a single zone (via --zone or --zones). The zone ' + 'is selected at random and overrides the --zones flag or the --zone flag, ' + 'depending on which is provided. INSUFFICIENT_CAPACITY and UNSUPPORTED ' + 'must be in the list of retry substatuses for this to work.') +flags.DEFINE_boolean( + 'boot_samples', False, + 'Whether to publish boot time samples for all tests.') +_MEASURE_DELETE = flags.DEFINE_boolean( + 'delete_samples', False, + 'Whether to publish delete time samples for all tests.') +flags.DEFINE_boolean( + 'gpu_samples', False, + 'Whether to publish GPU memcpy bandwidth samples for GPU tests.') +flags.DEFINE_integer( + 'run_processes', None, + 'The number of parallel processes to use to run benchmarks.', + lower_bound=1) +flags.DEFINE_float( + 'run_processes_delay', None, + 'The delay in seconds between parallel processes\' invocation. ' + 'Increasing this value may reduce provider throttling issues.', + lower_bound=0) +flags.DEFINE_string( + 'completion_status_file', None, + 'If specified, this file will contain the completion status of each ' + 'benchmark that ran (SUCCEEDED, FAILED, or SKIPPED). The file has one json ' + 'object per line, each with the following format:\n' + '{ "name": , "flags": , ' + '"status": }') +flags.DEFINE_string( + 'helpmatch', '', + 'Shows only flags defined in a module whose name matches the given regex.', + allow_override_cpp=True) +flags.DEFINE_string( + 'helpmatchmd', '', + 'helpmatch query with markdown friendly output. ' + 'Shows only flags defined in a module whose name matches the given regex.', + allow_override_cpp=True) +flags.DEFINE_boolean( + 'create_failed_run_samples', False, + 'If true, PKB will create a sample specifying that a run stage failed. ' + 'This sample will include metadata specifying the run stage that ' + 'failed, the exception that occurred, as well as all the flags that ' + 'were provided to PKB on the command line.') +_CREATE_STARTED_RUN_SAMPLE = flags.DEFINE_boolean( + 'create_started_run_sample', False, + 'Whether PKB will create a sample at the start of the provision phase of ' + 'the benchmark run.') +_CREATE_STARTED_STAGE_SAMPLES = flags.DEFINE_boolean( + 'create_started_stage_samples', False, + 'Whether PKB will create a sample at the start of the each stage of ' + 'the benchmark run.') +flags.DEFINE_integer( + 'failed_run_samples_error_length', 10240, + 'If create_failed_run_samples is true, PKB will truncate any error ' + 'messages at failed_run_samples_error_length.') +flags.DEFINE_boolean( + 'dry_run', False, + 'If true, PKB will print the flags configurations to be run and exit. ' + 'The configurations are generated from the command line flags, the ' + 'flag_matrix, and flag_zip.') +flags.DEFINE_string( + 'skip_pending_runs_file', None, + 'If file exists, any pending runs will be not be executed.') +flags.DEFINE_boolean( + 'use_vpn', False, + 'Creates VPN tunnels between vm_groups') +flags.DEFINE_integer( + 'after_prepare_sleep_time', 0, + 'The time in seconds to sleep after the prepare phase. This can be useful ' + 'for letting burst tokens accumulate.') +flags.DEFINE_integer( + 'after_run_sleep_time', 0, + 'The time in seconds to sleep after the run phase. This can be useful ' + 'for letting the VM sit idle after the bechmarking phase is complete.') +flags.DEFINE_bool( + 'before_run_pause', False, + 'If true, wait for command line input before executing the run phase. ' + 'This is useful for debugging benchmarks during development.') +flags.DEFINE_bool( + 'before_cleanup_pause', False, + 'If true, wait for command line input before executing the cleanup phase. ' + 'This is useful for debugging benchmarks during development.') +flags.DEFINE_integer( + 'timeout_minutes', 240, + 'An upper bound on the time in minutes that the benchmark is expected to ' + 'run. This time is annotated or tagged on the resources of cloud ' + 'providers. Note that for retries, this applies to each individual retry.') +flags.DEFINE_integer( + 'persistent_timeout_minutes', 240, + 'An upper bound on the time in minutes that resources left behind by the ' + 'benchmark. Some benchmarks purposefully create resources for other ' + 'benchmarks to use. Persistent timeout specifies how long these shared ' + 'resources should live.') + +flags.DEFINE_integer( + 'trace_start_delay', 0, + 'delay in seconds before the trace will start collecting data', + lower_bound=0) +flags.DEFINE_integer( + 'trace_duration', None, + 'duration in seconds for which the trace will collect data') +flags.DEFINE_boolean( + 'trace_allow_benchmark_control', False, + 'Specifies to allow benchmark-level traces control. ' + 'Traces will not be started/stopped surround run phase and instead ' + 'benchmarks which are designed to start/stop traces themselves will do so.') + +flags.DEFINE_bool('disable_interrupt_moderation', False, + 'Turn off the interrupt moderation networking feature') +flags.DEFINE_bool('disable_rss', False, + 'Whether or not to disable the Receive Side Scaling feature.') +flags.DEFINE_boolean('record_lscpu', False, + 'Whether to record the lscpu output in a sample') +flags.DEFINE_boolean('record_proccpu', False, + 'Whether to record the /proc/cpuinfo output in a sample') +flags.DEFINE_boolean('record_cpu_vuln', False, + 'Whether to record the CPU vulnerabilities on linux VMs') +flags.DEFINE_boolean('record_gcc', False, + 'Whether to record the gcc version in a sample') +flags.DEFINE_boolean('record_glibc', False, + 'Whether to record the glibc version in a sample') +# Support for using a proxy in the cloud environment. +flags.DEFINE_string('http_proxy', '', + 'Specify a proxy for HTTP in the form ' + '[user:passwd@]proxy.server:port.') +flags.DEFINE_string('https_proxy', '', + 'Specify a proxy for HTTPS in the form ' + '[user:passwd@]proxy.server:port.') +flags.DEFINE_string('ftp_proxy', '', + 'Specify a proxy for FTP in the form ' + '[user:passwd@]proxy.server:port.') +flags.DEFINE_string('no_proxy', '', + 'Specify host(s) to exclude from proxy, e.g. ' + '--no_proxy=localhost,.example.com,192.168.0.1') +flags.DEFINE_bool('randomize_run_order', False, + 'When running with more than one benchmarks, ' + 'randomize order of the benchmarks.') +flags.DEFINE_string('compiler', 'gcc', + 'Specify compiler to use for workload:' + '--compiler=[gcc, aocc, icc]') +flags.DEFINE_string('compiler_version', '8', + 'Use specific version of compiler:' + '--compiler_version=2.1') +flags.DEFINE_string('compiler_path', '', + 'Specify compiler path prefix' + '--compiler_path=/usr/local/bin') +flags.DEFINE_string('compiler_flags', '', + 'override flags to pass to the compiler command line, e.g. ' + '-O3 -mcmodel=medium -fopenmp -march=skylake-avx512') + +_TEARDOWN_EVENT = multiprocessing.Event() +_ANY_ZONE = 'any' + +events.initialization_complete.connect(traces.RegisterAll) + + +@flags.multi_flags_validator( + ['smart_quota_retry', 'smart_capacity_retry', 'retries', 'zones', 'zone'], + message='Smart zone retries requires exactly one single zone from --zones ' + 'or --zone, as well as retry count > 0.') +def ValidateSmartZoneRetryFlags(flags_dict): + """Validates smart zone retry flags.""" + if flags_dict['smart_quota_retry'] or flags_dict['smart_capacity_retry']: + if flags_dict['retries'] == 0: + return False + return (len(flags_dict['zones']) == 1 and + not flags_dict['zone']) or (len(flags_dict['zone']) == 1 and + not flags_dict['zones']) + return True + + +@flags.multi_flags_validator( + ['retries', 'run_stage'], + message='Retries requires running all stages of the benchmark.') +def ValidateRetriesAndRunStages(flags_dict): + if flags_dict['retries'] > 0 and flags_dict['run_stage'] != stages.STAGES: + return False + return True + + +def _InjectBenchmarkInfoIntoDocumentation(): + """Appends each benchmark's information to the main module's docstring.""" + # TODO: Verify if there is other way of appending additional help + # message. + # Inject more help documentation + # The following appends descriptions of the benchmarks and descriptions of + # the benchmark sets to the help text. + benchmark_sets_list = [ + '%s: %s' % + (set_name, benchmark_sets.BENCHMARK_SETS[set_name]['message']) + for set_name in benchmark_sets.BENCHMARK_SETS] + sys.modules['__main__'].__doc__ = ( + 'PerfKitBenchmarker version: {version}\n\n{doc}\n' + 'Benchmarks (default requirements):\n' + '\t{benchmark_doc}').format( + version=version.VERSION, + doc=__doc__, + benchmark_doc=_GenerateBenchmarkDocumentation()) + sys.modules['__main__'].__doc__ += ('\n\nBenchmark Sets:\n\t%s' + % '\n\t'.join(benchmark_sets_list)) + + +def _ParseFlags(argv=sys.argv): + """Parses the command-line flags.""" + try: + argv = FLAGS(argv) + except flags.Error as e: + logging.error(e) + logging.info('For usage instructions, use --helpmatch={module_name}') + logging.info('For example, ./pkb.py --helpmatch=benchmarks.fio') + sys.exit(1) + + +def _PrintHelp(matches=None): + """Prints help for flags defined in matching modules. + + Args: + matches: regex string or None. Filters help to only those whose name + matched the regex. If None then all flags are printed. + """ + if not matches: + print(FLAGS) + else: + flags_by_module = FLAGS.flags_by_module_dict() + modules = sorted(flags_by_module) + regex = re.compile(matches) + for module_name in modules: + if regex.search(module_name): + print(FLAGS.module_help(module_name)) + + +def _PrintHelpMD(matches=None): + """Prints markdown formatted help for flags defined in matching modules. + + Works just like --helpmatch. + + Args: + matches: regex string or None. Filters help to only those whose name matched + the regex. If None then all flags are printed. + Raises: + RuntimeError: If unable to find module help. + Eg: + * all flags: `./pkb.py --helpmatchmd .*` > testsuite_docs/all.md + * linux benchmarks: `./pkb.py --helpmatchmd linux_benchmarks.*` > + testsuite_docs/linux_benchmarks.md * specific modules `./pkb.py + --helpmatchmd iperf` > testsuite_docs/iperf.md * windows packages + `./pkb.py --helpmatchmd windows_packages.*` > + testsuite_docs/windows_packages.md + * GCP provider: `./pkb.py --helpmatchmd providers.gcp.* > + testsuite_docs/providers_gcp.md` + """ + + flags_by_module = FLAGS.flags_by_module_dict() + modules = sorted(flags_by_module) + regex = re.compile(matches) + for module_name in modules: + if regex.search(module_name): + # Compile regex patterns. + module_regex = re.compile(MODULE_REGEX) + flags_regex = re.compile(FLAGS_REGEX, re.MULTILINE | re.DOTALL) + flagname_regex = re.compile(FLAGNAME_REGEX, re.MULTILINE | re.DOTALL) + docstring_regex = re.compile(DOCSTRING_REGEX, re.MULTILINE | re.DOTALL) + # Retrieve the helpmatch text to format. + helptext_raw = FLAGS.module_help(module_name) + + # Converts module name to github linkable string. + # eg: perfkitbenchmarker.linux_benchmarks.iperf_vpn_benchmark -> + # perfkitbenchmarker/linux_benchmarks/iperf_vpn_benchmark.py + match = re.search( + module_regex, + helptext_raw, + ) + if not match: + raise RuntimeError( + f'Unable to find "{module_regex}" in "{helptext_raw}"') + module = match.group(1) + module_link = module.replace('.', '/') + '.py' + # Put flag name in a markdown code block for visibility. + flags = re.findall(flags_regex, helptext_raw) + flags[:] = [flagname_regex.sub(r'`\1`\2', flag) for flag in flags] + # Get the docstring for the module without importing everything into our + # namespace. Probably a better way to do this + docstring = 'No description available' + # Only pull doststrings from inside pkb source files. + if isfile(module_link): + with open(module_link, 'r') as f: + source = f.read() + # Get the triple quoted matches. + docstring_match = re.search(docstring_regex, source) + # Some modules don't have docstrings. + # eg perfkitbenchmarker/providers/alicloud/flags.py + if docstring_match is not None: + docstring = docstring_match.group(1) + # Format output and print here. + if isfile(module_link): # Only print links for modules we can find. + print('### [' + module, '](' + BASE_RELATIVE + module_link + ')\n') + else: + print('### ' + module + '\n') + print('#### Description:\n\n' + docstring + '\n\n#### Flags:\n') + print('\n'.join(flags) + '\n') + + +def CheckVersionFlag(): + """If the --version flag was specified, prints the version and exits.""" + if FLAGS.version: + print(version.VERSION) + sys.exit(0) + + +def _InitializeRunUri(): + """Determines the PKB run URI and sets FLAGS.run_uri.""" + if FLAGS.run_uri is None: + if stages.PROVISION in FLAGS.run_stage: + FLAGS.run_uri = str(uuid.uuid4())[-8:] + else: + # Attempt to get the last modified run directory. + run_uri = vm_util.GetLastRunUri() + if run_uri: + FLAGS.run_uri = run_uri + logging.warning( + 'No run_uri specified. Attempting to run the following stages with ' + '--run_uri=%s: %s', FLAGS.run_uri, ', '.join(FLAGS.run_stage)) + else: + raise errors.Setup.NoRunURIError( + 'No run_uri specified. Could not run the following stages: %s' % + ', '.join(FLAGS.run_stage)) + elif not FLAGS.run_uri.isalnum() or len(FLAGS.run_uri) > MAX_RUN_URI_LENGTH: + raise errors.Setup.BadRunURIError('run_uri must be alphanumeric and less ' + 'than or equal to %d characters in ' + 'length.' % MAX_RUN_URI_LENGTH) + + +def _CreateBenchmarkSpecs(): + """Create a list of BenchmarkSpecs for each benchmark run to be scheduled. + + Returns: + A list of BenchmarkSpecs. + """ + specs = [] + benchmark_tuple_list = benchmark_sets.GetBenchmarksFromFlags() + benchmark_counts = collections.defaultdict(itertools.count) + for benchmark_module, user_config in benchmark_tuple_list: + # Construct benchmark config object. + name = benchmark_module.BENCHMARK_NAME + expected_os_types = None if FLAGS.multi_os_benchmark else ( + os_types.WINDOWS_OS_TYPES if FLAGS.os_type in os_types.WINDOWS_OS_TYPES + else os_types.LINUX_OS_TYPES) + with flag_util.OverrideFlags(FLAGS, user_config.get('flags')): + config_dict = benchmark_module.GetConfig(user_config) + config_spec_class = getattr( + benchmark_module, 'BENCHMARK_CONFIG_SPEC_CLASS', + benchmark_config_spec.BenchmarkConfigSpec) + config = config_spec_class(name, expected_os_types=expected_os_types, + flag_values=FLAGS, **config_dict) + + # Assign a unique ID to each benchmark run. This differs even between two + # runs of the same benchmark within a single PKB run. + uid = name + str(next(benchmark_counts[name])) + + # Optional step to check flag values and verify files exist. + check_prereqs = getattr(benchmark_module, 'CheckPrerequisites', None) + if check_prereqs: + try: + with config.RedirectFlags(FLAGS): + check_prereqs(config) + except: + logging.exception('Prerequisite check failed for %s', name) + raise + + specs.append( + bm_spec.BenchmarkSpec.GetBenchmarkSpec(benchmark_module, config, uid)) + + return specs + + +def _WriteCompletionStatusFile(benchmark_specs, status_file): + """Writes a completion status file. + + The file has one json object per line, each with the following format: + + { + "name": , + "status": , + "failed_substatus": , + "status_detail": , + "flags": + } + + Args: + benchmark_specs: The list of BenchmarkSpecs that ran. + status_file: The file object to write the json structures to. + """ + for spec in benchmark_specs: + # OrderedDict so that we preserve key order in json file + status_dict = collections.OrderedDict() + status_dict['name'] = spec.name + status_dict['status'] = spec.status + if spec.failed_substatus: + status_dict['failed_substatus'] = spec.failed_substatus + if spec.status_detail: + status_dict['status_detail'] = spec.status_detail + status_dict['flags'] = spec.config.flags + status_file.write(json.dumps(status_dict) + '\n') + + +def _SetRestoreSpec(spec: bm_spec.BenchmarkSpec) -> None: + """Unpickles the spec to restore resources from, if provided.""" + restore_path = _RESTORE_PATH.value + if restore_path: + logging.info('Using restore spec at path: %s', restore_path) + with open(restore_path, 'rb') as spec_file: + spec.restore_spec = pickle.load(spec_file) + + +def _SetFreezePath(spec: bm_spec.BenchmarkSpec) -> None: + """Sets the path to freeze resources to if provided.""" + if _FREEZE_PATH.value: + spec.freeze_path = _FREEZE_PATH.value + logging.info('Using freeze path, %s', spec.freeze_path) + + +def DoProvisionPhase(spec, timer): + """Performs the Provision phase of benchmark execution. + + Args: + spec: The BenchmarkSpec created for the benchmark. + timer: An IntervalTimer that measures the start and stop times of resource + provisioning. + """ + logging.info('Provisioning resources for benchmark %s', spec.name) + events.before_phase.send(stages.PROVISION, benchmark_spec=spec) + spec.ConstructContainerCluster() + spec.ConstructContainerRegistry() + # spark service needs to go first, because it adds some vms. + spec.ConstructSparkService() + spec.ConstructDpbService() + spec.ConstructVirtualMachines() + spec.ConstructRelationalDb() + spec.ConstructSpanner() + spec.ConstructNonRelationalDb() + spec.ConstructMessagingService() + # CapacityReservations need to be constructed after VirtualMachines because + # it needs information about the VMs (machine type, count, zone, etc). The + # CapacityReservations will be provisioned before VMs. + spec.ConstructCapacityReservations() + spec.ConstructTpu() + spec.ConstructEdwService() + spec.ConstructVPNService() + spec.ConstructNfsService() + spec.ConstructSmbService() + spec.ConstructDataDiscoveryService() + # Pickle the spec before we try to create anything so we can clean + # everything up on a second run if something goes wrong. + spec.Pickle() + events.benchmark_start.send(benchmark_spec=spec) + try: + with timer.Measure('Resource Provisioning'): + spec.Provision() + finally: + # Also pickle the spec after the resources are created so that + # we have a record of things like AWS ids. Otherwise we won't + # be able to clean them up on a subsequent run. + spec.Pickle() + events.after_phase.send(stages.PROVISION, benchmark_spec=spec) + + +class InterruptChecker(): + """An class that check interrupt on VM.""" + + def __init__(self, vms): + """Start check interrupt thread. + + Args: + vms: A list of virtual machines. + """ + self.vms = vms + self.check_threads = [] + self.phase_status = threading.Event() + for vm in vms: + if vm.IsInterruptible(): + check_thread = threading.Thread(target=self.CheckInterrupt, args=(vm,)) + check_thread.start() + self.check_threads.append(check_thread) + + def CheckInterrupt(self, vm): + """Check interrupt. + + Args: + vm: the virtual machine object. + + Returns: + None + """ + while not self.phase_status.is_set(): + vm.UpdateInterruptibleVmStatus(use_api=False) + if vm.WasInterrupted(): + return + else: + self.phase_status.wait(vm.GetInterruptableStatusPollSeconds()) + + def EndCheckInterruptThread(self): + """End check interrupt thread.""" + self.phase_status.set() + + for check_thread in self.check_threads: + check_thread.join() + + def EndCheckInterruptThreadAndRaiseError(self): + """End check interrupt thread and raise error. + + Raises: + InsufficientCapacityCloudFailure when it catches interrupt. + + Returns: + None + """ + self.EndCheckInterruptThread() + if any(vm.IsInterruptible() and vm.WasInterrupted() for vm in self.vms): + raise errors.Benchmarks.InsufficientCapacityCloudFailure('Interrupt') + + +def DoPreparePhase(spec, timer): + """Performs the Prepare phase of benchmark execution. + + Args: + spec: The BenchmarkSpec created for the benchmark. + timer: An IntervalTimer that measures the start and stop times of the + benchmark module's Prepare function. + """ + # ### Added by Cumulus for svrinfo b0 #### + interrupt_checker = InterruptChecker(spec.vms) + # ### End Cumulus svrinfo block 0 #### + logging.info('Preparing benchmark %s', spec.name) + events.before_phase.send(stages.PREPARE, benchmark_spec=spec) + with timer.Measure('BenchmarkSpec Prepare'): + spec.Prepare() + with timer.Measure('Benchmark Prepare'): + spec.BenchmarkPrepare(spec) + spec.StartBackgroundWorkload() + if FLAGS.after_prepare_sleep_time: + logging.info('Sleeping for %s seconds after the prepare phase.', + FLAGS.after_prepare_sleep_time) + time.sleep(FLAGS.after_prepare_sleep_time) + events.after_phase.send(stages.PREPARE, benchmark_spec=spec) + + interrupt_checker.EndCheckInterruptThread() + + +def DoRunPhase(spec, collector, timer): + """Performs the Run phase of benchmark execution. + + Args: + spec: The BenchmarkSpec created for the benchmark. + collector: The SampleCollector object to add samples to. + timer: An IntervalTimer that measures the start and stop times of the + benchmark module's Run function. + """ + if FLAGS.before_run_pause: + six.moves.input('Hit enter to begin Run.') + deadline = time.time() + FLAGS.run_stage_time + run_number = 0 + consecutive_failures = 0 + last_publish_time = time.time() + + def _IsRunStageFinished(): + if FLAGS.run_stage_time > 0: + return time.time() > deadline + else: + return run_number >= FLAGS.run_stage_iterations + + trace_start_delay = FLAGS.trace_start_delay + trace_duration = FLAGS.trace_duration + benchmark_control_traces = spec.control_traces and FLAGS.trace_allow_benchmark_control + + # Use mutable list under a lock as an atomic indicator of the traces being stopped + traces_stopped = [] + lock = Lock() + + def _StopTraces(): + """ + Stops all traces by sending on the after_phase signal from + the RUN_PHASE sender. RUN_PHASE is the sender all the modules in + perfkitbenchmarker/traces/ except svrinfo are listening to. + """ + with lock: + if traces_stopped: + logging.info("traces already stopped") + return + events.stop_trace.send(stages.RUN, benchmark_spec=spec) + traces_stopped.append(1) + + while True: + samples = [] + + if not traces_stopped: + logging.info("Installing traces...") + events.before_phase.send(stages.RUN, benchmark_spec=spec) + trace_stop_thread = None + if benchmark_control_traces: + if trace_start_delay: + logging.warning('Benchmark {} controls it\'s own traces behavior and is ' + 'incompatible with --trace_start_delay.'.format(spec.name)) + elif trace_start_delay: + logging.info('Traces installed & will start after {0} seconds'.format(trace_start_delay)) + trace_start_thread = Timer(trace_start_delay, events.start_trace.send, [stages.RUN], + {"benchmark_spec": spec}) + trace_start_thread.start() + if trace_duration is not None: + logging.info('Traces will run for {0} seconds'.format(trace_duration)) + trace_stop_delay = trace_start_delay + trace_duration + trace_stop_thread = Timer(trace_stop_delay, _StopTraces) + trace_stop_thread.start() + else: + logging.info('Traces will run for the duration of the benchmark') + else: + events.start_trace.send(stages.RUN, benchmark_spec=spec) + + try: + logging.info('Running benchmark %s', spec.name) + with timer.Measure('Benchmark Run'): + samples = spec.BenchmarkRun(spec) + except Exception: + consecutive_failures += 1 + if consecutive_failures > FLAGS.run_stage_retries: + raise + logging.exception('Run failed (consecutive_failures=%s); retrying.', + consecutive_failures) + else: + consecutive_failures = 0 + finally: + if not benchmark_control_traces: + _StopTraces() + if trace_stop_thread is not None: + # This is to cover the case when the benchmark finishes running + # before the traces are stopped by the thread delayed by trace_duration seconds. + # Not cancelling the thread will not cause the traces to stop since we ensure + # the stop signal is only sent once in _StopTraces(). This just servers to + # clean up after the thread that is still waiting on the Timer. + trace_stop_thread.cancel() + + if not traces_stopped: + events.after_phase.send(stages.RUN, benchmark_spec=spec) + + if FLAGS.run_stage_time or FLAGS.run_stage_iterations: + for s in samples: + s.metadata['run_number'] = run_number + + # Add boot time metrics on the first run iteration. + if run_number == 0 and (FLAGS.boot_samples or + spec.name == cluster_boot_benchmark.BENCHMARK_NAME): + samples.extend(cluster_boot_benchmark.GetTimeToBoot(spec.vms)) + + if FLAGS.record_lscpu: + samples.extend(_CreateLscpuSamples(spec.vms)) + + if FLAGS.record_proccpu: + samples.extend(_CreateProcCpuSamples(spec.vms)) + if FLAGS.record_cpu_vuln and run_number == 0: + samples.extend(_CreateCpuVulnerabilitySamples(spec.vms)) + + # Add workload uid to metadata + for s in samples: + s.metadata['uid'] = spec.uid + if FLAGS.record_gcc: + samples.extend(_CreateGccSamples(spec.vms)) + if FLAGS.record_glibc: + samples.extend(_CreateGlibcSamples(spec.vms)) + + events.samples_created.send( + stages.RUN, benchmark_spec=spec, samples=samples) + collector.AddSamples(samples, spec.name, spec) + if (FLAGS.publish_after_run and FLAGS.publish_period is not None and + FLAGS.publish_period < (time.time() - last_publish_time)): + collector.PublishSamples() + last_publish_time = time.time() + run_number += 1 + if _IsRunStageFinished(): + if FLAGS.after_run_sleep_time: + logging.info('Sleeping for %s seconds after the run phase.', + FLAGS.after_run_sleep_time) + time.sleep(FLAGS.after_run_sleep_time) + break + + +def DoCleanupPhase(spec, timer): + """Performs the Cleanup phase of benchmark execution. + + Cleanup phase work should be delegated to spec.BenchmarkCleanup to allow + non-PKB based cleanup if needed. + + Args: + spec: The BenchmarkSpec created for the benchmark. + timer: An IntervalTimer that measures the start and stop times of the + benchmark module's Cleanup function. + """ + if FLAGS.before_cleanup_pause: + six.moves.input('Hit enter to begin Cleanup.') + logging.info('Cleaning up benchmark %s', spec.name) + events.before_phase.send(stages.CLEANUP, benchmark_spec=spec) + if (spec.always_call_cleanup or any([vm.is_static for vm in spec.vms]) or + spec.dpb_service is not None): + events.before_phase.send(stages.CLEANUP, benchmark_spec=spec) + spec.StopBackgroundWorkload() + with timer.Measure('Benchmark Cleanup'): + spec.BenchmarkCleanup(spec) + events.after_phase.send(stages.CLEANUP, benchmark_spec=spec) + + +def DoTeardownPhase(spec, collector, timer): + """Performs the Teardown phase of benchmark execution. + + Teardown phase work should be delegated to spec.Delete to allow non-PKB based + teardown if needed. + + Args: + spec: The BenchmarkSpec created for the benchmark. + collector: The SampleCollector object to add samples to + (if collecting delete samples) + timer: An IntervalTimer that measures the start and stop times of + resource teardown. + """ + logging.info('Tearing down resources for benchmark %s', spec.name) + events.before_phase.send(stages.TEARDOWN, benchmark_spec=spec) + # Add delete time metrics after metadeta collected + if _MEASURE_DELETE.value: + samples = cluster_boot_benchmark.MeasureDelete(spec.vms) + collector.AddSamples(samples, spec.name, spec) + + with timer.Measure('Resource Teardown'): + spec.Delete() + events.after_phase.send(stages.TEARDOWN, benchmark_spec=spec) + + +def _SkipPendingRunsFile(): + if FLAGS.skip_pending_runs_file and isfile(FLAGS.skip_pending_runs_file): + logging.warning('%s exists. Skipping benchmark.', + FLAGS.skip_pending_runs_file) + return True + else: + return False + +_SKIP_PENDING_RUNS_CHECKS = [] + + +def RegisterSkipPendingRunsCheck(func): + """Registers a function to skip pending runs. + + Args: + func: A function which returns True if pending runs should be skipped. + """ + _SKIP_PENDING_RUNS_CHECKS.append(func) + + +@events.before_phase.connect +def _PublishStageStartedSamples( + sender: str, + benchmark_spec: bm_spec.BenchmarkSpec): + """Publish the start of each stage.""" + if sender == stages.PROVISION and _CREATE_STARTED_RUN_SAMPLE.value: + _PublishRunStartedSample(benchmark_spec) + if _CREATE_STARTED_STAGE_SAMPLES.value: + _PublishEventSample( + benchmark_spec, + f'{sender.capitalize()} Stage Started') + + +def _PublishRunStartedSample(spec): + """Publishes a sample indicating that a run has started. + + This sample is published immediately so that there exists some metric for any + run (even if the process dies). + + Args: + spec: The BenchmarkSpec object with run information. + """ + metadata = { + 'flags': str(flag_util.GetProvidedCommandLineFlags()) + } + _PublishEventSample(spec, 'Run Started', metadata) + + +def _PublishEventSample(spec: bm_spec.BenchmarkSpec, + event: str, + metadata: Optional[Dict[str, Any]] = None, + collector: Optional[publisher.SampleCollector] = None): + """Publishes a sample indicating the progress of the benchmark. + + Value of sample is time of event in unix seconds + + Args: + spec: The BenchmarkSpec object with run information. + event: The progress event to publish. + metadata: optional metadata to publish about the event. + collector: the SampleCollector to use. + """ + # N.B. SampleCollector seems stateless so re-using vs creating a new one seems + # to have no effect. + if not collector: + collector = publisher.SampleCollector() + collector.AddSamples( + [sample.Sample(event, time.time(), 'seconds', metadata or {})], + spec.name, spec) + collector.PublishSamples() + + +def RunBenchmark(spec, collector): + """Runs a single benchmark and adds the results to the collector. + + Args: + spec: The BenchmarkSpec object with run information. + collector: The SampleCollector object to add samples to. + """ + + # Since there are issues with the handling SIGINT/KeyboardInterrupt (see + # further discussion in _BackgroundProcessTaskManager) this mechanism is + # provided for defense in depth to force skip pending runs after SIGINT. + for f in _SKIP_PENDING_RUNS_CHECKS: + if f(): + logging.warning('Skipping benchmark.') + return + + # Optional display of additional benchmark usage information, such as + # listing available benchmark tests. + if FLAGS.get_benchmark_usage is True: + if hasattr(spec, 'BenchmarkGetUsage'): + spec.BenchmarkGetUsage(spec) + else: + logging.info('Benchmark does not have additional usage information') + return + + spec.status = benchmark_status.FAILED + current_run_stage = stages.PROVISION + # Modify the logger prompt for messages logged within this function. + label_extension = '{}({}/{})'.format( + spec.name, spec.sequence_number, spec.total_benchmarks) + context.SetThreadBenchmarkSpec(spec) + log_context = log_util.GetThreadLogContext() + with log_context.ExtendLabel(label_extension): + with spec.RedirectGlobalFlags(): + end_to_end_timer = timing_util.IntervalTimer() + detailed_timer = timing_util.IntervalTimer() + interrupt_checker = None + try: + with end_to_end_timer.Measure('End to End'): + + _SetRestoreSpec(spec) + _SetFreezePath(spec) + + if stages.PROVISION in FLAGS.run_stage: + DoProvisionPhase(spec, detailed_timer) + + if stages.PREPARE in FLAGS.run_stage: + current_run_stage = stages.PREPARE + interrupt_checker = InterruptChecker(spec.vms) + DoPreparePhase(spec, detailed_timer) + interrupt_checker.EndCheckInterruptThreadAndRaiseError() + interrupt_checker = None + + if stages.RUN in FLAGS.run_stage: + current_run_stage = stages.RUN + interrupt_checker = InterruptChecker(spec.vms) + DoRunPhase(spec, collector, detailed_timer) + interrupt_checker.EndCheckInterruptThreadAndRaiseError() + interrupt_checker = None + + if stages.CLEANUP in FLAGS.run_stage: + current_run_stage = stages.CLEANUP + interrupt_checker = InterruptChecker(spec.vms) + DoCleanupPhase(spec, detailed_timer) + interrupt_checker.EndCheckInterruptThreadAndRaiseError() + interrupt_checker = None + + if stages.TEARDOWN in FLAGS.run_stage: + current_run_stage = stages.TEARDOWN + DoTeardownPhase(spec, collector, detailed_timer) + + # Add timing samples. + if (FLAGS.run_stage == stages.STAGES and + timing_util.EndToEndRuntimeMeasurementEnabled()): + collector.AddSamples( + end_to_end_timer.GenerateSamples(), spec.name, spec) + events.samples_created.send( + stages.RUN, benchmark_spec=spec, samples=end_to_end_timer.GenerateSamples()) + if timing_util.RuntimeMeasurementsEnabled(): + collector.AddSamples( + detailed_timer.GenerateSamples(), spec.name, spec) + + # Add resource related samples. + collector.AddSamples(spec.GetSamples(), spec.name, spec) + + # except block will clean up benchmark specific resources on exception. It + # may also clean up generic resources based on + # FLAGS.always_teardown_on_exception. + except (Exception, KeyboardInterrupt) as e: + # Log specific type of failure, if known + # TODO(dlott) Move to exception chaining with Python3 support + if (isinstance(e, errors.Benchmarks.InsufficientCapacityCloudFailure) + or 'InsufficientCapacityCloudFailure' in str(e)): + spec.failed_substatus = ( + benchmark_status.FailedSubstatus.INSUFFICIENT_CAPACITY) + elif (isinstance(e, errors.Benchmarks.QuotaFailure) + or 'QuotaFailure' in str(e)): + spec.failed_substatus = benchmark_status.FailedSubstatus.QUOTA + elif isinstance(e, errors.Benchmarks.KnownIntermittentError): + spec.failed_substatus = ( + benchmark_status.FailedSubstatus.KNOWN_INTERMITTENT) + elif (isinstance(e, errors.Benchmarks.UnsupportedConfigError) or + 'UnsupportedConfigError' in str(e)): + spec.failed_substatus = benchmark_status.FailedSubstatus.UNSUPPORTED + elif isinstance(e, errors.Resource.RestoreError): + spec.failed_substatus = ( + benchmark_status.FailedSubstatus.RESTORE_FAILED) + elif isinstance(e, errors.Resource.FreezeError): + spec.failed_substatus = ( + benchmark_status.FailedSubstatus.FREEZE_FAILED) + else: + spec.failed_substatus = ( + benchmark_status.FailedSubstatus.UNCATEGORIZED) + spec.status_detail = str(e) + + # Resource cleanup (below) can take a long time. Log the error to give + # immediate feedback, then re-throw. + logging.exception('Error during benchmark %s', spec.name) + if FLAGS.create_failed_run_samples: + PublishFailedRunSample(spec, str(e), current_run_stage, collector) + + # If the particular benchmark requests us to always call cleanup, do it + # here. + if stages.CLEANUP in FLAGS.run_stage and spec.always_call_cleanup: + DoCleanupPhase(spec, detailed_timer) + + if (FLAGS.always_teardown_on_exception and + stages.TEARDOWN not in FLAGS.run_stage): + # Note that if TEARDOWN is specified, it will happen below. + DoTeardownPhase(spec, collector, detailed_timer) + raise + # finally block will only clean up generic resources if teardown is + # included in FLAGS.run_stage. + finally: + if interrupt_checker: + interrupt_checker.EndCheckInterruptThread() + # Deleting resources should happen first so any errors with publishing + # don't prevent teardown. + if stages.TEARDOWN in FLAGS.run_stage: + spec.Delete() + if FLAGS.publish_after_run: + collector.PublishSamples() + events.benchmark_end.send(benchmark_spec=spec) + # Pickle spec to save final resource state. + spec.Pickle() + spec.status = benchmark_status.SUCCEEDED + + +def PublishFailedRunSample( + spec: bm_spec.BenchmarkSpec, + error_message: str, + run_stage_that_failed: str, + collector: publisher.SampleCollector): + """Publish a sample.Sample representing a failed run stage. + + The sample metric will have the name 'Run Failed'; + the value will be the timestamp in Unix Seconds, and the unit will be + 'seconds'. + + The sample metadata will include the error message from the + Exception, the run stage that failed, as well as all PKB + command line flags that were passed in. + + Args: + spec: benchmark_spec + error_message: error message that was caught, resulting in the + run stage failure. + run_stage_that_failed: run stage that failed by raising an Exception + collector: the collector to publish to. + """ + # Note: currently all provided PKB command line flags are included in the + # metadata. We may want to only include flags specific to the benchmark that + # failed. This can be acomplished using gflag's FlagsByModuleDict(). + metadata = { + 'error_message': error_message[0:FLAGS.failed_run_samples_error_length], + 'run_stage': run_stage_that_failed, + 'flags': str(flag_util.GetProvidedCommandLineFlags()) + } + vm_util.RunThreaded(lambda vm: vm.UpdateInterruptibleVmStatus(use_api=True), + spec.vms) + + interruptible_vm_count = 0 + interrupted_vm_count = 0 + vm_status_codes = [] + for vm in spec.vms: + if vm.IsInterruptible(): + interruptible_vm_count += 1 + if vm.WasInterrupted(): + interrupted_vm_count += 1 + spec.failed_substatus = ( + benchmark_status.FailedSubstatus.INTERRUPTED) + status_code = vm.GetVmStatusCode() + if status_code: + vm_status_codes.append(status_code) + + if spec.failed_substatus: + metadata['failed_substatus'] = spec.failed_substatus + + if interruptible_vm_count: + metadata.update({'interruptible_vms': interruptible_vm_count, + 'interrupted_vms': interrupted_vm_count, + 'vm_status_codes': vm_status_codes}) + if interrupted_vm_count: + logging.error( + '%d interruptible VMs were interrupted in this failed PKB run.', + interrupted_vm_count) + _PublishEventSample(spec, 'Run Failed', metadata, collector) + + +def _ShouldRetry(spec: bm_spec.BenchmarkSpec) -> bool: + """Returns whether the benchmark run should be retried.""" + return (spec.status == benchmark_status.FAILED and + spec.failed_substatus in _RETRY_SUBSTATUSES.value) + + +def RunBenchmarkTask( + spec: bm_spec.BenchmarkSpec +) -> Tuple[Sequence[bm_spec.BenchmarkSpec], List[sample.SampleDict]]: + """Task that executes RunBenchmark. + + This is designed to be used with RunParallelProcesses. Note that + for retries only the last run has its samples published. + + Arguments: + spec: BenchmarkSpec. The spec to call RunBenchmark with. + + Returns: + A BenchmarkSpec for each run iteration and a list of samples from the + last run. + """ + # Many providers name resources using run_uris. When running multiple + # benchmarks in parallel, this causes name collisions on resources. + # By modifying the run_uri, we avoid the collisions. + if FLAGS.run_processes and FLAGS.run_processes > 1: + spec.config.flags['run_uri'] = FLAGS.run_uri + str(spec.sequence_number) + # Unset run_uri so the config value takes precedence. + FLAGS['run_uri'].present = 0 + + # Start Intel Contribution + collector = intel_publisher.IntelSampleCollector() if intel_publisher else publisher.SampleCollector() + # End Intel Contribution + + zone_retry_manager = ZoneRetryManager() + # Set the run count. + max_run_count = 1 + _MAX_RETRIES.value + + # Useful format string for debugging. + benchmark_info = ( + f'{spec.sequence_number}/{spec.total_benchmarks} ' + f'{spec.name} (UID: {spec.uid})' + ) + + result_specs = [] + for current_run_count in range(max_run_count): + # Attempt to return the most recent results. + if _TEARDOWN_EVENT.is_set(): + if result_specs and collector: + return result_specs, collector.samples + return [spec], [] + + run_start_msg = ('\n' + '-' * 85 + '\n' + + 'Starting benchmark %s attempt %s of %s' + '\n' + '-' * 85) + logging.info(run_start_msg, benchmark_info, current_run_count + 1, + max_run_count) + + # Start Intel Contribution + collector = intel_publisher.IntelSampleCollector() if intel_publisher else publisher.SampleCollector() + # End Intel Contribution + + # Make a new copy of the benchmark_spec for each run since currently a + # benchmark spec isn't compatible with multiple runs. In particular, the + # benchmark_spec doesn't correctly allow for a provision of resources + # after tearing down. + spec_for_run = copy.deepcopy(spec) + result_specs.append(spec_for_run) + try: + RunBenchmark(spec_for_run, collector) + if intel_publisher: + # Added by Intel + collector.IntelPublishSamples() + # End added by Intel + except BaseException as e: # pylint: disable=broad-except + logging.exception('Exception running benchmark') + msg = f'Benchmark {benchmark_info} failed.' + if isinstance(e, KeyboardInterrupt) or FLAGS.stop_after_benchmark_failure: + logging.error('%s Execution will not continue.', msg) + _TEARDOWN_EVENT.set() + break + + + # Don't retry on the last run. + if _ShouldRetry(spec_for_run) and current_run_count != max_run_count - 1: + logging.info( + 'Benchmark should be retried. Waiting %s seconds before running.', + _RETRY_DELAY_SECONDS.value) + time.sleep(_RETRY_DELAY_SECONDS.value) + + # Handle smart retries if specified. + zone_retry_manager.HandleSmartRetries(spec_for_run) + + else: + logging.info( + 'Benchmark should not be retried. ' + 'Finished %s runs of %s', current_run_count + 1, max_run_count) + break + + # We need to return both the spec and samples so that we know + # the status of the test and can publish any samples that + # haven't yet been published. + return result_specs, collector.samples + + +class ZoneRetryManager(): + """Encapsulates state and functions for zone retries. + + Attributes: + original_zone: If specified, the original zone provided to the benchmark. + zones_tried: Zones that have already been tried in previous runs. + """ + + def __init__(self): + self._CheckFlag() + if not _SMART_CAPACITY_RETRY.value and not _SMART_QUOTA_RETRY.value: + return + self._zones_tried: Set[str] = set() + self._regions_tried: Set[str] = set() + self._utils: types.ModuleType = providers.LoadProviderUtils(FLAGS.cloud) + self._SetOriginalZoneAndFlag() + + def _GetCurrentZoneFlag(self): + return FLAGS[self._zone_flag].value[0] + + def _CheckFlag(self) -> None: + for zone_flag in ['zone', 'zones']: + if FLAGS[zone_flag].value: + self._zone_flag = zone_flag + if self._GetCurrentZoneFlag() == _ANY_ZONE: + FLAGS['smart_capacity_retry'].parse(True) + FLAGS['smart_quota_retry'].parse(True) + + def _SetOriginalZoneAndFlag(self) -> None: + """Records the flag name and zone value that the benchmark started with.""" + # This is guaranteed to set values due to flag validator. + self._supported_zones = self._utils.GetZonesFromMachineType() + if self._GetCurrentZoneFlag() == _ANY_ZONE: + if _MAX_RETRIES.value < 1: + FLAGS['retries'].parse(len(self._supported_zones)) + self._AssignNewZone() + self._original_zone = self._GetCurrentZoneFlag() + self._original_region = self._utils.GetRegionFromZone(self._original_zone) + + def HandleSmartRetries(self, spec: bm_spec.BenchmarkSpec) -> None: + """Handles smart zone retry flags if provided.""" + if (_SMART_QUOTA_RETRY.value and spec.failed_substatus + == benchmark_status.FailedSubstatus.QUOTA): + self._AssignZoneToNewRegion() + elif (_SMART_CAPACITY_RETRY.value and spec.failed_substatus in { + benchmark_status.FailedSubstatus.UNSUPPORTED, + benchmark_status.FailedSubstatus.INSUFFICIENT_CAPACITY + }): + self._AssignNewZone() + + def _AssignZoneToNewRegion(self) -> None: + """Changes zone to be a new zone in the different region.""" + region = self._utils.GetRegionFromZone(self._GetCurrentZoneFlag()) + self._regions_tried.add(region) + regions_to_try = set( + self._utils.GetRegionFromZone(zone) + for zone in self._supported_zones) - self._regions_tried + # Restart from empty if we've exhausted all alternatives. + if not regions_to_try: + self._regions_tried.clear() + new_region = self._original_region + else: + new_region = secrets.choice(tuple(regions_to_try)) + logging.info('Retry using new region %s', new_region) + self._ChooseAndSetNewZone(self._utils.GetZonesInRegion(new_region)) + + def _AssignNewZone(self) -> None: + """Changes zone to be a new zone.""" + self._ChooseAndSetNewZone(self._supported_zones) + + def _ChooseAndSetNewZone(self, possible_zones: Set[str]) -> None: + """Saves the current _zone_flag and sets it to a new zone. + + Args: + possible_zones: The set of zones to choose from. + """ + current_zone = self._GetCurrentZoneFlag() + if current_zone != _ANY_ZONE: + self._zones_tried.add(current_zone) + zones_to_try = possible_zones - self._zones_tried + # Restart from empty if we've exhausted all alternatives. + if not zones_to_try: + self._zones_tried.clear() + new_zone = self._original_zone + else: + new_zone = secrets.choice(tuple(zones_to_try)) + logging.info('Retry using new zone %s', new_zone) + FLAGS[self._zone_flag].unparse() + FLAGS[self._zone_flag].parse([new_zone]) + + +def _LogCommandLineFlags(): + result = [] + for name in FLAGS: + flag = FLAGS[name] + if flag.present: + result.append(flag.serialize()) + logging.info('Flag values:\n%s', '\n'.join(result)) + + +def SetUpPKB(): + """Set globals and environment variables for PKB. + + After SetUpPKB() returns, it should be possible to call PKB + functions, like benchmark_spec.Prepare() or benchmark_spec.Run(). + + SetUpPKB() also modifies the local file system by creating a temp + directory and storing new SSH keys. + """ + try: + _InitializeRunUri() + except errors.Error as e: + logging.error(e) + sys.exit(1) + + # Initialize logging. + vm_util.GenTempDir() + if FLAGS.use_pkb_logging: + log_util.ConfigureLogging( + stderr_log_level=log_util.LOG_LEVELS[FLAGS.log_level], + log_path=vm_util.PrependTempDir(LOG_FILE_NAME), + run_uri=FLAGS.run_uri, + file_log_level=log_util.LOG_LEVELS[FLAGS.file_log_level]) + logging.info('PerfKitBenchmarker version: %s', version.VERSION) + + # Translate deprecated flags and log all provided flag values. + disk.WarnAndTranslateDiskFlags() + _LogCommandLineFlags() + + # Register skip pending runs functionality. + RegisterSkipPendingRunsCheck(_SkipPendingRunsFile) + + # Check environment. + if not FLAGS.ignore_package_requirements: + requirements.CheckBasicRequirements() + + for executable in REQUIRED_EXECUTABLES: + if not vm_util.ExecutableOnPath(executable): + raise errors.Setup.MissingExecutableError( + 'Could not find required executable "%s"' % executable) + + # Check mutually exclusive flags + if FLAGS.run_stage_iterations > 1 and FLAGS.run_stage_time > 0: + raise errors.Setup.InvalidFlagConfigurationError( + 'Flags run_stage_iterations and run_stage_time are mutually exclusive') + + vm_util.SSHKeyGen() + vm_util.InstallRsync() + + if FLAGS.static_vm_file: + with open(FLAGS.static_vm_file) as fp: + static_virtual_machine.StaticVirtualMachine.ReadStaticVirtualMachineFile( + fp) + + events.initialization_complete.send(parsed_flags=FLAGS) + + benchmark_lookup.SetBenchmarkModuleFunction(benchmark_sets.BenchmarkModule) + package_lookup.SetPackageModuleFunction(benchmark_sets.PackageModule) + + # Update max_concurrent_threads to use at least as many threads as VMs. This + # is important for the cluster_boot benchmark where we want to launch the VMs + # in parallel. + if not FLAGS.max_concurrent_threads: + FLAGS.max_concurrent_threads = max( + background_tasks.MAX_CONCURRENT_THREADS, + FLAGS.num_vms) + logging.info('Setting --max_concurrent_threads=%d.', + FLAGS.max_concurrent_threads) + + +def RunBenchmarkTasksInSeries(tasks): + """Runs benchmarks in series. + + Arguments: + tasks: list of tuples of task: [(RunBenchmarkTask, (spec,), {}),] + + Returns: + list of tuples of func results + """ + return [func(*args, **kwargs) for func, args, kwargs in tasks] + + +def RunBenchmarks(): + """Runs all benchmarks in PerfKitBenchmarker. + + Returns: + Exit status for the process. + """ + benchmark_specs = _CreateBenchmarkSpecs() + if FLAGS.randomize_run_order: + random.shuffle(benchmark_specs) + if FLAGS.dry_run: + print('PKB will run with the following configurations:') + for spec in benchmark_specs: + print(spec) + print('') + return 0 + + benchmark_spec_lists = None + collector = publisher.SampleCollector() + try: + tasks = [(RunBenchmarkTask, (spec,), {}) + for spec in benchmark_specs] + if FLAGS.run_processes is None: + spec_sample_tuples = RunBenchmarkTasksInSeries(tasks) + else: + spec_sample_tuples = background_tasks.RunParallelProcesses( + tasks, FLAGS.run_processes, FLAGS.run_processes_delay) + benchmark_spec_lists, sample_lists = list(zip(*spec_sample_tuples)) + for sample_list in sample_lists: + collector.samples.extend(sample_list) + + finally: + if collector.samples: + collector.PublishSamples() + # Use the last run in the series of runs. + if benchmark_spec_lists: + benchmark_specs = [spec_list[-1] for spec_list in benchmark_spec_lists] + if benchmark_specs: + logging.info(benchmark_status.CreateSummary(benchmark_specs)) + + logging.info('Complete logs can be found at: %s', + vm_util.PrependTempDir(LOG_FILE_NAME)) + logging.info('Completion statuses can be found at: %s', + vm_util.PrependTempDir(COMPLETION_STATUS_FILE_NAME)) + + if stages.TEARDOWN not in FLAGS.run_stage: + logging.info( + 'To run again with this setup, please use --run_uri=%s', FLAGS.run_uri) + + if FLAGS.archive_bucket: + archive.ArchiveRun(vm_util.GetTempDir(), FLAGS.archive_bucket, + gsutil_path=FLAGS.gsutil_path, + prefix=FLAGS.run_uri + '_') + + # Start Intel Contribution + if intel_publisher: + if intel_publisher.IntelSampleCollector.PublishEnabled() and FLAGS.intel_publisher_s3_archive_bucket_url: + patterns_to_ignore = [] + # If emon is enabled and user selects not to publish EDP data, add it to the patterns_to_ignore list + if not FLAGS.edp_publish: + patterns_to_ignore.append(EMON_EDP_TARBALL) + intel_publisher.ArchiveToS3(vm_util.GetTempDir(), FLAGS.run_uri + '.zip', patterns_to_ignore) + # End Intel Contribution + + # Write completion status file(s) + completion_status_file_name = ( + vm_util.PrependTempDir(COMPLETION_STATUS_FILE_NAME)) + with open(completion_status_file_name, 'w') as status_file: + _WriteCompletionStatusFile(benchmark_specs, status_file) + if FLAGS.completion_status_file: + with open(FLAGS.completion_status_file, 'w') as status_file: + _WriteCompletionStatusFile(benchmark_specs, status_file) + + all_benchmarks_succeeded = all(spec.status == benchmark_status.SUCCEEDED + for spec in benchmark_specs) + + return 0 if all_benchmarks_succeeded else 1 + + +def _GenerateBenchmarkDocumentation(): + """Generates benchmark documentation to show in --help.""" + benchmark_docs = [] + for benchmark_module in (linux_benchmarks.BENCHMARKS + + windows_benchmarks.BENCHMARKS): + benchmark_config = configs.LoadMinimalConfig( + benchmark_module.BENCHMARK_CONFIG, benchmark_module.BENCHMARK_NAME) + vm_groups = benchmark_config.get('vm_groups', {}) + total_vm_count = 0 + vm_str = '' + scratch_disk_str = '' + for group in six.itervalues(vm_groups): + group_vm_count = group.get('vm_count', 1) + if group_vm_count is None: + vm_str = 'variable' + else: + total_vm_count += group_vm_count + if group.get('disk_spec'): + scratch_disk_str = ' with scratch volume(s)' + + name = benchmark_module.BENCHMARK_NAME + if benchmark_module in windows_benchmarks.BENCHMARKS: + name += ' (Windows)' + benchmark_docs.append('%s: %s (%s VMs%s)' % + (name, + benchmark_config['description'], + vm_str or total_vm_count, + scratch_disk_str)) + return '\n\t'.join(benchmark_docs) + + +def _CreateLscpuSamples(vms): + """Creates samples from linux VMs of lscpu output.""" + samples = [] + for vm in vms: + if vm.OS_TYPE in os_types.LINUX_OS_TYPES: + metadata = {'node_name': vm.name} + metadata.update(vm.CheckLsCpu().data) + samples.append(sample.Sample('lscpu', 0, '', metadata)) + return samples + + +def _CreateProcCpuSamples(vms): + """Creates samples from linux VMs of lscpu output.""" + samples = [] + for vm in vms: + if vm.OS_TYPE not in os_types.LINUX_OS_TYPES: + continue + data = vm.CheckProcCpu() + metadata = {'node_name': vm.name} + metadata.update(data.GetValues()) + samples.append(sample.Sample('proccpu', 0, '', metadata)) + metadata = {'node_name': vm.name} + for processor_id, raw_values in data.mappings.items(): + values = ['%s=%s' % item for item in raw_values.items()] + metadata['proc_{}'.format(processor_id)] = ';'.join(sorted(values)) + samples.append(sample.Sample('proccpu_mapping', 0, '', metadata)) + return samples + + +def _CreateCpuVulnerabilitySamples(vms) -> List[sample.Sample]: + """Returns samples of the VMs' CPU vulernabilites.""" + + def CreateSample(vm) -> Optional[sample.Sample]: + metadata = {'vm_name': vm.name} + metadata.update(vm.cpu_vulnerabilities.asdict) + return sample.Sample('cpu_vuln', 0, '', metadata) + + linux_vms = [vm for vm in vms if vm.OS_TYPE in os_types.LINUX_OS_TYPES] + return vm_util.RunThreaded(CreateSample, linux_vms) + + +def _CreateGccSamples(vms): + """Creates samples from linux VMs of gcc version output.""" + + def _GetGccMetadata(vm): + return { + 'name': vm.name, + 'versiondump': build_tools.GetVersion(vm, 'gcc'), + 'versioninfo': build_tools.GetVersionInfo(vm, 'gcc') + } + + return [ + sample.Sample('gcc_version', 0, '', metadata) + for metadata in vm_util.RunThreaded(_GetGccMetadata, vms) + ] + + +def _CreateGlibcSamples(vms): + """Creates glibc samples from linux VMs of ldd output.""" + + def _GetGlibcVersionInfo(vm): + out, _ = vm.RemoteCommand('ldd --version', ignore_failure=True) + # return first line + return out.splitlines()[0] if out else None + + def _GetGlibcMetadata(vm): + return { + 'name': vm.name, + # TODO(user): Add glibc versiondump. + 'versioninfo': _GetGlibcVersionInfo(vm) + } + + return [ + sample.Sample('glibc_version', 0, '', metadata) + for metadata in vm_util.RunThreaded(_GetGlibcMetadata, vms) + ] + + +def _ParseMeminfo(meminfo_txt: str) -> Tuple[Dict[str, int], List[str]]: + """Returns the parsed /proc/meminfo data. + + Response has entries such as {'MemTotal' : 32887056, 'Inactive': 4576524}. If + the /proc/meminfo entry has two values such as + MemTotal: 32887056 kB + checks that the last value is 'kB' If it is not then adds that line to the + 2nd value in the tuple. + + Args: + meminfo_txt: contents of /proc/meminfo + + Returns: + Tuple where the first entry is a dict of the parsed keys and the second + are unparsed lines. + """ + data: Dict[str, int] = {} + malformed: List[str] = [] + for line in meminfo_txt.splitlines(): + try: + key, full_value = re.split(r':\s+', line) + parts = full_value.split() + if len(parts) == 1 or (len(parts) == 2 and parts[1] == 'kB'): + data[key] = int(parts[0]) + else: + malformed.append(line) + except ValueError: + # If the line does not match "key: value" or if the value is not an int + malformed.append(line) + return data, malformed + + +@events.samples_created.connect +def _CollectMeminfoHandler(sender: str, benchmark_spec: bm_spec.BenchmarkSpec, + samples: List[sample.Sample]) -> None: + """Optionally creates /proc/meminfo samples. + + If the flag --collect_meminfo is set appends a sample.Sample of /proc/meminfo + data for every VM in the run. + + Parameter names cannot be changed as the method is called by events.send with + keyword arguments. + + Args: + sender: Unused sender. + benchmark_spec: The benchmark spec. + samples: Generated samples that can be appended to. + """ + del sender # Unused as appending to samples with VMs from benchmark_spec + if not _COLLECT_MEMINFO.value: + return + + def CollectMeminfo(vm): + txt, _ = vm.RemoteCommand('cat /proc/meminfo') + meminfo, malformed = _ParseMeminfo(txt) + meminfo.update({ + 'meminfo_keys': ','.join(sorted(meminfo)), + 'meminfo_vmname': vm.name, + 'meminfo_machine_type': vm.machine_type, + 'meminfo_os_type': vm.OS_TYPE, + }) + if malformed: + meminfo['meminfo_malformed'] = ','.join(sorted(malformed)) + return sample.Sample('meminfo', 0, '', meminfo) + + linux_vms = [ + vm for vm in benchmark_spec.vms if vm.OS_TYPE in os_types.LINUX_OS_TYPES + ] + + samples.extend(vm_util.RunThreaded(CollectMeminfo, linux_vms)) + + +def Main(): + log_util.ConfigureBasicLogging() + _InjectBenchmarkInfoIntoDocumentation() + _ParseFlags() + if FLAGS.helpmatch: + _PrintHelp(FLAGS.helpmatch) + return 0 + if FLAGS.helpmatchmd: + _PrintHelpMD(FLAGS.helpmatchmd) + return 0 + CheckVersionFlag() + SetUpPKB() + return RunBenchmarks() diff --git a/script/cumulus/pkb/perfkitbenchmarker/placement_group.py b/script/cumulus/pkb/perfkitbenchmarker/placement_group.py new file mode 100644 index 0000000..81257ce --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/placement_group.py @@ -0,0 +1,121 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Class to represent Placement Group Object. + +Top-level Placement Group implementation. +Cloud specific implementations of Placement Group needed. +""" + + +from absl import flags +from perfkitbenchmarker import resource +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.configs import spec + +FLAGS = flags.FLAGS + +PLACEMENT_GROUP_CLUSTER = 'cluster' +PLACEMENT_GROUP_SUPERCLUSTER = 'supercluster' +PLACEMENT_GROUP_SPREAD = 'spread' +PLACEMENT_GROUP_NONE = 'none' +PLACEMENT_GROUP_OPTIONS = frozenset([ + PLACEMENT_GROUP_CLUSTER, + PLACEMENT_GROUP_SPREAD, + PLACEMENT_GROUP_NONE +]) + +# Default placement group style is specified by Cloud Specific Placement Group. +flags.DEFINE_enum( + 'placement_group_style', None, + list(PLACEMENT_GROUP_OPTIONS) + [PLACEMENT_GROUP_SUPERCLUSTER], + 'The vm placement group option to use. Default set by cloud.') + + +def GetPlacementGroupSpecClass(cloud): + """Returns the PlacementGroupSpec class corresponding to 'cloud'.""" + return spec.GetSpecClass(BasePlacementGroupSpec, CLOUD=cloud) + + +def GetPlacementGroupClass(cloud): + """Returns the PlacementGroup class corresponding to 'cloud'.""" + return resource.GetResourceClass(BasePlacementGroup, + CLOUD=cloud) + + +class BasePlacementGroupSpec(spec.BaseSpec): + """Storing various data about a placement group. + + Attributes: + zone: The zone the in which the placement group will launch. + """ + + SPEC_TYPE = 'BasePlacementGroupSpec' + CLOUD = None + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May + be modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(BasePlacementGroupSpec, cls)._ApplyFlags(config_values, flag_values) + if FLAGS.placement_group_style: + config_values['placement_group_style'] = FLAGS.placement_group_style + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Can be overridden by derived classes to add options or impose additional + requirements on existing options. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(BasePlacementGroupSpec, cls)._GetOptionDecoderConstructions() + result.update({'zone': (option_decoders.StringDecoder, {'none_ok': True})}) + return result + + +class BasePlacementGroup(resource.BaseResource): + """Base class for Placement Groups. + + This class holds Placement Group methods and attributes relating to the + Placement Groups as a cloud + resource. + + Attributes: + zone: The zone the Placement Group was launched in. + """ + + RESOURCE_TYPE = 'BasePlacementGroup' + + def __init__(self, placement_group_spec): + """Initialize BasePlacementGroup class. + + Args: + placement_group_spec: placement_group.BasePlacementGroupSpec object of the + placement group. + """ + super(BasePlacementGroup, self).__init__() + self.zone = placement_group_spec.zone diff --git a/script/cumulus/pkb/perfkitbenchmarker/provider_info.py b/script/cumulus/pkb/perfkitbenchmarker/provider_info.py new file mode 100644 index 0000000..44cd972 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/provider_info.py @@ -0,0 +1,54 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing class for provider data. + +This contains the BaseProviderInfo class which is +used for IsBenchmarkSupported + +""" + +import six + + +_PROVIDER_INFO_REGISTRY = {} + + +def GetProviderInfoClass(cloud): + """Returns the provider info class corresponding to the cloud.""" + return _PROVIDER_INFO_REGISTRY.get(cloud, BaseProviderInfo) + + +class AutoRegisterProviderInfoMeta(type): + """Metaclass which allows ProviderInfos to automatically be registered.""" + + def __init__(cls, name, bases, dct): + super(AutoRegisterProviderInfoMeta, cls).__init__(name, bases, dct) + if hasattr(cls, 'CLOUD') and cls.CLOUD is not None: + _PROVIDER_INFO_REGISTRY[cls.CLOUD] = cls + + +class BaseProviderInfo(six.with_metaclass(AutoRegisterProviderInfoMeta)): + """Class that holds provider-related data.""" + + CLOUD = None + + UNSUPPORTED_BENCHMARKS = [] + + @classmethod + def IsBenchmarkSupported(cls, benchmark): + if benchmark in cls.UNSUPPORTED_BENCHMARKS: + return False + else: + return True diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/__init__.py b/script/cumulus/pkb/perfkitbenchmarker/providers/__init__.py new file mode 100644 index 0000000..58d0bf8 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/__init__.py @@ -0,0 +1,119 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Providers for PKB.""" +import importlib +import logging +import os +import types + +from perfkitbenchmarker import events +from perfkitbenchmarker import import_util +from perfkitbenchmarker import requirements +from perfkitbenchmarker.providers import aws +from perfkitbenchmarker.providers import azure +try: + from perfkitbenchmarker.providers import ibmcloud +except: + pass + +GCP = 'GCP' +AZURE = 'Azure' +AWS = 'AWS' +IBMCLOUD = 'IBMCloud' +ALICLOUD = 'AliCloud' +KUBERNETES = 'Kubernetes' +DIGITALOCEAN = 'DigitalOcean' +OPENSTACK = 'OpenStack' +CLOUDSTACK = 'CloudStack' +RACKSPACE = 'Rackspace' +MESOS = 'Mesos' +PROFITBRICKS = 'ProfitBricks' +TENCENT = 'Tencent' +DOCKER = 'Docker' +YANDEX = "Yandex" + +VALID_CLOUDS = (GCP, AZURE, AWS, ALICLOUD, TENCENT) + +_imported_providers = set() + + +def _GetProviderPackageName(cloud: str) -> str: + """Gets the name of the provider package that corresponds to the cloud.""" + return cloud.lower() + + +def LoadProviderFlags(providers): + """Imports just the flags module for each provider. + + This allows PKB to load flag definitions from each provider to include in the + help text without actually loading any other provider-specific modules. + + Args: + providers: series of strings. Each element is a value from VALID_CLOUDS + indicating a cloud provider for which to import the flags module. + """ + for provider_name in providers: + normalized_name = _GetProviderPackageName(provider_name) + flags_module_name = '.'.join((__name__, normalized_name, 'flags')) + importlib.import_module(flags_module_name) + + +# Import flag definitions for all cloud providers. +LoadProviderFlags(VALID_CLOUDS) + + +def LoadProviderUtils(cloud: str) -> types.ModuleType: + util_module_name = '.'.join( + (__name__, _GetProviderPackageName(cloud), 'util')) + return importlib.import_module(util_module_name) + + +def LoadProvider(provider_name, ignore_package_requirements=True): + """Loads the all modules in the 'provider_name' package. + + This function first checks the specified provider's Python package + requirements file, if one exists, and verifies that all requirements are met. + Next, it loads all modules in the specified provider's package. By loading + these modules, relevant classes (e.g. VMs) will register themselves. + + Args: + provider_name: string chosen from VALID_CLOUDS. The name of the provider + whose modules should be loaded. + ignore_package_requirements: boolean. If True, the provider's Python package + requirements file is ignored. + """ + if provider_name in _imported_providers: + return + + # Check package requirements from the provider's pip requirements file. + normalized_name = _GetProviderPackageName(provider_name) + if not ignore_package_requirements: + requirements.CheckProviderRequirements(normalized_name) + + # Load all modules in the provider's directory. Simply loading those modules + # will cause relevant classes (e.g. VM and disk classes) to register + # themselves so that they can be instantiated during resource provisioning. + provider_package_path = os.path.join(__path__[0], normalized_name) + try: + modules = tuple(import_util.LoadModulesForPath( + [provider_package_path], __name__ + '.' + normalized_name)) + if not modules: + raise ImportError('No modules found for provider %s.' % provider_name) + except Exception: + logging.error('Unable to load provider %s.', provider_name) + raise + + # Signal that the provider's modules have been imported. + _imported_providers.add(provider_name) + events.provider_imported.send(provider_name) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/__init__.py b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/__init__.py new file mode 100644 index 0000000..d90275b --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/ali_disk.py b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/ali_disk.py new file mode 100644 index 0000000..692d7af --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/ali_disk.py @@ -0,0 +1,145 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing classes related to AliCloud disks. +""" + +import json +import logging +import string +import threading +from absl import flags +from perfkitbenchmarker import disk +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.alicloud import util + +FLAGS = flags.FLAGS + +DISK_TYPE = { + disk.STANDARD: 'cloud', + disk.REMOTE_SSD: 'cloud_ssd', + disk.PIOPS: 'cloud_efficiency', + disk.LOCAL: 'ephemeral_ssd', + disk.REMOTE_ESSD: 'cloud_essd', +} + + +class AliDisk(disk.BaseDisk): + """Object representing an AliCloud Disk.""" + + _lock = threading.Lock() + vm_devices = {} + + def __init__(self, disk_spec, zone): + super(AliDisk, self).__init__(disk_spec) + self.id = None + self.zone = zone + self.region = util.GetRegionByZone(self.zone) + self.attached_vm_id = None + + def _Create(self): + """Creates the disk.""" + create_cmd = util.ALI_PREFIX + [ + 'ecs', + 'CreateDisk', + '--RegionId %s' % self.region, + '--ZoneId %s' % self.zone, + '--Size %s' % self.disk_size, + '--DiskCategory %s' % DISK_TYPE[self.disk_type]] + if self.disk_type == disk.REMOTE_ESSD: + create_cmd.append('--PerformanceLevel %s' % FLAGS.ali_essd_performance_level) + + if FLAGS.ali_resource_group_id: + create_cmd.append('--ResourceGroupId %s' % FLAGS.ali_resource_group_id) + + create_cmd = util.GetEncodedCmd(create_cmd) + stdout, _, _ = vm_util.IssueCommand(create_cmd, raise_on_failure=False) + response = json.loads(stdout) + self.id = response['DiskId'] + + def _Delete(self): + """Deletes the disk.""" + delete_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DeleteDisk', + '--DiskId %s' % self.id] + logging.info('Deleting AliCloud disk %s. This may fail if the disk is not ' + 'yet detached, but will be retried.', self.id) + delete_cmd = util.GetEncodedCmd(delete_cmd) + vm_util.IssueRetryableCommand(delete_cmd) + + def Attach(self, vm): + """Attaches the disk to a VM. + Args: + vm: The AliVirtualMachine instance to which the disk will be attached. + """ + with self._lock: + self.attached_vm_id = vm.id + if self.attached_vm_id not in AliDisk.vm_devices: + AliDisk.vm_devices[self.attached_vm_id] = set( + string.ascii_lowercase[1:]) + self.device_letter = min(AliDisk.vm_devices[self.attached_vm_id]) + AliDisk.vm_devices[self.attached_vm_id].remove(self.device_letter) + + attach_cmd = util.ALI_PREFIX + [ + 'ecs', + 'AttachDisk', + '--InstanceId %s' % self.attached_vm_id, + '--DiskId %s' % self.id, + '--Device %s' % self.GetVirtualDevicePath()] + attach_cmd = util.GetEncodedCmd(attach_cmd) + vm_util.IssueRetryableCommand(attach_cmd) + + def Detach(self): + """Detaches the disk from a VM.""" + detach_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DetachDisk', + '--InstanceId %s' % self.attached_vm_id, + '--DiskId %s' % self.id] + detach_cmd = util.GetEncodedCmd(detach_cmd) + vm_util.IssueRetryableCommand(detach_cmd) + + with self._lock: + assert self.attached_vm_id in AliDisk.vm_devices + AliDisk.vm_devices[self.attached_vm_id].add(self.device_letter) + self.attached_vm_id = None + self.device_letter = None + + def GetDevicePath(self): + """Returns the path to the device inside the VM.""" + return '/dev/vd%s' % self.device_letter + + def GetVirtualDevicePath(self): + """Returns the path to the device visible to console users.""" + return '/dev/xvd%s' % self.device_letter + + @vm_util.Retry(poll_interval=5, max_retries=30, log_errors=False) + def WaitForDiskStatus(self, status_list): + """Waits until disk is attach to the instance""" + logging.info('Waits until the disk\'s status is one of statuses: %s', + status_list) + describe_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DescribeDisks', + '--RegionId %s' % self.region, + '--ZoneId %s' % self.zone, + '--DiskIds \'["%s"]\'' % self.id] + attach_cmd = util.GetEncodedCmd(describe_cmd) + stdout, _ = vm_util.IssueRetryableCommand(attach_cmd) + response = json.loads(stdout) + disk = response['Disks']['Disk'] + assert len(disk) == 1 + status = disk[0]['Status'] + assert status in status_list diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/ali_network.py b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/ali_network.py new file mode 100644 index 0000000..79a9db3 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/ali_network.py @@ -0,0 +1,337 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing classes related to AliCloud VM networking. + +The Firewall class provides a way of opening VM ports. The Network class allows +VMs to communicate via internal ips and isolates PerfKitBenchmarker VMs from +others in the +same project. See https://developers.google.com/compute/docs/networking for +more information about AliCloud VM networking. +""" + +import json +import logging +import threading +import uuid + +from absl import flags +from perfkitbenchmarker import network +from perfkitbenchmarker import providers +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.alicloud import util +from six.moves import range + +FLAGS = flags.FLAGS +MAX_NAME_LENGTH = 128 + + +class AliVpc(resource.BaseResource): + """An object representing an AliCloud VPC.""" + + def __init__(self, name, region): + super(AliVpc, self).__init__() + self.region = region + self.id = None + self.name = name + + def _Create(self): + """Creates the VPC.""" + create_cmd = util.ALI_PREFIX + [ + 'ecs', + 'CreateVpc', + '--VpcName %s' % self.name, + '--RegionId %s' % self.region, + '--CidrBlock 10.0.0.0/16'] + create_cmd = util.GetEncodedCmd(create_cmd) + stdout, _, _ = vm_util.IssueCommand(create_cmd, raise_on_failure=False) + response = json.loads(stdout) + self.id = response['VpcId'] + + def _Exists(self): + """Returns true if the VPC exists.""" + describe_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DescribeVpcs', + '--RegionId %s' % self.region, + '--VpcId %s' % self.id] + describe_cmd = util.GetEncodedCmd(describe_cmd) + stdout, _ = vm_util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + vpcs = response['Vpcs']['Vpc'] + assert len(vpcs) < 2, 'Too many VPCs.' + return len(vpcs) > 0 + + @vm_util.Retry(poll_interval=5, max_retries=30, log_errors=False) + def _WaitForVpcStatus(self, status_list): + """Waits until VPC's status is in status_list""" + logging.info('Waits until the status of VPC is in status_list: %s', + status_list) + describe_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DescribeVpcs', + '--RegionId %s' % self.region, + '--VpcId %s' % self.id] + describe_cmd = util.GetEncodedCmd(describe_cmd) + stdout, _ = vm_util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + vpcs = response['Vpcs']['Vpc'] + assert len(vpcs) == 1 + vpc_status = response['Vpcs']['Vpc'][0]['Status'] + assert vpc_status in status_list + + def _Delete(self): + """Delete's the VPC.""" + delete_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DeleteVpc', + '--RegionId %s' % self.region, + '--VpcId %s' % self.id] + delete_cmd = util.GetEncodedCmd(delete_cmd) + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + +class AliVSwitch(resource.BaseResource): + """An object representing an AliCloud VSwitch.""" + + def __init__(self, name, zone, vpc_id): + super(AliVSwitch, self).__init__() + self.region = util.GetRegionByZone(zone) + self.id = None + self.vpc_id = vpc_id + self.zone = zone + self.name = name + + def _Create(self): + """Creates the VSwitch.""" + create_cmd = util.ALI_PREFIX + [ + 'ecs', + 'CreateVSwitch', + '--VSwitchName %s' % self.name, + '--ZoneId %s' % self.zone, + '--RegionId %s' % self.region, + '--CidrBlock 10.0.0.0/24', + '--VpcId %s' % self.vpc_id, + ] + create_cmd = util.GetEncodedCmd(create_cmd) + stdout, _, _ = vm_util.IssueCommand(create_cmd, raise_on_failure=False) + response = json.loads(stdout) + self.id = response['VSwitchId'] + + def _Delete(self): + """Deletes the VSwitch.""" + delete_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DeleteVSwitch', + '--RegionId %s' % self.region, + '--VSwitchId %s' % self.id] + delete_cmd = util.GetEncodedCmd(delete_cmd) + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + def _Exists(self): + """Returns true if the VSwitch exists.""" + describe_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DescribeVSwitches', + '--RegionId %s' % self.region, + '--VpcId %s' % self.vpc_id, + '--ZoneId %s' % self.zone] + describe_cmd = util.GetEncodedCmd(describe_cmd) + stdout, _ = vm_util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + vswitches = response['VSwitches']['VSwitch'] + assert len(vswitches) < 2, 'Too many VSwitches.' + return len(vswitches) > 0 + + +class AliSecurityGroup(resource.BaseResource): + """Object representing an AliCloud Security Group.""" + + def __init__(self, name, region, use_vpc=True, vpc_id=None): + super(AliSecurityGroup, self).__init__() + self.name = name + self.region = region + self.use_vpc = use_vpc + self.vpc_id = vpc_id + + def _Create(self): + """Creates the security group.""" + create_cmd = util.ALI_PREFIX + [ + 'ecs', + 'CreateSecurityGroup', + '--SecurityGroupName %s' % self.name, + '--RegionId %s' % self.region] + if self.use_vpc: + create_cmd.append('--VpcId %s' % self.vpc_id) + create_cmd = util.GetEncodedCmd(create_cmd) + stdout, _ = vm_util.IssueRetryableCommand(create_cmd) + self.group_id = json.loads(stdout)['SecurityGroupId'] + + def _Delete(self): + """Deletes the security group.""" + delete_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DeleteSecurityGroup', + '--RegionId %s' % self.region, + '--SecurityGroupId %s' % self.group_id] + delete_cmd = util.GetEncodedCmd(delete_cmd) + vm_util.IssueRetryableCommand(delete_cmd) + + def _Exists(self): + """Returns true if the security group exists.""" + show_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DescribeSecurityGroups', + '--RegionId %s' % self.region, + '--SecurityGroupId %s' % self.group_id] + show_cmd = util.GetEncodedCmd(show_cmd) + stdout, _ = vm_util.IssueRetryableCommand(show_cmd) + response = json.loads(stdout) + securityGroups = response['SecurityGroups']['SecurityGroup'] + assert len(securityGroups) < 2, 'Too many securityGroups.' + if not securityGroups: + return False + return True + + +class AliFirewall(network.BaseFirewall): + """An object representing the AliCloud Firewall.""" + + CLOUD = providers.ALICLOUD + + def __init__(self): + self.firewall_set = set() + self._lock = threading.Lock() + + def AllowIcmp(self, vm): + """Opens the ICMP protocol on the firewall. + + Args: + vm: The BaseVirtualMachine object to open the ICMP protocol for. + """ + if vm.is_static: + return + with self._lock: + authorize_cmd = util.ALI_PREFIX + [ + 'ecs', + 'AuthorizeSecurityGroup', + '--IpProtocol ICMP', + '--PortRange -1/-1', + '--SourceCidrIp 0.0.0.0/0', + '--RegionId %s' % vm.region, + '--SecurityGroupId %s' % vm.group_id] + if FLAGS.ali_use_vpc: + authorize_cmd.append('--NicType intranet') + authorize_cmd = util.GetEncodedCmd(authorize_cmd) + vm_util.IssueRetryableCommand(authorize_cmd) + + def AllowPort(self, vm, start_port, end_port=None, source_range=None): + """Opens a port on the firewall. + + Args: + vm: The BaseVirtualMachine object to open the port for. + start_port: The first local port in a range of ports to open. + end_port: The last port in a range of ports to open. If None, only + start_port will be opened. + source_range: unsupported at present. + """ + + if not end_port: + end_port = start_port + + for port in range(start_port, end_port + 1): + self._AllowPort(vm, port) + + def _AllowPort(self, vm, port): + """Opens a port on the firewall. + + Args: + vm: The BaseVirtualMachine object to open the port for. + port: The local port to open. + """ + if vm.is_static: + return + entry = (port, vm.group_id) + if entry in self.firewall_set: + return + with self._lock: + if entry in self.firewall_set: + return + for protocol in ('tcp', 'udp'): + authorize_cmd = util.ALI_PREFIX + [ + 'ecs', + 'AuthorizeSecurityGroup', + '--IpProtocol %s' % protocol, + '--PortRange %s/%s' % (port, port), + '--SourceCidrIp 0.0.0.0/0', + '--RegionId %s' % vm.region, + '--SecurityGroupId %s' % vm.group_id] + if FLAGS.ali_use_vpc: + authorize_cmd.append('--NicType intranet') + authorize_cmd = util.GetEncodedCmd(authorize_cmd) + vm_util.IssueRetryableCommand(authorize_cmd) + self.firewall_set.add(entry) + + def DisallowAllPorts(self): + """Closes all ports on the firewall.""" + pass + + +class AliNetwork(network.BaseNetwork): + """Object representing a AliCloud Network.""" + + CLOUD = providers.ALICLOUD + + def __init__(self, spec): + super(AliNetwork, self).__init__(spec) + self.name = ( + 'perfkit-%s-%s' % (FLAGS.run_uri, str(uuid.uuid4())[-12:])) + self.region = util.GetRegionByZone(spec.zone) + self.use_vpc = FLAGS.ali_use_vpc + if self.use_vpc: + self.vpc = AliVpc(self.name, self.region) + self.vswitch = None + self.security_group = None + else: + self.security_group = \ + AliSecurityGroup(self.name, self.region, use_vpc=False) + + @vm_util.Retry() + def Create(self): + """Creates the network.""" + if self.use_vpc: + self.vpc.Create() + self.vpc._WaitForVpcStatus(['Available']) + if self.vswitch is None: + self.vswitch = AliVSwitch(self.name, self.zone, self.vpc.id) + self.vswitch.Create() + + if self.security_group is None: + self.security_group = AliSecurityGroup(self.name, + self.region, + use_vpc=True, + vpc_id=self.vpc.id) + self.security_group.Create() + else: + self.security_group.Create() + + def Delete(self): + """Deletes the network.""" + if self.use_vpc: + self.security_group.Delete() + self.vswitch.Delete() + self.security_group.Delete() + self.vpc.Delete() + else: + self.security_group.Delete() diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/ali_virtual_machine.py b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/ali_virtual_machine.py new file mode 100644 index 0000000..e7120ef --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/ali_virtual_machine.py @@ -0,0 +1,458 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Class to represent an Ali Virtual Machine object. +All VM specifics are self-contained and the class provides methods to +operate on the VM: boot, shutdown, etc. +""" + +import base64 +import json +import logging +import threading +from absl import flags +from perfkitbenchmarker import disk +from perfkitbenchmarker import linux_virtual_machine +from perfkitbenchmarker import providers +from perfkitbenchmarker import virtual_machine +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.alicloud import ali_disk +from perfkitbenchmarker.providers.alicloud import ali_network +from perfkitbenchmarker.providers.alicloud import util +import six + +FLAGS = flags.FLAGS +NON_HVM_PREFIXES = ['t1', 's1', 's2', 's3', 'm1'] + +DRIVE_START_LETTER = 'b' +DEFAULT_DISK_SIZE = 500 +INSTANCE = 'instance' +IMAGE = 'image' +SNAPSHOT = 'snapshot' +DISK = 'disk' +NONE = 'none' +IO_OPTIMIZED = 'io_optimized' +RESOURCE_TYPE = { + INSTANCE: 'instance', + IMAGE: 'image', + SNAPSHOT: 'snapshot', + DISK: 'disk', +} +SSH_PORT = 22 + + +NUM_LOCAL_VOLUMES = { + 'ecs.t1.small': 4, + 'ecs.s1.small': 4, + 'ecs.s1.medium': 4, + 'ecs.s2.small': 4, + 'ecs.s2.large': 4, + 'ecs.s2.xlarge': 4, + 'ecs.s3.medium': 4, + 'ecs.s3.large': 4, + 'ecs.m1.medium': 4, +} +INSTANCE_EXISTS_STATUSES = frozenset( + ['Starting', 'Running', 'Stopping', 'Stopped']) +INSTANCE_DELETED_STATUSES = frozenset([]) +INSTANCE_KNOWN_STATUSES = INSTANCE_EXISTS_STATUSES | INSTANCE_DELETED_STATUSES + + +class AliVirtualMachine(virtual_machine.BaseVirtualMachine): + """Object representing an AliCloud Virtual Machine.""" + + CLOUD = providers.ALICLOUD + DEFAULT_ZONE = 'cn-hangzhou-d' + DEFAULT_MACHINE_TYPE = 'ecs.s3.large' + + _lock = threading.Lock() + imported_keyfile_set = set() + deleted_keyfile_set = set() + + def __init__(self, vm_spec): + """Initialize a AliCloud virtual machine. + Args: + vm_spec: virtual_machine.BaseVirtualMachineSpec object of the VM. + """ + super(AliVirtualMachine, self).__init__(vm_spec) + self.image = FLAGS.image + self.user_name = FLAGS.ali_user_name + self.key_pair_name = None + self.region = util.GetRegionByZone(self.zone) + self.bandwidth_in = FLAGS.ali_bandwidth_in + self.bandwidth_out = FLAGS.ali_bandwidth_out + self.scratch_disk_size = FLAGS.scratch_disk_size or DEFAULT_DISK_SIZE + self.system_disk_type = FLAGS.ali_system_disk_type + self.system_disk_size = FLAGS.ali_system_disk_size + self.eip_address_bandwidth = FLAGS.ali_eip_address_bandwidth + self.network = ali_network.AliNetwork.GetNetwork(self) + self.firewall = ali_network.AliFirewall.GetFirewall() + + @vm_util.Retry(poll_interval=1, log_errors=False) + def _WaitForInstanceStatus(self, status_list): + """Waits until the instance's status is in status_list.""" + logging.info('Waits until the instance\'s status is one of statuses: %s', + status_list) + describe_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DescribeInstances', + '--RegionId %s' % self.region, + '--InstanceIds \'["%s"]\'' % self.id] + describe_cmd = util.GetEncodedCmd(describe_cmd) + stdout, _ = vm_util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + instances = response['Instances']['Instance'] + assert len(instances) == 1 + status = instances[0]['Status'] + assert status in status_list + + @vm_util.Retry(poll_interval=5, max_retries=30, log_errors=False) + def _WaitForEipStatus(self, status_list): + """Waits until the instance's status is in status_list.""" + logging.info('Waits until the eip\'s status is one of statuses: %s', + status_list) + describe_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DescribeEipAddresses', + '--RegionId %s' % self.region, + '--AllocationId %s' % self.eip_id] + describe_cmd = util.GetEncodedCmd(describe_cmd) + stdout, _ = vm_util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + EipAddresses = response['EipAddresses']['EipAddress'] + assert len(EipAddresses) == 1 + status = EipAddresses[0]['Status'] + assert status in status_list + + def _AllocatePubIp(self, region, instance_id): + """Allocate a public ip address and associate it to the instance.""" + if FLAGS.ali_use_vpc: + allocatip_cmd = util.ALI_PREFIX + [ + 'ecs', + 'AllocateEipAddress', + '--RegionId %s' % region, + '--InternetChargeType PayByTraffic', + '--Bandwidth %s' % self.eip_address_bandwidth] + allocatip_cmd = util.GetEncodedCmd(allocatip_cmd) + stdout, _ = vm_util.IssueRetryableCommand(allocatip_cmd) + response = json.loads(stdout) + self.ip_address = response['EipAddress'] + self.eip_id = response['AllocationId'] + + self._WaitForInstanceStatus(['Stopped', 'Running']) + + associate_cmd = util.ALI_PREFIX + [ + 'ecs', + 'AssociateEipAddress', + '--RegionId %s' % region, + '--AllocationId %s' % self.eip_id, + '--InstanceId %s' % instance_id, + '--InstanceType EcsInstance'] + associate_cmd = util.GetEncodedCmd(associate_cmd) + vm_util.IssueRetryableCommand(associate_cmd) + + else: + allocatip_cmd = util.ALI_PREFIX + [ + 'ecs', + 'AllocatePublicIpAddress', + '--RegionId %s' % region, + '--InstanceId %s' % instance_id] + allocatip_cmd = util.GetEncodedCmd(allocatip_cmd) + stdout, _ = vm_util.IssueRetryableCommand(allocatip_cmd) + response = json.loads(stdout) + self.ip_address = response['IpAddress'] + + @classmethod + def _GetDefaultImage(cls, region): + """Returns the default image given the machine type and region. + If no default is configured, this will return None. + """ + if cls.IMAGE_NAME_FILTER is None: + return None + + describe_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DescribeImages', + '--RegionId %s' % region, + '--ImageName \'%s\'' % cls.IMAGE_NAME_FILTER] + describe_cmd = util.GetEncodedCmd(describe_cmd) + stdout, _ = vm_util.IssueRetryableCommand(describe_cmd) + + if not stdout: + return None + + images = json.loads(stdout)['Images']['Image'] + # We want to return the latest version of the image, and since the wildcard + # portion of the image name is the image's creation date, we can just take + # the image with the 'largest' name. + return max(images, key=lambda image: image['ImageName'])['ImageId'] + + @vm_util.Retry() + def _PostCreate(self): + """Get the instance's data and tag it.""" + describe_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DescribeInstances', + '--RegionId %s' % self.region, + '--InstanceIds \'["%s"]\'' % self.id] + logging.info('Getting instance %s public IP. This will fail until ' + 'a public IP is available, but will be retried.', self.id) + describe_cmd = util.GetEncodedCmd(describe_cmd) + stdout, _ = vm_util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + instance = response['Instances']['Instance'][0] + if self.network.use_vpc: + pub_ip_address = instance['EipAddress']['IpAddress'] + self.internal_ip = \ + instance['VpcAttributes']['PrivateIpAddress']['IpAddress'][0] + else: + pub_ip_address = instance['PublicIpAddress']['IpAddress'][0] + self.internal_ip = instance['InnerIpAddress']['IpAddress'][0] + assert self.ip_address == pub_ip_address + self.group_id = instance['SecurityGroupIds']['SecurityGroupId'][0] + + self._WaitForInstanceStatus(['Running']) + + self.firewall.AllowPort(self, SSH_PORT) + tags = {} + tags.update(self.vm_metadata) + util.AddTags(self.id, RESOURCE_TYPE[INSTANCE], self.region, **tags) + util.AddDefaultTags(self.id, RESOURCE_TYPE[INSTANCE], self.region) + + def _CreateDependencies(self): + """Create VM dependencies.""" + self.key_pair_name = AliCloudKeyFileManager.ImportKeyfile(self.region) + + def _DeleteDependencies(self): + """Delete VM dependencies.""" + if self.key_pair_name: + AliCloudKeyFileManager.DeleteKeyfile(self.region, self.key_pair_name) + + def _Create(self): + """Create a VM instance.""" + + if self.image is None: + # This is here and not in the __init__ method bceauese _GetDefaultImage + # does a nontrivial amount of work (it calls the aliyuncli). + self.image = self._GetDefaultImage(self.region) + + create_cmd = util.ALI_PREFIX + [ + 'ecs', + 'CreateInstance', + '--InstanceName perfkit-%s' % FLAGS.run_uri, + '--RegionId %s' % self.region, + '--ZoneId %s' % self.zone, + '--ImageId %s' % self.image, + '--InstanceType %s' % self.machine_type, + '--SecurityGroupId %s' % self.network.security_group.group_id, + '--KeyPairName %s' % self.key_pair_name, + '--SystemDisk.Category %s' % self.system_disk_type, + '--SystemDisk.Size %s' % self.system_disk_size] + + if FLAGS.scratch_disk_type == disk.LOCAL: + disk_cmd = [ + '--DataDisk1Category ephemeral_ssd', + '--DataDisk1Size %s' % self.scratch_disk_size, + '--DataDisk1Device %s%s' % (util.GetDrivePathPrefix(), + DRIVE_START_LETTER)] + create_cmd.extend(disk_cmd) + + if FLAGS.ali_io_optimized is not None: + create_cmd.extend(['--IoOptimized optimized']) + + if FLAGS.ali_use_vpc: + create_cmd.extend(['--VSwitchId %s' % self.network.vswitch.id]) + else: + create_cmd.extend([ + '--InternetChargeType PayByTraffic', + '--InternetMaxBandwidthIn %s' % self.bandwidth_in, + '--InternetMaxBandwidthOut %s' % self.bandwidth_out]) + + # Create user and add SSH key + public_key = AliCloudKeyFileManager.GetPublicKey() + user_data = util.ADD_USER_TEMPLATE.format(user_name=self.user_name, + public_key=public_key) + logging.debug('encoding startup script: %s', user_data) + create_cmd.extend(['--UserData', six.ensure_str( + base64.b64encode(user_data.encode('utf-8')))]) + + if FLAGS.ali_resource_group_id: + create_cmd.extend(['--ResourceGroupId %s' % FLAGS.ali_resource_group_id]) + + create_cmd = util.GetEncodedCmd(create_cmd) + stdout, _ = vm_util.IssueRetryableCommand(create_cmd) + response = json.loads(stdout) + self.id = response['InstanceId'] + + self._AllocatePubIp(self.region, self.id) + + start_cmd = util.ALI_PREFIX + [ + 'ecs', + 'StartInstance', + '--InstanceId %s' % self.id] + start_cmd = util.GetEncodedCmd(start_cmd) + vm_util.IssueRetryableCommand(start_cmd) + + def _Delete(self): + """Delete a VM instance.""" + stop_cmd = util.ALI_PREFIX + [ + 'ecs', + 'StopInstance', + '--InstanceId %s' % self.id] + stop_cmd = util.GetEncodedCmd(stop_cmd) + vm_util.IssueRetryableCommand(stop_cmd) + + self._WaitForInstanceStatus(['Stopped']) + + delete_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DeleteInstance', + '--InstanceId %s' % self.id] + delete_cmd = util.GetEncodedCmd(delete_cmd) + vm_util.IssueRetryableCommand(delete_cmd) + + if FLAGS.ali_use_vpc: + self._WaitForEipStatus(['Available']) + release_eip_cmd = util.ALI_PREFIX + [ + 'ecs', + 'ReleaseEipAddress', + '--RegionId %s' % self.region, + '--AllocationId %s' % self.eip_id] + release_eip_cmd = util.GetEncodedCmd(release_eip_cmd) + vm_util.IssueRetryableCommand(release_eip_cmd) + + def _Exists(self): + """Returns true if the VM exists.""" + describe_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DescribeInstances', + '--RegionId %s' % self.region, + '--InstanceIds \'["%s"]\'' % str(self.id)] + describe_cmd = util.GetEncodedCmd(describe_cmd) + stdout, _ = vm_util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + instances = response['Instances']['Instance'] + assert len(instances) < 2, 'Too many instances.' + if not instances: + return False + assert len(instances) == 1, 'Wrong number of instances.' + status = instances[0]['Status'] + assert status in INSTANCE_KNOWN_STATUSES, status + return status in INSTANCE_EXISTS_STATUSES + + def CreateScratchDisk(self, disk_spec): + """Create a VM's scratch disk. + Args: + disk_spec: virtual_machine.BaseDiskSpec object of the disk. + """ + data_disk = ali_disk.AliDisk(disk_spec, self.zone) + self.scratch_disks.append(data_disk) + + if disk_spec.disk_type != disk.LOCAL: + data_disk.Create() + data_disk.Attach(self) + data_disk.WaitForDiskStatus(['In_use']) + else: + data_disk.device_letter = DRIVE_START_LETTER + + self.FormatDisk(data_disk.GetDevicePath(), disk_spec.disk_type) + self.MountDisk(data_disk.GetDevicePath(), disk_spec.mount_point, + disk_spec.disk_type, data_disk.mount_options, + data_disk.fstab_options) + + def AddMetadata(self, **kwargs): + """Adds metadata to the VM.""" + util.AddTags(self.id, RESOURCE_TYPE[INSTANCE], self.region, **kwargs) + + +class AliCloudKeyFileManager(object): + """Object for managing AliCloud Keyfiles.""" + _lock = threading.Lock() + imported_keyfile_set = set() + deleted_keyfile_set = set() + run_uri_key_names = {} + + @classmethod + def ImportKeyfile(cls, region): + """Imports the public keyfile to AliCloud.""" + with cls._lock: + if FLAGS.run_uri in cls.run_uri_key_names: + return cls.run_uri_key_names[FLAGS.run_uri] + public_key = cls.GetPublicKey() + key_name = cls.GetKeyNameForRun() + import_cmd = util.ALI_PREFIX + [ + 'ecs', + 'ImportKeyPair', + '--RegionId', region, + '--KeyPairName', key_name, + '--PublicKeyBody', json.dumps(public_key)] + vm_util.IssueRetryableCommand(import_cmd) + cls.run_uri_key_names[FLAGS.run_uri] = key_name + return key_name + + @classmethod + def DeleteKeyfile(cls, region, key_name): + """Deletes the imported KeyPair for a run_uri.""" + with cls._lock: + if FLAGS.run_uri not in cls.run_uri_key_names: + return + delete_cmd = util.ALI_PREFIX + [ + 'ecs', + 'DeleteKeyPairs', + '--RegionId', region, + '--KeyPairNames', json.dumps([key_name])] + vm_util.IssueRetryableCommand(delete_cmd) + del cls.run_uri_key_names[FLAGS.run_uri] + + @classmethod + def GetKeyNameForRun(cls): + return 'perfkit_key_{0}'.format(FLAGS.run_uri) + + @classmethod + def GetPublicKey(cls): + cat_cmd = ['cat', + vm_util.GetPublicKeyPath()] + keyfile, _ = vm_util.IssueRetryableCommand(cat_cmd) + return keyfile.strip() + + +class Ubuntu1604BasedAliVirtualMachine(AliVirtualMachine, + linux_virtual_machine.Ubuntu1604Mixin): + IMAGE_NAME_FILTER = 'ubuntu_16_04_64*alibase*.vhd' + + +class Ubuntu1804BasedAliVirtualMachine(AliVirtualMachine, + linux_virtual_machine.Ubuntu1804Mixin): + IMAGE_NAME_FILTER = 'ubuntu_18_04_x64*alibase*.vhd' + + +class Ubuntu2004BasedAliVirtualMachine(AliVirtualMachine, + linux_virtual_machine.Ubuntu2004Mixin): + IMAGE_NAME_FILTER = 'ubuntu_20_04_x64*alibase*.vhd' + + +# TODO to be verified +class Ubuntu2204BasedAliVirtualMachine(AliVirtualMachine, + linux_virtual_machine.Ubuntu2204Mixin): + IMAGE_NAME_FILTER = 'ubuntu_22_04_x64*alibase*.vhd' + + +class CentOs7BasedAliVirtualMachine(AliVirtualMachine, + linux_virtual_machine.CentOs7Mixin): + IMAGE_NAME_FILTER = 'centos_7_09_x64*alibase*.vhd' + + +class CentOs8BasedAliVirtualMachine(AliVirtualMachine, + linux_virtual_machine.CentOs8Mixin): + IMAGE_NAME_FILTER = 'centos_8_2_x64*alibase*.vhd' diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/flags.py b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/flags.py new file mode 100644 index 0000000..4b5f966 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/flags.py @@ -0,0 +1,44 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from absl import flags + +flags.DEFINE_string('ali_user_name', 'ubuntu', + 'This determines the user name that Perfkit will ' + 'attempt to use. This must be changed in order to ' + 'use any image other than ubuntu.') +flags.DEFINE_integer('ali_bandwidth_in', 100, 'Inbound Bandwidth') +flags.DEFINE_integer('ali_bandwidth_out', 100, 'Outbound Bandwidth') +flags.DEFINE_string('ali_io_optimized', None, + 'IO optimized for disk in AliCloud. The default is ' + 'None which means no IO optimized ' + '"optimized" means use IO optimized. If you ' + 'choose optimized, you must specify the system disk type') +flags.DEFINE_string('ali_system_disk_type', 'cloud_ssd', + 'System disk category for AliCloud. The default is ' + '"cloud" for General cloud disk, ' + '"cloud_ssd" for cloud ssd disk, ' + '"cloud_essd" for enhanced cloud ssd disk, ' + '"cloud_efficiency" for efficiency cloud disk, ' + '"ephemeral_ssd" for local ssd disk') +flags.DEFINE_integer('ali_system_disk_size', 50, + 'System disk size in GB. Default is 50 GB.') +flags.DEFINE_boolean('ali_use_vpc', True, + 'Use VPC to create networks') +flags.DEFINE_integer('ali_eip_address_bandwidth', 100, + 'The rate limit of the EIP in Mbps.') +flags.DEFINE_enum('ali_essd_performance_level', 'PL1', ['PL1', 'PL2', 'PL3'], + 'Performance level for disk when using --data_disk_type=remote_essd.') +flags.DEFINE_string('ali_resource_group_id', '', + 'Specify the resource group id.') diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/provider_info.py b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/provider_info.py new file mode 100644 index 0000000..c5a3507 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/provider_info.py @@ -0,0 +1,24 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Provider info for AliCloud.""" + +from perfkitbenchmarker import provider_info +from perfkitbenchmarker import providers + + +class AliCloudProviderInfo(provider_info.BaseProviderInfo): + + UNSUPPORTED_BENCHMARKS = ['mysql_service'] + CLOUD = providers.ALICLOUD diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/requirements.txt b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/requirements.txt new file mode 100644 index 0000000..be9b05e --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/requirements.txt @@ -0,0 +1,15 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Requirements for running PerfKit Benchmarker on AliCloud. diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/util.py b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/util.py new file mode 100644 index 0000000..9362234 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/alicloud/util.py @@ -0,0 +1,107 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for working with AliCloud Web Services resources.""" + + +import shlex + +from absl import flags +from perfkitbenchmarker import vm_util +import six + +ALI_PREFIX = ['aliyun'] +ROOT = 'root' +FLAGS = flags.FLAGS +PASSWD_LEN = 20 + + +REGION_HZ = 'cn-hangzhou' + + +ADD_USER_TEMPLATE = """#!/bin/bash +echo "{user_name} ALL = NOPASSWD: ALL" >> /etc/sudoers +useradd {user_name} --home /home/{user_name} --shell /bin/bash -m +mkdir /home/{user_name}/.ssh +echo "{public_key}" >> /home/{user_name}/.ssh/authorized_keys +chown -R {user_name}:{user_name} /home/{user_name}/.ssh +chmod 700 /home/{user_name}/.ssh +chmod 600 /home/{user_name}/.ssh/authorized_keys +""" + + +def GetEncodedCmd(cmd): + cmd_line = ' '.join(cmd) + cmd_args = shlex.split(cmd_line) + return cmd_args + + +def GetRegionByZone(zone): + if zone.find(REGION_HZ) != -1: + return REGION_HZ + s = zone.split('-') + if s[0] == 'cn': + s.pop() + return '-'.join(s) + else: + return zone[:-1] + + +def AddTags(resource_id, resource_type, region, **kwargs): + """Adds tags to an AliCloud resource created by PerfKitBenchmarker. + + Args: + resource_id: An extant AliCloud resource to operate on. + resource_type: The type of the resource. + region: The AliCloud region 'resource_id' was created in. + **kwargs: dict. Key-value pairs to set on the instance. + """ + if not kwargs: + return + + tag_cmd = ALI_PREFIX + [ + 'ecs', 'AddTags', + '--RegionId', region, + '--ResourceId', resource_id, + '--ResourceType', resource_type + ] + for index, (key, value) in enumerate(six.iteritems(kwargs)): + tag_cmd.extend([ + '--Tag.{0}.Key'.format(index + 1), str(key), + '--Tag.{0}.Value'.format(index + 1), str(value) + ]) + vm_util.IssueRetryableCommand(tag_cmd) + + +def AddDefaultTags(resource_id, resource_type, region): + """Adds tags to an AliCloud resource created by PerfKitBenchmarker. + + By default, resources are tagged with "owner" and "perfkitbenchmarker-run" + key-value + pairs. + + Args: + resource_id: An extant AliCloud resource to operate on. + resource_type: The type of the 'resource_id' + region: The AliCloud region 'resource_id' was created in. + """ + tags = {'owner': FLAGS.owner, 'perfkitbenchmarker-run': FLAGS.run_uri} + AddTags(resource_id, resource_type, region, **tags) + + +def GetDrivePathPrefix(): + if FLAGS.ali_io_optimized is None: + return '/dev/xvd' + elif FLAGS.ali_io_optimized: + return '/dev/vd' diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/__init__.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/__init__.py new file mode 100644 index 0000000..8955062 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Provider for AWS.""" diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/athena.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/athena.py new file mode 100644 index 0000000..5a59711 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/athena.py @@ -0,0 +1,382 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for AWS's Athena EDW service.""" + +import copy +import datetime +import json +import logging +import re +from typing import Dict, Text, Tuple + +from absl import flags +from perfkitbenchmarker import data +from perfkitbenchmarker import edw_service +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import s3 +from perfkitbenchmarker.providers.aws import util + +LATEST_CLIENT_JAR = 'athena-java-client-2.1.jar' + +AWS_ATHENA_CMD_PREFIX = ['aws', 'athena'] +AWS_ATHENA_CMD_POSTFIX = ['--output', 'json'] +# TODO(user): Derive the full table set from the TPC suite. +TPC_H_TABLES = [ + 'customer', 'lineitem', 'nation', 'orders', 'part', 'partsupp', 'region', + 'supplier' +] +TPC_DS_TABLES = [ + 'call_center', 'catalog_page', 'catalog_returns', 'catalog_sales', + 'customer', 'customer_address', 'customer_demographics', 'date_dim', + 'dbgen_version', 'household_demographics', 'income_band', 'inventory', + 'item', 'promotion', 'reason', 'ship_mode', 'store', 'store_returns', + 'store_sales', 'time_dim', 'warehouse', 'web_page', 'web_returns', + 'web_sales', 'web_site' +] + +FLAGS = flags.FLAGS + + +class AthenaQueryError(RuntimeError): + pass + + +def GetAthenaClientInterface(database: str, output_bucket: str, + region: str) -> edw_service.EdwClientInterface: + """Builds and Returns the requested Athena client Interface. + + Args: + database: Name of the Athena database to execute queries against. + output_bucket: String name of the S3 bucket to store query output. + region: String aws region in which the database exists and client operations + are performed. + + Returns: + A concrete Client Interface object (subclass of EdwClientInterface) + + Raises: + RuntimeError: if an unsupported athena_client_interface is requested + """ + if FLAGS.athena_client_interface == 'JAVA': + return JavaClientInterface(database, output_bucket, region) + raise RuntimeError('Unknown Athena Client Interface requested.' + + FLAGS.athena_client_interface) + + +class GenericClientInterface(edw_service.EdwClientInterface): + """Generic Client Interface class for Athena. + + Attributes: + database: String name of the Athena database to execute queries against. + output_bucket: String name of the S3 bucket to store query output. + region: String aws region in which the database exists and client operations + are performed. + """ + + def __init__(self, database: str, output_bucket: str, region: str): + super(GenericClientInterface, self).__init__() + self.database = database + self.output_bucket = 's3://%s' % output_bucket + self.region = region + + def GetMetadata(self) -> Dict[str, str]: + """Gets the Metadata attributes for the Client Interface.""" + client_workgroup = FLAGS.athena_workgroup or 'dynamic' + return { + 'client': f'{FLAGS.athena_client_interface}_{client_workgroup}', + 'client_region': self.region + } + + +class JavaClientInterface(GenericClientInterface): + """Java Client Interface class for Athena. + """ + + def Prepare(self, package_name: str) -> None: + """Prepares the client vm to execute query. + + Installs the Java Execution Environment and a uber jar with + a) Athena Java client libraries, + b) An application to execute a query and gather execution details, and + collect CW metrics + c) their dependencies. + + Args: + package_name: String name of the package defining the preprovisioned data + (certificates, etc.) to extract and use during client vm preparation. + """ + self.client_vm.Install('openjdk') + # Push the executable jar to the working directory on client vm + self.client_vm.InstallPreprovisionedPackageData( + package_name, [LATEST_CLIENT_JAR], '') + + def ExecuteQuery(self, query_name: Text) -> Tuple[float, Dict[str, str]]: + """Executes a query and returns performance details. + + Args: + query_name: String name of the query to execute + + Returns: + A tuple of (execution_time, run_metadata) + execution_time: A Float variable set to the query's completion time in + secs. -1.0 is used as a sentinel value implying the query failed. For a + successful query the value is expected to be positive. + run_metadata: A dictionary of query execution attributes eg. script name + """ + query_command = (f'java -cp {LATEST_CLIENT_JAR} ' + 'com.google.cloud.performance.edw.Single ' + f'--region {self.region} ' + f'--database {self.database} ' + f'--output_location {self.output_bucket} ' + f'--query_file {query_name} ' + f'--query_timeout_secs {FLAGS.athena_query_timeout} ' + f'--collect_metrics {FLAGS.athena_metrics_collection}') + + if not FLAGS.athena_metrics_collection: + # execute the query in requested persistent workgroup + query_command = f'{query_command} --workgroup {FLAGS.athena_workgroup} ' + query_command = f'{query_command} --delete_workgroup False' + else: + # the dynamic workgroup may have to live beyond the benchmark + query_command = (f'{query_command} ' + f'--delete_workgroup {FLAGS.athena_workgroup_delete}') + + stdout, _ = self.client_vm.RemoteCommand(query_command) + details = copy.copy(self.GetMetadata()) # Copy the base metadata + details.update(json.loads(stdout)['details']) + details['query_start'] = json.loads(stdout)['query_start'] + details['query_end'] = json.loads(stdout)['query_end'] + performance = json.loads(stdout)['query_wall_time_in_secs'] + return performance, details + + +def ReadScript(script_uri): + """Method to read a sql script based on its local path. + + Arguments: + script_uri: Local URI of file containing SQL query. + + Returns: + Query String contents of the URI location. + + Raises: + IOError: If the script cannot be read. + """ + with open(script_uri) as fp: + return fp.read() + + +def PrepareQueryString(query_string_template, substitutions): + """Method to read a template Athena script and substitute placeholders. + + Args: + query_string_template: Template version of the Athena query. + substitutions: A dictionary of string placeholder keys and corresponding + string values. + + Returns: + Materialized Athena query as a string. + """ + for key, value in substitutions.items(): + query_string = query_string_template.replace(key, value) + return query_string + + +def RunScriptCommand(script_command): + """Method to execute an AWS Athena cli command. + + Args: + script_command: Fully compiled AWS Athena cli command. + + Returns: + String stdout result of executing the query. + Script Command execution duration in seconds (rounded). + + Raises: + AthenaQueryError: If the return code does not indicate success. + """ + start_time = datetime.datetime.now() + stdout, _, retcode = vm_util.IssueCommand( + script_command, raise_on_failure=False) + if retcode: + raise AthenaQueryError + end_time = datetime.datetime.now() + return stdout, int((end_time - start_time).total_seconds()) + + +class Athena(edw_service.EdwService): + """Object representing a Athena data warehouse.""" + + CLOUD = providers.AWS + SERVICE_TYPE = 'athena' + + def __init__(self, edw_service_spec): + super(Athena, self).__init__(edw_service_spec) + self.region = util.GetRegionFromZone(FLAGS.zones[0]) + self.output_bucket = '-'.join( + [FLAGS.athena_output_location_prefix, self.region, FLAGS.run_uri]) + self.client_interface = GetAthenaClientInterface(self.cluster_identifier, + self.output_bucket, + self.region) + self.s3_service = s3.S3Service() + self.s3_service.PrepareService(self.region) + self.s3_service.MakeBucket(self.output_bucket) + if FLAGS.provision_athena: + self.data_bucket = 'pkb' + self.cluster_identifier.replace('_', '') + self.tables = ( + TPC_H_TABLES if FLAGS.edw_tpc_dsb_type == 'tpc_h' else TPC_DS_TABLES) + self.athena_db_create_time = 0 + self.athena_table_create_time = 0 + + def BuildAthenaCommand(self, query_string, database=None): + """Method to compile a AWS Athena cli command. + + Arguments: + query_string: A string with the query that needs to be executed on Athena. + database: The Athena database against which the query should be executed. + + Returns: + Fully compiled AWS Athena cli command. + """ + cmd = [] + cmd.extend(AWS_ATHENA_CMD_PREFIX) + cmd.extend([ + '--region', self.region, + 'start-query-execution', + '--query-string', query_string + ]) + if database: + cmd.extend(['--query-execution-context', ('Database=%s' % database)]) + cmd.extend([ + '--result-configuration', + ('OutputLocation=s3://%s' % self.output_bucket) + ]) + cmd.extend(AWS_ATHENA_CMD_POSTFIX) + return cmd + + def _Create(self): + """Create a Athena data warehouse.""" + + def _EmptyDatabase(): + """Remove tables, if they exist, so they can be refreshed. + + If the database and/or tables don't already exist, the drop commands + will simply fail, which won't raise errors. + """ + drop_script_path = data.ResourcePath('edw/athena/%s/ddl/drop.sql' % + FLAGS.edw_tpc_dsb_type) + drop_script_contents = ReadScript(drop_script_path) + # Drop all tables so the database can be dropped. + for table in self.tables: + # Remove the folder backing each parquet table so they can be refreshed. + vm_util.IssueCommand([ + 'aws', 's3', 'rm', + 's3://%s/%s_parquet' % (self.data_bucket, table), '--recursive' + ], raise_on_failure=False) + # The parquet tables don't have the type suffix so that the queries can + # run as written without having to change the table names. + for suffix in ['_csv', '']: + script_contents = PrepareQueryString(drop_script_contents, + {'{table}': table + suffix}) + script_command = self.BuildAthenaCommand( + script_contents, database=self.cluster_identifier) + RunScriptCommand(script_command) + + drop_database_query_string = PrepareQueryString( + 'drop database database_name', + {'database_name': self.cluster_identifier}) + script_command = self.BuildAthenaCommand(drop_database_query_string) + RunScriptCommand(script_command) + + def _CreateDatabase(): + create_database_query_string = PrepareQueryString( + 'create database database_name', + {'database_name': self.cluster_identifier}) + script_command = self.BuildAthenaCommand(create_database_query_string) + return RunScriptCommand(script_command) + + def _CreateTable(table_create_sql_template): + template_script_path = data.ResourcePath(table_create_sql_template) + template_script_contents = ReadScript(template_script_path) + script_contents = PrepareQueryString(template_script_contents, + {'{bucket}': self.data_bucket}) + script_command = self.BuildAthenaCommand( + script_contents, database=self.cluster_identifier) + return RunScriptCommand(script_command) + + def _CreateAllTables(): + """Create all TPC benchmarking tables.""" + cumulative_table_create_time = 0 + for table in self.tables: + for suffix in ['_csv', '_parquet']: + script = 'edw/athena/%s/ddl/%s.sql' % (FLAGS.edw_tpc_dsb_type, + table + suffix) + _, table_create_time = _CreateTable(script) + cumulative_table_create_time += table_create_time + return cumulative_table_create_time + + _EmptyDatabase() + _, self.athena_db_create_time = _CreateDatabase() + self.athena_table_create_time = _CreateAllTables() + + def _Exists(self): + """Method to validate the existence of a Athena data warehouse. + + Returns: + Boolean value indicating the existence of a Athena data warehouse. + """ + raise NotImplementedError + + def _Delete(self): + """Delete a Athena data warehouse.""" + if not FLAGS.teardown_athena: + logging.info('The current resource is requested to be long living.') + return + raise NotImplementedError + + def Cleanup(self): + # Direct cleanup is used instead of _DeleteDependencies because the Athena + # warehouse resource isn't created/deleted each time. + self.s3_service.DeleteBucket(self.output_bucket) + + def GetDataDetails(self) -> Dict[str, str]: + """Returns a dictionary with underlying data details. + + cluster_identifier = + Data details are extracted from the dataset_id that follows the format: + ___ + eg. + tpch100_parquet_uncompressed_unpartitoned + + Returns: + A dictionary set to underlying data's details (format, etc.) + """ + data_details = {} + # If the information isn't in the cluster identifier, skip collecting it. + if '_' not in self.cluster_identifier: + return data_details + parsed_id = re.split(r'_', self.cluster_identifier) + data_details['format'] = parsed_id[1] + data_details['compression'] = parsed_id[2] + data_details['partitioning'] = parsed_id[3] + return data_details + + def GetMetadata(self): + """Return a dictionary of the metadata for the Athena data warehouse.""" + basic_data = super(Athena, self).GetMetadata() + basic_data.update({'database': self.cluster_identifier}) + basic_data.update(self.GetDataDetails()) + basic_data.update(self.client_interface.GetMetadata()) + return basic_data diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_capacity_reservation.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_capacity_reservation.py new file mode 100644 index 0000000..ccc9097 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_capacity_reservation.py @@ -0,0 +1,200 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""CapacityReservation for AWS virtual machines. + +AWS EC2 has the concept of capacity reservations which allow the +user to request a reservation for a given number of VMs of a +specified shape (machine type and os type) in a given zone, for +an optionally-supplied duration. This module implements this functionaly. + +A useful feature of using AwsCapacityReservation is that it allows the +user to specify a region instead of a zone, and this module will automatically +pick a zone that has capacity, and the VM(s) will then be launched in that zone. + +AwsCapacityReservation modifies all the VMs in a given vm_group in the +following way: + 1. The capacity_reservation_id attribute on the VM is set after the + reservation is created. The VM needs to reference this id during + creation. + 2. If the user supplied a region instead of zone, then this module + will update the zone attribute on the VM, as well as the zone + attribute on the VM's network instance. + +A run of PKB may have several capacity reservations; there is a 1:1 mapping +from AWS vm_groups to AwsCapacityReservation instances. This is because all +VMs in a VM group share the same shape and zone. +""" + +import datetime +import json +import logging + +from absl import flags +from perfkitbenchmarker import capacity_reservation +from perfkitbenchmarker import errors +from perfkitbenchmarker import os_types +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import util + +FLAGS = flags.FLAGS +_INSUFFICIENT_CAPACITY = 'InsufficientInstanceCapacity' + + +class InvalidVmGroupSizeError(Exception): + pass + + +class UnsupportedOsTypeError(Exception): + pass + + +class CreationError(Exception): + pass + + +class AwsCapacityReservation(capacity_reservation.BaseCapacityReservation): + """An object representing an AWS EC2 CapacityReservation.""" + CLOUD = providers.AWS + + def __init__(self, vm_group): + if not vm_group: + raise InvalidVmGroupSizeError( + 'AwsCapacityReservation must be initialized with at least one ' + 'VM in the vm_group.') + + super(AwsCapacityReservation, self).__init__(vm_group) + self.zone_or_region = vm_group[0].zone + self.region = util.GetRegionFromZone(self.zone_or_region) + self.machine_type = vm_group[0].machine_type + self.os_type = vm_group[0].OS_TYPE + self.vm_count = len(vm_group) + + def _Create(self): + """Creates the AWS CapacaityReservation. + + A reservation will be created given the VM shape in self.vm_groups. + Count is determined by the number of VMs in said group. The reservation + will have a lifetime determined by the general PKB concept of + timeout_minutes. If the reservation exceeds this timeout, AWS will + cancel it automatically. The VMs in the reservation will not be deleted. + Note that an empty capacity reservation will encur costs for the + VM shape / count, even if no VMs are using it. + + After the reservation is created, this method updates all the VMs + in self.vm_groups by setting the capacity_reservation_id, as well + as the zone attributes on the VM, and the VM's network instance. + + Raises: + UnsupportedOsTypeError: If creating a capacity reservation for the + given os type is not supported. + CreationError: If a capacity reservation cannot be created in the + region (typically indicates a stockout). + """ + if self.os_type in os_types.LINUX_OS_TYPES: + instance_platform = 'Linux/UNIX' + elif self.os_type in os_types.WINDOWS_OS_TYPES: + instance_platform = 'Windows' + else: + raise UnsupportedOsTypeError( + 'Unsupported os_type for AWS CapacityReservation: %s.' + % self.os_type) + + # If the user did not specify an AZ, we need to try to create the + # CapacityReservation in a specifc AZ until it succeeds. + # Then update the zone attribute on all the VMs in the group, + # as well as the zone attribute on the VMs' network instance. + if util.IsRegion(self.zone_or_region): + zones_to_try = util.GetZonesInRegion(self.region) + else: + zones_to_try = [self.zone_or_region] + + end_date = ( + datetime.datetime.utcnow() + + datetime.timedelta(minutes=FLAGS.timeout_minutes)) + for zone in zones_to_try: + cmd = util.AWS_PREFIX + [ + 'ec2', + 'create-capacity-reservation', + '--instance-type=%s' % self.machine_type, + '--instance-platform=%s' % instance_platform, + '--availability-zone=%s' % zone, + '--instance-count=%s' % self.vm_count, + '--instance-match-criteria=targeted', + '--region=%s' % self.region, + '--end-date-type=limited', + '--end-date=%s' % end_date.isoformat(), + ] + stdout, stderr, retcode = vm_util.IssueCommand(cmd, + raise_on_failure=False) + if retcode: + logging.info('Unable to create CapacityReservation in %s. ' + 'This may be retried. Details: %s', zone, stderr) + if _INSUFFICIENT_CAPACITY in stderr: + logging.error(util.STOCKOUT_MESSAGE) + raise errors.Benchmarks.InsufficientCapacityCloudFailure( + util.STOCKOUT_MESSAGE + ' CapacityReservation in ' + zone) + continue + json_output = json.loads(stdout) + self.capacity_reservation_id = ( + json_output['CapacityReservation']['CapacityReservationId']) + self._UpdateVmsInGroup(self.capacity_reservation_id, zone) + return + raise CreationError('Unable to create CapacityReservation in any of the ' + 'following zones: %s.' % zones_to_try) + + def _Delete(self): + """Deletes the capacity reservation.""" + cmd = util.AWS_PREFIX + [ + 'ec2', + 'cancel-capacity-reservation', + '--capacity-reservation-id=%s' % self.capacity_reservation_id, + '--region=%s' % self.region, + ] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _Exists(self): + """Returns true if the underlying reservation exists and is active.""" + cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-capacity-reservations', + '--capacity-reservation-id=%s' % self.capacity_reservation_id, + '--region=%s' % self.region, + ] + stdout, _, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + return False + + json_output = json.loads(stdout) + return json_output['CapacityReservations'][0]['State'] == 'active' + + def _UpdateVmsInGroup(self, capacity_reservation_id, zone): + """Updates the VMs in a group with necessary reservation details. + + AWS virtual machines need to reference the capacity reservation id + during creation, so it is set on all VMs in the group. Additionally, + this class may determine which zone to run in, so that needs to be + updated too (on the VM, and the VM's network instance). + + Args: + capacity_reservation_id: ID of the reservation created by this instance. + zone: Zone chosen by this class, or if it was supplied, the zone + provided by the user. In the latter case, setting the zone is equivalent + to a no-op. + """ + for vm in self.vm_group: + vm.capacity_reservation_id = capacity_reservation_id + vm.zone = zone + vm.network.zone = zone diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_cluster_parameter_group.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_cluster_parameter_group.py new file mode 100644 index 0000000..d02290b --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_cluster_parameter_group.py @@ -0,0 +1,49 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for AWS's Redshift Cluster Parameter Group.""" + +from absl import flags +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util + +FLAGS = flags.FLAGS + + +class RedshiftClusterParameterGroup(resource.BaseResource): + """Cluster Parameter Group associated with a Redshift cluster. + + Attributes: + name: A string name of the cluster parameter group. + """ + + def __init__(self, cmd_prefix): + super(RedshiftClusterParameterGroup, self).__init__(user_managed=False) + self.cmd_prefix = cmd_prefix + self.name = 'pkb-' + FLAGS.run_uri + + def _Create(self): + cmd = self.cmd_prefix + [ + 'redshift', 'create-cluster-parameter-group', '--parameter-group-name', + self.name, '--parameter-group-family', 'redshift-1.0', '--description', + 'Cluster Parameter group for run uri {}'.format(FLAGS.run_uri) + ] + vm_util.IssueCommand(cmd) + + def _Delete(self): + """Delete a redshift cluster parameter group.""" + cmd = self.cmd_prefix + [ + 'redshift', 'delete-cluster-parameter-group', '--parameter-group-name', + self.name + ] + vm_util.IssueCommand(cmd, raise_on_failure=False) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_cluster_subnet_group.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_cluster_subnet_group.py new file mode 100644 index 0000000..abdaed9 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_cluster_subnet_group.py @@ -0,0 +1,55 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for AWS's Redshift Cluster Subnet Group.""" + +from absl import flags +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util + +FLAGS = flags.FLAGS + + +class RedshiftClusterSubnetGroup(resource.BaseResource): + """Cluster Subnet Group associated with a Redshift cluster launched in a vpc. + + A cluster subnet group allows you to specify a set of subnets in your VPC. + + + Attributes: + name: A string name of the cluster subnet group. + subnet_id: A string name of the subnet id associated with the group. + """ + + def __init__(self, cmd_prefix): + super(RedshiftClusterSubnetGroup, self).__init__(user_managed=False) + self.cmd_prefix = cmd_prefix + self.name = 'pkb-' + FLAGS.run_uri + self.subnet_id = '' + + def _Create(self): + cmd = self.cmd_prefix + [ + 'redshift', 'create-cluster-subnet-group', + '--cluster-subnet-group-name', self.name, '--description', + 'Cluster Subnet Group for run uri {}'.format( + FLAGS.run_uri), '--subnet-ids', self.subnet_id + ] + vm_util.IssueCommand(cmd) + + def _Delete(self): + """Delete a redshift cluster subnet group.""" + cmd = self.cmd_prefix + [ + 'redshift', 'delete-cluster-subnet-group', + '--cluster-subnet-group-name', self.name + ] + vm_util.IssueCommand(cmd, raise_on_failure=False) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_container_service.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_container_service.py new file mode 100755 index 0000000..7575d99 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_container_service.py @@ -0,0 +1,575 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes/functions related to AWS container clusters.""" + +import json +import os +import uuid + +from absl import flags +from perfkitbenchmarker import container_service +from perfkitbenchmarker import context +from perfkitbenchmarker import errors +from perfkitbenchmarker import providers +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import aws_load_balancer +from perfkitbenchmarker.providers.aws import aws_logs +from perfkitbenchmarker.providers.aws import aws_network +from perfkitbenchmarker.providers.aws import s3 +from perfkitbenchmarker.providers.aws import util +import requests +import six +import yaml + +FLAGS = flags.FLAGS +_ECS_NOT_READY = frozenset(['PROVISIONING', 'PENDING']) + + +class EcrRepository(resource.BaseResource): + """Class representing an Elastic Container Registry image repository.""" + + def __init__(self, name, region): + super(EcrRepository, self).__init__() + self.name = name + self.region = region + + def _Create(self): + """Creates the image repository.""" + if self._Exists(): + self.user_managed = True + return + create_cmd = util.AWS_PREFIX + [ + 'ecr', 'create-repository', '--region', self.region, + '--repository-name', self.name + ] + _, stderr, retcode = vm_util.IssueCommand( + create_cmd, raise_on_failure=False) + if retcode: + if 'InsufficientInstanceCapacity' in stderr: + raise errors.Benchmarks.InsufficientCapacityCloudFailure(stderr) + if 'InstanceLimitExceeded' in stderr or 'VpcLimitExceeded' in stderr: + raise errors.Benchmarks.QuotaFailure(stderr) + raise errors.Resource.CreationError( + 'Failed to create EKS Cluster: {} return code: {}'.format( + retcode, stderr)) + + def _Exists(self): + """Returns True if the repository exists.""" + describe_cmd = util.AWS_PREFIX + [ + 'ecr', 'describe-repositories', '--region', self.region, + '--repository-names', self.name + ] + stdout, _, _ = vm_util.IssueCommand( + describe_cmd, suppress_warning=True, raise_on_failure=False) + if not stdout or not json.loads(stdout)['repositories']: + return False + return True + + def _Delete(self): + """Deletes the repository.""" + delete_cmd = util.AWS_PREFIX + [ + 'ecr', 'delete-repository', '--region', self.region, + '--repository-name', self.name, '--force' + ] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + +class ElasticContainerRegistry(container_service.BaseContainerRegistry): + """Class for building and storing container images on AWS.""" + + CLOUD = providers.AWS + + def __init__(self, registry_spec): + super(ElasticContainerRegistry, self).__init__(registry_spec) + self.account = self.project or util.GetAccount() + self.region = util.GetRegionFromZone(self.zone.split(',')[0]) + self.repositories = [] + + def _Delete(self): + """Deletes the repositories.""" + for repository in self.repositories: + repository.Delete() + + def PrePush(self, image): + """Prepares registry to push a given image.""" + repository_name = '{namespace}/{name}'.format( + namespace=self.name, name=image.name) + repository = EcrRepository(repository_name, self.region) + self.repositories.append(repository) + repository.Create() + + def GetFullRegistryTag(self, image): + """Gets the full tag of the image.""" + tag = '{account}.dkr.ecr.{region}.amazonaws.com/{namespace}/{name}'.format( + account=self.account, + region=self.region, + namespace=self.name, + name=image) + return tag + + def Login(self): + """Logs in to the registry.""" + get_login_cmd = util.AWS_PREFIX + [ + '--region', self.region, 'ecr', 'get-login', '--no-include-email' + ] + stdout, _, _ = vm_util.IssueCommand(get_login_cmd) + login_cmd = stdout.split() + vm_util.IssueCommand(login_cmd) + + def RemoteBuild(self, image): + """Build the image remotely.""" + # TODO(ehankland) use AWS codebuild to build the image. + raise NotImplementedError() + + +class TaskDefinition(resource.BaseResource): + """Class representing an AWS task definition.""" + + def __init__(self, name, container_spec, cluster): + super(TaskDefinition, self).__init__() + self.name = name + self.cpus = container_spec.cpus + self.memory = container_spec.memory + self.image = container_spec.image + self.container_port = container_spec.container_port + self.region = cluster.region + self.arn = None + self.log_group = aws_logs.LogGroup(self.region, 'pkb') + + def _CreateDependencies(self): + """Create the log group if it doesn't exist.""" + if not self.log_group.Exists(): + self.log_group.Create() + + def _Create(self): + """Create the task definition.""" + register_cmd = util.AWS_PREFIX + [ + '--region', self.region, 'ecs', 'register-task-definition', '--family', + self.name, '--execution-role-arn', 'ecsTaskExecutionRole', + '--network-mode', 'awsvpc', '--requires-compatibilities=FARGATE', + '--cpu', + str(int(1024 * self.cpus)), '--memory', + str(self.memory), '--container-definitions', + self._GetContainerDefinitions() + ] + stdout, _, _ = vm_util.IssueCommand(register_cmd) + response = json.loads(stdout) + self.arn = response['taskDefinition']['taskDefinitionArn'] + + def _Delete(self): + """Deregister the task definition.""" + if self.arn is None: + return + deregister_cmd = util.AWS_PREFIX + [ + '--region', self.region, 'ecs', 'deregister-task-definition', + '--task-definition', self.arn + ] + vm_util.IssueCommand(deregister_cmd) + + def _GetContainerDefinitions(self): + """Returns a JSON representation of the container definitions.""" + definitions = [{ + 'name': self.name, + 'image': self.image, + 'essential': True, + 'portMappings': [{ + 'containerPort': self.container_port, + 'protocol': 'TCP' + }], + 'logConfiguration': { + 'logDriver': 'awslogs', + 'options': { + 'awslogs-group': 'pkb', + 'awslogs-region': self.region, + 'awslogs-stream-prefix': 'pkb' + } + } + }] + return json.dumps(definitions) + + +class EcsTask(container_service.BaseContainer): + """Class representing an ECS/Fargate task.""" + + def __init__(self, name, container_spec, cluster): + super(EcsTask, self).__init__(container_spec) + self.name = name + self.task_def = cluster.task_defs[name] + self.arn = None + self.region = cluster.region + self.cluster_name = cluster.name + self.subnet_id = cluster.network.subnet.id + self.ip_address = None + self.security_group_id = ( + cluster.network.regional_network.vpc.default_security_group_id) + + def _GetNetworkConfig(self): + network_config = { + 'awsvpcConfiguration': { + 'subnets': [self.subnet_id], + 'securityGroups': [self.security_group_id], + 'assignPublicIp': 'ENABLED', + } + } + return json.dumps(network_config) + + def _GetOverrides(self): + """Returns a JSON representaion of task overrides. + + While the container level resources can be overridden, they have no + effect on task level resources for Fargate tasks. This means + that modifying a container spec will only affect the command of any + new containers launched from it and not cpu/memory. + """ + overrides = { + 'containerOverrides': [{ + 'name': self.name, + }] + } + if self.command: + overrides['containerOverrides'][0]['command'] = self.command + return json.dumps(overrides) + + def _Create(self): + """Creates the task.""" + run_cmd = util.AWS_PREFIX + [ + '--region', self.region, 'ecs', 'run-task', '--cluster', + self.cluster_name, '--task-definition', self.task_def.arn, + '--launch-type', 'FARGATE', '--network-configuration', + self._GetNetworkConfig(), '--overrides', + self._GetOverrides() + ] + stdout, _, _ = vm_util.IssueCommand(run_cmd) + response = json.loads(stdout) + self.arn = response['tasks'][0]['taskArn'] + + def _PostCreate(self): + """Gets the tasks IP address.""" + container = self._GetTask()['containers'][0] + self.ip_address = container['networkInterfaces'][0]['privateIpv4Address'] + + def _DeleteDependencies(self): + """Delete the task def.""" + self.task_def.Delete() + + def _Delete(self): + """Deletes the task.""" + if self.arn is None: + return + stop_cmd = util.AWS_PREFIX + [ + '--region', self.region, 'ecs', 'stop-task', '--cluster', + self.cluster_name, '--task', self.arn + ] + vm_util.IssueCommand(stop_cmd) + + def _GetTask(self): + """Returns a dictionary representation of the task.""" + describe_cmd = util.AWS_PREFIX + [ + '--region', self.region, 'ecs', 'describe-tasks', '--cluster', + self.cluster_name, '--tasks', self.arn + ] + stdout, _, _ = vm_util.IssueCommand(describe_cmd) + response = json.loads(stdout) + return response['tasks'][0] + + def _IsReady(self): + """Returns true if the task has stopped pending.""" + return self._GetTask()['lastStatus'] not in _ECS_NOT_READY + + def WaitForExit(self, timeout=None): + """Waits until the task has finished running.""" + + @vm_util.Retry( + timeout=timeout, + retryable_exceptions=(container_service.RetriableContainerException,)) + def _WaitForExit(): + task = self._GetTask() + if task['lastStatus'] != 'STOPPED': + raise container_service.RetriableContainerException( + 'Task is not STOPPED.') + return task + + return _WaitForExit() + + def GetLogs(self): + """Returns the logs from the container.""" + task_id = self.arn.split('/')[-1] + log_stream = 'pkb/{name}/{task_id}'.format(name=self.name, task_id=task_id) + return six.text_type( + aws_logs.GetLogStreamAsString(self.region, log_stream, 'pkb')) + + +class EcsService(container_service.BaseContainerService): + """Class representing an ECS/Fargate service.""" + + def __init__(self, name, container_spec, cluster): + super(EcsService, self).__init__(container_spec) + self.client_token = str(uuid.uuid4())[:32] + self.name = name + self.task_def = cluster.task_defs[name] + self.arn = None + self.region = cluster.region + self.cluster_name = cluster.name + self.subnet_id = cluster.network.subnet.id + self.security_group_id = ( + cluster.network.regional_network.vpc.default_security_group_id) + self.load_balancer = aws_load_balancer.LoadBalancer( + [cluster.network.subnet]) + self.target_group = aws_load_balancer.TargetGroup( + cluster.network.regional_network.vpc, self.container_port) + self.port = 80 + + def _CreateDependencies(self): + """Creates the load balancer for the service.""" + self.load_balancer.Create() + self.target_group.Create() + listener = aws_load_balancer.Listener(self.load_balancer, self.target_group, + self.port) + listener.Create() + self.ip_address = self.load_balancer.dns_name + + def _DeleteDependencies(self): + """Deletes the service's load balancer.""" + self.task_def.Delete() + self.load_balancer.Delete() + self.target_group.Delete() + + # TODO(ferneyhough): Consider supporting the flag container_cluster_version. + def _Create(self): + """Creates the service.""" + create_cmd = util.AWS_PREFIX + [ + '--region', + self.region, + 'ecs', + 'create-service', + '--desired-count', + '1', + '--client-token', + self.client_token, + '--cluster', + self.cluster_name, + '--service-name', + self.name, + '--task-definition', + self.task_def.arn, + '--launch-type', + 'FARGATE', + '--network-configuration', + self._GetNetworkConfig(), + '--load-balancers', + self._GetLoadBalancerConfig(), + ] + vm_util.IssueCommand(create_cmd) + + def _Delete(self): + """Deletes the service.""" + update_cmd = util.AWS_PREFIX + [ + '--region', self.region, 'ecs', 'update-service', '--cluster', + self.cluster_name, '--service', self.name, '--desired-count', '0' + ] + vm_util.IssueCommand(update_cmd) + delete_cmd = util.AWS_PREFIX + [ + '--region', self.region, 'ecs', 'delete-service', '--cluster', + self.cluster_name, '--service', self.name + ] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + def _GetNetworkConfig(self): + network_config = { + 'awsvpcConfiguration': { + 'subnets': [self.subnet_id], + 'securityGroups': [self.security_group_id], + 'assignPublicIp': 'ENABLED', + } + } + return json.dumps(network_config) + + def _GetLoadBalancerConfig(self): + """Returns the JSON representation of the service load balancers.""" + load_balancer_config = [{ + 'targetGroupArn': self.target_group.arn, + 'containerName': self.name, + 'containerPort': self.container_port, + }] + return json.dumps(load_balancer_config) + + def _IsReady(self): + """Returns True if the Service is ready.""" + url = 'http://%s' % self.ip_address + try: + r = requests.get(url) + except requests.ConnectionError: + return False + if r.status_code == 200: + return True + return False + + +class FargateCluster(container_service.BaseContainerCluster): + """Class representing an AWS Fargate cluster.""" + + CLOUD = providers.AWS + CLUSTER_TYPE = 'Fargate' + + def __init__(self, cluster_spec): + super(FargateCluster, self).__init__(cluster_spec) + self.region = util.GetRegionFromZone(self.zone) + self.network = aws_network.AwsNetwork.GetNetwork(self) + self.firewall = aws_network.AwsFirewall.GetFirewall() + self.name = 'pkb-%s' % FLAGS.run_uri + self.task_defs = {} + self.arn = None + + def _Create(self): + """Creates the cluster.""" + create_cmd = util.AWS_PREFIX + [ + '--region', self.region, 'ecs', 'create-cluster', '--cluster-name', + self.name + ] + stdout, _, _ = vm_util.IssueCommand(create_cmd) + response = json.loads(stdout) + self.arn = response['cluster']['clusterArn'] + + def _Exists(self): + """Returns True if the cluster exists.""" + if not self.arn: + return False + describe_cmd = util.AWS_PREFIX + [ + '--region', self.region, 'ecs', 'describe-clusters', '--clusters', + self.arn + ] + stdout, _, _ = vm_util.IssueCommand(describe_cmd) + response = json.loads(stdout) + clusters = response['clusters'] + if not clusters or clusters[0]['status'] == 'INACTIVE': + return False + return True + + def _Delete(self): + """Deletes the cluster.""" + delete_cmd = util.AWS_PREFIX + [ + '--region', self.region, 'ecs', 'delete-cluster', '--cluster', self.name + ] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + def DeployContainer(self, name, container_spec): + """Deploys the container according to the spec.""" + if name not in self.task_defs: + task_def = TaskDefinition(name, container_spec, self) + self.task_defs[name] = task_def + task_def.Create() + task = EcsTask(name, container_spec, self) + self.containers[name].append(task) + task.Create() + + def DeployContainerService(self, name, container_spec): + """Deploys the container service according to the spec.""" + if name not in self.task_defs: + task_def = TaskDefinition(name, container_spec, self) + self.task_defs[name] = task_def + task_def.Create() + service = EcsService(name, container_spec, self) + self.services[name] = service + self.firewall.AllowPortInSecurityGroup(service.region, + service.security_group_id, + service.container_port) + service.Create() + + +class AwsKopsCluster(container_service.KubernetesCluster): + """Class representing a kops based Kubernetes cluster.""" + + CLOUD = providers.AWS + CLUSTER_TYPE = 'kops' + + def __init__(self, spec): + super(AwsKopsCluster, self).__init__(spec) + self.name += '.k8s.local' + self.config_bucket = 'kops-%s-%s' % (FLAGS.run_uri, str(uuid.uuid4())) + self.region = util.GetRegionFromZone(self.zone) + self.s3_service = s3.S3Service() + self.s3_service.PrepareService(self.region) + + def _CreateDependencies(self): + """Create the bucket to store cluster config.""" + self.s3_service.MakeBucket(self.config_bucket) + + def _DeleteDependencies(self): + """Delete the bucket that stores cluster config.""" + self.s3_service.DeleteBucket(self.config_bucket) + + def _Create(self): + """Creates the cluster.""" + # Create the cluster spec but don't provision any resources. + create_cmd = [ + FLAGS.kops, 'create', 'cluster', + '--name=%s' % self.name, + '--zones=%s' % self.zone, + '--node-count=%s' % self.num_nodes, + '--node-size=%s' % self.machine_type + ] + env = os.environ.copy() + env['KUBECONFIG'] = FLAGS.kubeconfig + env['KOPS_STATE_STORE'] = 's3://%s' % self.config_bucket + vm_util.IssueCommand(create_cmd, env=env) + + # Download the cluster spec and modify it. + get_cmd = [FLAGS.kops, 'get', 'cluster', self.name, '--output=yaml'] + stdout, _, _ = vm_util.IssueCommand(get_cmd, env=env) + spec = yaml.safe_load(stdout) + spec['metadata']['creationTimestamp'] = None + spec['spec']['api']['loadBalancer']['idleTimeoutSeconds'] = 3600 + benchmark_spec = context.GetThreadBenchmarkSpec() + spec['spec']['cloudLabels'] = { + 'owner': FLAGS.owner, + 'perfkitbenchmarker-run': FLAGS.run_uri, + 'benchmark': benchmark_spec.name, + 'perfkit_uuid': benchmark_spec.uuid, + 'benchmark_uid': benchmark_spec.uid + } + + # Replace the cluster spec. + with vm_util.NamedTemporaryFile() as tf: + yaml.dump(spec, tf) + tf.close() + replace_cmd = [FLAGS.kops, 'replace', '--filename=%s' % tf.name] + vm_util.IssueCommand(replace_cmd, env=env) + + # Create the actual cluster. + update_cmd = [FLAGS.kops, 'update', 'cluster', self.name, '--yes'] + vm_util.IssueCommand(update_cmd, env=env) + + def _Delete(self): + """Deletes the cluster.""" + super()._Delete() + delete_cmd = [ + FLAGS.kops, 'delete', 'cluster', + '--name=%s' % self.name, + '--state=s3://%s' % self.config_bucket, '--yes' + ] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + def _IsReady(self): + """Returns True if the cluster is ready, else False.""" + validate_cmd = [ + FLAGS.kops, 'validate', 'cluster', + '--name=%s' % self.name, + '--state=s3://%s' % self.config_bucket + ] + env = os.environ.copy() + env['KUBECONFIG'] = FLAGS.kubeconfig + _, _, retcode = vm_util.IssueCommand( + validate_cmd, env=env, suppress_warning=True, raise_on_failure=False) + return not retcode diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_dax.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_dax.py new file mode 100644 index 0000000..0104821 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_dax.py @@ -0,0 +1,173 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for AWS' DAX cluster. + +DAX cluster can be created and deleted. +""" + + +import json +import logging +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import aws_iam_role +from perfkitbenchmarker.providers.aws import util + +_DAX_CLUSTER_NAME_TEMPLATE = 'pkb-dax-{uid}' +_DAX_SUBNET_GROUP_TEMPLATE = 'pkb-subnet-group-{uid}' +_DAX_SERVICE = 'dax.amazonaws.com' +_DAX_ACTION = 'dynamodb:*' +_DAX_TCP_PORT = 8011 +_DAX_ROLE_NAME_TEMPLATE = 'PkbDaxServiceRole{uid}' +_DAX_POLICY_NAME_TEMPLATE = 'PolicyForPkbDaxServiceRole{uid}' +_DAX_STATUS_AVAILABLE = 'available' +_DYNAMODB_RESOURCE_TEMPLATE = 'arn:aws:dynamodb:{region}:{account}:*' + +FLAGS = flags.FLAGS + + +class AwsDax(resource.BaseResource): + """Class representing an AWS Dax cluster.""" + + def __init__(self, benchmark_uid, zone, network): + super(AwsDax, self).__init__() + self.benchmark_uid = benchmark_uid + self.zone = zone + self.region = util.GetRegionFromZone(self.zone) + self.vpc = network.regional_network.vpc + self.subnet_id = network.subnet.id + self.account = util.GetAccount() + self.iam_role = aws_iam_role.AwsIamRole( + self.account, _DAX_ROLE_NAME_TEMPLATE.format(uid=self.benchmark_uid), + _DAX_POLICY_NAME_TEMPLATE.format(uid=self.benchmark_uid), _DAX_SERVICE, + _DAX_ACTION, + _DYNAMODB_RESOURCE_TEMPLATE.format( + region=self.region, account=self.account)) + self.cluster_endpoint = None + self.subnet_group_name = _DAX_SUBNET_GROUP_TEMPLATE.format( + uid=self.benchmark_uid) + self.cluster_name = _DAX_CLUSTER_NAME_TEMPLATE.format( + uid=self.benchmark_uid) + + def _CreateDependencies(self): + """See base class. + + Creates the IAM role and subnet group used by the DAX cluster. + """ + + self.iam_role.Create() + cmd = util.AWS_PREFIX + [ + 'dax', 'create-subnet-group', '--subnet-group-name', + self.subnet_group_name, '--subnet-ids', self.subnet_id + ] + + _, stderror, retcode = vm_util.IssueCommand(cmd, raise_on_failure=True) + if retcode != 0: + logging.warning('Failed to create subnet group! %s', stderror) + + def _Create(self): + """See base class.""" + cmd = util.AWS_PREFIX + [ + 'dax', 'create-cluster', '--cluster-name', self.cluster_name, + '--node-type', FLAGS.aws_dax_node_type, '--replication-factor', + str(FLAGS.aws_dax_replication_factor), '--iam-role-arn', + self.iam_role.GetRoleArn(), '--subnet-group', self.subnet_group_name, + '--sse-specification', 'Enabled=true', '--region', self.region + ] + + _, stderror, retcode = vm_util.IssueCommand(cmd, raise_on_failure=True) + if retcode != 0: + logging.warning('Failed to create dax cluster! %s', stderror) + + def _DeleteDependencies(self): + """See base class.""" + cmd = util.AWS_PREFIX + [ + 'dax', 'delete-subnet-group', '--subnet-group-name', + self.subnet_group_name + ] + + _, stderror, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + logging.warning('Failed to delete subnet group! %s', stderror) + + self.iam_role.Delete() + + def _Delete(self): + """See base class.""" + cmd = util.AWS_PREFIX + [ + 'dax', 'delete-cluster', '--cluster-name', self.cluster_name + ] + _, stderror, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + logging.warning('Failed to delete dax cluster! %s', stderror) + + def _Exists(self): + """See base class.""" + cmd = util.AWS_PREFIX + [ + 'dax', 'describe-clusters', '--cluster-names', self.cluster_name + ] + _, _, retcode = vm_util.IssueCommand( + cmd, suppress_warning=True, raise_on_failure=False) + return not retcode + + def _IsReady(self): + """See base class. + + Returns: + True if the DAX cluster is ready. + """ + cmd = util.AWS_PREFIX + [ + 'dax', 'describe-clusters', '--cluster-names', self.cluster_name + ] + + stdout, _, retcode = vm_util.IssueCommand( + cmd, suppress_warning=True, raise_on_failure=False) + if retcode != 0 or not stdout: + return False + result = json.loads(stdout) + status = result['Clusters'][0]['Status'] + if not status: + return False + + if status == _DAX_STATUS_AVAILABLE and not self.cluster_endpoint: + endpoint = result['Clusters'][0]['ClusterDiscoveryEndpoint'] + self.cluster_endpoint = '{}:{}'.format(endpoint['Address'], + endpoint['Port']) + return status == _DAX_STATUS_AVAILABLE + + def _PostCreate(self): + """See base class. + + Enables the Dax Port on the security group's inbound rule. + """ + for security_group in self.vpc.GetSecurityGroups(): + if security_group['GroupName'] == 'default': + cmd = util.AWS_PREFIX + [ + 'ec2', 'authorize-security-group-ingress', '--group-id', + security_group['GroupId'], '--protocol', 'tcp', '--port', + str(_DAX_TCP_PORT) + ] + + _, stderror, retcode = vm_util.IssueCommand(cmd, raise_on_failure=True) + if retcode != 0: + logging.warning('Failed to config Dax port! %s', stderror) + + def GetClusterEndpoint(self): + """Returns the DAX cluster's endpoint.""" + if not self._IsReady(): + raise errors.Benchmarks.PrepareException( + 'GetEndpoint when preparing dax cluster: cluster not ready yet.') + return self.cluster_endpoint diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_disk.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_disk.py new file mode 100644 index 0000000..5fe8720 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_disk.py @@ -0,0 +1,565 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing classes related to AWS disks. + +Disks can be created, deleted, attached to VMs, and detached from VMs. +See http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html to +determine valid disk types. +See http://aws.amazon.com/ebs/details/ for more information about AWS (EBS) +disks. +""" + +import json +import logging +import string +import threading + +from perfkitbenchmarker import disk +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.providers.aws import util + + +class AwsStateRetryableError(Exception): + """Error for retrying when an AWS disk is in a transitional state.""" + +VOLUME_EXISTS_STATUSES = frozenset(['creating', 'available', 'in-use', 'error']) +VOLUME_DELETED_STATUSES = frozenset(['deleting', 'deleted']) +VOLUME_KNOWN_STATUSES = VOLUME_EXISTS_STATUSES | VOLUME_DELETED_STATUSES + +STANDARD = 'standard' +GP2 = 'gp2' +GP3 = 'gp3' +IO1 = 'io1' +IO2 = 'io2' +ST1 = 'st1' +SC1 = 'sc1' + +DISK_TYPE = { + disk.STANDARD: STANDARD, + disk.REMOTE_SSD: GP2, + disk.PIOPS: IO1 +} + +DISK_METADATA = { + STANDARD: { + disk.MEDIA: disk.HDD, + disk.REPLICATION: disk.ZONE, + }, + GP2: { + disk.MEDIA: disk.SSD, + disk.REPLICATION: disk.ZONE, + }, + GP3: { + disk.MEDIA: disk.SSD, + disk.REPLICATION: disk.ZONE, + }, + IO1: { + disk.MEDIA: disk.SSD, + disk.REPLICATION: disk.ZONE, + }, + IO2: { + disk.MEDIA: disk.SSD, + disk.REPLICATION: disk.ZONE, + }, + ST1: { + disk.MEDIA: disk.HDD, + disk.REPLICATION: disk.ZONE + }, + SC1: { + disk.MEDIA: disk.HDD, + disk.REPLICATION: disk.ZONE + } +} + +LOCAL_SSD_METADATA = { + disk.MEDIA: disk.SSD, + disk.REPLICATION: disk.NONE, +} + +LOCAL_HDD_METADATA = { + disk.MEDIA: disk.HDD, + disk.REPLICATION: disk.NONE, +} + +LOCAL_HDD_PREFIXES = ['d2', 'hs1', 'h1', 'c1', 'cc2', 'm1', 'm2'] +# Following lists based on +# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html +NON_EBS_NVME_TYPES = [ + 'c4', + 'd2', + 'f1', + 'g3', + 'h1', + 'i3', + 'm4', + 'p2', + 'p3', + 'r4', + 't2', + 'x1', + 'x1e', + 'm1', + 'm3', + 'c1', + 'cc2', + 'c3', + 'm2', + 'cr1', + 'r3', + 'hs1', + 'i2', + 'g2', + 't1', +] +NON_LOCAL_NVME_TYPES = LOCAL_HDD_PREFIXES + [ + 'c3', 'cr1', 'g2', 'i2', 'm3', 'r3', 'x1', 'x1e'] + +# Following dictionary based on +# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html +NUM_LOCAL_VOLUMES = { + 'c1.medium': 1, + 'c1.xlarge': 4, + 'c3.large': 2, + 'c3.xlarge': 2, + 'c3.2xlarge': 2, + 'c3.4xlarge': 2, + 'c3.8xlarge': 2, + 'cc2.8xlarge': 4, + 'cg1.4xlarge': 2, + 'cr1.8xlarge': 2, + 'g2.2xlarge': 1, + 'hi1.4xlarge': 2, + 'hs1.8xlarge': 24, + 'i2.xlarge': 1, + 'i2.2xlarge': 2, + 'i2.4xlarge': 4, + 'i2.8xlarge': 8, + 'm1.small': 1, + 'm1.medium': 1, + 'm1.large': 2, + 'm1.xlarge': 4, + 'm2.xlarge': 1, + 'm2.2xlarge': 1, + 'm2.4xlarge': 2, + 'm3.medium': 1, + 'm3.large': 1, + 'm3.xlarge': 2, + 'm3.2xlarge': 2, + 'r3.large': 1, + 'r3.xlarge': 1, + 'r3.2xlarge': 1, + 'r3.4xlarge': 1, + 'r3.8xlarge': 2, + 'd2.xlarge': 3, + 'd2.2xlarge': 6, + 'd2.4xlarge': 12, + 'd2.8xlarge': 24, + 'd3.xlarge': 3, + 'd3.2xlarge': 6, + 'd3.4xlarge': 12, + 'd3.8xlarge': 24, + 'd3en.large': 1, + 'd3en.xlarge': 2, + 'd3en.2xlarge': 4, + 'd3en.4xlarge': 8, + 'd3en.6xlarge': 12, + 'd3en.8xlarge': 16, + 'd3en.12xlarge': 24, + 'i3.large': 1, + 'i3.xlarge': 1, + 'i3.2xlarge': 1, + 'i3.4xlarge': 2, + 'i3.8xlarge': 4, + 'i3.16xlarge': 8, + 'i3.metal': 8, + 'i4i.large': 1, + 'i4i.xlarge': 1, + 'i4i.2xlarge': 1, + 'i4i.4xlarge': 1, + 'i4i.8xlarge': 2, + 'i4i.16xlarge': 4, + 'i4i.32xlarge': 8, + 'is4gen.medium': 1, + 'is4gen.large': 1, + 'is4gen.xlarge': 1, + 'is4gen.2xlarge': 1, + 'is4gen.4xlarge': 2, + 'is4gen.8xlarge': 4, + 'im4gn.large': 1, + 'im4gn.xlarge': 1, + 'im4gn.2xlarge': 1, + 'im4gn.4xlarge': 1, + 'im4gn.8xlarge': 2, + 'im4gn.16xlarge': 4, + 'i3en.large': 1, + 'i3en.xlarge': 1, + 'i3en.2xlarge': 2, + 'i3en.3xlarge': 1, + 'i3en.6xlarge': 2, + 'i3en.12xlarge': 4, + 'i3en.24xlarge': 8, + 'i3en.metal': 8, + 'c5ad.large': 1, + 'c5ad.xlarge': 1, + 'c5ad.2xlarge': 1, + 'c5ad.4xlarge': 2, + 'c5ad.8xlarge': 2, + 'c5ad.12xlarge': 2, + 'c5ad.16xlarge': 2, + 'c5ad.24xlarge': 2, + 'c5d.large': 1, + 'c5d.xlarge': 1, + 'c5d.2xlarge': 1, + 'c5d.4xlarge': 1, + 'c5d.9xlarge': 1, + 'c5d.12xlarge': 2, + 'c5d.18xlarge': 2, + 'c5d.24xlarge': 4, + 'c5d.metal': 4, + 'c6gd.large': 1, + 'c6gd.xlarge': 1, + 'c6gd.2xlarge': 1, + 'c6gd.4xlarge': 1, + 'c6gd.8xlarge': 1, + 'c6gd.12xlarge': 2, + 'c6gd.16xlarge': 2, + 'c6gd.metal': 2, + 'm5d.large': 1, + 'm5d.xlarge': 1, + 'm5d.2xlarge': 1, + 'm5d.4xlarge': 2, + 'm5d.8xlarge': 2, + 'm5d.12xlarge': 2, + 'm5d.24xlarge': 4, + 'm5d.metal': 4, + 'm5ad.large': 1, + 'm5ad.xlarge': 1, + 'm5ad.2xlarge': 1, + 'm5ad.4xlarge': 2, + 'm5ad.8xlarge': 2, + 'm5ad.12xlarge': 2, + 'm5ad.16xlarge': 4, + 'm5ad.24xlarge': 4, + 'm6gd.large': 1, + 'm6gd.xlarge': 1, + 'm6gd.2xlarge': 1, + 'm6gd.4xlarge': 1, + 'm6gd.8xlarge': 1, + 'm6gd.12xlarge': 2, + 'm6gd.16xlarge': 2, + 'm6gd.metal': 2, + 'r5d.large': 1, + 'r5d.xlarge': 1, + 'r5d.2xlarge': 1, + 'r5d.4xlarge': 2, + 'r5d.12xlarge': 2, + 'r5d.24xlarge': 4, + 'z1d.large': 1, + 'z1d.xlarge': 1, + 'z1d.2xlarge': 1, + 'z1d.3xlarge': 2, + 'z1d.6xlarge': 1, + 'z1d.12xlarge': 2, + 'x1.16xlarge': 1, + 'x1.32xlarge': 2, + 'x1e.xlarge': 1, + 'x1e.2xlarge': 1, + 'x1e.4xlarge': 1, + 'x1e.8xlarge': 1, + 'x1e.16xlarge': 1, + 'x1e.32xlarge': 2, + 'f1.2xlarge': 1, + 'f1.4xlarge': 1, + 'f1.16xlarge': 4, + 'p3dn.24xlarge': 2, + 'm5d.metal': 4, + 'c5d.metal': 4, + 'm6gd.metal': 2, + 'm5dn.large': 1, + 'm5dn.xlarge': 1, + 'm5dn.2xlarge': 1, + 'm5dn.4xlarge': 2, + 'm5dn.8xlarge': 2, + 'm5dn.12xlarge': 2, + 'm5dn.16xlarge': 4, + 'm5dn.24xlarge': 4, + 'm5dn.metal': 4, + 'p4d.24xlarge': 8 +} + + +def LocalDiskIsHDD(machine_type): + """Check whether the local disks use spinning magnetic storage.""" + return machine_type.split('.')[0].lower() in LOCAL_HDD_PREFIXES + + +def LocalDriveIsNvme(machine_type): + """Check if the machine type uses NVMe driver.""" + return machine_type.split('.')[0].lower() not in NON_LOCAL_NVME_TYPES + + +def EbsDriveIsNvme(machine_type): + """Check if the machine type uses NVMe driver.""" + instance_family = machine_type.split('.')[0].lower() + return (instance_family not in NON_EBS_NVME_TYPES or + 'metal' in machine_type) + + +AWS = 'AWS' +disk.RegisterDiskTypeMap(AWS, DISK_TYPE) + + +class AwsDiskSpec(disk.BaseDiskSpec): + """Object holding the information needed to create an AwsDisk. + + Attributes: + iops: None or int. IOPS for Provisioned IOPS (SSD) volumes in AWS. + throughput: None or int. Throughput for (SSD) volumes in AWS. + """ + + CLOUD = providers.AWS + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May + be modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(AwsDiskSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['aws_provisioned_iops'].present: + config_values['iops'] = flag_values.aws_provisioned_iops + if flag_values['aws_provisioned_throughput'].present: + config_values['throughput'] = flag_values.aws_provisioned_throughput + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(AwsDiskSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'iops': (option_decoders.IntDecoder, { + 'default': None, + 'none_ok': True + }) + }) + result.update({ + 'throughput': (option_decoders.IntDecoder, { + 'default': None, + 'none_ok': True + }) + }) + return result + + +class AwsDisk(disk.BaseDisk): + """Object representing an Aws Disk.""" + + _lock = threading.Lock() + vm_devices = {} + + def __init__(self, disk_spec, zone, machine_type): + super(AwsDisk, self).__init__(disk_spec) + self.iops = disk_spec.iops + self.throughput = disk_spec.throughput + self.id = None + self.zone = zone + self.region = util.GetRegionFromZone(zone) + self.device_letter = None + self.attached_vm_id = None + self.machine_type = machine_type + if self.disk_type != disk.LOCAL: + self.metadata.update(DISK_METADATA.get(self.disk_type, {})) + else: + self.metadata.update((LOCAL_HDD_METADATA + if LocalDiskIsHDD(machine_type) + else LOCAL_SSD_METADATA)) + if self.iops: + self.metadata['iops'] = self.iops + if self.throughput: + self.metadata['throughput'] = self.throughput + + def AssignDeviceLetter(self, letter_suggestion, nvme_boot_drive_index): + if (LocalDriveIsNvme(self.machine_type) and + EbsDriveIsNvme(self.machine_type)): + first_device_letter = 'b' + local_drive_number = ord(letter_suggestion) - ord(first_device_letter) + logging.info('local drive number is: %d', local_drive_number) + if local_drive_number < nvme_boot_drive_index: + self.device_letter = letter_suggestion + else: + # skip the boot drive + self.device_letter = chr(ord(letter_suggestion) + 1) + else: + self.device_letter = letter_suggestion + + def _Create(self): + """Creates the disk.""" + create_cmd = util.AWS_PREFIX + [ + 'ec2', + 'create-volume', + '--region=%s' % self.region, + '--size=%s' % self.disk_size, + '--volume-type=%s' % self.disk_type] + if not util.IsRegion(self.zone): + create_cmd.append('--availability-zone=%s' % self.zone) + if self.disk_type in [IO1, IO2]: + create_cmd.append('--iops=%s' % self.iops) + if self.disk_type == GP3 and self.iops: + create_cmd.append('--iops=%s' % self.iops) + if self.disk_type == GP3 and self.throughput: + create_cmd.append('--throughput=%s' % self.throughput) + stdout, _, _ = vm_util.IssueCommand(create_cmd) + response = json.loads(stdout) + self.id = response['VolumeId'] + util.AddDefaultTags(self.id, self.region) + + def _Delete(self): + """Deletes the disk.""" + delete_cmd = util.AWS_PREFIX + [ + 'ec2', + 'delete-volume', + '--region=%s' % self.region, + '--volume-id=%s' % self.id] + logging.info('Deleting AWS volume %s. This may fail if the disk is not ' + 'yet detached, but will be retried.', self.id) + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + def _Exists(self): + """Returns true if the disk exists.""" + describe_cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-volumes', + '--region=%s' % self.region, + '--filter=Name=volume-id,Values=%s' % self.id] + stdout, _ = util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + volumes = response['Volumes'] + assert len(volumes) < 2, 'Too many volumes.' + if not volumes: + return False + status = volumes[0]['State'] + assert status in VOLUME_KNOWN_STATUSES, status + return status in VOLUME_EXISTS_STATUSES + + @vm_util.Retry( + poll_interval=0.5, + log_errors=True, + retryable_exceptions=(AwsStateRetryableError,)) + def _WaitForAttachedState(self): + """Returns if the state of the disk is attached. + + Returns: + Whether the disk is in an attached state. If not, raises an + error. + + Raises: + AwsUnknownStatusError: If an unknown status is returned from AWS. + AwsStateRetryableError: If the disk attach is pending. This is retried. + """ + describe_cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-volumes', + '--region=%s' % self.region, + '--volume-ids=%s' % self.id, + ] + + stdout, _ = util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + status = response['Volumes'][0]['Attachments'][0]['State'] + if status.lower() != 'attached': + logging.info('Disk (id:%s) attaching to ' + 'VM (id:%s) has status %s.', + self.id, self.attached_vm_id, status) + + raise AwsStateRetryableError() + + def Attach(self, vm): + """Attaches the disk to a VM. + + Args: + vm: The AwsVirtualMachine instance to which the disk will be attached. + """ + with self._lock: + self.attached_vm_id = vm.id + if self.attached_vm_id not in AwsDisk.vm_devices: + AwsDisk.vm_devices[self.attached_vm_id] = set( + string.ascii_lowercase) + self.device_letter = min(AwsDisk.vm_devices[self.attached_vm_id]) + AwsDisk.vm_devices[self.attached_vm_id].remove(self.device_letter) + + device_name = '/dev/xvdb%s' % self.device_letter + attach_cmd = util.AWS_PREFIX + [ + 'ec2', + 'attach-volume', + '--region=%s' % self.region, + '--instance-id=%s' % self.attached_vm_id, + '--volume-id=%s' % self.id, + '--device=%s' % device_name] + logging.info('Attaching AWS volume %s. This may fail if the disk is not ' + 'ready, but will be retried.', self.id) + util.IssueRetryableCommand(attach_cmd) + self._WaitForAttachedState() + + def Detach(self): + """Detaches the disk from a VM.""" + detach_cmd = util.AWS_PREFIX + [ + 'ec2', + 'detach-volume', + '--region=%s' % self.region, + '--instance-id=%s' % self.attached_vm_id, + '--volume-id=%s' % self.id] + util.IssueRetryableCommand(detach_cmd) + + with self._lock: + assert self.attached_vm_id in AwsDisk.vm_devices + AwsDisk.vm_devices[self.attached_vm_id].add(self.device_letter) + self.attached_vm_id = None + self.device_letter = None + + def GetDevicePath(self): + """Returns the path to the device inside the VM.""" + if self.disk_type == disk.LOCAL: + if LocalDriveIsNvme(self.machine_type): + first_device_letter = 'b' + return '/dev/nvme%sn1' % str( + ord(self.device_letter) - ord(first_device_letter)) + return '/dev/xvd%s' % self.device_letter + else: + if EbsDriveIsNvme(self.machine_type): + first_device_letter = 'a' + # Modified by Cumulus - upstream code breaks on instances with EBS-only NVMe drives + if self.machine_type in NUM_LOCAL_VOLUMES: + return '/dev/nvme%sn1' % ( + 1 + NUM_LOCAL_VOLUMES[self.machine_type] + + ord(self.device_letter) - ord(first_device_letter)) + else: + return '/dev/nvme%sn1' % ( + 1 + ord(self.device_letter) - ord(first_device_letter)) + # End Cumulus mods + else: + return '/dev/xvdb%s' % self.device_letter diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_dpb_emr.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_dpb_emr.py new file mode 100644 index 0000000..87e90eb --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_dpb_emr.py @@ -0,0 +1,409 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for AWS's EMR service. + +Clusters can be created and deleted. +""" + +import collections +import json +import logging +from typing import Optional + +from absl import flags +from perfkitbenchmarker import disk +from perfkitbenchmarker import dpb_service +from perfkitbenchmarker import errors +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import aws_disk +from perfkitbenchmarker.providers.aws import aws_network +from perfkitbenchmarker.providers.aws import aws_virtual_machine +from perfkitbenchmarker.providers.aws import s3 +from perfkitbenchmarker.providers.aws import util + +FLAGS = flags.FLAGS +flags.DEFINE_string('dpb_emr_release_label', None, + 'DEPRECATED use dpb_service.version.') + +INVALID_STATES = ['TERMINATED_WITH_ERRORS', 'TERMINATED'] +READY_CHECK_SLEEP = 30 +READY_CHECK_TRIES = 60 +READY_STATE = 'WAITING' +JOB_WAIT_SLEEP = 30 +EMR_TIMEOUT = 14400 + +disk_to_hdfs_map = { + aws_disk.ST1: 'HDD (ST1)', + aws_disk.GP2: 'SSD (GP2)', + disk.LOCAL: 'Local SSD', +} + +DATAPROC_TO_EMR_CONF_FILES = { + # https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html + 'core': 'core-site', + 'hdfs': 'hdfs-site', + # https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html + 'spark': 'spark-defaults', +} + + +def _GetClusterConfiguration(): + """Return a JSON string containing dpb_cluster_properties.""" + properties = collections.defaultdict(lambda: {}) + for entry in FLAGS.dpb_cluster_properties: + file, kv = entry.split(':') + key, value = kv.split('=') + if file not in DATAPROC_TO_EMR_CONF_FILES: + raise errors.Config.InvalidValue( + 'Unsupported EMR configuration file "{}". '.format(file) + + 'Please add it to aws_dpb_emr.DATAPROC_TO_EMR_CONF_FILES.') + properties[DATAPROC_TO_EMR_CONF_FILES[file]][key] = value + json_conf = [] + for file, props in properties.items(): + json_conf.append({ + # https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html + 'Classification': file, + 'Properties': props, + }) + return json.dumps(json_conf) + + +class EMRRetryableException(Exception): + pass + + +class AwsDpbEmr(dpb_service.BaseDpbService): + """Object representing a AWS EMR cluster. + + Attributes: + cluster_id: ID of the cluster. + project: ID of the project in which the cluster is being launched. + dpb_service_type: Set to 'emr'. + cmd_prefix: Setting default prefix for the emr commands (region optional). + network: Dedicated network for the EMR cluster + storage_service: Region specific instance of S3 for bucket management. + bucket_to_delete: Cluster associated bucket to be cleaned up. + dpb_version: EMR version to use. + """ + + CLOUD = providers.AWS + SERVICE_TYPE = 'emr' + + def __init__(self, dpb_service_spec): + super(AwsDpbEmr, self).__init__(dpb_service_spec) + self.dpb_service_type = AwsDpbEmr.SERVICE_TYPE + self.project = None + self.cmd_prefix = list(util.AWS_PREFIX) + if self.dpb_service_zone: + self.region = util.GetRegionFromZone(self.dpb_service_zone) + else: + raise errors.Setup.InvalidSetupError( + 'dpb_service_zone must be provided, for provisioning.') + self.cmd_prefix += ['--region', self.region] + self.network = aws_network.AwsNetwork.GetNetworkFromNetworkSpec( + aws_network.AwsNetworkSpec(zone=self.dpb_service_zone)) + self.storage_service = s3.S3Service() + self.storage_service.PrepareService(self.region) + self.persistent_fs_prefix = 's3://' + self.bucket_to_delete = None + self.dpb_version = FLAGS.dpb_emr_release_label or self.dpb_version + self._cluster_create_time = None + if not self.dpb_version: + raise errors.Setup.InvalidSetupError( + 'dpb_service.version must be provided.') + + def GetClusterCreateTime(self) -> Optional[float]: + """Returns the cluster creation time. + + On this implementation, the time returned is based on the timestamps + reported by the EMR API (which is stored in the _cluster_create_time + attribute). + + Returns: + A float representing the creation time in seconds or None. + """ + return self._cluster_create_time + + @staticmethod + def CheckPrerequisites(benchmark_config): + del benchmark_config # Unused + + @property + def security_group_id(self): + """Returns the security group ID of this Cluster.""" + return self.network.regional_network.vpc.default_security_group_id + + def _CreateDependencies(self): + """Set up the ssh key.""" + super(AwsDpbEmr, self)._CreateDependencies() + aws_virtual_machine.AwsKeyFileManager.ImportKeyfile(self.region) + + def _Create(self): + """Creates the cluster.""" + name = 'pkb_' + FLAGS.run_uri + + # Set up ebs details if disk_spec is present in the config + ebs_configuration = None + if self.spec.worker_group.disk_spec: + # Make sure nothing we are ignoring is included in the disk spec + assert self.spec.worker_group.disk_spec.device_path is None + assert self.spec.worker_group.disk_spec.disk_number is None + assert self.spec.worker_group.disk_spec.iops is None + self.dpb_hdfs_type = disk_to_hdfs_map[ + self.spec.worker_group.disk_spec.disk_type] + if self.spec.worker_group.disk_spec.disk_type != disk.LOCAL: + ebs_configuration = {'EbsBlockDeviceConfigs': [ + {'VolumeSpecification': { + 'SizeInGB': self.spec.worker_group.disk_spec.disk_size, + 'VolumeType': self.spec.worker_group.disk_spec.disk_type}, + 'VolumesPerInstance': self.spec.worker_group.disk_count}]} + + # Create the specification for the master and the worker nodes + instance_groups = [] + core_instances = {'InstanceCount': self.spec.worker_count, + 'InstanceGroupType': 'CORE', + 'InstanceType': + self.spec.worker_group.vm_spec.machine_type} + if ebs_configuration: + core_instances.update({'EbsConfiguration': ebs_configuration}) + + master_instance = {'InstanceCount': 1, + 'InstanceGroupType': 'MASTER', + 'InstanceType': + self.spec.worker_group.vm_spec.machine_type} + if ebs_configuration: + master_instance.update({'EbsConfiguration': ebs_configuration}) + + instance_groups.append(core_instances) + instance_groups.append(master_instance) + + # Spark SQL needs to access Hive + cmd = self.cmd_prefix + ['emr', 'create-cluster', '--name', name, + '--release-label', self.dpb_version, + '--use-default-roles', + '--instance-groups', + json.dumps(instance_groups), + '--application', 'Name=Spark', + 'Name=Hadoop', 'Name=Hive', + '--log-uri', self.base_dir] + + ec2_attributes = [ + 'KeyName=' + aws_virtual_machine.AwsKeyFileManager.GetKeyNameForRun(), + 'SubnetId=' + self.network.subnet.id, + # Place all VMs in default security group for simplicity and speed of + # provisioning + 'EmrManagedMasterSecurityGroup=' + self.security_group_id, + 'EmrManagedSlaveSecurityGroup=' + self.security_group_id, + ] + cmd += ['--ec2-attributes', ','.join(ec2_attributes)] + + if FLAGS.dpb_cluster_properties: + cmd += ['--configurations', _GetClusterConfiguration()] + + stdout, _, _ = vm_util.IssueCommand(cmd) + result = json.loads(stdout) + self.cluster_id = result['ClusterId'] + logging.info('Cluster created with id %s', self.cluster_id) + for tag_key, tag_value in util.MakeDefaultTags().items(): + self._AddTag(tag_key, tag_value) + + def _AddTag(self, key, value): + cmd = self.cmd_prefix + ['emr', 'add-tags', + '--resource-id', self.cluster_id, + '--tag', + '{}={}'.format(key, value)] + vm_util.IssueCommand(cmd) + + def _Delete(self): + if self.cluster_id: + delete_cmd = self.cmd_prefix + ['emr', + 'terminate-clusters', + '--cluster-ids', + self.cluster_id] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + def _DeleteDependencies(self): + super(AwsDpbEmr, self)._DeleteDependencies() + aws_virtual_machine.AwsKeyFileManager.DeleteKeyfile(self.region) + + def _Exists(self): + """Check to see whether the cluster exists.""" + if not self.cluster_id: + return False + cmd = self.cmd_prefix + ['emr', + 'describe-cluster', + '--cluster-id', + self.cluster_id] + stdout, _, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + return False + result = json.loads(stdout) + if result['Cluster']['Status']['State'] in INVALID_STATES: + return False + else: + return True + + def _IsReady(self): + """Check to see if the cluster is ready.""" + logging.info('Checking _Ready cluster: %s', self.cluster_id) + cmd = self.cmd_prefix + ['emr', + 'describe-cluster', '--cluster-id', + self.cluster_id] + stdout, _, _ = vm_util.IssueCommand(cmd) + result = json.loads(stdout) + # TODO(saksena): Handle error outcomees when spinning up emr clusters + is_ready = result['Cluster']['Status']['State'] == READY_STATE + if is_ready: + self._cluster_create_time = self._ParseClusterCreateTime(result) + return is_ready + + @classmethod + def _ParseClusterCreateTime(cls, data) -> Optional[float]: + """Parses the cluster create time from an API response dict.""" + creation_ts = None + ready_ts = None + try: + creation_ts = data['Cluster']['Status']['Timeline']['CreationDateTime'] + ready_ts = data['Cluster']['Status']['Timeline']['ReadyDateTime'] + return ready_ts - creation_ts + except (LookupError, TypeError): + return None + + def _GetCompletedJob(self, job_id): + """See base class.""" + cmd = self.cmd_prefix + [ + 'emr', 'describe-step', '--cluster-id', self.cluster_id, '--step-id', + job_id + ] + stdout, stderr, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode: + if 'ThrottlingException' in stderr: + logging.warning('Rate limited while polling EMR step:\n%s\nRetrying.', + stderr) + return None + else: + raise errors.VmUtil.IssueCommandError( + f'Getting step status failed:\n{stderr}') + result = json.loads(stdout) + state = result['Step']['Status']['State'] + if state == 'FAILED': + raise dpb_service.JobSubmissionError( + result['Step']['Status']['FailureDetails']) + if state == 'COMPLETED': + pending_time = result['Step']['Status']['Timeline']['CreationDateTime'] + start_time = result['Step']['Status']['Timeline']['StartDateTime'] + end_time = result['Step']['Status']['Timeline']['EndDateTime'] + return dpb_service.JobResult( + run_time=end_time - start_time, + pending_time=start_time - pending_time) + + def SubmitJob(self, + jarfile=None, + classname=None, + pyspark_file=None, + query_file=None, + job_poll_interval=5, + job_arguments=None, + job_files=None, + job_jars=None, + job_stdout_file=None, + job_type=None, + properties=None): + """See base class.""" + if job_arguments: + # Escape commas in arguments + job_arguments = (arg.replace(',', '\\,') for arg in job_arguments) + + all_properties = self.GetJobProperties() + all_properties.update(properties or {}) + + if job_type == 'hadoop': + if not (jarfile or classname): + raise ValueError('You must specify jarfile or classname.') + if jarfile and classname: + raise ValueError('You cannot specify both jarfile and classname.') + arg_list = [] + # Order is important + if classname: + # EMR does not support passing classnames as jobs. Instead manually + # invoke `hadoop CLASSNAME` using command-runner.jar + jarfile = 'command-runner.jar' + arg_list = ['hadoop', classname] + # Order is important + arg_list += ['-D{}={}'.format(k, v) for k, v in all_properties.items()] + if job_arguments: + arg_list += job_arguments + arg_spec = 'Args=[' + ','.join(arg_list) + ']' + step_list = ['Jar=' + jarfile, arg_spec] + elif job_type == self.SPARK_JOB_TYPE: + arg_list = [] + if job_files: + arg_list += ['--files', ','.join(job_files)] + if job_jars: + arg_list += ['--jars', ','.join(job_jars)] + for k, v in all_properties.items(): + arg_list += ['--conf', '{}={}'.format(k, v)] + # jarfile must be last before args + arg_list += ['--class', classname, jarfile] + if job_arguments: + arg_list += job_arguments + arg_spec = '[' + ','.join(arg_list) + ']' + step_type_spec = 'Type=Spark' + step_list = [step_type_spec, 'Args=' + arg_spec] + elif job_type == self.PYSPARK_JOB_TYPE: + arg_list = [] + if job_files: + arg_list += ['--files', ','.join(job_files)] + if job_jars: + arg_list += ['--jars', ','.join(job_jars)] + for k, v in all_properties.items(): + arg_list += ['--conf', '{}={}'.format(k, v)] + # pyspark_file must be last before args + arg_list += [pyspark_file] + if job_arguments: + arg_list += job_arguments + arg_spec = 'Args=[{}]'.format(','.join(arg_list)) + step_list = ['Type=Spark', arg_spec] + elif job_type == self.SPARKSQL_JOB_TYPE: + assert not job_arguments + arg_list = [query_file] + jar_spec = 'Jar="command-runner.jar"' + for k, v in all_properties.items(): + arg_list += ['--conf', '{}={}'.format(k, v)] + arg_spec = 'Args=[spark-sql,-f,{}]'.format(','.join(arg_list)) + step_list = [jar_spec, arg_spec] + + step_string = ','.join(step_list) + + step_cmd = self.cmd_prefix + ['emr', + 'add-steps', + '--cluster-id', + self.cluster_id, + '--steps', + step_string] + stdout, _, _ = vm_util.IssueCommand(step_cmd) + result = json.loads(stdout) + step_id = result['StepIds'][0] + return self._WaitForJob(step_id, EMR_TIMEOUT, job_poll_interval) + + def DistributedCopy(self, source, destination): + """Method to copy data using a distributed job on the cluster.""" + job_arguments = ['s3-dist-cp'] + job_arguments.append('--src={}'.format(source)) + job_arguments.append('--dest={}'.format(destination)) + return self.SubmitJob( + 'command-runner.jar', + job_arguments=job_arguments, + job_type=dpb_service.BaseDpbService.HADOOP_JOB_TYPE) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_dynamodb.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_dynamodb.py new file mode 100644 index 0000000..35282ea --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_dynamodb.py @@ -0,0 +1,494 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for AWS' dynamodb tables. + +Tables can be created and deleted. +""" + +import json +import logging +from typing import Any, Collection, Dict, List, Optional, Tuple, Sequence + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import non_relational_db +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.providers.aws import util + +FLAGS = flags.FLAGS +flags.DEFINE_string( + 'aws_dynamodb_primarykey', None, + 'The primaryKey of dynamodb table. This switches to sortkey if using sort.' + 'If testing GSI/LSI, use the range keyname of the index you want to test.' + 'Defaults to primary_key') +flags.DEFINE_boolean( + 'aws_dynamodb_use_sort', None, + 'Determine whether to use sort key or not. Defaults to False.') +flags.DEFINE_string( + 'aws_dynamodb_sortkey', None, + 'The sortkey of dynamodb table. This switches to primarykey if using sort.' + 'If testing GSI/LSI, use the primary keyname of the index you want to test.' + 'Defaults to sort_key.') +flags.DEFINE_enum( + 'aws_dynamodb_attributetype', None, ['S', 'N', 'B'], + 'The type of attribute, default to S (String).' + 'Alternates are N (Number) and B (Binary).' + 'Defaults to S.') +flags.DEFINE_integer('aws_dynamodb_read_capacity', None, + 'Set RCU for dynamodb table. Defaults to 25.') +flags.DEFINE_integer('aws_dynamodb_write_capacity', None, + 'Set WCU for dynamodb table. Defaults to 25.') +flags.DEFINE_integer('aws_dynamodb_lsi_count', None, + 'Set amount of Local Secondary Indexes. Only set 0-5.' + 'Defaults to 0.') +flags.DEFINE_integer('aws_dynamodb_gsi_count', None, + 'Set amount of Global Secondary Indexes. Only set 0-5.' + 'Defaults to 0.') + +# Throughput constants +_FREE_TIER_RCU = 25 +_FREE_TIER_WCU = 25 + +_DEFAULT_ZONE = 'us-east-1b' + + +class DynamoDbSpec(non_relational_db.BaseNonRelationalDbSpec): + """Configurable options of a DynamoDB instance.""" + + SERVICE_TYPE = non_relational_db.DYNAMODB + + table_name: str + zone: str + rcu: int + wcu: int + primary_key: str + sort_key: str + attribute_type: str + lsi_count: int + gsi_count: int + use_sort: bool + + def __init__(self, component_full_name, flag_values, **kwargs): + super().__init__(component_full_name, flag_values=flag_values, **kwargs) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes / constructor args for each configurable option.""" + result = super()._GetOptionDecoderConstructions() + none_ok = {'default': None, 'none_ok': False} + result.update({ + 'table_name': (option_decoders.StringDecoder, none_ok), + 'zone': (option_decoders.StringDecoder, none_ok), + 'rcu': (option_decoders.IntDecoder, none_ok), + 'wcu': (option_decoders.IntDecoder, none_ok), + 'primary_key': (option_decoders.StringDecoder, none_ok), + 'sort_key': (option_decoders.StringDecoder, none_ok), + 'attribute_type': (option_decoders.StringDecoder, none_ok), + 'lsi_count': (option_decoders.IntDecoder, none_ok), + 'gsi_count': (option_decoders.IntDecoder, none_ok), + 'use_sort': (option_decoders.BooleanDecoder, none_ok), + }) + return result + + @classmethod + def _ValidateConfig(cls, config_values) -> None: + if 'lsi_count' in config_values: + if not -1 < config_values['lsi_count'] < 6: + raise errors.Config.InvalidValue('lsi_count must be from 0-5') + if (not config_values.get('use_sort', False) and + config_values['lsi_count'] != 0): + raise errors.Config.InvalidValue('lsi_count requires use_sort=True') + if not -1 < config_values.get('gsi_count', 0) < 6: + raise errors.Config.InvalidValue('gsi_count must be from 0-5') + + @classmethod + def _ApplyFlags(cls, config_values, flag_values) -> None: + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May be + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super()._ApplyFlags(config_values, flag_values) + option_name_from_flag = { + 'aws_dynamodb_read_capacity': 'rcu', + 'aws_dynamodb_write_capacity': 'wcu', + 'aws_dynamodb_primarykey': 'primary_key', + 'aws_dynamodb_sortkey': 'sort_key', + 'aws_dynamodb_attributetype': 'attribute_type', + 'aws_dynamodb_lsi_count': 'lsi_count', + 'aws_dynamodb_gsi_count': 'gsi_count', + 'aws_dynamodb_use_sort': 'use_sort', + } + for flag_name, option_name in option_name_from_flag.items(): + if flag_values[flag_name].present: + config_values[option_name] = flag_values[flag_name].value + + # Handle the zone flag. + for zone_flag_name in ['zone', 'zones']: + if flag_values[zone_flag_name].present: + config_values['zone'] = flag_values[zone_flag_name].value[0] + + cls._ValidateConfig(config_values) + + def __repr__(self) -> str: + return str(self.__dict__) + + +class AwsDynamoDBInstance(non_relational_db.BaseNonRelationalDb): + """Class for working with DynamoDB.""" + SERVICE_TYPE = non_relational_db.DYNAMODB + + def __init__(self, + table_name: Optional[str] = None, + zone: Optional[str] = None, + rcu: Optional[int] = None, + wcu: Optional[int] = None, + primary_key: Optional[str] = None, + sort_key: Optional[str] = None, + attribute_type: Optional[str] = None, + lsi_count: Optional[int] = None, + gsi_count: Optional[int] = None, + use_sort: Optional[bool] = None, + **kwargs): + super(AwsDynamoDBInstance, self).__init__(**kwargs) + self.table_name = table_name or f'pkb-{FLAGS.run_uri}' + self.zone = zone or _DEFAULT_ZONE + self.region = util.GetRegionFromZone(self.zone) + self.resource_arn: str = None # Set during the _Exists() call. + + self.rcu = rcu or _FREE_TIER_RCU + self.wcu = wcu or _FREE_TIER_WCU + self.throughput = ( + f'ReadCapacityUnits={self.rcu},WriteCapacityUnits={self.wcu}') + + self.primary_key = primary_key or 'primary_key' + self.sort_key = sort_key or 'sort_key' + self.use_sort = use_sort or False + self.attribute_type = attribute_type or 'S' + + self.lsi_count = lsi_count or 0 + self.lsi_indexes = self._CreateLocalSecondaryIndex() + self.gsi_count = gsi_count or 0 + self.gsi_indexes = self._CreateGlobalSecondaryIndex() + + @classmethod + def FromSpec(cls, spec: DynamoDbSpec) -> 'AwsDynamoDBInstance': + return cls( + table_name=spec.table_name, + zone=spec.zone, + rcu=spec.rcu, + wcu=spec.wcu, + primary_key=spec.primary_key, + sort_key=spec.sort_key, + attribute_type=spec.attribute_type, + lsi_count=spec.lsi_count, + gsi_count=spec.gsi_count, + use_sort=spec.use_sort, + enable_freeze_restore=spec.enable_freeze_restore, + create_on_restore_error=spec.create_on_restore_error, + delete_on_freeze_error=spec.delete_on_freeze_error) + + def _CreateLocalSecondaryIndex(self) -> List[str]: + """Used to create local secondary indexes.""" + lsi_items = [] + lsi_entry = [] + attr_list = [] + for lsi in range(0, self.lsi_count): + lsi_item = json.dumps({ + 'IndexName': f'lsiidx{str(lsi)}', + 'KeySchema': [{ + 'AttributeName': self.primary_key, + 'KeyType': 'HASH' + }, { + 'AttributeName': f'lattr{str(lsi)}', + 'KeyType': 'RANGE' + }], + 'Projection': { + 'ProjectionType': 'KEYS_ONLY' + } + }) + lsi_entry.append(lsi_item) + attr_list.append( + json.dumps({ + 'AttributeName': f'lattr{str(lsi)}', + 'AttributeType': self.attribute_type + })) + lsi_items.append('[' + ','.join(lsi_entry) + ']') + lsi_items.append(','.join(attr_list)) + return lsi_items + + def _CreateGlobalSecondaryIndex(self) -> List[str]: + """Used to create global secondary indexes.""" + gsi_items = [] + gsi_entry = [] + attr_list = [] + for gsi in range(0, self.gsi_count): + gsi_item = json.dumps({ + 'IndexName': f'gsiidx{str(gsi)}', + 'KeySchema': [{ + 'AttributeName': f'gsikey{str(gsi)}', + 'KeyType': 'HASH' + }, { + 'AttributeName': f'gattr{str(gsi)}', + 'KeyType': 'RANGE' + }], + 'Projection': { + 'ProjectionType': 'KEYS_ONLY' + }, + 'ProvisionedThroughput': { + 'ReadCapacityUnits': 5, + 'WriteCapacityUnits': 5 + } + }) + gsi_entry.append(gsi_item) + attr_list.append( + json.dumps({ + 'AttributeName': f'gattr{str(gsi)}', + 'AttributeType': self.attribute_type + })) + attr_list.append( + json.dumps({ + 'AttributeName': f'gsikey{str(gsi)}', + 'AttributeType': self.attribute_type + })) + gsi_items.append('[' + ','.join(gsi_entry) + ']') + gsi_items.append(','.join(attr_list)) + return gsi_items + + def _SetAttrDefnArgs(self, cmd: List[str], args: Sequence[str]) -> None: + attr_def_args = _MakeArgs(args) + cmd[10] = f'[{attr_def_args}]' + logging.info('adding to --attribute-definitions') + + def _SetKeySchemaArgs(self, cmd: List[str], args: Sequence[str]) -> None: + key_schema_args = _MakeArgs(args) + cmd[12] = f'[{key_schema_args}]' + logging.info('adding to --key-schema') + + def _PrimaryKeyJson(self) -> str: + return json.dumps({'AttributeName': self.primary_key, 'KeyType': 'HASH'}) + + def _PrimaryAttrsJson(self) -> str: + return json.dumps({ + 'AttributeName': self.primary_key, + 'AttributeType': self.attribute_type + }) + + def _SortAttrsJson(self) -> str: + return json.dumps({ + 'AttributeName': self.sort_key, + 'AttributeType': self.attribute_type + }) + + def _SortKeyJson(self) -> str: + return json.dumps({'AttributeName': self.sort_key, 'KeyType': 'RANGE'}) + + def _Create(self) -> None: + """Creates the dynamodb table.""" + cmd = util.AWS_PREFIX + [ + 'dynamodb', + 'create-table', + '--region', self.region, + '--table-name', self.table_name, + '--attribute-definitions', self._PrimaryAttrsJson(), + '--key-schema', self._PrimaryKeyJson(), + '--provisioned-throughput', self.throughput, + '--tags' + ] + util.MakeFormattedDefaultTags() + if self.lsi_count > 0 and self.use_sort: + self._SetAttrDefnArgs(cmd, [ + self._PrimaryAttrsJson(), + self._SortAttrsJson(), self.lsi_indexes[1] + ]) + cmd.append('--local-secondary-indexes') + cmd.append(self.lsi_indexes[0]) + self._SetKeySchemaArgs( + cmd, [self._PrimaryKeyJson(), + self._SortKeyJson()]) + elif self.use_sort: + self._SetAttrDefnArgs( + cmd, [self._PrimaryAttrsJson(), + self._SortAttrsJson()]) + self._SetKeySchemaArgs( + cmd, [self._PrimaryKeyJson(), + self._SortKeyJson()]) + if self.gsi_count > 0: + self._SetAttrDefnArgs( + cmd, cmd[10].strip('[]').split(',') + [self.gsi_indexes[1]]) + cmd.append('--global-secondary-indexes') + cmd.append(self.gsi_indexes[0]) + _, stderror, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + logging.warning('Failed to create table! %s', stderror) + + def _Delete(self) -> None: + """Deletes the dynamodb table.""" + cmd = util.AWS_PREFIX + [ + 'dynamodb', + 'delete-table', + '--region', self.region, + '--table-name', self.table_name] + logging.info('Attempting deletion: ') + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _IsReady(self) -> bool: + """Check if dynamodb table is ready.""" + logging.info('Getting table ready status for %s', self.table_name) + cmd = util.AWS_PREFIX + [ + 'dynamodb', + 'describe-table', + '--region', self.region, + '--table-name', self.table_name] + stdout, _, _ = vm_util.IssueCommand(cmd) + result = json.loads(stdout) + return result['Table']['TableStatus'] == 'ACTIVE' + + def _Exists(self) -> bool: + """Returns true if the dynamodb table exists.""" + logging.info('Checking if table %s exists', self.table_name) + result = self._DescribeTable() + if not result: + return False + if not self.resource_arn: + self.resource_arn = result['TableArn'] + return True + + def _DescribeTable(self) -> Dict[Any, Any]: + """Calls describe on dynamodb table.""" + cmd = util.AWS_PREFIX + [ + 'dynamodb', + 'describe-table', + '--region', self.region, + '--table-name', self.table_name] + stdout, stderr, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + logging.info('Could not find table %s, %s', self.table_name, stderr) + return {} + return json.loads(stdout)['Table'] + + def GetEndPoint(self) -> str: + return f'http://dynamodb.{self.region}.amazonaws.com' + + def GetResourceMetadata(self) -> Dict[str, Any]: + """Returns a dict containing metadata about the dynamodb instance. + + Returns: + dict mapping string property key to value. + """ + return { + 'aws_dynamodb_primarykey': self.primary_key, + 'aws_dynamodb_use_sort': self.use_sort, + 'aws_dynamodb_sortkey': self.sort_key, + 'aws_dynamodb_attributetype': self.attribute_type, + 'aws_dynamodb_read_capacity': self.rcu, + 'aws_dynamodb_write_capacity': self.wcu, + 'aws_dynamodb_lsi_count': self.lsi_count, + 'aws_dynamodb_gsi_count': self.gsi_count, + } + + def SetThroughput(self, + rcu: Optional[int] = None, + wcu: Optional[int] = None) -> None: + """Updates the table's rcu and wcu.""" + if not rcu: + rcu = self.rcu + if not wcu: + wcu = self.wcu + cmd = util.AWS_PREFIX + [ + 'dynamodb', 'update-table', + '--table-name', self.table_name, + '--region', self.region, + '--provisioned-throughput', + f'ReadCapacityUnits={rcu},WriteCapacityUnits={wcu}', + ] + logging.info('Setting %s table provisioned throughput to %s rcu and %s wcu', + self.table_name, rcu, wcu) + util.IssueRetryableCommand(cmd) + while not self._IsReady(): + continue + + def _GetThroughput(self) -> Tuple[int, int]: + """Returns the current (rcu, wcu) of the table.""" + output = self._DescribeTable()['ProvisionedThroughput'] + return output['ReadCapacityUnits'], output['WriteCapacityUnits'] + + @vm_util.Retry(poll_interval=1, max_retries=3, + retryable_exceptions=(errors.Resource.CreationError)) + def _GetTagResourceCommand(self, tags: Sequence[str]) -> Sequence[str]: + """Returns the tag-resource command with the provided tags. + + This function will retry up to max_retries to allow for instance creation to + finish. + + Args: + tags: List of formatted tags to append to the instance. + + Returns: + A list of arguments for the 'tag-resource' command. + + Raises: + errors.Resource.CreationError: If the current instance does not exist. + """ + if not self._Exists(): + raise errors.Resource.CreationError( + f'Cannot get resource arn of non-existent instance {self.table_name}') + return util.AWS_PREFIX + [ + 'dynamodb', 'tag-resource', '--resource-arn', self.resource_arn, + '--region', self.region, '--tags' + ] + list(tags) + + def UpdateWithDefaultTags(self) -> None: + """Adds default tags to the table.""" + tags = util.MakeFormattedDefaultTags() + cmd = self._GetTagResourceCommand(tags) + logging.info('Setting default tags on table %s', self.table_name) + util.IssueRetryableCommand(cmd) + + def UpdateTimeout(self, timeout_minutes: int) -> None: + """Updates the timeout associated with the table.""" + tags = util.MakeFormattedDefaultTags(timeout_minutes) + cmd = self._GetTagResourceCommand(tags) + logging.info('Updating timeout tags on table %s with timeout minutes %s', + self.table_name, timeout_minutes) + util.IssueRetryableCommand(cmd) + + def _Freeze(self) -> None: + """See base class. + + Lowers provisioned throughput to free-tier levels. There is a limit to how + many times throughput on a table may by lowered per day. See: + https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ProvisionedThroughput.html. + """ + # Check that we actually need to lower before issuing command. + rcu, wcu = self._GetThroughput() + if rcu > _FREE_TIER_RCU or wcu > _FREE_TIER_WCU: + logging.info('(rcu=%s, wcu=%s) is higher than free tier.', rcu, wcu) + self.SetThroughput(rcu=_FREE_TIER_RCU, wcu=_FREE_TIER_WCU) + + def _Restore(self) -> None: + """See base class. + + Restores provisioned throughput back to benchmarking levels. + """ + self.SetThroughput(self.rcu, self.wcu) + + +def _MakeArgs(args: Collection[str]) -> str: + return ','.join(args) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_elasticache_redis.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_elasticache_redis.py new file mode 100644 index 0000000..78d51eb --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_elasticache_redis.py @@ -0,0 +1,212 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for AWS' elasticache redis clusters. + +Clusters can be created and deleted. +""" +import json +import logging + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import managed_memory_store +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import aws_network +from perfkitbenchmarker.providers.aws import util + +FLAGS = flags.FLAGS +REDIS_VERSION_MAPPING = {'redis_3_2': '3.2.10', + 'redis_4_0': '4.0.10', + 'redis_5_0': '5.0.6', + 'redis_6_x': '6.x'} + + +class ElastiCacheRedis(managed_memory_store.BaseManagedMemoryStore): + """Object representing a AWS Elasticache redis instance.""" + + CLOUD = providers.AWS + MEMORY_STORE = managed_memory_store.REDIS + + def __init__(self, spec): + super(ElastiCacheRedis, self).__init__(spec) + self.subnet_group_name = 'subnet-%s' % self.name + self.version = REDIS_VERSION_MAPPING[spec.config.cloud_redis.redis_version] + self.node_type = FLAGS.cache_node_type + self.redis_region = FLAGS.cloud_redis_region + self.failover_zone = FLAGS.aws_elasticache_failover_zone + self.failover_subnet = None + self.failover_style = FLAGS.redis_failover_style + + @staticmethod + def CheckPrerequisites(benchmark_config): + if (FLAGS.managed_memory_store_version and + FLAGS.managed_memory_store_version not in + managed_memory_store.REDIS_VERSIONS): + raise errors.Config.InvalidValue('Invalid Redis version.') + if FLAGS.redis_failover_style in [ + managed_memory_store.Failover.FAILOVER_NONE, + managed_memory_store.Failover.FAILOVER_SAME_ZONE]: + if FLAGS.aws_elasticache_failover_zone: + raise errors.Config.InvalidValue( + 'The aws_elasticache_failover_zone flag is ignored. ' + 'There is no need for a failover zone when there is no failover. ' + 'Same zone failover will fail over to the same zone.') + else: + if (not FLAGS.aws_elasticache_failover_zone or + FLAGS.aws_elasticache_failover_zone[:-1] != FLAGS.cloud_redis_region): + raise errors.Config.InvalidValue( + 'Invalid failover zone. ' + 'A failover zone in %s must be specified. ' % + FLAGS.cloud_redis_region) + + def GetResourceMetadata(self): + """Returns a dict containing metadata about the instance. + + Returns: + dict mapping string property key to value. + """ + result = { + 'cloud_redis_failover_style': + self.failover_style, + 'cloud_redis_version': + managed_memory_store.ParseReadableVersion(self.version), + 'cloud_redis_node_type': + self.node_type, + 'cloud_redis_region': + self.redis_region, + 'cloud_redis_primary_zone': + self.spec.vms[0].zone, + 'cloud_redis_failover_zone': + self.failover_zone, + } + return result + + def _CreateDependencies(self): + """Create the subnet dependencies.""" + subnet_id = self.spec.vms[0].network.subnet.id + cmd = ['aws', 'elasticache', 'create-cache-subnet-group', + '--region', self.redis_region, + '--cache-subnet-group-name', self.subnet_group_name, + '--cache-subnet-group-description', '"PKB redis benchmark subnet"', + '--subnet-ids', subnet_id] + + if self.failover_style == ( + managed_memory_store.Failover.FAILOVER_SAME_REGION): + regional_network = self.spec.vms[0].network.regional_network + vpc_id = regional_network.vpc.id + cidr = regional_network.vpc.NextSubnetCidrBlock() + self.failover_subnet = aws_network.AwsSubnet( + self.failover_zone, vpc_id, cidr_block=cidr) + self.failover_subnet.Create() + cmd += [self.failover_subnet.id] + + vm_util.IssueCommand(cmd) + + def _DeleteDependencies(self): + """Delete the subnet dependencies.""" + cmd = ['aws', 'elasticache', 'delete-cache-subnet-group', + '--region=%s' % self.redis_region, + '--cache-subnet-group-name=%s' % self.subnet_group_name] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + if self.failover_subnet: + self.failover_subnet.Delete() + + def _Create(self): + """Creates the cluster.""" + cmd = ['aws', 'elasticache', 'create-replication-group', + '--engine', 'redis', + '--engine-version', self.version, + '--replication-group-id', self.name, + '--replication-group-description', self.name, + '--region', self.redis_region, + '--cache-node-type', self.node_type, + '--cache-subnet-group-name', self.subnet_group_name, + '--preferred-cache-cluster-a-zs', self.spec.vms[0].zone] + + if self.failover_style == managed_memory_store.Failover.FAILOVER_SAME_REGION: + cmd += [self.failover_zone] + + elif self.failover_style == managed_memory_store.Failover.FAILOVER_SAME_ZONE: + cmd += [self.spec.vms[0].zone] + + if self.failover_style != managed_memory_store.Failover.FAILOVER_NONE: + cmd += ['--automatic-failover-enabled', + '--num-cache-clusters', '2'] + + cmd += ['--tags'] + cmd += util.MakeFormattedDefaultTags() + _, stderr, _ = vm_util.IssueCommand(cmd, raise_on_failure=False) + + if 'InsufficientCacheClusterCapacity' in stderr: + raise errors.Benchmarks.InsufficientCapacityCloudFailure(stderr) + + def _Delete(self): + """Deletes the cluster.""" + cmd = ['aws', 'elasticache', 'delete-replication-group', + '--region', self.redis_region, + '--replication-group-id', self.name] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _IsDeleting(self): + """Returns True if cluster is being deleted and false otherwise.""" + cluster_info = self.DescribeInstance() + return cluster_info.get('Status', '') == 'deleting' + + def _IsReady(self): + """Returns True if cluster is ready and false otherwise.""" + cluster_info = self.DescribeInstance() + return cluster_info.get('Status', '') == 'available' + + def _Exists(self): + """Returns true if the cluster exists and is not being deleted.""" + cluster_info = self.DescribeInstance() + return ('Status' in cluster_info and + cluster_info['Status'] not in ['deleting', 'create-failed']) + + def DescribeInstance(self): + """Calls describe on cluster. + + Returns: + dict mapping string cluster_info property key to value. + """ + cmd = ['aws', 'elasticache', 'describe-replication-groups', + '--region', self.redis_region, + '--replication-group-id', self.name] + stdout, stderr, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + logging.info('Could not find cluster %s, %s', self.name, stderr) + return {} + for cluster_info in json.loads(stdout)['ReplicationGroups']: + if cluster_info['ReplicationGroupId'] == self.name: + return cluster_info + return {} + + @vm_util.Retry(max_retries=5) + def _PopulateEndpoint(self): + """Populates address and port information from cluster_info. + + Raises: + errors.Resource.RetryableGetError: + Failed to retrieve information on cluster + """ + cluster_info = self.DescribeInstance() + if not cluster_info: + raise errors.Resource.RetryableGetError( + 'Failed to retrieve information on %s', self.name) + + primary_endpoint = cluster_info['NodeGroups'][0]['PrimaryEndpoint'] + self._ip = primary_endpoint['Address'] + self._port = primary_endpoint['Port'] diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_elasticached_memcached.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_elasticached_memcached.py new file mode 100644 index 0000000..b8f8478 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_elasticached_memcached.py @@ -0,0 +1,159 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for AWS' elasticache memcached clusters.""" + + +import json +import logging + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import managed_memory_store +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import util + + +MEMCACHED_VERSIONS = ['1.5.10', '1.5.16', '1.6.6'] +FLAGS = flags.FLAGS + + +class ElastiCacheMemcached(managed_memory_store.BaseManagedMemoryStore): + """Object representing a AWS Elasticache memcached instance.""" + + CLOUD = providers.AWS + MEMORY_STORE = managed_memory_store.MEMCACHED + + def __init__(self, spec): + super(ElastiCacheMemcached, self).__init__(spec) + self.subnet_group_name = 'subnet-%s' % self.name + self.zone = self.spec.vms[0].zone + self.region = util.GetRegionFromZone(self.zone) + self.node_type = FLAGS.cache_node_type + self.version = FLAGS.managed_memory_store_version + + @staticmethod + def CheckPrerequisites(benchmark_config): + if (FLAGS.managed_memory_store_version and + FLAGS.managed_memory_store_version not in MEMCACHED_VERSIONS): + raise errors.Config.InvalidValue('Invalid Memcached version.') + + def GetResourceMetadata(self): + """Returns a dict containing metadata about the cache cluster. + + Returns: + dict mapping string property key to value. + """ + result = { + 'cloud_memcached_version': self.version, + 'cloud_memcached_node_type': self.node_type, + } + return result + + def _CreateDependencies(self): + """Create the subnet dependencies.""" + subnet_id = self.spec.vms[0].network.subnet.id + cmd = ['aws', 'elasticache', 'create-cache-subnet-group', + '--region', self.region, + '--cache-subnet-group-name', self.subnet_group_name, + '--cache-subnet-group-description', '"memcached benchmark subnet"', + '--subnet-ids', subnet_id] + + vm_util.IssueCommand(cmd) + + def _DeleteDependencies(self): + """Delete the subnet dependencies.""" + cmd = ['aws', 'elasticache', 'delete-cache-subnet-group', + '--region', self.region, + '--cache-subnet-group-name', self.subnet_group_name] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _Create(self): + """Creates the cache cluster.""" + cmd = ['aws', 'elasticache', 'create-cache-cluster', + '--engine', 'memcached', + '--region', self.region, + '--cache-cluster-id', self.name, + '--preferred-availability-zone', self.zone, + '--num-cache-nodes', str(managed_memory_store.MEMCACHED_NODE_COUNT), + '--cache-node-type', self.node_type, + '--cache-subnet-group-name', self.subnet_group_name] + + if self.version: + cmd += ['--engine-version', self.version] + + cmd += ['--tags'] + cmd += util.MakeFormattedDefaultTags() + vm_util.IssueCommand(cmd) + + def _Delete(self): + """Deletes the cache cluster.""" + cmd = ['aws', 'elasticache', 'delete-cache-cluster', + '--region', self.region, + '--cache-cluster-id', self.name] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _IsDeleting(self): + """Returns True if cluster is being deleted and false otherwise.""" + cluster_info = self._DescribeInstance() + return cluster_info.get('CacheClusterStatus', '') == 'deleting' + + def _IsReady(self): + """Returns True if cluster is ready and false otherwise.""" + cluster_info = self._DescribeInstance() + if cluster_info.get('CacheClusterStatus', '') == 'available': + self.version = cluster_info.get('EngineVersion') + return True + return False + + def _Exists(self): + """Returns true if the cluster exists and is not being deleted.""" + cluster_info = self._DescribeInstance() + return cluster_info.get('CacheClusterStatus', '') not in [ + '', 'deleting', 'create-failed'] + + def _DescribeInstance(self): + """Calls describe on cluster. + + Returns: + dict mapping string cluster_info property key to value. + """ + cmd = ['aws', 'elasticache', 'describe-cache-clusters', + '--region', self.region, + '--cache-cluster-id', self.name] + stdout, stderr, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + logging.info('Could not find cluster %s, %s', self.name, stderr) + return {} + for cluster_info in json.loads(stdout)['CacheClusters']: + if cluster_info['CacheClusterId'] == self.name: + return cluster_info + return {} + + @vm_util.Retry(max_retries=5) + def _PopulateEndpoint(self): + """Populates address and port information from cluster_info. + + Raises: + errors.Resource.RetryableGetError: + Failed to retrieve information on cluster + """ + cluster_info = self._DescribeInstance() + if not cluster_info: + raise errors.Resource.RetryableGetError( + 'Failed to retrieve information on {0}.'.format(self.name)) + + endpoint = cluster_info['ConfigurationEndpoint'] + self._ip = endpoint['Address'] + self._port = endpoint['Port'] diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_emr.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_emr.py new file mode 100644 index 0000000..7469a8d --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_emr.py @@ -0,0 +1,405 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for AWS's spark service. + +Spark clusters can be created and deleted. +""" + +import json +import logging + +from absl import flags +from perfkitbenchmarker import providers +from perfkitbenchmarker import resource +from perfkitbenchmarker import spark_service +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import aws_network +from perfkitbenchmarker.providers.aws import util + + +FLAGS = flags.FLAGS + +DEFAULT_MACHINE_TYPE = 'm3.xlarge' +RELEASE_LABEL = 'emr-5.23.0' +READY_CHECK_SLEEP = 30 +READY_CHECK_TRIES = 60 +READY_STATE = 'WAITING' + +JOB_WAIT_SLEEP = 30 + +DELETED_STATES = ['TERMINATED_WITH_ERRORS', 'TERMINATED'] + +MANAGER_SG = 'EmrManagedMasterSecurityGroup' +WORKER_SG = 'EmrManagedSlaveSecurityGroup' + +# Certain machine types require a subnet. +NEEDS_SUBNET = ['m4', 'c4', 'm5', 'c5'] + + +class AwsSecurityGroup(resource.BaseResource): + """Object representing a AWS Security Group. + + A security group is created automatically when an Amazon EMR cluster + is created. It is not deleted automatically, and the subnet and VPN + cannot be deleted until the security group is deleted. + + Because of this, there's no _Create method, only a _Delete and an + _Exists method. + """ + + def __init__(self, cmd_prefix, group_id): + super(AwsSecurityGroup, self).__init__() + self.created = True + self.group_id = group_id + self.cmd_prefix = cmd_prefix + + def _Delete(self): + cmd = self.cmd_prefix + ['ec2', 'delete-security-group', + '--group-id=' + self.group_id] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _Exists(self): + cmd = self.cmd_prefix + ['ec2', 'describe-security-groups', + '--group-id=' + self.group_id] + _, _, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + # if the security group doesn't exist, the describe command gives an error. + return retcode == 0 + + def _Create(self): + if not self.created: + raise NotImplementedError() + + +class AwsEMR(spark_service.BaseSparkService): + """Object representing a AWS EMR cluster. + + Attributes: + cluster_id: Cluster identifier, set in superclass. + project: Enclosing project for the cluster. + cmd_prefix: emr prefix, including region + network: network to use; set if needed by machine type + bucket_to_delete: bucket name to delete when cluster is + terminated. + """ + + CLOUD = providers.AWS + SPARK_SAMPLE_LOCATION = '/usr/lib/spark/lib/spark-examples.jar' + SERVICE_NAME = 'emr' + + def __init__(self, spark_service_spec): + super(AwsEMR, self).__init__(spark_service_spec) + # TODO(hildrum) use availability zone when appropriate + worker_machine_type = self.spec.worker_group.vm_spec.machine_type + leader_machine_type = self.spec.master_group.vm_spec.machine_type + self.cmd_prefix = list(util.AWS_PREFIX) + + if self.zone: + region = util.GetRegionFromZone(self.zone) + self.cmd_prefix += ['--region', region] + + # Certain machine types require subnets. + if (self.spec.static_cluster_id is None and + (worker_machine_type[0:2] in NEEDS_SUBNET or + leader_machine_type[0:2] in NEEDS_SUBNET)): + # GetNetwork is supposed to take a VM, but all it uses + # from the VM is the zone attribute, which self has. + self.network = aws_network.AwsNetwork.GetNetwork(self) + else: + self.network = None + self.bucket_to_delete = None + + def _CreateLogBucket(self): + bucket_name = 's3://pkb-{0}-emr'.format(FLAGS.run_uri) + cmd = self.cmd_prefix + ['s3', 'mb', bucket_name] + _, _, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + raise Exception('Error creating logs bucket') + self.bucket_to_delete = bucket_name + return bucket_name + + def _Create(self): + """Creates the cluster.""" + name = 'pkb_' + FLAGS.run_uri + logs_bucket = FLAGS.aws_emr_loguri or self._CreateLogBucket() + + instance_groups = [] + for group_type, group_spec in [ + ('CORE', self.spec.worker_group), + ('MASTER', self.spec.master_group)]: + instance_properties = {'InstanceCount': group_spec.vm_count, + 'InstanceGroupType': group_type, + 'InstanceType': group_spec.vm_spec.machine_type, + 'Name': group_type + ' group'} + if group_spec.disk_spec: + # Make sure nothing we are ignoring is included in the disk spec + assert group_spec.disk_spec.device_path is None + assert group_spec.disk_spec.disk_number is None + assert group_spec.disk_spec.mount_point is None + assert group_spec.disk_spec.iops is None + ebs_configuration = {'EbsBlockDeviceConfigs': [ + {'VolumeSpecification': + {'SizeInGB': group_spec.disk_spec.disk_size, + 'VolumeType': group_spec.disk_spec.disk_type}, + 'VolumesPerInstance': + group_spec.disk_spec.num_striped_disks}]} + instance_properties.update({'EbsConfiguration': ebs_configuration}) + instance_groups.append(instance_properties) + + # we need to store the cluster id. + cmd = self.cmd_prefix + ['emr', 'create-cluster', '--name', name, + '--release-label', RELEASE_LABEL, + '--use-default-roles', + '--instance-groups', + json.dumps(instance_groups), + '--application', 'Name=Spark', + 'Name=Hadoop', + '--log-uri', logs_bucket] + if self.network: + cmd += ['--ec2-attributes', 'SubnetId=' + self.network.subnet.id] + stdout, _, _ = vm_util.IssueCommand(cmd) + result = json.loads(stdout) + self.cluster_id = result['ClusterId'] + logging.info('Cluster created with id %s', self.cluster_id) + for tag_key, tag_value in util.MakeDefaultTags().items(): + self._AddTag(tag_key, tag_value) + + def _AddTag(self, key, value): + """Add the key value pair as a tag to the emr cluster.""" + cmd = self.cmd_prefix + ['emr', 'add-tags', + '--resource-id', self.cluster_id, + '--tag', + '{}={}'.format(key, value)] + vm_util.IssueCommand(cmd) + + def _DeleteSecurityGroups(self): + """Delete the security groups associated with this cluster.""" + cmd = self.cmd_prefix + ['emr', 'describe-cluster', + '--cluster-id', self.cluster_id] + stdout, _, _ = vm_util.IssueCommand(cmd) + cluster_desc = json.loads(stdout) + sec_object = cluster_desc['Cluster']['Ec2InstanceAttributes'] + manager_sg = sec_object[MANAGER_SG] + worker_sg = sec_object[WORKER_SG] + + # the manager group and the worker group reference each other, so neither + # can be deleted. First we delete the references to the manager group in + # the worker group. Then we delete the manager group, and then, finally the + # worker group. + + # remove all references to the manager group from the worker group. + for proto, port in [('tcp', '0-65535'), ('udp', '0-65535'), ('icmp', '-1')]: + for group1, group2 in [(worker_sg, manager_sg), (manager_sg, worker_sg)]: + cmd = self.cmd_prefix + ['ec2', 'revoke-security-group-ingress', + '--group-id=' + group1, + '--source-group=' + group2, + '--protocol=' + proto, + '--port=' + port] + vm_util.IssueCommand(cmd) + + # Now we need to delete the manager, then the worker. + for group in manager_sg, worker_sg: + sec_group = AwsSecurityGroup(self.cmd_prefix, group) + sec_group.Delete() + + def _Delete(self): + """Deletes the cluster.""" + + cmd = self.cmd_prefix + ['emr', 'terminate-clusters', '--cluster-ids', + self.cluster_id] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _DeleteDependencies(self): + if self.network: + self._DeleteSecurityGroups() + if self.bucket_to_delete: + bucket_del_cmd = self.cmd_prefix + ['s3', 'rb', '--force', + self.bucket_to_delete] + vm_util.IssueCommand(bucket_del_cmd) + + def _Exists(self): + """Check to see whether the cluster exists.""" + cmd = self.cmd_prefix + ['emr', 'describe-cluster', + '--cluster-id', self.cluster_id] + stdout, _, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + return False + result = json.loads(stdout) + if result['Cluster']['Status']['State'] in DELETED_STATES: + return False + else: + return True + + def _IsReady(self): + """Check to see if the cluster is ready.""" + cmd = self.cmd_prefix + ['emr', 'describe-cluster', '--cluster-id', + self.cluster_id] + stdout, _, _ = vm_util.IssueCommand(cmd) + result = json.loads(stdout) + if result['Cluster']['Status']['State'] == 'TERMINATED_WITH_ERRORS': + reason = result['Cluster']['Status']['StateChangeReason']['Message'] + message = reason + if reason.startswith('Subnet is required'): + message = ('Cluster creation failed because this machine type requires ' + 'a subnet. To ensure PKB creates a subnet for this machine ' + 'type, update the NEEDS_SUBNET variable of ' + 'providers/aws/aws_emr.py to contain prefix of this machine ' + 'type. Raw AWS message={0}'.format(reason)) + raise Exception(message) + return result['Cluster']['Status']['State'] == READY_STATE + + def _GetLogBase(self): + """Gets the base uri for the logs.""" + cmd = self.cmd_prefix + ['emr', 'describe-cluster', '--cluster-id', + self.cluster_id] + stdout, _, _ = vm_util.IssueCommand(cmd) + result = json.loads(stdout) + if 'LogUri' in result['Cluster']: + self.logging_enabled = True + log_uri = result['Cluster']['LogUri'] + if log_uri.startswith('s3n'): + log_uri = 's3' + log_uri[3:] + return log_uri + else: + return None + + def _CheckForFile(self, filename): + """Wait for file to appear on s3.""" + cmd = self.cmd_prefix + ['s3', 'ls', filename] + _, _, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + return retcode == 0 + + def _IsStepDone(self, step_id): + """Determine whether the step is done. + + Args: + step_id: The step id to query. + Returns: + A dictionary describing the step if the step the step is complete, + None otherwise. + """ + + cmd = self.cmd_prefix + ['emr', 'describe-step', '--cluster-id', + self.cluster_id, '--step-id', step_id] + stdout, _, _ = vm_util.IssueCommand(cmd) + result = json.loads(stdout) + state = result['Step']['Status']['State'] + if state == 'COMPLETED' or state == 'FAILED': + return result + else: + return None + + def _MakeHadoopStep(self, jarfile, classname, job_arguments): + """Construct an EMR step with a type CUSTOM_JAR.""" + step_list = ['Type=CUSTOM_JAR', 'Jar=' + jarfile] + if classname: + step_list.append('MainClass=' + classname) + if job_arguments: + arg_string = '[' + ','.join(job_arguments) + ']' + step_list.append('Args=' + arg_string) + return step_list + + def _MakeSparkStep(self, jarfile, classname, job_arguments): + arg_list = ['--class', classname, jarfile] + if job_arguments: + arg_list += job_arguments + arg_string = '[' + ','.join(arg_list) + ']' + step_list = ['Type=Spark', 'Args=' + arg_string] + return step_list + + def SubmitJob(self, jarfile, classname, job_poll_interval=JOB_WAIT_SLEEP, + job_arguments=None, job_stdout_file=None, + job_type=spark_service.SPARK_JOB_TYPE): + """Submit the job. + + Submit the job and wait for it to complete. If job_stdout_file is not + None, also way for the job's stdout to appear and put that in + job_stdout_file. + + Args: + jarfile: Jar file containing the class to submit. + classname: Name of the class. + job_poll_interval: Submit job will poll until the job is done; this is + the time between checks. + job_arguments: Arguments to pass to the job. + job_stdout_file: Name of a file in which to put the job's standard out. + If there is data here already, it will be overwritten. + + """ + + @vm_util.Retry(poll_interval=job_poll_interval, fuzz=0) + def WaitForFile(filename): + if not self._CheckForFile(filename): + raise Exception('File not found yet') + + @vm_util.Retry(timeout=FLAGS.aws_emr_job_wait_time, + poll_interval=job_poll_interval, fuzz=0) + def WaitForStep(step_id): + result = self._IsStepDone(step_id) + if result is None: + raise Exception('Step {0} not complete.'.format(step_id)) + return result + + if job_type == spark_service.SPARK_JOB_TYPE: + step_list = self._MakeSparkStep(jarfile, classname, job_arguments) + elif job_type == spark_service.HADOOP_JOB_TYPE: + step_list = self._MakeHadoopStep(jarfile, classname, job_arguments) + else: + raise Exception('Job type %s unsupported for EMR' % job_type) + step_string = ','.join(step_list) + cmd = self.cmd_prefix + ['emr', 'add-steps', '--cluster-id', + self.cluster_id, '--steps', step_string] + stdout, _, _ = vm_util.IssueCommand(cmd) + result = json.loads(stdout) + step_id = result['StepIds'][0] + metrics = {} + + result = WaitForStep(step_id) + pending_time = result['Step']['Status']['Timeline']['CreationDateTime'] + start_time = result['Step']['Status']['Timeline']['StartDateTime'] + end_time = result['Step']['Status']['Timeline']['EndDateTime'] + metrics[spark_service.WAITING] = start_time - pending_time + metrics[spark_service.RUNTIME] = end_time - start_time + step_state = result['Step']['Status']['State'] + metrics[spark_service.SUCCESS] = step_state == 'COMPLETED' + + # Now we need to take the standard out and put it in the designated path, + # if appropriate. + if job_stdout_file: + log_base = self._GetLogBase() + if log_base is None: + logging.warning('SubmitJob requested output, but EMR cluster was not ' + 'created with logging') + return metrics + + # log_base ends in a slash. + s3_stdout = '{0}{1}/steps/{2}/stdout.gz'.format(log_base, + self.cluster_id, + step_id) + WaitForFile(s3_stdout) + dest_file = '{0}.gz'.format(job_stdout_file) + cp_cmd = ['aws', 's3', 'cp', s3_stdout, dest_file] + _, _, retcode = vm_util.IssueCommand(cp_cmd, raise_on_failure=False) + if retcode == 0: + uncompress_cmd = ['gunzip', '-f', dest_file] + vm_util.IssueCommand(uncompress_cmd) + return metrics + + def SetClusterProperty(self): + pass + + def ExecuteOnMaster(self, script_path, script_args): + raise NotImplementedError() + + def CopyFromMaster(self, remote_path, local_path): + raise NotImplementedError() diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_glue_crawler.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_glue_crawler.py new file mode 100644 index 0000000..f4052c4 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_glue_crawler.py @@ -0,0 +1,203 @@ +"""Module containing class for AWS's Glue Crawler.""" + +import json +from typing import Any, Dict, Optional, Tuple + +from absl import flags +from perfkitbenchmarker import data_discovery_service +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import util + +FLAGS = flags.FLAGS + + +class CrawlNotCompletedError(Exception): + """Used to signal a crawl is still running.""" + + +class CrawlFailedError(Exception): + """Used to signal a crawl has failed.""" + + +class AwsGlueCrawler(data_discovery_service.BaseDataDiscoveryService): + """AWS Glue Crawler Resource Class. + + Attributes: + db_name: Name of the Glue database that will be provisioned. + crawler_name: Name of the crawler that will be provisioned. + role: Role the crawler will use. Refer to aws_glue_crawler_role flag for + more info. + sample_size: How many files will be crawled in each leaf directory. Refer to + aws_glue_crawler_sample_size flag for more info. + """ + + CLOUD = providers.AWS + SERVICE_TYPE = 'glue' + READY = 'READY' + FAILED = 'FAILED' + CRAWL_TIMEOUT = 21600 + CRAWL_POLL_INTERVAL = 5 + + def __init__(self): + super().__init__() + self.db_name = f'pkb-db-{FLAGS.run_uri}' + self.crawler_name = f'pkb-crawler-{FLAGS.run_uri}' + self.role = FLAGS.aws_glue_crawler_role + self.sample_size = FLAGS.aws_glue_crawler_sample_size + + def _Create(self) -> None: + # creating database + database_input = { + 'Name': self.db_name, + 'Description': '\n'.join( + f'{k}={v}' for k, v in util.MakeDefaultTags().items()), + } + cmd = util.AWS_PREFIX + [ + 'glue', + 'create-database', + '--database-input', json.dumps(database_input), + f'--region={self.region}', + ] + vm_util.IssueCommand(cmd) + + targets = {'S3Targets': [{'Path': self.data_discovery_path}]} + if self.sample_size is not None: + targets['S3Targets'][0]['SampleSize'] = self.sample_size + + # creating crawler + cmd = util.AWS_PREFIX + [ + 'glue', + 'create-crawler', + '--name', self.crawler_name, + '--role', self.role, + '--database-name', self.db_name, + '--targets', json.dumps(targets), + '--region', self.region, + '--tags', ','.join( + f'{k}={v}' for k, v in util.MakeDefaultTags().items()), + ] + vm_util.IssueCommand(cmd) + + def _Exists(self) -> bool: + return self._DbExists() and self._CrawlerExists() + + def _IsReady(self, raise_on_crawl_failure=False) -> bool: + stdout, _, _ = self._GetCrawler() + data = json.loads(stdout) + if (data['Crawler'].get('LastCrawl', {}).get('Status') == self.FAILED and + raise_on_crawl_failure): + raise CrawlFailedError( + data['Crawler'].get('LastCrawl', {}).get('ErrorMessage', '')) + return data['Crawler']['State'] == self.READY + + def _Delete(self) -> None: + # deleting database + cmd = util.AWS_PREFIX + [ + 'glue', + 'delete-database', + '--name', self.db_name, + '--region', self.region, + ] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + # deleting crawler + cmd = util.AWS_PREFIX + [ + 'glue', + 'delete-crawler', + '--name', self.crawler_name, + '--region', self.region, + ] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _IsDeleting(self) -> bool: + crawler_exists = self._CrawlerExists() + db_exists = self._DbExists() + if db_exists is None or crawler_exists is None: + return True + return self._DbExists() or self._CrawlerExists() + + def DiscoverData(self) -> float: + """Runs the AWS Glue Crawler. Returns the time elapsed in secs.""" + + cmd = util.AWS_PREFIX + [ + 'glue', + 'start-crawler', + '--name', self.crawler_name, + '--region', self.region, + ] + vm_util.IssueCommand(cmd) + self._WaitUntilCrawlerReady() + cmd = util.AWS_PREFIX + [ + 'glue', + 'get-crawler-metrics', + '--crawler-name-list', self.crawler_name, + '--region', self.region, + ] + output, _, _ = vm_util.IssueCommand(cmd) + data = json.loads(output) + assert (isinstance(data['CrawlerMetricsList'], list) and + len(data['CrawlerMetricsList']) == 1) + return data['CrawlerMetricsList'][0]['LastRuntimeSeconds'] + + def GetMetadata(self) -> Dict[str, Any]: + """Return a dictionary of the metadata for this service.""" + metadata = super().GetMetadata() + metadata.update( + aws_glue_crawler_sample_size=self.sample_size, + aws_glue_db_name=self.db_name, + aws_glue_crawler_name=self.crawler_name, + ) + return metadata + + @vm_util.Retry( + timeout=CRAWL_TIMEOUT, + poll_interval=CRAWL_POLL_INTERVAL, + fuzz=0, + retryable_exceptions=CrawlNotCompletedError,) + def _WaitUntilCrawlerReady(self): + if not self._IsReady(raise_on_crawl_failure=True): + raise CrawlNotCompletedError( + f'Crawler {self.crawler_name} still running.') + + def _DbExists(self) -> Optional[bool]: + """Whether the database exists or not. + + It might return None if the API call failed with an unknown error. + + Returns: + A bool or None. + """ + cmd = util.AWS_PREFIX + [ + 'glue', + 'get-database', + '--name', self.db_name, + '--region', self.region, + ] + _, stderr, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if not retcode: + return True + return False if 'EntityNotFoundException' in stderr else None + + def _CrawlerExists(self) -> Optional[bool]: + """Whether the crawler exists or not. + + It might return None if the API call failed with an unknown error. + + Returns: + A bool or None. + """ + _, stderr, retcode = self._GetCrawler(raise_on_failure=False) + if not retcode: + return True + return False if 'EntityNotFoundException' in stderr else None + + def _GetCrawler(self, raise_on_failure=True) -> Tuple[str, str, int]: + """Calls the AWS CLI to retrieve a crawler.""" + cmd = util.AWS_PREFIX + [ + 'glue', + 'get-crawler', + '--name', self.crawler_name, + '--region', self.region, + ] + return vm_util.IssueCommand(cmd, raise_on_failure=raise_on_failure) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_iam_role.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_iam_role.py new file mode 100644 index 0000000..7161320 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_iam_role.py @@ -0,0 +1,184 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for AWS' dynamodb tables. + +Tables can be created and deleted. +""" + + +import json +import logging +import time + +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import util + +# https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_elements_version.html +_POLICY_VERSION = '2012-10-17' + +_ROLE_ARN_TEMPLATE = 'arn:aws:iam::{account}:role/{role_name}' +_POLICY_ARN_TEMPLATE = 'arn:aws:iam::{account}:policy/{policy_name}' + +_TRUST_RELATIONSHIP_FILE = 'service-trust-relationship.json' +_ROLE_POLICY_FILE = 'service-role-policy.json' +_ROLE_CREATION_DELAY = 30 + +_TRUST_RELATIONSHIP_TEMPLATE = """{{ + "Version": "{version}", + "Statement": [ + {{ + "Effect": "Allow", + "Principal": {{ + "Service": "{service}" + }}, + "Action": "sts:AssumeRole" + }} + ] +}}""" + +_ROLE_POLICY_TEMPLATE = """{{ + "Version": "{version}", + "Statement": [ + {{ + "Action": [ + "{action}" + ], + "Effect": "Allow", + "Resource": [ + "{resource_arn}" + ] + }} + ] +}}""" + + +class AwsIamRole(resource.BaseResource): + """Class representing an AWS IAM role.""" + + def __init__(self, + account, + role_name, + policy_name, + service, + action, + resource_arn, + policy_version=None): + super(AwsIamRole, self).__init__() + self.account = account + self.role_name = role_name + self.policy_name = policy_name + self.service = service + self.action = action + self.resource_arn = resource_arn + self.policy_version = policy_version or _POLICY_VERSION + self.role_arn = _ROLE_ARN_TEMPLATE.format( + account=self.account, role_name=self.role_name) + self.policy_arn = _POLICY_ARN_TEMPLATE.format( + account=self.account, policy_name=self.policy_name) + + def _Create(self): + """See base class.""" + if not self._RoleExists(): + with open(_TRUST_RELATIONSHIP_FILE, 'w+') as relationship_file: + relationship_file.write( + _TRUST_RELATIONSHIP_TEMPLATE.format( + version=self.policy_version, service=self.service)) + + cmd = util.AWS_PREFIX + [ + 'iam', 'create-role', '--role-name', self.role_name, + '--assume-role-policy-document', + 'file://{}'.format(_TRUST_RELATIONSHIP_FILE) + ] + + _, stderror, retcode = vm_util.IssueCommand(cmd, raise_on_failure=True) + if retcode != 0: + logging.warning('Failed to create role! %s', stderror) + + if not self._PolicyExists(): + with open(_ROLE_POLICY_FILE, 'w+') as policy_file: + policy_file.write( + _ROLE_POLICY_TEMPLATE.format( + version=self.policy_version, + action=self.action, + resource_arn=self.resource_arn)) + cmd = util.AWS_PREFIX + [ + 'iam', 'create-policy', '--policy-name', 'PolicyFor' + self.role_name, + '--policy-document', 'file://{}'.format(_ROLE_POLICY_FILE) + ] + + _, stderror, retcode = vm_util.IssueCommand(cmd, raise_on_failure=True) + if retcode != 0: + logging.warning('Failed to create policy! %s', stderror) + + cmd = util.AWS_PREFIX + [ + 'iam', 'attach-role-policy', '--role-name', self.role_name, + '--policy-arn', self.policy_arn + ] + + _, stderror, retcode = vm_util.IssueCommand(cmd, raise_on_failure=True) + if retcode != 0: + logging.warning('Failed to attach role policy! %s', stderror) + + # Make sure the role is available for the downstream users (e.g., DAX). + # Without this, the step of creating DAX cluster may fail. + # TODO(user): use a more robust way to handle this. + time.sleep(_ROLE_CREATION_DELAY) + + def _Delete(self): + """See base class.""" + cmd = util.AWS_PREFIX + [ + 'iam', 'detach-role-policy', '--role-name', self.role_name, + '--policy-arn', self.policy_arn + ] + + _, stderror, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + logging.warning('Failed to delete role policy! %s', stderror) + + cmd = util.AWS_PREFIX + [ + 'iam', 'delete-policy', '--policy-arn', self.policy_arn + ] + + _, stderror, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + logging.warning('Failed to delete policy! %s', stderror) + + cmd = util.AWS_PREFIX + [ + 'iam', 'delete-role', '--role-name', self.role_name + ] + + _, stderror, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + logging.warning('Failed to delete role! %s', stderror) + + def GetRoleArn(self): + """Returns the role's Amazon Resource Name (ARN).""" + return self.role_arn + + def _RoleExists(self): + """Returns true if the IAM role exists.""" + cmd = util.AWS_PREFIX + ['iam', 'get-role', '--role-name', self.role_name] + stdout, _, retcode = vm_util.IssueCommand( + cmd, suppress_warning=True, raise_on_failure=False) + return retcode == 0 and stdout and json.loads(stdout)['Role'] + + def _PolicyExists(self): + """Returns true if the IAM policy used by the role exists.""" + cmd = util.AWS_PREFIX + [ + 'iam', 'get-policy', '--policy-arn', self.policy_arn + ] + stdout, _, retcode = vm_util.IssueCommand( + cmd, suppress_warning=True, raise_on_failure=False) + return retcode == 0 and stdout and json.loads(stdout)['Policy'] diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_load_balancer.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_load_balancer.py new file mode 100644 index 0000000..cdf33a2 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_load_balancer.py @@ -0,0 +1,150 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing classes related to AWS's load balancers.""" + +import json + +from absl import flags +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import util + +FLAGS = flags.FLAGS + + +class TargetGroup(resource.BaseResource): + """Class represeting an AWS target group.""" + + def __init__(self, vpc, port): + """Initializes the TargetGroup object. + + Args: + vpc: AwsVpc object which contains the targets for load balancing. + port: The internal port that the load balancer connects to. + """ + super(TargetGroup, self).__init__() + self.arn = None + self.region = vpc.region + self.name = 'pkb-%s' % FLAGS.run_uri + self.protocol = 'TCP' + self.port = port + self.vpc_id = vpc.id + + def _Create(self): + """Create the target group.""" + create_cmd = util.AWS_PREFIX + [ + '--region', self.region, + 'elbv2', 'create-target-group', + '--target-type', 'ip', + '--name', self.name, + '--protocol', self.protocol, + '--port', str(self.port), + '--vpc-id', self.vpc_id + ] + stdout, _, _ = vm_util.IssueCommand(create_cmd) + response = json.loads(stdout) + self.arn = response['TargetGroups'][0]['TargetGroupArn'] + + def _Delete(self): + """Delete the target group.""" + if self.arn is None: + return + delete_cmd = util.AWS_PREFIX + [ + '--region', self.region, + 'elbv2', 'delete-target-group', + '--target-group-arn', self.arn + ] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + +class LoadBalancer(resource.BaseResource): + """Class representing an AWS load balancer.""" + + def __init__(self, subnets): + """Initializes the LoadBalancer object. + + Args: + subnets: List of AwsSubnet objects. + """ + super(LoadBalancer, self).__init__() + self.region = subnets[0].region + self.name = 'pkb-%s' % FLAGS.run_uri + self.subnet_ids = [subnet.id for subnet in subnets] + self.type = 'network' + self.arn = None + self.dns_name = None + + def _Create(self): + """Create the load balancer.""" + create_cmd = util.AWS_PREFIX + [ + '--region', self.region, + 'elbv2', 'create-load-balancer', + '--name', self.name, + '--type', self.type, + '--tags'] + util.MakeFormattedDefaultTags() + # Add --subnets argument to the command. + create_cmd.append('--subnets') + create_cmd.extend(self.subnet_ids) + + stdout, _, _ = vm_util.IssueCommand(create_cmd) + load_balancer = json.loads(stdout)['LoadBalancers'][0] + self.arn = load_balancer['LoadBalancerArn'] + self.dns_name = load_balancer['DNSName'] + + def _Delete(self): + """Delete the load balancer.""" + if self.arn is None: + return + delete_cmd = util.AWS_PREFIX + [ + '--region', self.region, + 'elbv2', 'delete-load-balancer', + '--load-balancer-arn', self.arn + ] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + +class Listener(resource.BaseResource): + """Class representing an AWS listener.""" + + def __init__(self, load_balancer, target_group, port): + super(Listener, self).__init__() + self.load_balancer_arn = load_balancer.arn + self.target_group_arn = target_group.arn + self.port = port + self.protocol = target_group.protocol + self.region = target_group.region + + def _GetDefaultActions(self): + """Returns a JSON representation of the default actions for the listener.""" + actions = [{ + 'Type': 'forward', + 'TargetGroupArn': self.target_group_arn + }] + return json.dumps(actions) + + def _Create(self): + """Create the listener.""" + create_cmd = util.AWS_PREFIX + [ + '--region', self.region, + 'elbv2', 'create-listener', + '--load-balancer-arn', self.load_balancer_arn, + '--protocol', self.protocol, + '--port', str(self.port), + '--default-actions', self._GetDefaultActions() + ] + vm_util.IssueCommand(create_cmd) + + def _Delete(self): + """Listeners will be deleted along with their associated load balancers.""" + pass diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_logs.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_logs.py new file mode 100644 index 0000000..cf9c8cd --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_logs.py @@ -0,0 +1,101 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing classes related to AWS CloudWatch Logs.""" + +import json + +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import util + + +class LogGroup(resource.BaseResource): + """Class representing a CloudWatch log group.""" + + def __init__(self, region, name, retention_in_days=7): + super(LogGroup, self).__init__() + self.region = region + self.name = name + self.retention_in_days = retention_in_days + + def _Create(self): + """Create the log group.""" + create_cmd = util.AWS_PREFIX + [ + '--region', self.region, + 'logs', 'create-log-group', + '--log-group-name', self.name + ] + vm_util.IssueCommand(create_cmd) + + def _Delete(self): + """Delete the log group.""" + delete_cmd = util.AWS_PREFIX + [ + '--region', self.region, + 'logs', 'delete-log-group', + '--log-group-name', self.name + ] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + def Exists(self): + """Returns True if the log group exists.""" + describe_cmd = util.AWS_PREFIX + [ + '--region', self.region, + 'logs', 'describe-log-groups', + '--log-group-name-prefix', self.name, + '--no-paginate' + ] + stdout, _, _ = vm_util.IssueCommand(describe_cmd) + log_groups = json.loads(stdout)['logGroups'] + group = next((group for group in log_groups + if group['logGroupName'] == self.name), None) + return bool(group) + + def _PostCreate(self): + """Set the retention policy.""" + put_cmd = util.AWS_PREFIX + [ + '--region', self.region, + 'logs', 'put-retention-policy', + '--log-group-name', self.name, + '--retention-in-days', str(self.retention_in_days) + ] + vm_util.IssueCommand(put_cmd) + + +def GetLogs(region, stream_name, group_name, token=None): + """Fetches the JSON formatted log stream starting at the token.""" + get_cmd = util.AWS_PREFIX + [ + '--region', region, + 'logs', 'get-log-events', + '--start-from-head', + '--log-group-name', group_name, + '--log-stream-name', stream_name, + ] + if token: + get_cmd.extend(['--next-token', token]) + stdout, _, _ = vm_util.IssueCommand(get_cmd) + return json.loads(stdout) + + +def GetLogStreamAsString(region, stream_name, log_group): + """Returns the messages of the log stream as a string.""" + log_lines = [] + token = None + events = [] + while token is None or events: + response = GetLogs(region, stream_name, log_group, token) + events = response['events'] + token = response['nextForwardToken'] + for event in events: + log_lines.append(event['message']) + return '\n'.join(log_lines) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_network.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_network.py new file mode 100644 index 0000000..61574da --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_network.py @@ -0,0 +1,957 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing classes related to AWS VM networking. + +The Firewall class provides a way of opening VM ports. The Network class allows +VMs to communicate via internal ips and isolates PerfKitBenchmarker VMs from +others in +the same project. See https://aws.amazon.com/documentation/vpc/ +for more information about AWS Virtual Private Clouds. +""" + +import json +import logging +import threading + +from absl import flags +from perfkitbenchmarker import context +from perfkitbenchmarker import errors +from perfkitbenchmarker import network +from perfkitbenchmarker import placement_group +from perfkitbenchmarker import providers +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import aws_placement_group +from perfkitbenchmarker.providers.aws import aws_vpc_endpoint +from perfkitbenchmarker.providers.aws import util + +_AWS_VPC = flags.DEFINE_string( + 'aws_vpc', None, + 'The static AWS VPC id to use. If unset, creates a new VPC.') +_AWS_SUBNET = flags.DEFINE_string( + 'aws_subnet', None, + 'The static AWS subnet id to use. Set value to "default" to use ' + 'default subnet. If unset, creates a new subnet.') +flags.DEFINE_bool('aws_efa', False, 'Whether to use an Elastic Fiber Adapter.') +flags.DEFINE_string('aws_efa_version', '1.12.1', + 'Version of AWS EFA to use (must also pass in --aws_efa).') +flags.DEFINE_integer('aws_efa_count', 1, 'The number of EFAs per instance.') +flags.DEFINE_multi_enum('aws_endpoint', [], ['s3'], + 'List of AWS endpoints to create') + +FLAGS = flags.FLAGS + + +REGION = 'region' +ZONE = 'zone' + + +class AwsFirewall(network.BaseFirewall): + """An object representing the AWS Firewall.""" + + CLOUD = providers.AWS + + def __init__(self): + self.firewall_set = set() + self.firewall_icmp_set = set() + self._lock = threading.Lock() + + def AllowIcmp(self, vm): + """Opens the ICMP protocol on the firewall. + + Args: + vm: The BaseVirtualMachine object to open the ICMP protocol for. + """ + source = '0.0.0.0/0' + + # region, group_id, source + entry = (vm.region, vm.group_id, source) + with self._lock: + if entry in self.firewall_icmp_set: + return + # When defining ICMP firewall rules using the aws cli, + # port specifies the type of ICMP traffic allowed, + # with -1 meaning all ICMP types + # https://docs.aws.amazon.com/cli/latest/reference/ec2/authorize-security-group-ingress.html + authorize_cmd = util.AWS_PREFIX + [ + 'ec2', + 'authorize-security-group-ingress', + '--region=%s' % vm.region, + '--group-id=%s' % vm.group_id, + '--protocol=icmp', + '--port=-1', + '--cidr=%s' % source] + util.IssueRetryableCommand( + authorize_cmd) + self.firewall_icmp_set.add(entry) + + def AllowPort(self, vm, start_port, end_port=None, source_range=None): + """Opens a port on the firewall. + + Args: + vm: The BaseVirtualMachine object to open the port for. + start_port: The first local port to open in a range. + end_port: The last local port to open in a range. If None, only start_port + will be opened. + source_range: List of source CIDRs to allow for this port. If None, all + sources are allowed. i.e. ['0.0.0.0/0'] + """ + if vm.is_static or vm.network.is_static: + return + self.AllowPortInSecurityGroup(vm.region, vm.group_id, start_port, end_port, + source_range) + + def AllowPortInSecurityGroup(self, + region, + security_group, + start_port, + end_port=None, + source_range=None): + """Opens a port on the firewall for a security group. + + Args: + region: The region of the security group + security_group: The security group in which to open the ports + start_port: The first local port to open in a range. + end_port: The last local port to open in a range. If None, only start_port + will be opened. + source_range: List of source CIDRs to allow for this port. + """ + end_port = end_port or start_port + source_range = source_range or ['0.0.0.0/0'] + for source in source_range: + entry = (start_port, end_port, region, security_group, source) + if entry in self.firewall_set: + continue + if self._RuleExists(region, security_group, start_port, end_port, source): + self.firewall_set.add(entry) + continue + with self._lock: + if entry in self.firewall_set: + continue + authorize_cmd = util.AWS_PREFIX + [ + 'ec2', + 'authorize-security-group-ingress', + '--region=%s' % region, + '--group-id=%s' % security_group, + '--port=%s-%s' % (start_port, end_port), + '--cidr=%s' % source, + ] + util.IssueRetryableCommand(authorize_cmd + ['--protocol=tcp']) + util.IssueRetryableCommand(authorize_cmd + ['--protocol=udp']) + self.firewall_set.add(entry) + + def _RuleExists(self, region, security_group, start_port, end_port, source): + """Whether the firewall rule exists in the VPC.""" + query_cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-security-groups', + '--region=%s' % region, + '--group-ids=%s' % security_group, + '--filters', + 'Name=ip-permission.cidr,Values={}'.format(source), + 'Name=ip-permission.from-port,Values={}'.format(start_port), + 'Name=ip-permission.to-port,Values={}'.format(end_port), + ] + stdout, _ = util.IssueRetryableCommand(query_cmd) + # "groups" will be an array of all the matching firewall rules + groups = json.loads(stdout)['SecurityGroups'] + return bool(groups) + + def DisallowAllPorts(self): + """Closes all ports on the firewall.""" + pass + + +class AwsVpc(resource.BaseResource): + """An object representing an Aws VPC.""" + + def __init__(self, region, vpc_id=None, regional_network_index=0): + super(AwsVpc, self).__init__(vpc_id is not None) + self.region = region + self.regional_network_index = regional_network_index + self.cidr = network.GetCidrBlock(self.regional_network_index, 0, 16) + self.id = vpc_id + # Subnets are assigned per-AZ. + # _subnet_index tracks the next unused 10.x.y.0/24 block. + self._subnet_index = 0 + # Lock protecting _subnet_index + self._subnet_index_lock = threading.Lock() + self.default_security_group_id = None + if self.id: + self._SetSecurityGroupId() + self._endpoints = [ + aws_vpc_endpoint.CreateEndpointService(service, self) + for service in set(FLAGS.aws_endpoint) + ] + + def _Create(self): + """Creates the VPC.""" + create_cmd = util.AWS_PREFIX + [ + 'ec2', + 'create-vpc', + '--region=%s' % self.region, + '--cidr-block=%s' % self.cidr] + stdout, stderr, retcode = vm_util.IssueCommand( + create_cmd, raise_on_failure=False) + if 'VpcLimitExceeded' in stderr: + raise errors.Benchmarks.QuotaFailure(stderr) + if retcode: + raise errors.Resource.CreationError( + 'Failed to create Vpc: %s return code: %s' % (retcode, stderr)) + + response = json.loads(stdout) + self.id = response['Vpc']['VpcId'] + self._EnableDnsHostnames() + util.AddDefaultTags(self.id, self.region) + + def _PostCreate(self): + self._SetSecurityGroupId() + for endpoint in self._endpoints: + endpoint.Create() + + def _SetSecurityGroupId(self): + """Looks up the VPC default security group.""" + groups = self.GetSecurityGroups('default') + if len(groups) != 1: + raise ValueError('Expected one security group, got {} in {}'.format( + len(groups), groups)) + self.default_security_group_id = groups[0]['GroupId'] + logging.info('Default security group ID: %s', + self.default_security_group_id) + if FLAGS.aws_efa: + self._AllowSelfOutBound() + + def GetSecurityGroups(self, group_name=None): + cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-security-groups', + '--region', self.region, + '--filters', + 'Name=vpc-id,Values=' + self.id] + if group_name: + cmd.append('Name=group-name,Values={}'.format(group_name)) + stdout, _, _ = vm_util.IssueCommand(cmd) + return json.loads(stdout)['SecurityGroups'] + + def _Exists(self): + """Returns true if the VPC exists.""" + describe_cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-vpcs', + '--region=%s' % self.region, + '--filter=Name=vpc-id,Values=%s' % self.id] + stdout, _ = util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + vpcs = response['Vpcs'] + assert len(vpcs) < 2, 'Too many VPCs.' + return len(vpcs) > 0 + + def _EnableDnsHostnames(self): + """Sets the enableDnsHostnames attribute of this VPC to True. + + By default, instances launched in non-default VPCs are assigned an + unresolvable hostname. This breaks the hadoop benchmark. Setting the + enableDnsHostnames attribute to 'true' on the VPC resolves this. See: + http://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_DHCP_Options.html + """ + enable_hostnames_command = util.AWS_PREFIX + [ + 'ec2', + 'modify-vpc-attribute', + '--region=%s' % self.region, + '--vpc-id', self.id, + '--enable-dns-hostnames', + '{ "Value": true }'] + + util.IssueRetryableCommand(enable_hostnames_command) + + def _PreDelete(self): + """See base class. + + Deletes the AWS endpoints if created. + """ + for endpoint in self._endpoints: + endpoint.Delete() + + def _Delete(self): + """Deletes the VPC.""" + delete_cmd = util.AWS_PREFIX + [ + 'ec2', + 'delete-vpc', + '--region=%s' % self.region, + '--vpc-id=%s' % self.id] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + def NextSubnetCidrBlock(self): + """Returns the next available /24 CIDR block in this VPC. + + Each VPC has a 10.0.0.0/16 CIDR block. + Each subnet is assigned a /24 within this allocation. + Calls to this method return the next unused /24. + + Returns: + A string representing the next available /24 block, in CIDR notation. + Raises: + ValueError: when no additional subnets can be created. + """ + with self._subnet_index_lock: + if self._subnet_index >= (1 << 8) - 1: + raise ValueError('Exceeded subnet limit ({0}).'.format( + self._subnet_index)) + cidr = network.GetCidrBlock(self.regional_network_index, + self._subnet_index) + self._subnet_index += 1 + return cidr + + @vm_util.Retry() + def _AllowSelfOutBound(self): + """Allow outbound connections on all ports in the default security group. + + Details: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html + """ + cmd = util.AWS_PREFIX + [ + 'ec2', 'authorize-security-group-egress', + '--region', self.region, '--group-id', self.default_security_group_id, + '--protocol', 'all', '--source-group', self.default_security_group_id + ] + try: + vm_util.IssueCommand(cmd) + except errors.VmUtil.IssueCommandError as ex: + # do not retry if this rule already exists + if ex.message.find('InvalidPermission.Duplicate') == -1: + raise ex + + def AllowVpcPeerInBound(self, peer_vpc): + """Allow inbound connections on all ports in the default security group from peer vpc. + + Args: + peer_vpc: AwsVpc. Peer vpc to allow inbound traffic from. + """ + cmd = util.AWS_PREFIX + [ + 'ec2', 'authorize-security-group-ingress', + '--region=%s' % self.region, + '--group-id=%s' % self.default_security_group_id, + '--protocol=%s' % 'all', + '--cidr=%s' % peer_vpc.cidr + ] + vm_util.IssueRetryableCommand(cmd) + + +class AwsSubnet(resource.BaseResource): + """An object representing an Aws subnet.""" + + def __init__(self, zone, vpc_id, cidr_block='10.0.0.0/24', subnet_id=None): + super(AwsSubnet, self).__init__(subnet_id is not None) + self.zone = zone + self.region = util.GetRegionFromZone(zone) + self.vpc_id = vpc_id + self.id = subnet_id + self.cidr_block = cidr_block + + def _Create(self): + """Creates the subnet.""" + + create_cmd = util.AWS_PREFIX + [ + 'ec2', + 'create-subnet', + '--region=%s' % self.region, + '--vpc-id=%s' % self.vpc_id, + '--cidr-block=%s' % self.cidr_block] + if not util.IsRegion(self.zone): + create_cmd.append('--availability-zone=%s' % self.zone) + + stdout, _, _ = vm_util.IssueCommand(create_cmd) + response = json.loads(stdout) + self.id = response['Subnet']['SubnetId'] + util.AddDefaultTags(self.id, self.region) + + def _Delete(self): + """Deletes the subnet.""" + logging.info('Deleting subnet %s. This may fail if all instances in the ' + 'subnet have not completed termination, but will be retried.', + self.id) + delete_cmd = util.AWS_PREFIX + [ + 'ec2', + 'delete-subnet', + '--region=%s' % self.region, + '--subnet-id=%s' % self.id] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + def _Exists(self): + """Returns true if the subnet exists.""" + return bool(self.GetDict()) + + def GetDict(self): + """The 'aws ec2 describe-subnets' for this VPC / subnet id. + + Returns: + A dict of the single subnet or an empty dict if there are no subnets. + + Raises: + AssertionError: If there is more than one subnet. + """ + describe_cmd = util.AWS_PREFIX + [ + 'ec2', 'describe-subnets', + '--region=%s' % self.region, + '--filter=Name=vpc-id,Values=%s' % self.vpc_id + ] + if self.id: + describe_cmd.append('--filter=Name=subnet-id,Values=%s' % self.id) + stdout, _ = util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + subnets = response['Subnets'] + assert len(subnets) < 2, 'Too many subnets.' + return subnets[0] if subnets else {} + + +class AwsInternetGateway(resource.BaseResource): + """An object representing an Aws Internet Gateway.""" + + def __init__(self, region, vpc_id=None): + super(AwsInternetGateway, self).__init__(vpc_id is not None) + self.region = region + self.vpc_id = None + self.id = None + self.attached = False + if vpc_id: + self.vpc_id = vpc_id + self.id = self.GetDict().get('InternetGatewayId') + # if a gateway was found then it is attached to this VPC + self.attached = bool(self.id) + + def _Create(self): + """Creates the internet gateway.""" + create_cmd = util.AWS_PREFIX + [ + 'ec2', + 'create-internet-gateway', + '--region=%s' % self.region] + stdout, _, _ = vm_util.IssueCommand(create_cmd) + response = json.loads(stdout) + self.id = response['InternetGateway']['InternetGatewayId'] + util.AddDefaultTags(self.id, self.region) + + def _Delete(self): + """Deletes the internet gateway.""" + delete_cmd = util.AWS_PREFIX + [ + 'ec2', + 'delete-internet-gateway', + '--region=%s' % self.region, + '--internet-gateway-id=%s' % self.id] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + def _Exists(self): + """Returns true if the internet gateway exists.""" + return bool(self.GetDict()) + + def GetDict(self): + """The 'aws ec2 describe-internet-gateways' for this VPC / gateway id. + + Returns: + A dict of the single gateway or an empty dict if there are no gateways. + + Raises: + AssertionError: If there is more than one internet gateway. + """ + describe_cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-internet-gateways', + '--region=%s' % self.region, + ] + if self.id: + describe_cmd.append('--filter=Name=internet-gateway-id,Values=%s' % + self.id) + elif self.vpc_id: + # Only query with self.vpc_id if the self.id is NOT set -- after calling + # Detach() this object will set still have a vpc_id but will be filtered + # out in a query if using attachment.vpc-id. + # Using self.vpc_id instead of self.attached as the init phase always + # sets it to False. + describe_cmd.append('--filter=Name=attachment.vpc-id,Values=%s' % + self.vpc_id) + else: + raise errors.Error('Must have a VPC id or a gateway id') + stdout, _ = util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + internet_gateways = response['InternetGateways'] + assert len(internet_gateways) < 2, 'Too many internet gateways.' + return internet_gateways[0] if internet_gateways else {} + + def Attach(self, vpc_id): + """Attaches the internet gateway to the VPC.""" + if not self.attached: + self.vpc_id = vpc_id + attach_cmd = util.AWS_PREFIX + [ + 'ec2', + 'attach-internet-gateway', + '--region=%s' % self.region, + '--internet-gateway-id=%s' % self.id, + '--vpc-id=%s' % self.vpc_id] + util.IssueRetryableCommand(attach_cmd) + self.attached = True + + def Detach(self): + """Detaches the internet gateway from the VPC.""" + + def _suppress_failure(stdout, stderr, retcode): + """Suppresses Detach failure when internet gateway is in a bad state.""" + del stdout # unused + if retcode and ('InvalidInternetGatewayID.NotFound' in stderr or + 'Gateway.NotAttached' in stderr): + return True + return False + + if self.attached and not self.user_managed: + detach_cmd = util.AWS_PREFIX + [ + 'ec2', + 'detach-internet-gateway', + '--region=%s' % self.region, + '--internet-gateway-id=%s' % self.id, + '--vpc-id=%s' % self.vpc_id] + util.IssueRetryableCommand(detach_cmd, suppress_failure=_suppress_failure) + self.attached = False + + +class AwsRouteTable(resource.BaseResource): + """An object representing a route table.""" + + def __init__(self, region, vpc_id): + super(AwsRouteTable, self).__init__() + self.region = region + self.vpc_id = vpc_id + self.id: str = None # set by _PostCreate + + def _Create(self): + """Creates the route table. + + This is a no-op since every VPC has a default route table. + """ + pass + + def _Delete(self): + """Deletes the route table. + + This is a no-op since the default route table gets deleted with the VPC. + """ + pass + + @vm_util.Retry() + def _PostCreate(self): + """Gets data about the route table.""" + self.id = self.GetDict()[0]['RouteTableId'] + + def GetDict(self): + """Returns an array of the currently existing routes for this VPC.""" + describe_cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-route-tables', + '--region=%s' % self.region, + '--filters=Name=vpc-id,Values=%s' % self.vpc_id] + stdout, _ = util.IssueRetryableCommand(describe_cmd) + return json.loads(stdout)['RouteTables'] + + def RouteExists(self): + """Returns true if the 0.0.0.0/0 route already exists.""" + route_tables = self.GetDict() + if not route_tables: + return False + for route in route_tables[0].get('Routes', []): + if route.get('DestinationCidrBlock') == '0.0.0.0/0': + return True + return False + + def CreateRoute(self, internet_gateway_id): + """Adds a route to the internet gateway.""" + if self.RouteExists(): + logging.info('Internet route already exists.') + return + create_cmd = util.AWS_PREFIX + [ + 'ec2', + 'create-route', + '--region=%s' % self.region, + '--route-table-id=%s' % self.id, + '--gateway-id=%s' % internet_gateway_id, + '--destination-cidr-block=0.0.0.0/0'] + util.IssueRetryableCommand(create_cmd) + + def CreateVpcPeeringRoute(self, vpc_peering_id, destination_cidr): + """Adds a route to peer VPC.""" + create_cmd = util.AWS_PREFIX + [ + 'ec2', + 'create-route', + '--region=%s' % self.region, + '--route-table-id=%s' % self.id, + '--vpc-peering-connection-id=%s' % vpc_peering_id, + '--destination-cidr-block=%s' % destination_cidr] + util.IssueRetryableCommand(create_cmd) + + +class _AwsRegionalNetwork(network.BaseNetwork): + """Object representing regional components of an AWS network. + + The benchmark spec contains one instance of this class per region, which an + AwsNetwork may retrieve or create via _AwsRegionalNetwork.GetForRegion. + + Attributes: + region: string. The AWS region. + vpc: an AwsVpc instance. + internet_gateway: an AwsInternetGateway instance. + route_table: an AwsRouteTable instance. The default route table. + """ + + _regional_network_count = 0 + _regional_network_lock = threading.Lock() + + CLOUD = providers.AWS + + def __repr__(self): + return '%s(%r)' % (self.__class__, self.__dict__) + + def __init__(self, region, vpc_id=None): + self.region = region + self.internet_gateway = AwsInternetGateway(region, vpc_id) + self.route_table = None + self.created = False + + # Locks to ensure that a single thread creates / deletes the instance. + self._create_lock = threading.Lock() + + # Tracks the number of AwsNetworks using this _AwsRegionalNetwork. + # Incremented by Create(); decremented by Delete(); + # When a Delete() call decrements _reference_count to 0, the RegionalNetwork + # is destroyed. + self._reference_count = 0 + self._reference_count_lock = threading.Lock() + + # Each regional network needs unique cidr_block for VPC peering. + with _AwsRegionalNetwork._regional_network_lock: + self.vpc = AwsVpc(self.region, vpc_id, + _AwsRegionalNetwork._regional_network_count) + self.cidr_block = network.GetCidrBlock( + _AwsRegionalNetwork._regional_network_count) + _AwsRegionalNetwork._regional_network_count += 1 + + @classmethod + def GetForRegion(cls, region, vpc_id=None): + """Retrieves or creates an _AwsRegionalNetwork. + + Args: + region: string. AWS region name. + vpc_id: string. AWS VPC id. + + Returns: + _AwsRegionalNetwork. If an _AwsRegionalNetwork for the same region already + exists in the benchmark spec, that instance is returned. Otherwise, a new + _AwsRegionalNetwork is created and returned. + """ + benchmark_spec = context.GetThreadBenchmarkSpec() + if benchmark_spec is None: + raise errors.Error('GetNetwork called in a thread without a ' + 'BenchmarkSpec.') + key = cls.CLOUD, REGION, region + # Because this method is only called from the AwsNetwork constructor, which + # is only called from AwsNetwork.GetNetwork, we already hold the + # benchmark_spec.networks_lock. + if key not in benchmark_spec.regional_networks: + benchmark_spec.regional_networks[key] = cls(region, vpc_id) + return benchmark_spec.regional_networks[key] + + def Create(self): + """Creates the network.""" + with self._reference_count_lock: + assert self._reference_count >= 0, self._reference_count + self._reference_count += 1 + + # Access here must be synchronized. The first time the block is executed, + # the network will be created. Subsequent attempts to create the + # network block until the initial attempt completes, then return. + with self._create_lock: + if self.created: + return + + self.vpc.Create() + + self.internet_gateway.Create() + self.internet_gateway.Attach(self.vpc.id) + + if self.route_table is None: + self.route_table = AwsRouteTable(self.region, self.vpc.id) + self.route_table.Create() + self.route_table.CreateRoute(self.internet_gateway.id) + + self.created = True + + def Delete(self): + """Deletes the network.""" + # Only actually delete if there are no more references. + with self._reference_count_lock: + assert self._reference_count >= 1, self._reference_count + self._reference_count -= 1 + if self._reference_count: + return + + if self.created: + self.internet_gateway.Detach() + self.internet_gateway.Delete() + self.vpc.Delete() + + +class AwsNetworkSpec(network.BaseNetworkSpec): + """Configuration for creating an AWS network.""" + + def __init__(self, zone, vpc_id=None, subnet_id=None): + super(AwsNetworkSpec, self).__init__(zone) + if vpc_id or subnet_id: + logging.info('Confirming vpc (%s) and subnet (%s) selections', vpc_id, + subnet_id) + my_subnet = AwsSubnet(self.zone, vpc_id, subnet_id=subnet_id).GetDict() + self.vpc_id = my_subnet['VpcId'] + self.subnet_id = my_subnet['SubnetId'] + self.cidr_block = my_subnet['CidrBlock'] + logging.info('Using vpc %s subnet %s cidr %s', self.vpc_id, + self.subnet_id, self.cidr_block) + else: + self.vpc_id = None + self.subnet_id = None + self.cidr_block = None + + +def _get_default_vpc_id(region: str) -> str: + """Returns the default VPC ID for the region. + + Creates a default VPC if one did not exist previously + + + Args: + region: Region of the default VPC. + + Returns: Default VPC ID + + Raises: + UnsupportedConfigError: When default VPC does not exist and cannot be + created. + """ + vpc_cmd = util.AWS_PREFIX + [ + 'ec2', 'describe-vpcs', + '--region', region, + '--filters', 'Name=isDefault,Values=true' + ] + stdout, _ = vm_util.IssueRetryableCommand(vpc_cmd) + vpcs = json.loads(stdout)['Vpcs'] + if vpcs: + return vpcs[0]['VpcId'] + create_cmd = util.AWS_PREFIX + [ + 'ec2', 'create-default-vpc', '--region', region, + ] + stdout, _, ret = vm_util.IssueCommand(create_cmd, raise_on_failure=False) + if ret: + raise errors.Benchmarks.UnsupportedConfigError( + f'AWS default VPC does not exist for region {region}.') + return json.loads(stdout)['Vpc']['VpcId'] + + +def _get_default_subnet_id(zone: str) -> str: + """Returns the default subnet ID for the zone. + + Creates a default subnet if one did not exist previously + + + Args: + zone: Zone of the default subnet. + + Returns: Default Subnet ID + + Raises: + UnsupportedConfigError: When default subnet does not exist and cannot be + created. + """ + region = util.GetRegionFromZone(zone) + subnet_cmd = util.AWS_PREFIX + [ + 'ec2', 'describe-subnets', + '--region', region, '--filter', + f'Name=availabilityZone,Values={zone}', + 'Name=defaultForAz,Values=true' + ] + stdout, _ = vm_util.IssueRetryableCommand(subnet_cmd) + subnets = json.loads(stdout)['Subnets'] + if subnets: + return subnets[0]['SubnetId'] + create_cmd = util.AWS_PREFIX + [ + 'ec2', 'create-default-subnet', + '--region', region, + '--availability-zone', zone + ] + stdout, _, ret = vm_util.IssueCommand(create_cmd, raise_on_failure=False) + if ret: + raise errors.Benchmarks.UnsupportedConfigError( + f'AWS default subnet does not exist for zone {zone}.') + return json.loads(stdout)['Subnet']['SubnetId'] + + +class AwsNetwork(network.BaseNetwork): + """Object representing an AWS Network. + + Attributes: + region: The AWS region the Network is in. + regional_network: The AwsRegionalNetwork for 'region'. + subnet: the AwsSubnet for this zone. + placement_group: An AwsPlacementGroup instance. + """ + + CLOUD = providers.AWS + + def __repr__(self): + return '%s(%r)' % (self.__class__, self.__dict__) + + def __init__(self, spec): + """Initializes AwsNetwork instances. + + Args: + spec: An AwsNetworkSpec object. + """ + super(AwsNetwork, self).__init__(spec) + self.region = util.GetRegionFromZone(spec.zone) + self.regional_network = _AwsRegionalNetwork.GetForRegion( + self.region, spec.vpc_id) + self.subnet = None + self.vpc_peering = None + if (FLAGS.placement_group_style == + placement_group.PLACEMENT_GROUP_NONE): + self.placement_group = None + else: + placement_group_spec = aws_placement_group.AwsPlacementGroupSpec( + 'AwsPlacementGroupSpec', flag_values=FLAGS, zone=spec.zone) + self.placement_group = aws_placement_group.AwsPlacementGroup( + placement_group_spec) + self.is_static = False + if spec.vpc_id: + self.is_static = True + self.subnet = AwsSubnet( + self.zone, + spec.vpc_id, + cidr_block=self.regional_network.cidr_block, + subnet_id=spec.subnet_id) + + @staticmethod + def _GetNetworkSpecFromVm(vm): + """Returns an AwsNetworkSpec created from VM attributes and flags.""" + if _AWS_SUBNET.value == 'default': + vpc_id = _get_default_vpc_id(vm.region) + subnet_id = _get_default_subnet_id(vm.zone) + else: + vpc_id = _AWS_VPC.value + subnet_id = _AWS_SUBNET.value + return AwsNetworkSpec(vm.zone, vpc_id, subnet_id) + + def Create(self): + """Creates the network.""" + self.regional_network.Create() + + if self.subnet is None: + cidr = self.regional_network.vpc.NextSubnetCidrBlock() + self.subnet = AwsSubnet(self.zone, self.regional_network.vpc.id, + cidr_block=cidr) + self.subnet.Create() + if self.placement_group: + self.placement_group.Create() + + def Delete(self): + """Deletes the network.""" + if self.subnet: + self.subnet.Delete() + if self.placement_group: + self.placement_group.Delete() + if hasattr(self, 'vpc_peering') and self.vpc_peering: + self.vpc_peering.Delete() + self.regional_network.Delete() + + def Peer(self, peering_network): + """Peers the network with the peering_network. + + This method is used for VPC peering. It will connect 2 VPCs together. + + Args: + peering_network: BaseNetwork. The network to peer with. + """ + + # Skip Peering if the networks are the same + if self.regional_network is peering_network.regional_network: + return + + spec = network.BaseVPCPeeringSpec(self.regional_network, + peering_network.regional_network) + self.vpc_peering = AwsVpcPeering(spec) + peering_network.vpc_peering = self.vpc_peering + self.vpc_peering.Create() + + @classmethod + def _GetKeyFromNetworkSpec(cls, spec): + """Returns a key used to register Network instances.""" + return (cls.CLOUD, ZONE, spec.zone) + + +class AwsVpcPeering(network.BaseVPCPeering): + """Object containing all information needed to create a VPC Peering Object.""" + + def _Create(self): + """Creates the peering object. + + Documentation on creating a vpc object: + https://docs.aws.amazon.com/vpc/latest/peering/vpc-pg.pdf + """ + # Creates Peering Connection + create_cmd = util.AWS_PREFIX + [ + 'ec2', + 'create-vpc-peering-connection', + '--region=%s' % self.network_a.region, + '--peer-region=%s' % self.network_b.region, + '--vpc-id=%s' % self.network_a.vpc.id, + '--peer-vpc-id=%s' % self.network_b.vpc.id] + + stdout, _ = vm_util.IssueRetryableCommand(create_cmd) + response = json.loads(stdout) + + self.id = response['VpcPeeringConnection'][ + 'VpcPeeringConnectionId'] + + # Accepts Peering Connection + accept_cmd = util.AWS_PREFIX + [ + 'ec2', + 'accept-vpc-peering-connection', + '--region=%s' % self.network_b.region, + '--vpc-peering-connection-id=%s' % self.id] + vm_util.IssueRetryableCommand(accept_cmd) + + util.AddDefaultTags(self.id, self.network_a.region) + logging.info('Creating VPC peering between %s and %s', + self.network_a.vpc.cidr, self.network_b.vpc.cidr) + + # Adds VPC peering to both networks' route tables + self.network_a.route_table.CreateVpcPeeringRoute(self.id, + self.network_b.vpc.cidr) + self.network_b.route_table.CreateVpcPeeringRoute(self.id, + self.network_a.vpc.cidr) + + # Updates security group to allow inbound traffic from peering networks + self.network_a.vpc.AllowVpcPeerInBound(self.network_b.vpc) + self.network_b.vpc.AllowVpcPeerInBound(self.network_a.vpc) + + def _Delete(self): + """Creates the deletes the peering object.""" + delete_cmd = util.AWS_PREFIX + [ + 'ec2', + 'delete-vpc-peering-connection', + '--region=%s' % self.network_a.region, + '--vpc-peering-connection-id=%s' % self.id] + vm_util.IssueCommand(delete_cmd) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_nfs_service.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_nfs_service.py new file mode 100644 index 0000000..001733d --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_nfs_service.py @@ -0,0 +1,256 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""AWS NFS implementation. + +See https://aws.amazon.com/efs/ + +This launches an EFS instance and creates a mount point. Individual AwsDisks +will then mount the share. + +The AfsNfsService object is a resource.BaseResource that has two resources +underneath it: +1. A resource to connect to the filer. +2. A resource to connect to the mount point on the filer. + +Lifecycle: +1. EFS service created and blocks until it is available, as it is needed to + make the mount point. +2. Issues a non-blocking call to create the mount point. Does not block as the + NfsDisk will block on it being available. +3. The NfsDisk then mounts the mount point and uses the disk like normal. +4. On teardown the mount point is first deleted. Blocks on that returning. +5. The EFS service is then deleted. Does not block as can take some time. +""" + +import json +import logging + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import nfs_service +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import aws_network +from perfkitbenchmarker.providers.aws import util + +FLAGS = flags.FLAGS + + +class AwsNfsService(nfs_service.BaseNfsService): + """An AWS NFS resource. + + Creates the AWS EFS file system and mount point for use with NFS clients. + + See https://aws.amazon.com/efs/ + """ + + CLOUD = providers.AWS + NFS_TIERS = ('generalPurpose', 'maxIO') + DEFAULT_NFS_VERSION = '4.1' + DEFAULT_TIER = 'generalPurpose' + + def __init__(self, disk_spec, zone): + super(AwsNfsService, self).__init__(disk_spec, zone) + self.region = util.GetRegionFromZone(self.zone) + self.aws_commands = AwsEfsCommands(self.region) + self.disk_spec.disk_size = 0 + self.filer_id = None + self.mount_id = None + self.throughput_mode = FLAGS.efs_throughput_mode + self.provisioned_throughput = FLAGS.efs_provisioned_throughput + + @property + def network(self): + network_spec = aws_network.AwsNetworkSpec(self.zone) + return aws_network.AwsNetwork.GetNetworkFromNetworkSpec(network_spec) + + @property + def subnet_id(self): + if hasattr(self.network, 'subnet'): + return self.network.subnet.id + else: + raise errors.Config.InvalidValue('No subnet in network %s' % self.network) + + @property + def security_group(self): + if hasattr(self.network, 'vpc'): + return self.network.vpc.default_security_group_id + # not required when making the mount target + return None + + def _Create(self): + logging.info('Creating NFS resource, subnet: %s, security group: %s', + self.subnet_id, self.security_group) + self._CreateFiler() + logging.info('Waiting for filer to start up') + self.aws_commands.WaitUntilFilerAvailable(self.filer_id) + # create the mount point but do not wait for it, superclass will call the + # _IsReady() method. + self._CreateMount() + + def _Delete(self): + # deletes on the file-system and mount-target are immediate + self._DeleteMount() + if not FLAGS.aws_delete_file_system: + return + self._DeleteFiler() + + def GetRemoteAddress(self): + if self.filer_id is None: + raise errors.Resource.RetryableGetError('Filer not created') + return '{name}.efs.{region}.amazonaws.com'.format( + name=self.filer_id, region=self.region) + + def _IsReady(self): + return self.aws_commands.IsMountAvailable(self.mount_id) + + def _CreateFiler(self): + """Creates the AWS EFS service.""" + if self.filer_id: + logging.warning('_CreateFiler() already called for %s', self.filer_id) + return + if FLAGS.aws_efs_token: + filer = self.aws_commands.GetFiler(FLAGS.aws_efs_token) + if filer: + self.nfs_tier = filer['PerformanceMode'] + self.filer_id = filer['FileSystemId'] + self.disk_spec.disk_size = int( + round(filer['SizeInBytes']['Value'] / 10.0 ** 9)) + return + token = FLAGS.aws_efs_token or 'nfs-token-%s' % FLAGS.run_uri + self.filer_id = self.aws_commands.CreateFiler( + token, self.nfs_tier, self.throughput_mode, self.provisioned_throughput) + self.aws_commands.AddTagsToFiler(self.filer_id) + logging.info('Created filer %s with address %s', self.filer_id, + self.GetRemoteAddress()) + + def _CreateMount(self): + """Creates an NFS mount point on an EFS service.""" + if self.mount_id: + logging.warning('_CreateMount() already called for %s', self.mount_id) + return + if not self.filer_id: + raise errors.Resource.CreationError('Did not create a filer first') + logging.info('Creating NFS mount point') + self.mount_id = self.aws_commands.CreateMount( + self.filer_id, self.subnet_id, self.security_group) + logging.info('Mount target %s starting up', self.mount_id) + + def _DeleteMount(self): + """Deletes the EFS mount point. + """ + if not self.mount_id: + return + logging.info('Deleting NFS mount mount %s', self.mount_id) + self.aws_commands.DeleteMount(self.mount_id) + self.mount_id = None + + def _DeleteFiler(self): + """Deletes the EFS service. + + Raises: + RetryableDeletionError: If the mount point exists. + """ + if not self.filer_id: + return + if self.mount_id: + # this isn't retryable as the mount point wasn't deleted + raise errors.Resource.RetryableDeletionError( + 'Did not delete mount point first') + logging.info('Deleting NFS filer %s', self.filer_id) + self.aws_commands.DeleteFiler(self.filer_id) + self.filer_id = None + + +class AwsEfsCommands(object): + """Commands for interacting with AWS EFS. + + Args: + region: AWS region for the NFS service. + """ + + def __init__(self, region): + self.efs_prefix = util.AWS_PREFIX + ['--region', region, 'efs'] + + def GetFiler(self, token): + """Returns the filer using the creation token or None.""" + args = ['describe-file-systems', '--creation-token', token] + response = self._IssueAwsCommand(args) + file_systems = response['FileSystems'] + if not file_systems: + return None + assert len(file_systems) < 2, 'Too many file systems.' + return file_systems[0] + + def CreateFiler(self, token, nfs_tier, throughput_mode, + provisioned_throughput): + args = ['create-file-system', '--creation-token', token] + if nfs_tier is not None: + args += ['--performance-mode', nfs_tier] + args += ['--throughput-mode', throughput_mode] + if throughput_mode == 'provisioned': + args += ['--provisioned-throughput-in-mibps', provisioned_throughput] + return self._IssueAwsCommand(args)['FileSystemId'] + + def AddTagsToFiler(self, filer_id): + tags = util.MakeFormattedDefaultTags() + args = ['create-tags', '--file-system-id', filer_id, '--tags'] + tags + self._IssueAwsCommand(args, False) + + @vm_util.Retry() + def WaitUntilFilerAvailable(self, filer_id): + if not self._IsAvailable('describe-file-systems', '--file-system-id', + 'FileSystems', filer_id): + raise errors.Resource.RetryableCreationError( + '{} not ready'.format(filer_id)) + + @vm_util.Retry() + def DeleteFiler(self, file_system_id): + args = self.efs_prefix + [ + 'delete-file-system', '--file-system-id', file_system_id] + _, stderr, retcode = vm_util.IssueCommand(args, raise_on_failure=False) + if retcode and 'FileSystemInUse' in stderr: + raise Exception('Mount Point hasn\'t finished deleting.') + + def CreateMount(self, file_system_id, subnet_id, security_group=None): + args = [ + 'create-mount-target', '--file-system-id', file_system_id, + '--subnet-id', subnet_id + ] + if security_group: + args += ['--security-groups', security_group] + return self._IssueAwsCommand(args)['MountTargetId'] + + def IsMountAvailable(self, mount_target_id): + if mount_target_id is None: + # caller called _IsReady() before the mount point was created + return False + return self._IsAvailable('describe-mount-targets', '--mount-target-id', + 'MountTargets', mount_target_id) + + def DeleteMount(self, mount_target_id): + self._IssueAwsCommand( + ['delete-mount-target', '--mount-target-id', mount_target_id], False) + + def _IsAvailable(self, describe_cmd, id_attr, response_attribute, id_value): + describe = self._IssueAwsCommand([describe_cmd, id_attr, id_value]) + status = describe[response_attribute][0].get('LifeCycleState') + return status == 'available' + + def _IssueAwsCommand(self, args, return_json=True): + args = self.efs_prefix + [str(arg) for arg in args] + stdout, _, retcode = vm_util.IssueCommand(args, raise_on_failure=False) + if retcode: + return None + return json.loads(stdout) if return_json else stdout diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_placement_group.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_placement_group.py new file mode 100644 index 0000000..08a00a2 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_placement_group.py @@ -0,0 +1,117 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Class to represent an AWS Placement Group object. + +Cloud specific implementations of Placement Group. +""" + +import json +import uuid + +from absl import flags +from perfkitbenchmarker import placement_group +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.providers.aws import util + + +FLAGS = flags.FLAGS + + +class AwsPlacementGroupSpec(placement_group.BasePlacementGroupSpec): + """Object containing the information needed to create an AwsPlacementGroup. + + Attributes: + zone: The AWS zone the Placement Group is in. + """ + + CLOUD = providers.AWS + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(AwsPlacementGroupSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'placement_group_style': (option_decoders.EnumDecoder, { + 'valid_values': placement_group.PLACEMENT_GROUP_OPTIONS, + 'default': placement_group.PLACEMENT_GROUP_CLUSTER, + }) + }) + return result + + +class AwsPlacementGroup(placement_group.BasePlacementGroup): + """Object representing an AWS Placement Group.""" + + CLOUD = providers.AWS + + def __init__(self, aws_placement_group_spec): + """Init method for AwsPlacementGroup. + + Args: + aws_placement_group_spec: Object containing the + information needed to create an AwsPlacementGroup. + """ + super(AwsPlacementGroup, self).__init__(aws_placement_group_spec) + self.name = ( + 'perfkit-%s-%s' % (FLAGS.run_uri, str(uuid.uuid4())[-12:])) + self.region = util.GetRegionFromZone(self.zone) + self.strategy = aws_placement_group_spec.placement_group_style + + def _Create(self): + """Creates the Placement Group.""" + formatted_tags = util.FormatTagSpecifications('placement-group', + util.MakeDefaultTags()) + + create_cmd = util.AWS_PREFIX + [ + 'ec2', + 'create-placement-group', + '--region=%s' % self.region, + '--group-name=%s' % self.name, + '--strategy=%s' % self.strategy, + '--tag-specifications=%s' % formatted_tags + ] + + vm_util.IssueCommand(create_cmd) + + def _Delete(self): + """Deletes the Placement Group.""" + delete_cmd = util.AWS_PREFIX + [ + 'ec2', + 'delete-placement-group', + '--region=%s' % self.region, + '--group-name=%s' % self.name] + # Failed deletes are ignorable (probably already deleted). + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + def _Exists(self): + """Returns true if the Placement Group exists.""" + describe_cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-placement-groups', + '--region=%s' % self.region, + '--filter=Name=group-name,Values=%s' % self.name] + stdout, _ = util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + placement_groups = response['PlacementGroups'] + assert len(placement_groups) < 2, 'Too many placement groups.' + return bool(placement_groups) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_relational_db.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_relational_db.py new file mode 100644 index 0000000..2697785 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_relational_db.py @@ -0,0 +1,847 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Managed relational database provisioning and teardown for AWS RDS.""" + + +import datetime +import json +import logging +import time + +from absl import flags +from perfkitbenchmarker import providers +from perfkitbenchmarker import relational_db +from perfkitbenchmarker import sql_engine_utils +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import aws_disk +from perfkitbenchmarker.providers.aws import aws_network +from perfkitbenchmarker.providers.aws import util + +FLAGS = flags.FLAGS + + +DEFAULT_MYSQL_VERSION = '5.7.16' +DEFAULT_POSTGRES_VERSION = '9.6.9' + +DEFAULT_MYSQL_AURORA_VERSION = '5.7.12' +DEFAULT_MYSQL56_AURORA_VERSION = '5.6.10a' +DEFAULT_POSTGRES_AURORA_VERSION = '9.6.9' +DEFAULT_SQLSERVER_VERSION = '14.00.3223.3.v1' + +IS_READY_TIMEOUT = 60 * 60 * 1 # 1 hour (RDS HA takes a long time to prepare) + +_MAP_ENGINE_TO_DEFAULT_VERSION = { + sql_engine_utils.MYSQL: DEFAULT_MYSQL_VERSION, + sql_engine_utils.AURORA_MYSQL: DEFAULT_MYSQL_AURORA_VERSION, + sql_engine_utils.AURORA_MYSQL56: DEFAULT_MYSQL56_AURORA_VERSION, + sql_engine_utils.POSTGRES: DEFAULT_POSTGRES_VERSION, + sql_engine_utils.AURORA_POSTGRES: DEFAULT_POSTGRES_AURORA_VERSION, + sql_engine_utils.SQLSERVER_EXPRESS: DEFAULT_SQLSERVER_VERSION, + sql_engine_utils.SQLSERVER_STANDARD: DEFAULT_SQLSERVER_VERSION, + sql_engine_utils.SQLSERVER_ENTERPRISE: DEFAULT_SQLSERVER_VERSION, +} + +_AURORA_ENGINES = ( + sql_engine_utils.AURORA_MYSQL56, sql_engine_utils.AURORA_MYSQL, + sql_engine_utils.AURORA_POSTGRES) + +_SQL_SERVER_ENGINES = ( + sql_engine_utils.SQLSERVER_EXPRESS, + sql_engine_utils.SQLSERVER_STANDARD, + sql_engine_utils.SQLSERVER_ENTERPRISE) + +_RDS_ENGINES = ( + sql_engine_utils.MYSQL, + sql_engine_utils.POSTGRES, + sql_engine_utils.SQLSERVER_EXPRESS, + sql_engine_utils.SQLSERVER_STANDARD, + sql_engine_utils.SQLSERVER_ENTERPRISE) + +MYSQL_SUPPORTED_MAJOR_VERSIONS = ['5.7', '8.0'] +POSTGRES_SUPPORTED_MAJOR_VERSIONS = ['9.6', '10', '11', '12', '13'] + + + +class AwsRelationalDbCrossRegionError(Exception): + pass + + +class AwsRelationalDbParameterError(Exception): + """Exceptions for invalid Db parameters.""" + pass + + +class AwsRelationalDb(relational_db.BaseRelationalDb): + """An object representing an AWS RDS managed relational database. + + Currenty MySQL and Postgres are supported. This class requires that a + client vm be available as an attribute on the instance before Create() is + called, which is the current behavior of PKB. This is necessary to setup the + networking correctly. The following steps are performed to provision the + database: + 1. get the client's VPC + 2. get the client's zone + 3. create a new subnet in the VPC's region that is different from the + client's zone + 4. create a new db subnet group using the client's zone, and the newly + created zone + 5. authorize Postgres traffic on the VPC's default security group + 6. create the RDS instance in the requested region using the new db + subnet group and VPC security group. + + On teardown, all resources are deleted. + + Note that the client VM's region and the region requested for the database + must be the same. + + At the moment there is no way to specify the primary zone when creating a + high availability instance, which means that the client and server may + be launched in different zones, which hurts network performance. + In other words, the 'zone' attribute on the relational_db db_spec + has no effect, and is only used to specify the region. + + To filter out runs that cross zones, be sure to check the sample metadata for + 'zone' (client's zone), 'relational_db_zone' (primary RDS zone), + and 'relational_db_secondary_zone' (secondary RDS zone). + + If the instance was NOT launched in the high availability configuration, the + server will be launched in the zone requested, and + relational_db_secondary_zone will not exist in the metadata. + """ + CLOUD = providers.AWS + + def __init__(self, relational_db_spec): + super(AwsRelationalDb, self).__init__(relational_db_spec) + self.cluster_id = None + self.all_instance_ids = [] + self.primary_zone = None + self.secondary_zone = None + self.parameter_group = None + + if hasattr(self.spec, 'zones') and self.spec.zones is not None: + self.zones = self.spec.zones + else: + self.zones = [self.spec.db_spec.zone] + + self.region = util.GetRegionFromZones(self.zones) + self.subnets_owned_by_db = [] + self.subnets_used_by_db = [] + + self.unmanaged_db_exists = None if self.is_managed_db else False + + # dependencies which will be created + self.db_subnet_group_name: str = None + self.security_group_id: str = None + + def GetResourceMetadata(self): + """Returns the metadata associated with the resource. + + All keys will be prefaced with relational_db before + being published (done in publisher.py). + + Returns: + metadata: dict of AWS Managed DB metadata. + """ + metadata = super(AwsRelationalDb, self).GetResourceMetadata() + metadata.update({ + 'zone': self.primary_zone, + }) + + if self.spec.high_availability: + metadata.update({ + 'secondary_zone': self.secondary_zone, + }) + + if hasattr(self.spec.db_disk_spec, 'iops'): + metadata.update({ + 'disk_iops': self.spec.db_disk_spec.iops, + }) + + return metadata + + @staticmethod + def GetDefaultEngineVersion(engine): + """Returns the default version of a given database engine. + + Args: + engine (string): type of database (my_sql or postgres). + Returns: + (string): Default engine version. + Raises: + Exception: If unrecognized engine is specified. + """ + if engine not in _MAP_ENGINE_TO_DEFAULT_VERSION: + raise Exception('Unspecified default version for {0}'.format(engine)) + return _MAP_ENGINE_TO_DEFAULT_VERSION[engine] + + def _GetNewZones(self): + """Returns a list of zones, excluding the one that the client VM is in.""" + all_zones = util.GetZonesInRegion(self.region) + for zone in self.zones: + all_zones.remove(zone) + return all_zones + + def _CreateSubnetInZone(self, new_subnet_zone): + """Creates a new subnet in the same region as the client VM. + + Args: + new_subnet_zone: The zone for the subnet to be created. + Must be in the same region as the client + + Returns: + the new subnet resource + """ + cidr = self.client_vm.network.regional_network.vpc.NextSubnetCidrBlock() + logging.info('Attempting to create a subnet in zone %s', new_subnet_zone) + new_subnet = ( + aws_network.AwsSubnet( + new_subnet_zone, + self.client_vm.network.regional_network.vpc.id, + cidr)) + new_subnet.Create() + logging.info('Successfully created a new subnet, subnet id is: %s', + new_subnet.id) + + # save for cleanup + self.subnets_used_by_db.append(new_subnet) + self.subnets_owned_by_db.append(new_subnet) + return new_subnet + + def _CreateSubnetInAllZonesAssumeClientZoneExists(self): + client_zone = self.client_vm.network.subnet.zone + for zone in self.zones: + if zone != client_zone: + self._CreateSubnetInZone(zone) + else: + self.subnets_used_by_db.append(self.client_vm.network.subnet) + + def _CreateSubnetInAdditionalZone(self): + """Creates a new subnet in the same region as the client VM. + + The zone will be different from the client's zone (but in the same region). + + Returns: + the new subnet resource + + Raises: + Exception: if unable to create a subnet in any zones in the region. + """ + new_subnet_zones = self._GetNewZones() + while len(new_subnet_zones) >= 1: + new_subnet_zone = new_subnet_zones.pop() + try: + new_subnet = self._CreateSubnetInZone(new_subnet_zone) + return new_subnet + except: + logging.info('Unable to create subnet in zone %s', new_subnet_zone) + raise Exception('Unable to create subnet in any availability zones') + + def _CreateDbSubnetGroup(self, subnets): + """Creates a new db subnet group. + + Args: + subnets: a list of strings. + The db subnet group will consit of all subnets in this list. + """ + db_subnet_group_name = 'pkb-db-subnet-group-{0}'.format(FLAGS.run_uri) + + create_db_subnet_group_cmd = util.AWS_PREFIX + ( + ['rds', + 'create-db-subnet-group', + '--db-subnet-group-name', db_subnet_group_name, + '--db-subnet-group-description', 'pkb_subnet_group_for_db', + '--region', self.region, + '--subnet-ids'] + [subnet.id for subnet in subnets] + + ['--tags'] + util.MakeFormattedDefaultTags()) + + vm_util.IssueCommand(create_db_subnet_group_cmd) + + # save for cleanup + self.db_subnet_group_name = db_subnet_group_name + self.security_group_id = (self.client_vm.network.regional_network. + vpc.default_security_group_id) + + def _SetupNetworking(self): + """Sets up the networking required for the RDS database.""" + if self.spec.engine in _RDS_ENGINES: + self.subnets_used_by_db.append(self.client_vm.network.subnet) + self._CreateSubnetInAdditionalZone() + elif self.spec.engine in _AURORA_ENGINES: + self._CreateSubnetInAllZonesAssumeClientZoneExists() + else: + raise Exception('Unknown how to create network for {0}'.format( + self.spec.engine)) + + self._CreateDbSubnetGroup(self.subnets_used_by_db) + + open_port_cmd = util.AWS_PREFIX + [ + 'ec2', + 'authorize-security-group-ingress', + '--group-id', self.security_group_id, + '--source-group', self.security_group_id, + '--protocol', 'tcp', + '--port={0}'.format(self.port), + '--region', self.region] + stdout, stderr, _ = vm_util.IssueCommand(open_port_cmd) + logging.info('Granted DB port ingress, stdout is:\n%s\nstderr is:\n%s', + stdout, stderr) + + def _TeardownNetworking(self): + """Tears down all network resources that were created for the database.""" + if hasattr(self, 'db_subnet_group_name'): + delete_db_subnet_group_cmd = util.AWS_PREFIX + [ + 'rds', + 'delete-db-subnet-group', + '--db-subnet-group-name', self.db_subnet_group_name, + '--region', self.region] + vm_util.IssueCommand(delete_db_subnet_group_cmd, raise_on_failure=False) + + for subnet_for_db in self.subnets_owned_by_db: + subnet_for_db.Delete() + + def _TeardownParameterGroup(self): + """Tears down all parameter group that were created for the database.""" + if self.parameter_group: + delete_db_parameter_group_cmd = util.AWS_PREFIX + [ + 'rds', 'delete-db-parameter-group', '--db-parameter-group-name', + self.parameter_group, '--region', self.region + ] + vm_util.IssueCommand( + delete_db_parameter_group_cmd, raise_on_failure=False) + + def _CreateAwsSqlInstance(self): + if self.spec.engine in _RDS_ENGINES: + instance_identifier = self.instance_id + self.all_instance_ids.append(instance_identifier) + cmd = util.AWS_PREFIX + [ + 'rds', 'create-db-instance', + '--db-instance-identifier=%s' % instance_identifier, + '--engine=%s' % self.spec.engine, + '--master-username=%s' % self.spec.database_username, + '--master-user-password=%s' % self.spec.database_password, + '--allocated-storage=%s' % self.spec.db_disk_spec.disk_size, + '--storage-type=%s' % self.spec.db_disk_spec.disk_type, + '--db-instance-class=%s' % self.spec.db_spec.machine_type, + '--no-auto-minor-version-upgrade', + '--region=%s' % self.region, + '--engine-version=%s' % self.spec.engine_version, + '--db-subnet-group-name=%s' % self.db_subnet_group_name, + '--vpc-security-group-ids=%s' % self.security_group_id, + '--availability-zone=%s' % self.spec.db_spec.zone, '--tags' + ] + util.MakeFormattedDefaultTags() + + if self.spec.engine in _SQL_SERVER_ENGINES: + cmd = cmd + ['--license-model=license-included'] + + if self.spec.db_disk_spec.disk_type == aws_disk.IO1: + cmd.append('--iops=%s' % self.spec.db_disk_spec.iops) + # TODO(ferneyhough): add backup_enabled and backup_window + + vm_util.IssueCommand(cmd) + + elif self.spec.engine in _AURORA_ENGINES: + zones_needed_for_high_availability = len(self.zones) > 1 + if zones_needed_for_high_availability != self.spec.high_availability: + raise Exception('When db_high_availability is true, multiple ' + 'zones must be specified. When ' + 'db_high_availability is false, one zone ' + 'should be specified. ' + 'db_high_availability: {0} ' + 'zone count: {1} '.format( + zones_needed_for_high_availability, + len(self.zones))) + + cluster_identifier = 'pkb-db-cluster-' + FLAGS.run_uri + # Create the cluster. + cmd = util.AWS_PREFIX + [ + 'rds', 'create-db-cluster', + '--db-cluster-identifier=%s' % cluster_identifier, + '--engine=%s' % self.spec.engine, + '--engine-version=%s' % self.spec.engine_version, + '--master-username=%s' % self.spec.database_username, + '--master-user-password=%s' % self.spec.database_password, + '--region=%s' % self.region, + '--db-subnet-group-name=%s' % self.db_subnet_group_name, + '--vpc-security-group-ids=%s' % self.security_group_id, + '--availability-zones=%s' % self.spec.zones[0], + '--tags'] + util.MakeFormattedDefaultTags() + + self.cluster_id = cluster_identifier + vm_util.IssueCommand(cmd) + + for zone in self.zones: + + # The first instance is assumed to be writer - + # and so use the instance_id for that id. + if zone == self.zones[0]: + instance_identifier = self.instance_id + else: + instance_identifier = self.instance_id + '-' + zone + + self.all_instance_ids.append(instance_identifier) + + cmd = util.AWS_PREFIX + [ + 'rds', 'create-db-instance', + '--db-instance-identifier=%s' % instance_identifier, + '--db-cluster-identifier=%s' % cluster_identifier, + '--engine=%s' % self.spec.engine, + '--engine-version=%s' % self.spec.engine_version, + '--no-auto-minor-version-upgrade', + '--db-instance-class=%s' % self.spec.db_spec.machine_type, + '--region=%s' % self.region, + '--availability-zone=%s' % zone, '--tags' + ] + util.MakeFormattedDefaultTags() + vm_util.IssueCommand(cmd) + + else: + raise Exception('Unknown how to create AWS data base engine {0}'.format( + self.spec.engine)) + + def _Create(self): + """Creates the AWS RDS instance. + + Raises: + Exception: if unknown how to create self.spec.engine. + + """ + if self.is_managed_db: + self._CreateAwsSqlInstance() + else: + self.endpoint = self.server_vm.ip_address + self._SetupUnmanagedDatabase() + self.firewall = aws_network.AwsFirewall() + self.firewall.AllowPortInSecurityGroup( + self.server_vm.region, + self.server_vm.network.regional_network.vpc.default_security_group_id, + self.port, + self.port, + ['%s/32' % self.client_vm.ip_address]) + self.unmanaged_db_exists = True + + def _IsDeleting(self): + """See Base class BaseResource in perfkitbenchmarker.resource.py.""" + + for instance_id in self.all_instance_ids: + json_output = self._DescribeInstance(instance_id) + if json_output: + state = json_output['DBInstances'][0]['DBInstanceStatus'] + if state == 'deleting': + return True + + return False + + def _Delete(self): + """Deletes the underlying resource. + + Implementations of this method should be idempotent since it may + be called multiple times, even if the resource has already been + deleted. + """ + if not self.is_managed_db: + if hasattr(self, 'firewall'): + self.firewall.DisallowAllPorts() + self.unmanaged_db_exists = False + self.PrintUnmanagedDbStats() + return + + for current_instance_id in self.all_instance_ids: + cmd = util.AWS_PREFIX + [ + 'rds', + 'delete-db-instance', + '--db-instance-identifier=%s' % current_instance_id, + '--skip-final-snapshot', + '--region', self.region, + ] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + if self.cluster_id is not None: + cmd = util.AWS_PREFIX + [ + 'rds', + 'delete-db-cluster', + '--db-cluster-identifier=%s' % self.cluster_id, + '--skip-final-snapshot', + '--region', self.region, + ] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _Exists(self): + """Returns true if the underlying resource exists. + + Supplying this method is optional. If it is not implemented then the + default is to assume success when _Create and _Delete do not raise + exceptions. + """ + if not self.is_managed_db: + return self.unmanaged_db_exists + for current_instance_id in self.all_instance_ids: + json_output = self._DescribeInstance(current_instance_id) + if not json_output: + return False + + return True + + def _ParseEndpointFromInstance(self, describe_instance_json): + """Parses the json output from the CLI and returns the endpoint. + + Args: + describe_instance_json: output in json format from calling + 'aws rds describe-db-instances' + + Returns: + endpoint of the server as a string + """ + return describe_instance_json['DBInstances'][0]['Endpoint']['Address'] + + def _ParsePortFromInstance(self, describe_instance_json): + """Parses the json output from the CLI and returns the port. + + Args: + describe_instance_json: output in json format from calling + 'aws rds describe-db-instances' + + Returns: + port on which the server is listening, as an int + """ + if describe_instance_json is None: + return None + return int(describe_instance_json['DBInstances'][0]['Endpoint']['Port']) + + def _ParseEndpointFromCluster(self, describe_cluster_json): + """Parses the json output from the CLI and returns the endpoint. + + Args: + describe_cluster_json: output in json format from calling + 'aws rds describe-db-clusters' + + Returns: + endpoint of the server as a string + """ + return describe_cluster_json['DBClusters'][0]['Endpoint'] + + def _SavePrimaryAndSecondaryZones(self, describe_instance_json): + """Saves the primary, and secondary (only if HA) zone of the server. + + Args: + describe_instance_json: output in json format from calling + 'aws rds describe-db-instances' + """ + + if self.spec.engine in _AURORA_ENGINES: + self.primary_zone = self.zones[0] + if len(self.zones) > 1: + self.secondary_zone = ','.join(self.zones[1:]) + else: + db_instance = describe_instance_json['DBInstances'][0] + self.primary_zone = ( + db_instance['AvailabilityZone']) + if self.spec.high_availability: + if 'SecondaryAvailabilityZone' in db_instance: + self.secondary_zone = db_instance['SecondaryAvailabilityZone'] + else: + # the secondary DB for RDS is in the second subnet. + self.secondary_zone = self.subnets_used_by_db[1].zone + + def _IsReady(self, timeout=IS_READY_TIMEOUT): + """Return true if the underlying resource is ready. + + This method will query all of the instance every 5 seconds until + its instance state is 'available', or until a timeout occurs. + + Args: + timeout: timeout in seconds + + Returns: + True if the resource was ready in time, False if the wait timed out + or an Exception occurred. + """ + if not self.is_managed_db: + return self._IsReadyUnmanaged() + + if not self.all_instance_ids: + return False + + for instance_id in self.all_instance_ids: + if not self._IsInstanceReady(instance_id, timeout): + return False + + return True + + def _PostCreate(self): + """Perform general post create operations on the cluster. + + Raises: + Exception: If could not ready the instance after modification to + multi-az. + """ + super()._PostCreate() + + if not self.is_managed_db: + self.client_vm_query_tools.InstallPackages() + else: + need_ha_modification = self.spec.engine in _RDS_ENGINES + + if self.spec.high_availability and need_ha_modification: + # When extending the database to be multi-az, the second region + # is picked by where the second subnet has been created. + cmd = util.AWS_PREFIX + [ + 'rds', + 'modify-db-instance', + '--db-instance-identifier=%s' % self.instance_id, + '--multi-az', + '--apply-immediately', + '--region=%s' % self.region + ] + vm_util.IssueCommand(cmd) + + if not self._IsInstanceReady( + self.instance_id, timeout=IS_READY_TIMEOUT): + raise Exception('Instance could not be set to ready after ' + 'modification for high availability') + + json_output = self._DescribeInstance(self.instance_id) + self._SavePrimaryAndSecondaryZones(json_output) + if self.cluster_id: + self._GetPortsForClusterInstance(self.cluster_id) + else: + self._GetPortsForWriterInstance(self.all_instance_ids[0]) + + self.client_vm_query_tools.InstallPackages() + + def _IsInstanceReady(self, instance_id, timeout=IS_READY_TIMEOUT): + """Return true if the instance is ready. + + This method will query the instance every 5 seconds until + its instance state is 'available', or until a timeout occurs. + + Args: + instance_id: string of the instance to check is ready + timeout: timeout in seconds + + Returns: + True if the resource was ready in time, False if the wait timed out + or an Exception occurred. + """ + start_time = datetime.datetime.now() + + while True: + if (datetime.datetime.now() - start_time).seconds >= timeout: + logging.exception('Timeout waiting for sql instance to be ready') + return False + json_output = self._DescribeInstance(instance_id) + if json_output: + try: + state = json_output['DBInstances'][0]['DBInstanceStatus'] + pending_values = ( + json_output['DBInstances'][0]['PendingModifiedValues']) + waiting_param = json_output['DBInstances'][0]['DBParameterGroups'][0][ + 'ParameterApplyStatus'] == 'applying' + logging.info('Instance state: %s', state) + if pending_values: + logging.info('Pending values: %s', (str(pending_values))) + + if waiting_param: + logging.info('Applying parameter') + + if state == 'available' and not pending_values and not waiting_param: + break + except: + logging.exception( + 'Error attempting to read stdout. Creation failure.') + return False + time.sleep(5) + + return True + + def _DescribeInstance(self, instance_id): + cmd = util.AWS_PREFIX + [ + 'rds', + 'describe-db-instances', + '--db-instance-identifier=%s' % instance_id, + '--region=%s' % self.region + ] + stdout, _, retcode = vm_util.IssueCommand(cmd, suppress_warning=True, + raise_on_failure=False) + if retcode != 0: + return None + json_output = json.loads(stdout) + return json_output + + def _DescribeCluster(self, cluster_id): + cmd = util.AWS_PREFIX + [ + 'rds', + 'describe-db-clusters', + '--db-cluster-identifier=%s' % cluster_id, + '--region=%s' % self.region + ] + stdout, _, _ = vm_util.IssueCommand(cmd, suppress_warning=True) + json_output = json.loads(stdout) + return json_output + + def _Reboot(self): + """Reboot the database and wait until the database is in ready state.""" + # Can only reboot when the instance is in ready state + if not self._IsInstanceReady(self.instance_id, timeout=IS_READY_TIMEOUT): + raise Exception('Instance is not in a state that can reboot') + + cmd = util.AWS_PREFIX + [ + 'rds', 'reboot-db-instance', + '--db-instance-identifier=%s' % self.instance_id, + '--region=%s' % self.region + ] + + vm_util.IssueCommand(cmd, suppress_warning=True) + + if not self._IsInstanceReady(self.instance_id, timeout=IS_READY_TIMEOUT): + raise Exception('Instance could not be set to ready after ' + 'reboot') + + def _ApplyManagedDbFlags(self): + """Apply managed flags on RDS.""" + if self.spec.db_flags: + if self.spec.engine == 'postgres': + cumulus_default_paramter_group = 'cumulus-default-postgres' + elif self.spec.engine == 'mysql': + cumulus_default_paramter_group = 'cumulus-default-mysql' + + self.parameter_group = 'pkb-parameter-group-' + FLAGS.run_uri + cmd = util.AWS_PREFIX + [ + 'rds', 'copy-db-parameter-group', + '--source-db-parameter-group-identifier=%s' % cumulus_default_paramter_group, + '--target-db-parameter-group-identifier=%s' % self.parameter_group, + '--target-db-parameter-group-description="Configuration options chosen by user"', + '--region=%s' % self.region + ] + + vm_util.IssueCommand(cmd, suppress_warning=True) + + cmd = util.AWS_PREFIX + [ + 'rds', 'modify-db-instance', + '--db-instance-identifier=%s' % self.instance_id, + '--db-parameter-group-name=%s' % self.parameter_group, + '--region=%s' % self.region, '--apply-immediately' + ] + + vm_util.IssueCommand(cmd, suppress_warning=True, raise_on_failure=False) + + for flag in self.spec.db_flags: + key_value_pair = flag.split('=') + if len(key_value_pair) != 2: + raise AwsRelationalDbParameterError('Malformed parameter %s' % flag) + cmd = util.AWS_PREFIX + [ + 'rds', 'modify-db-parameter-group', + '--db-parameter-group-name=%s' % self.parameter_group, + '--parameters=ParameterName=%s,ParameterValue=%s,ApplyMethod=pending-reboot' + % (key_value_pair[0], key_value_pair[1]), + '--region=%s' % self.region + ] + + vm_util.IssueCommand(cmd, suppress_warning=True) + + self._Reboot() + + + def _GetParameterGroupFamily(self): + """Get the parameter group family string. + + Parameter group family is formatted as engine type plus version. + + Returns: + ParameterGroupFamiliy name of rds resources. + + Raises: + NotImplementedError: If there is no supported ParameterGroupFamiliy. + """ + all_supported_versions = ( + MYSQL_SUPPORTED_MAJOR_VERSIONS + POSTGRES_SUPPORTED_MAJOR_VERSIONS) + for version in all_supported_versions: + if self.spec.engine_version.startswith(version): + return self.spec.engine + version + + raise NotImplementedError('The parameter group of engine %s,' + ' version %s is not supported' % + (self.spec.engine, self.spec.engine_version)) + + def _GetPortsForWriterInstance(self, instance_id): + """Assigns the ports and endpoints from the instance_id to self. + + These will be used to communicate with the data base. + """ + json_output = self._DescribeInstance(instance_id) + self.endpoint = self._ParseEndpointFromInstance(json_output) + + def _GetPortsForClusterInstance(self, cluster_id): + """Assigns the ports and endpoints from the cluster_id to self. + + These will be used to communicate with the data base. + """ + json_output = self._DescribeCluster(cluster_id) + self.endpoint = self._ParseEndpointFromCluster(json_output) + + def _AssertClientAndDbInSameRegion(self): + """Asserts that the client vm is in the same region requested by the server. + + Raises: + AwsRelationalDbCrossRegionError: if the client vm is in a + different region that is requested by the server. + """ + if self.client_vm.region != self.region: + raise AwsRelationalDbCrossRegionError( + ('client_vm and relational_db server ' + 'must be in the same region')) + + def _CreateDependencies(self): + """Method that will be called once before _CreateResource() is called. + + Supplying this method is optional. It is intended to allow additional + flexibility in creating resource dependencies separately from _Create(). + """ + if self.is_managed_db: + self._AssertClientAndDbInSameRegion() + self._SetupNetworking() + + def _DeleteDependencies(self): + """Method that will be called once after _DeleteResource() is called. + + Supplying this method is optional. It is intended to allow additional + flexibility in deleting resource dependencies separately from _Delete(). + """ + if self.is_managed_db: + self._TeardownNetworking() + self._TeardownParameterGroup() + + def _FailoverHA(self): + """Fail over from master to replica.""" + + if self.spec.engine in _RDS_ENGINES: + cmd = util.AWS_PREFIX + [ + 'rds', + 'reboot-db-instance', + '--db-instance-identifier=%s' % self.instance_id, + '--force-failover', + '--region=%s' % self.region + ] + vm_util.IssueCommand(cmd) + elif self.spec.engine in _AURORA_ENGINES: + new_primary_id = self.all_instance_ids[1] + cmd = util.AWS_PREFIX + [ + 'rds', + 'failover-db-cluster', + '--db-cluster-identifier=%s' % self.cluster_id, + '--target-db-instance-identifier=%s' % new_primary_id, + '--region=%s' % self.region + ] + vm_util.IssueCommand(cmd) + else: + raise Exception('Unknown how to failover {0}'.format( + self.spec.engine)) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_sqs.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_sqs.py new file mode 100644 index 0000000..939724c --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_sqs.py @@ -0,0 +1,116 @@ +"""AWS SQS interface for resources. + +This class handles resource creation/cleanup for SQS benchmark on AWS. +""" + +import json +import os + +from absl import flags +from perfkitbenchmarker import messaging_service as msgsvc +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import util + +FLAGS = flags.FLAGS +MESSAGING_SERVICE_SCRIPTS_VM_AWS_DIR = os.path.join( + msgsvc.MESSAGING_SERVICE_SCRIPTS_VM_LIB_DIR, 'aws') +MESSAGING_SERVICE_SCRIPTS_AWS_PREFIX = 'messaging_service_scripts/aws' +MESSAGING_SERVICE_SCRIPTS_AWS_FILES = ['__init__.py', 'aws_sqs_client.py'] +MESSAGING_SERVICE_SCRIPTS_AWS_BIN = 'messaging_service_scripts/aws_benchmark.py' + + +class AwsSqs(msgsvc.BaseMessagingService): + """AWS SQS Interface Class.""" + + CLOUD = providers.AWS + + def __init__(self): + super().__init__() + self.queue_name = 'pkb-queue-{0}'.format(FLAGS.run_uri) + + def _Create(self): + """Handles AWS resources provision. + + It creates an AWS SQS queue. + """ + cmd = util.AWS_PREFIX + [ + 'sqs', + 'create-queue', + '--queue-name', self.queue_name, + '--region', self.region + ] + vm_util.IssueCommand(cmd) + + def _Exists(self) -> bool: + """Checks whether SQS queue already exists.""" + cmd = util.AWS_PREFIX + [ + 'sqs', + 'get-queue-url', + '--queue-name', self.queue_name, + '--region', self.region + ] + _, _, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + return retcode == 0 + + def _Delete(self): + """Handle SQS queue deletion.""" + cmd = util.AWS_PREFIX + [ + 'sqs', + 'delete-queue', + '--queue-url', self._GetQueue(), + '--region', self.region + ] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _IsDeleting(self): + """Overrides BaseResource._IsDeleting. + + Used internally while deleting to check if the deletion is still in + progress. + + Returns: + A bool. True if the resource is not yet deleted, else False. + """ + return self._Exists() + + def _InstallCloudClients(self): + self.client_vm.RemoteCommand( + 'sudo pip3 install boto3', ignore_failure=False) + + self._CopyFiles( + MESSAGING_SERVICE_SCRIPTS_AWS_PREFIX, + MESSAGING_SERVICE_SCRIPTS_AWS_FILES, + MESSAGING_SERVICE_SCRIPTS_VM_AWS_DIR) + self.client_vm.PushDataFile(MESSAGING_SERVICE_SCRIPTS_AWS_BIN) + + # copy AWS creds + self.client_vm.Install('aws_credentials') + + def Run(self, benchmark_scenario: str, number_of_messages: str, + message_size: str): + """Runs remote commands on client VM - benchmark's run phase.""" + command = (f'python3 -m aws_benchmark ' + f'--queue_name={self.queue_name} ' + f'--region={self.region} ' + f'--benchmark_scenario={benchmark_scenario} ' + f'--number_of_messages={number_of_messages} ' + f'--message_size={message_size}') + stdout, _ = self.client_vm.RemoteCommand(command) + results = json.loads(stdout) + return results + + @property + def region(self): + return util.GetRegionFromZone(self.client_vm.zone) + + def _GetQueue(self) -> str: + """Get SQS queue URL from AWS.""" + cmd = util.AWS_PREFIX + [ + 'sqs', + 'get-queue-url', + '--queue-name', self.queue_name, + '--region', self.region + ] + stdout, _, _ = vm_util.IssueCommand(cmd) + return json.loads(stdout)['QueueUrl'] diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_virtual_machine.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_virtual_machine.py new file mode 100644 index 0000000..1a6032a --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_virtual_machine.py @@ -0,0 +1,1655 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Class to represent an AWS Virtual Machine object. + +Images: aws ec2 describe-images --owners self amazon +All VM specifics are self-contained and the class provides methods to +operate on the VM: boot, shutdown, etc. +""" + + +import base64 +import collections +import json +import logging +import posixpath +import re +import threading +import time +import uuid +# Added by Intel +import ipaddress +# End Added by Intel + +from absl import flags +from perfkitbenchmarker import disk +from perfkitbenchmarker import errors +from perfkitbenchmarker import linux_virtual_machine +from perfkitbenchmarker import placement_group +from perfkitbenchmarker import providers +from perfkitbenchmarker import resource +from perfkitbenchmarker import virtual_machine +from perfkitbenchmarker import vm_util +from perfkitbenchmarker import windows_virtual_machine +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.providers.aws import aws_disk +from perfkitbenchmarker.providers.aws import aws_network +from perfkitbenchmarker.providers.aws import util +from six.moves import range + +# Added by Intel +try: + unicode +except NameError: + unicode = str +# End added by Intel + +FLAGS = flags.FLAGS + +HVM = 'hvm' +PV = 'paravirtual' +NON_HVM_PREFIXES = ['m1', 'c1', 't1', 'm2'] +NON_PLACEMENT_GROUP_PREFIXES = frozenset(['t2', 'm3', 't3', 't3a']) +DRIVE_START_LETTER = 'b' +TERMINATED = 'terminated' +SHUTTING_DOWN = 'shutting-down' +INSTANCE_EXISTS_STATUSES = frozenset(['running', 'stopping', 'stopped']) +INSTANCE_DELETED_STATUSES = frozenset([SHUTTING_DOWN, TERMINATED]) +INSTANCE_TRANSITIONAL_STATUSES = frozenset(['pending']) +INSTANCE_KNOWN_STATUSES = (INSTANCE_EXISTS_STATUSES | INSTANCE_DELETED_STATUSES + | INSTANCE_TRANSITIONAL_STATUSES) +HOST_EXISTS_STATES = frozenset( + ['available', 'under-assessment', 'permanent-failure']) +HOST_RELEASED_STATES = frozenset(['released', 'released-permanent-failure']) +KNOWN_HOST_STATES = HOST_EXISTS_STATES | HOST_RELEASED_STATES + +AWS_INITIATED_SPOT_TERMINATING_TRANSITION_STATUSES = frozenset( + ['marked-for-termination', 'marked-for-stop']) + +AWS_INITIATED_SPOT_TERMINAL_STATUSES = frozenset( + ['instance-terminated-by-price', 'instance-terminated-by-service', + 'instance-terminated-no-capacity', + 'instance-terminated-capacity-oversubscribed', + 'instance-terminated-launch-group-constraint']) + +USER_INITIATED_SPOT_TERMINAL_STATUSES = frozenset( + ['request-canceled-and-instance-running', 'instance-terminated-by-user']) + +# These are the project numbers of projects owning common images. +# Some numbers have corresponding owner aliases, but they are not used here. +AMAZON_LINUX_IMAGE_PROJECT = [ + '137112412989', # alias amazon most regions + '210953353124', # alias amazon for af-south-1 + '910595266909', # alias amazon for ap-east-1 + '071630900071', # alias amazon for eu-south-1 +] +# From https://wiki.debian.org/Cloud/AmazonEC2Image/Stretch +# Marketplace AMI exists, but not in all regions +DEBIAN_9_IMAGE_PROJECT = ['379101102735'] +# From https://wiki.debian.org/Cloud/AmazonEC2Image/Buster +# From https://wiki.debian.org/Cloud/AmazonEC2Image/Bullseye +DEBIAN_IMAGE_PROJECT = ['136693071363'] +# Owns AMIs lists here: +# https://wiki.centos.org/Cloud/AWS#Official_CentOS_Linux_:_Public_Images +# Also owns the AMIS listed in +# https://builds.coreos.fedoraproject.org/streams/stable.json +CENTOS_IMAGE_PROJECT = ['125523088429'] +MARKETPLACE_IMAGE_PROJECT = ['679593333241'] # alias aws-marketplace +# https://access.redhat.com/articles/2962171 +RHEL_IMAGE_PROJECT = ['309956199498'] +# https://help.ubuntu.com/community/EC2StartersGuide#Official_Ubuntu_Cloud_Guest_Amazon_Machine_Images_.28AMIs.29 +UBUNTU_IMAGE_PROJECT = ['099720109477'] # Owned by canonical +# Some Windows images are also available in marketplace project, but this is the +# one selected by the AWS console. +WINDOWS_IMAGE_PROJECT = ['801119661308'] # alias amazon +UBUNTU_EFA_IMAGE_PROJECT = ['898082745236'] + +# Processor architectures +ARM = 'arm64' +X86 = 'x86_64' + +# Machine type to ARM architecture. +_MACHINE_TYPE_PREFIX_TO_ARM_ARCH = { + 'a1': 'cortex-a72', + 'c6g': 'graviton2', + 'c7g': 'graviton3', + 'g5g': 'graviton2', + 'm6g': 'graviton2', + 'r6g': 'graviton2', + 't4g': 'graviton2', + 'im4g': 'graviton2', + 'is4ge': 'graviton2', + 'x2g': 'graviton2', +} + +# Parameters for use with Elastic Fiber Adapter +_EFA_PARAMS = { + 'InterfaceType': 'efa', + 'DeviceIndex': 0, + 'NetworkCardIndex': 0, + 'Groups': '', + 'SubnetId': '' +} +# Location of EFA installer +_EFA_URL = ('https://s3-us-west-2.amazonaws.com/aws-efa-installer/' + 'aws-efa-installer-{version}.tar.gz') + + +class AwsTransitionalVmRetryableError(Exception): + """Error for retrying _Exists when an AWS VM is in a transitional state.""" + + +class AwsDriverDoesntSupportFeatureError(Exception): + """Raised if there is an attempt to set a feature not supported.""" + + +class AwsUnexpectedWindowsAdapterOutputError(Exception): + """Raised when querying the status of a windows adapter failed.""" + + +class AwsUnknownStatusError(Exception): + """Error indicating an unknown status was encountered.""" + + +class AwsImageNotFoundError(Exception): + """Error indicating no appropriate AMI could be found.""" + + +def GetRootBlockDeviceSpecForImage(image_id, region): + """Queries the CLI and returns the root block device specification as a dict. + + Args: + image_id: The EC2 image id to query + region: The EC2 region in which the image resides + + Returns: + The root block device specification as returned by the AWS cli, + as a Python dict. If the image is not found, or if the response + is malformed, an exception will be raised. + """ + command = util.AWS_PREFIX + [ + 'ec2', + 'describe-images', + '--region=%s' % region, + '--image-ids=%s' % image_id, + '--query', 'Images[]'] + stdout, _ = util.IssueRetryableCommand(command) + images = json.loads(stdout) + assert images + assert len(images) == 1, ( + 'Expected to receive only one image description for %s' % image_id) + image_spec = images[0] + root_device_name = image_spec['RootDeviceName'] + block_device_mappings = image_spec['BlockDeviceMappings'] + root_block_device_dict = next((x for x in block_device_mappings if + x['DeviceName'] == root_device_name)) + return root_block_device_dict + + +def GetBlockDeviceMap(machine_type, root_volume_size_gb=None, + image_id=None, region=None): + """Returns the block device map to expose all devices for a given machine. + + Args: + machine_type: The machine type to create a block device map for. + root_volume_size_gb: The desired size of the root volume, in GiB, + or None to the default provided by AWS. + image_id: The image id (AMI) to use in order to lookup the default + root device specs. This is only required if root_volume_size + is specified. + region: The region which contains the specified image. This is only + required if image_id is specified. + + Returns: + The json representation of the block device map for a machine compatible + with the AWS CLI, or if the machine type has no local disks, it will + return None. If root_volume_size_gb and image_id are provided, the block + device map will include the specification for the root volume. + + Raises: + ValueError: If required parameters are not passed. + """ + mappings = [] + if root_volume_size_gb is not None: + if image_id is None: + raise ValueError( + 'image_id must be provided if root_volume_size_gb is specified') + if region is None: + raise ValueError( + 'region must be provided if image_id is specified') + root_block_device = GetRootBlockDeviceSpecForImage(image_id, region) + root_block_device['Ebs']['VolumeSize'] = root_volume_size_gb + root_block_device['Ebs']['DeleteOnTermination'] = True + # The 'Encrypted' key must be removed or the CLI will complain + if not FLAGS.aws_vm_hibernate: + root_block_device['Ebs'].pop('Encrypted') + else: + root_block_device['Ebs']['Encrypted'] = True + mappings.append(root_block_device) + + if (machine_type in aws_disk.NUM_LOCAL_VOLUMES and + not aws_disk.LocalDriveIsNvme(machine_type)): + for i in range(aws_disk.NUM_LOCAL_VOLUMES[machine_type]): + od = collections.OrderedDict() + od['VirtualName'] = 'ephemeral%s' % i + od['DeviceName'] = '/dev/xvd%s' % chr(ord(DRIVE_START_LETTER) + i) + mappings.append(od) + if mappings: + return json.dumps(mappings) + return None + + +def IsPlacementGroupCompatible(machine_type): + """Returns True if VMs of 'machine_type' can be put in a placement group.""" + prefix = machine_type.split('.')[0] + return prefix not in NON_PLACEMENT_GROUP_PREFIXES + + +def GetArmArchitecture(machine_type): + """Returns the specific ARM processor architecture of the VM.""" + # c6g.medium -> c6g, m6gd.large -> m6g, c5n.18xlarge -> c5 + prefix = re.split(r'[dn]?\.', machine_type)[0] + return _MACHINE_TYPE_PREFIX_TO_ARM_ARCH.get(prefix) + + +def GetProcessorArchitecture(machine_type): + """Returns the processor architecture of the VM.""" + if GetArmArchitecture(machine_type): + return ARM + else: + return X86 + + +class AwsDedicatedHost(resource.BaseResource): + """Object representing an AWS host. + + Attributes: + region: The AWS region of the host. + zone: The AWS availability zone of the host. + machine_type: The machine type of VMs that may be created on the host. + client_token: A uuid that makes the creation request idempotent. + id: The host_id of the host. + """ + + def __init__(self, machine_type, zone): + super(AwsDedicatedHost, self).__init__() + self.machine_type = machine_type + self.zone = zone + self.region = util.GetRegionFromZone(self.zone) + self.client_token = str(uuid.uuid4()) + self.id = None + self.fill_fraction = 0.0 + + def _Create(self): + create_cmd = util.AWS_PREFIX + [ + 'ec2', + 'allocate-hosts', + '--region=%s' % self.region, + '--client-token=%s' % self.client_token, + '--instance-type=%s' % self.machine_type, + '--availability-zone=%s' % self.zone, + '--auto-placement=off', + '--quantity=1'] + vm_util.IssueCommand(create_cmd) + + def _Delete(self): + if self.id: + delete_cmd = util.AWS_PREFIX + [ + 'ec2', + 'release-hosts', + '--region=%s' % self.region, + '--host-ids=%s' % self.id] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + @vm_util.Retry() + def _Exists(self): + describe_cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-hosts', + '--region=%s' % self.region, + '--filter=Name=client-token,Values=%s' % self.client_token] + stdout, _, _ = vm_util.IssueCommand(describe_cmd) + response = json.loads(stdout) + hosts = response['Hosts'] + assert len(hosts) < 2, 'Too many hosts.' + if not hosts: + return False + host = hosts[0] + self.id = host['HostId'] + state = host['State'] + assert state in KNOWN_HOST_STATES, state + return state in HOST_EXISTS_STATES + + +class AwsVmSpec(virtual_machine.BaseVmSpec): + """Object containing the information needed to create an AwsVirtualMachine. + + Attributes: + use_dedicated_host: bool. Whether to create this VM on a dedicated host. + """ + + CLOUD = providers.AWS + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May + be modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(AwsVmSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['aws_boot_disk_size'].present: + config_values['boot_disk_size'] = flag_values.aws_boot_disk_size + if flag_values['aws_spot_instances'].present: + config_values['use_spot_instance'] = flag_values.aws_spot_instances + if flag_values['aws_spot_price'].present: + config_values['spot_price'] = flag_values.aws_spot_price + if flag_values['aws_spot_block_duration_minutes'].present: + config_values['spot_block_duration_minutes'] = int( + flag_values.aws_spot_block_duration_minutes) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(AwsVmSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'use_spot_instance': (option_decoders.BooleanDecoder, { + 'default': False + }), + 'spot_price': (option_decoders.FloatDecoder, { + 'default': None + }), + 'spot_block_duration_minutes': (option_decoders.IntDecoder, { + 'default': None + }), + 'boot_disk_size': (option_decoders.IntDecoder, { + 'default': None + }) + }) + + return result + + +def _GetKeyfileSetKey(region): + """Returns a key to use for the keyfile set. + + This prevents other runs in the same process from reusing the key. + + Args: + region: The region the keyfile is in. + """ + return (region, FLAGS.run_uri) + + +class AwsKeyFileManager(object): + """Object for managing AWS Keyfiles.""" + _lock = threading.Lock() + imported_keyfile_set = set() + deleted_keyfile_set = set() + + @classmethod + def ImportKeyfile(cls, region): + """Imports the public keyfile to AWS.""" + with cls._lock: + if _GetKeyfileSetKey(region) in cls.imported_keyfile_set: + return + cat_cmd = ['cat', + vm_util.GetPublicKeyPath()] + keyfile, _ = vm_util.IssueRetryableCommand(cat_cmd) + formatted_tags = util.FormatTagSpecifications('key-pair', + util.MakeDefaultTags()) + import_cmd = util.AWS_PREFIX + [ + 'ec2', '--region=%s' % region, + 'import-key-pair', + '--key-name=%s' % cls.GetKeyNameForRun(), + '--public-key-material=%s' % keyfile, + '--tag-specifications=%s' % formatted_tags, + ] + _, stderr, retcode = vm_util.IssueCommand( + import_cmd, raise_on_failure=False) + if retcode: + if 'KeyPairLimitExceeded' in stderr: + raise errors.Benchmarks.QuotaFailure( + 'KeyPairLimitExceeded in %s: %s' % (region, stderr)) + else: + raise errors.Benchmarks.PrepareException(stderr) + + cls.imported_keyfile_set.add(_GetKeyfileSetKey(region)) + if _GetKeyfileSetKey(region) in cls.deleted_keyfile_set: + cls.deleted_keyfile_set.remove(_GetKeyfileSetKey(region)) + + @classmethod + def DeleteKeyfile(cls, region): + """Deletes the imported keyfile for a region.""" + with cls._lock: + if _GetKeyfileSetKey(region) in cls.deleted_keyfile_set: + return + delete_cmd = util.AWS_PREFIX + [ + 'ec2', '--region=%s' % region, + 'delete-key-pair', + '--key-name=%s' % cls.GetKeyNameForRun()] + util.IssueRetryableCommand(delete_cmd) + cls.deleted_keyfile_set.add(_GetKeyfileSetKey(region)) + if _GetKeyfileSetKey(region) in cls.imported_keyfile_set: + cls.imported_keyfile_set.remove(_GetKeyfileSetKey(region)) + + @classmethod + def GetKeyNameForRun(cls): + return 'perfkit-key-{0}'.format(FLAGS.run_uri) + + +class AwsVirtualMachine(virtual_machine.BaseVirtualMachine): + """Object representing an AWS Virtual Machine.""" + + CLOUD = providers.AWS + + # The IMAGE_NAME_FILTER is passed to the AWS CLI describe-images command to + # filter images by name. This must be set by subclasses, but may be overridden + # by the aws_image_name_filter flag. + IMAGE_NAME_FILTER = None + + # The IMAGE_NAME_REGEX can be used to further filter images by name. It + # applies after the IMAGE_NAME_FILTER above. Note that before this regex is + # applied, Python's string formatting is used to replace {virt_type} and + # {disk_type} by the respective virtualization type and root disk type of the + # VM, allowing the regex to contain these strings. This regex supports + # arbitrary Python regular expressions to further narrow down the set of + # images considered. + IMAGE_NAME_REGEX = None + + # List of projects that own the AMIs of this OS type. Default to + # AWS Marketplace official image project. Note that opt-in regions may have a + # different image owner than default regions. + IMAGE_OWNER = MARKETPLACE_IMAGE_PROJECT + + # Some AMIs use a project code to find the latest (in addition to owner, and + # filter) + IMAGE_PRODUCT_CODE_FILTER = None + + # CoreOS only distinguishes between stable and testing images in the + # description + IMAGE_DESCRIPTION_FILTER = None + + DEFAULT_ROOT_DISK_TYPE = 'gp2' + DEFAULT_ROOT_DISK_SIZE_GB = 16 + DEFAULT_USER_NAME = 'ec2-user' + + _lock = threading.Lock() + deleted_hosts = set() + host_map = collections.defaultdict(list) + + def __init__(self, vm_spec): + """Initialize a AWS virtual machine. + + Args: + vm_spec: virtual_machine.BaseVirtualMachineSpec object of the vm. + + Raises: + ValueError: If an incompatible vm_spec is passed. + """ + super(AwsVirtualMachine, self).__init__(vm_spec) + self.region = util.GetRegionFromZone(self.zone) + self.user_name = FLAGS.aws_user_name or self.DEFAULT_USER_NAME + if self.machine_type in aws_disk.NUM_LOCAL_VOLUMES: + self.max_local_disks = aws_disk.NUM_LOCAL_VOLUMES[self.machine_type] + self.user_data = None + self.network = aws_network.AwsNetwork.GetNetwork(self) + self.placement_group = getattr(vm_spec, 'placement_group', + self.network.placement_group) + self.firewall = aws_network.AwsFirewall.GetFirewall() + self.use_dedicated_host = vm_spec.use_dedicated_host + self.num_vms_per_host = vm_spec.num_vms_per_host + self.use_spot_instance = vm_spec.use_spot_instance + self.spot_price = vm_spec.spot_price + self.spot_block_duration_minutes = vm_spec.spot_block_duration_minutes + self.boot_disk_size = vm_spec.boot_disk_size + self.client_token = str(uuid.uuid4()) + self.host = None + self.id = None + self.metadata.update({ + 'spot_instance': + self.use_spot_instance, + 'spot_price': + self.spot_price, + 'spot_block_duration_minutes': + self.spot_block_duration_minutes, + 'placement_group_strategy': + self.placement_group.strategy + if self.placement_group else placement_group.PLACEMENT_GROUP_NONE, + 'aws_credit_specification': + FLAGS.aws_credit_specification + if FLAGS.aws_credit_specification else 'none' + }) + self.spot_early_termination = False + self.spot_status_code = None + # See: + # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/enhanced-networking-os.html + self._smp_affinity_script = 'smp_affinity.sh' + + if self.use_dedicated_host and util.IsRegion(self.zone): + raise ValueError( + 'In order to use dedicated hosts, you must specify an availability ' + 'zone, not a region ("zone" was %s).' % self.zone) + + if self.use_dedicated_host and self.use_spot_instance: + raise ValueError( + 'Tenancy=host is not supported for Spot Instances') + self.allocation_id = None + self.association_id = None + self.aws_tags = {} + + @property + def host_list(self): + """Returns the list of hosts that are compatible with this VM.""" + return self.host_map[(self.machine_type, self.zone)] + + @property + def group_id(self): + """Returns the security group ID of this VM.""" + return self.network.regional_network.vpc.default_security_group_id + + @classmethod + def GetDefaultImage(cls, machine_type, region): + """Returns the default image given the machine type and region. + + If specified, the aws_image_name_filter and aws_image_name_regex flags will + override os_type defaults. + + Args: + machine_type: The machine_type of the VM, used to determine virtualization + type. + region: The region of the VM, as images are region specific. + + Raises: + AwsImageNotFoundError: If a default image cannot be found. + + Returns: + The ID of the latest image, or None if no default image is configured or + none can be found. + """ + + # These cannot be REQUIRED_ATTRS, because nesting REQUIRED_ATTRS breaks. + if not cls.IMAGE_OWNER: + raise NotImplementedError('AWS OSMixins require IMAGE_OWNER') + if not cls.IMAGE_NAME_FILTER: + raise NotImplementedError('AWS OSMixins require IMAGE_NAME_FILTER') + + if FLAGS.aws_image_name_filter: + cls.IMAGE_NAME_FILTER = FLAGS.aws_image_name_filter + + if FLAGS.aws_image_name_regex: + cls.IMAGE_NAME_REGEX = FLAGS.aws_image_name_regex + + prefix = machine_type.split('.')[0] + virt_type = PV if prefix in NON_HVM_PREFIXES else HVM + processor_architecture = GetProcessorArchitecture(machine_type) + + describe_cmd = util.AWS_PREFIX + [ + '--region=%s' % region, + 'ec2', + 'describe-images', + '--query', ('Images[*].{Name:Name,ImageId:ImageId,' + 'CreationDate:CreationDate}'), + '--filters', + 'Name=name,Values=%s' % cls.IMAGE_NAME_FILTER, + 'Name=block-device-mapping.volume-type,Values=%s' % + cls.DEFAULT_ROOT_DISK_TYPE, + 'Name=virtualization-type,Values=%s' % virt_type, + 'Name=architecture,Values=%s' % processor_architecture] + if cls.IMAGE_PRODUCT_CODE_FILTER: + describe_cmd.extend(['Name=product-code,Values=%s' % + cls.IMAGE_PRODUCT_CODE_FILTER]) + if cls.IMAGE_DESCRIPTION_FILTER: + describe_cmd.extend(['Name=description,Values=%s' % + cls.IMAGE_DESCRIPTION_FILTER]) + describe_cmd.extend(['--owners'] + cls.IMAGE_OWNER) + stdout, _ = util.IssueRetryableCommand(describe_cmd) + + if not stdout: + raise AwsImageNotFoundError('aws describe-images did not produce valid ' + 'output.') + + if cls.IMAGE_NAME_REGEX: + # Further filter images by the IMAGE_NAME_REGEX filter. + image_name_regex = cls.IMAGE_NAME_REGEX.format( + virt_type=virt_type, disk_type=cls.DEFAULT_ROOT_DISK_TYPE, + architecture=processor_architecture) + images = [] + excluded_images = [] + for image in json.loads(stdout): + if re.search(image_name_regex, image['Name']): + images.append(image) + else: + excluded_images.append(image) + + if excluded_images: + logging.debug('Excluded the following images with regex "%s": %s', + image_name_regex, + sorted(image['Name'] for image in excluded_images)) + else: + images = json.loads(stdout) + + if not images: + raise AwsImageNotFoundError('No AMIs with given filters found.') + + return max(images, key=lambda image: image['CreationDate'])['ImageId'] + + @vm_util.Retry(max_retries=2) + def _PostCreate(self): + """Get the instance's data and tag it.""" + describe_cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-instances', + '--region=%s' % self.region, + '--instance-ids=%s' % self.id] + logging.info('Getting instance %s public IP. This will fail until ' + 'a public IP is available, but will be retried.', self.id) + stdout, _ = util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + instance = response['Reservations'][0]['Instances'][0] + self.internal_ip = instance['PrivateIpAddress'] + if util.IsRegion(self.zone): + self.zone = str(instance['Placement']['AvailabilityZone']) + + assert self.group_id == instance['SecurityGroups'][0]['GroupId'], ( + self.group_id, instance['SecurityGroups'][0]['GroupId']) + if FLAGS.aws_efa: + self._ConfigureEfa(instance) + elif 'PublicIpAddress' in instance: + self.ip_address = instance['PublicIpAddress'] + else: + raise errors.Resource.RetryableCreationError('Public IP not ready.') + + def _ConfigureEfa(self, instance): + """Configuare EFA and associate Elastic IP. + + Args: + instance: dict which contains instance info. + """ + if FLAGS.aws_efa_count > 1: + self._ConfigureElasticIp(instance) + else: + self.ip_address = instance['PublicIpAddress'] + if FLAGS.aws_efa_version: + # Download EFA then call InstallEfa method so that subclass can override + self.InstallPackages('curl') + url = _EFA_URL.format(version=FLAGS.aws_efa_version) + tarfile = posixpath.basename(url) + self.RemoteCommand(f'curl -O {url}; tar -xzf {tarfile}') + self._InstallEfa() + # Run test program to confirm EFA working + self.RemoteCommand('cd aws-efa-installer; ' + 'PATH=${PATH}:/opt/amazon/efa/bin ./efa_test.sh') + + def _ConfigureElasticIp(self, instance): + """Create and associate Elastic IP. + + Args: + instance: dict which contains instance info. + """ + network_interface_id = None + for network_interface in instance['NetworkInterfaces']: + # The primary network interface (eth0) for the instance. + if network_interface['Attachment']['DeviceIndex'] == 0: + network_interface_id = network_interface['NetworkInterfaceId'] + break + assert network_interface_id is not None + + stdout, _, _ = vm_util.IssueCommand(util.AWS_PREFIX + + ['ec2', 'allocate-address', + f'--region={self.region}', + '--domain=vpc']) + response = json.loads(stdout) + self.ip_address = response['PublicIp'] + self.allocation_id = response['AllocationId'] + + util.AddDefaultTags(self.allocation_id, self.region) + + stdout, _, _ = vm_util.IssueCommand( + util.AWS_PREFIX + ['ec2', 'associate-address', + f'--region={self.region}', + f'--allocation-id={self.allocation_id}', + f'--network-interface-id={network_interface_id}']) + response = json.loads(stdout) + self.association_id = response['AssociationId'] + + def _InstallEfa(self): + """Installs AWS EFA packages. + + See https://aws.amazon.com/hpc/efa/ + """ + if not self.TryRemoteCommand('ulimit -l | grep unlimited'): + self.RemoteCommand(f'echo "{self.user_name} - memlock unlimited" | ' + 'sudo tee -a /etc/security/limits.conf') + self.RemoteCommand('cd aws-efa-installer; sudo ./efa_installer.sh -y') + if not self.TryRemoteCommand('ulimit -l | grep unlimited'): + # efa_installer.sh should reboot enabling this change, reboot if necessary + self.Reboot() + + def _CreateDependencies(self): + """Create VM dependencies.""" + AwsKeyFileManager.ImportKeyfile(self.region) + # GetDefaultImage calls the AWS CLI. + self.image = self.image or self.GetDefaultImage(self.machine_type, + self.region) + self.AllowRemoteAccessPorts() + + if self.use_dedicated_host: + with self._lock: + if (not self.host_list or (self.num_vms_per_host and + self.host_list[-1].fill_fraction + + 1.0 / self.num_vms_per_host > 1.0)): + host = AwsDedicatedHost(self.machine_type, self.zone) + self.host_list.append(host) + host.Create() + self.host = self.host_list[-1] + if self.num_vms_per_host: + self.host.fill_fraction += 1.0 / self.num_vms_per_host + + def _DeleteDependencies(self): + """Delete VM dependencies.""" + AwsKeyFileManager.DeleteKeyfile(self.region) + if self.host: + with self._lock: + if self.host in self.host_list: + self.host_list.remove(self.host) + if self.host not in self.deleted_hosts: + self.host.Delete() + self.deleted_hosts.add(self.host) + + def _Create(self): + """Create a VM instance.""" + placement = [] + if not util.IsRegion(self.zone): + placement.append('AvailabilityZone=%s' % self.zone) + if self.use_dedicated_host: + placement.append('Tenancy=host,HostId=%s' % self.host.id) + num_hosts = len(self.host_list) + elif self.placement_group: + if IsPlacementGroupCompatible(self.machine_type): + placement.append('GroupName=%s' % self.placement_group.name) + else: + logging.warning( + 'VM not placed in Placement Group. VM Type %s not supported', + self.machine_type) + placement = ','.join(placement) + block_device_map = GetBlockDeviceMap(self.machine_type, + self.boot_disk_size, + self.image, + self.region) + if not self.aws_tags: + # Set tags for the AWS VM. If we are retrying the create, we have to use + # the same tags from the previous call. + self.aws_tags.update(self.vm_metadata) + self.aws_tags.update(util.MakeDefaultTags()) + create_cmd = util.AWS_PREFIX + [ + 'ec2', + 'run-instances', + '--region=%s' % self.region, + '--client-token=%s' % self.client_token, + '--image-id=%s' % self.image, + '--instance-type=%s' % self.machine_type, + '--key-name=%s' % AwsKeyFileManager.GetKeyNameForRun(), + '--tag-specifications=%s' % + util.FormatTagSpecifications('instance', self.aws_tags)] + + if FLAGS.aws_vm_hibernate: + create_cmd.extend([ + '--hibernation-options=Configured=true', + ]) + + if FLAGS.disable_smt: + query_cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-instance-types', + '--instance-types', + self.machine_type, + '--query', + 'InstanceTypes[0].VCpuInfo.DefaultCores' + ] + stdout, _, retcode = vm_util.IssueCommand(query_cmd) + cores = int(json.loads(stdout)) + create_cmd.append(f'--cpu-options=CoreCount={cores},ThreadsPerCore=1') + if FLAGS.aws_efa: + efas = ['--network-interfaces'] + for device_index in range(FLAGS.aws_efa_count): + efa_params = _EFA_PARAMS.copy() + efa_params.update({ + 'NetworkCardIndex': device_index, + 'DeviceIndex': device_index, + 'Groups': self.group_id, + 'SubnetId': self.network.subnet.id + }) + if FLAGS.aws_efa_count == 1: + efa_params['AssociatePublicIpAddress'] = True + efas.append(','.join(f'{key}={value}' for key, value in + sorted(efa_params.items()))) + create_cmd.extend(efas) + else: + create_cmd.append('--associate-public-ip-address') + create_cmd.append(f'--subnet-id={self.network.subnet.id}') + if block_device_map: + create_cmd.append('--block-device-mappings=%s' % block_device_map) + if placement: + create_cmd.append('--placement=%s' % placement) + if FLAGS.aws_credit_specification: + create_cmd.append('--credit-specification=%s' % + FLAGS.aws_credit_specification) + if self.user_data: + create_cmd.append('--user-data=%s' % self.user_data) + if self.capacity_reservation_id: + create_cmd.append( + '--capacity-reservation-specification=CapacityReservationTarget=' + '{CapacityReservationId=%s}' % self.capacity_reservation_id) + if self.use_spot_instance: + instance_market_options = collections.OrderedDict() + spot_options = collections.OrderedDict() + spot_options['SpotInstanceType'] = 'one-time' + spot_options['InstanceInterruptionBehavior'] = 'terminate' + if self.spot_price: + spot_options['MaxPrice'] = str(self.spot_price) + if self.spot_block_duration_minutes: + spot_options['BlockDurationMinutes'] = self.spot_block_duration_minutes + instance_market_options['MarketType'] = 'spot' + instance_market_options['SpotOptions'] = spot_options + create_cmd.append( + '--instance-market-options=%s' % json.dumps(instance_market_options)) + _, stderr, retcode = vm_util.IssueCommand(create_cmd, + raise_on_failure=False) + + arm_arch = GetArmArchitecture(self.machine_type) + if arm_arch: + self.host_arch = arm_arch + + if self.use_dedicated_host and 'InsufficientCapacityOnHost' in stderr: + if self.num_vms_per_host: + raise errors.Resource.CreationError( + 'Failed to create host: %d vms of type %s per host exceeds ' + 'memory capacity limits of the host' % + (self.num_vms_per_host, self.machine_type)) + else: + logging.warning( + 'Creation failed due to insufficient host capacity. A new host will ' + 'be created and instance creation will be retried.') + with self._lock: + if num_hosts == len(self.host_list): + host = AwsDedicatedHost(self.machine_type, self.zone) + self.host_list.append(host) + host.Create() + self.host = self.host_list[-1] + self.client_token = str(uuid.uuid4()) + raise errors.Resource.RetryableCreationError() + if 'InsufficientInstanceCapacity' in stderr: + if self.use_spot_instance: + self.spot_status_code = 'InsufficientSpotInstanceCapacity' + self.spot_early_termination = True + raise errors.Benchmarks.InsufficientCapacityCloudFailure(stderr) + if 'SpotMaxPriceTooLow' in stderr: + self.spot_status_code = 'SpotMaxPriceTooLow' + self.spot_early_termination = True + raise errors.Resource.CreationError(stderr) + if 'InstanceLimitExceeded' in stderr or 'VcpuLimitExceeded' in stderr: + raise errors.Benchmarks.QuotaFailure(stderr) + if 'RequestLimitExceeded' in stderr: + if FLAGS.retry_on_rate_limited: + raise errors.Resource.RetryableCreationError(stderr) + else: + raise errors.Benchmarks.QuotaFailure(stderr) + + # When launching more than 1 VM into the same placement group, there is an + # occasional error that the placement group has already been used in a + # separate zone. Retrying fixes this error. + if 'InvalidPlacementGroup.InUse' in stderr: + raise errors.Resource.RetryableCreationError(stderr) + if 'Unsupported' in stderr: + raise errors.Benchmarks.UnsupportedConfigError(stderr) + if retcode: + raise errors.Resource.CreationError( + 'Failed to create VM: %s return code: %s' % (retcode, stderr)) + + @vm_util.Retry( + poll_interval=0.5, + log_errors=True, + retryable_exceptions=(AwsTransitionalVmRetryableError,)) + def _WaitForStoppedStatus(self): + """Returns the status of the VM. + + Returns: + Whether the VM is suspended i.e. in a stopped status. If not, raises an + error + + Raises: + AwsUnknownStatusError: If an unknown status is returned from AWS. + AwsTransitionalVmRetryableError: If the VM is pending. This is retried. + """ + describe_cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-instance-status', + '--region=%s' % self.region, + '--instance-ids=%s' % self.id, + '--include-all-instances', + ] + + stdout, _ = util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + status = response['InstanceStatuses'][0]['InstanceState']['Name'] + if status.lower() != 'stopped': + logging.info('VM has status %s.', status) + + raise AwsTransitionalVmRetryableError() + + def _BeforeSuspend(self): + """Prepares the instance for suspend by having the VM sleep for a given duration. + + This ensures the VM is ready for hibernation + """ + # Add a timer that waits for a given duration after vm instance is + # created before calling suspend on the vm to ensure that the vm is + # ready for hibernation in aws. + time.sleep(600) + + def _PostSuspend(self): + self._WaitForStoppedStatus() + + def _Suspend(self): + """Suspends a VM instance.""" + suspend_cmd = util.AWS_PREFIX + [ + 'ec2', + 'stop-instances', + '--region=%s' % self.region, + '--instance-ids=%s' % self.id, + '--hibernate', + ] + try: + vm_util.IssueCommand(suspend_cmd) + except: + raise errors.Benchmarks.KnownIntermittentError( + 'Instance is still not ready to hibernate') + + self._PostSuspend() + + @vm_util.Retry( + poll_interval=0.5, + retryable_exceptions=(AwsTransitionalVmRetryableError,)) + def _WaitForNewIP(self): + """Checks for a new IP address, waiting if the VM is still pending. + + Raises: + AwsTransitionalVmRetryableError: If VM is pending. This is retried. + """ + status_cmd = util.AWS_PREFIX + [ + 'ec2', 'describe-instances', f'--region={self.region}', + f'--instance-ids={self.id}' + ] + stdout, _, _ = vm_util.IssueCommand(status_cmd) + response = json.loads(stdout) + instance = response['Reservations'][0]['Instances'][0] + if 'PublicIpAddress' in instance: + self.ip_address = instance['PublicIpAddress'] + else: + logging.info('VM is pending.') + raise AwsTransitionalVmRetryableError() + + def _PostResume(self): + self._WaitForNewIP() + + def _Resume(self): + """Resumes a VM instance.""" + resume_cmd = util.AWS_PREFIX + [ + 'ec2', + 'start-instances', + '--region=%s' % self.region, + '--instance-ids=%s' % self.id, + ] + vm_util.IssueCommand(resume_cmd) + self._PostResume() + + def _Delete(self): + """Delete a VM instance.""" + if self.id: + delete_cmd = util.AWS_PREFIX + [ + 'ec2', + 'terminate-instances', + '--region=%s' % self.region, + '--instance-ids=%s' % self.id] + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + if hasattr(self, 'spot_instance_request_id'): + cancel_cmd = util.AWS_PREFIX + [ + '--region=%s' % self.region, + 'ec2', + 'cancel-spot-instance-requests', + '--spot-instance-request-ids=%s' % self.spot_instance_request_id] + vm_util.IssueCommand(cancel_cmd, raise_on_failure=False) + + if FLAGS.aws_efa: + if self.association_id: + vm_util.IssueCommand(util.AWS_PREFIX + + ['ec2', 'disassociate-address', + f'--region={self.region}', + f'--association-id={self.association_id}']) + + if self.allocation_id: + vm_util.IssueCommand(util.AWS_PREFIX + + ['ec2', 'release-address', + f'--region={self.region}', + f'--allocation-id={self.allocation_id}']) + + # _Start or _Stop not yet implemented for AWS + def _Start(self): + """Starts the VM.""" + if not self.id: + raise errors.Benchmarks.RunError( + 'Expected VM id to be non-null. Please make sure the VM exists.') + start_cmd = util.AWS_PREFIX + [ + 'ec2', 'start-instances', + f'--region={self.region}', + f'--instance-ids={self.id}' + ] + vm_util.IssueCommand(start_cmd) + + def _PostStart(self): + self._WaitForNewIP() + + def _Stop(self): + """Stops the VM.""" + if not self.id: + raise errors.Benchmarks.RunError( + 'Expected VM id to be non-null. Please make sure the VM exists.') + stop_cmd = util.AWS_PREFIX + [ + 'ec2', 'stop-instances', + f'--region={self.region}', + f'--instance-ids={self.id}' + ] + vm_util.IssueCommand(stop_cmd) + + def _PostStop(self): + self._WaitForStoppedStatus() + + def _UpdateInterruptibleVmStatusThroughApi(self): + if hasattr(self, 'spot_instance_request_id'): + describe_cmd = util.AWS_PREFIX + [ + '--region=%s' % self.region, + 'ec2', + 'describe-spot-instance-requests', + '--spot-instance-request-ids=%s' % self.spot_instance_request_id] + stdout, _, _ = vm_util.IssueCommand(describe_cmd) + sir_response = json.loads(stdout)['SpotInstanceRequests'] + self.spot_status_code = sir_response[0]['Status']['Code'] + self.spot_early_termination = ( + self.spot_status_code in AWS_INITIATED_SPOT_TERMINAL_STATUSES) + + @vm_util.Retry( + poll_interval=1, + log_errors=False, + retryable_exceptions=(AwsTransitionalVmRetryableError,)) + def _Exists(self): + """Returns whether the VM exists. + + This method waits until the VM is no longer pending. + + Returns: + Whether the VM exists. + + Raises: + AwsUnknownStatusError: If an unknown status is returned from AWS. + AwsTransitionalVmRetryableError: If the VM is pending. This is retried. + """ + describe_cmd = util.AWS_PREFIX + [ + 'ec2', + 'describe-instances', + '--region=%s' % self.region, + '--filter=Name=client-token,Values=%s' % self.client_token] + + stdout, _ = util.IssueRetryableCommand(describe_cmd) + response = json.loads(stdout) + reservations = response['Reservations'] + assert len(reservations) < 2, 'Too many reservations.' + if not reservations: + if not self.create_start_time: + return False + if self.delete_start_time and not self.created: + return False + logging.info('No reservation returned by describe-instances. This ' + 'sometimes shows up immediately after a successful ' + 'run-instances command. Retrying describe-instances ' + 'command.') + raise AwsTransitionalVmRetryableError() + instances = reservations[0]['Instances'] + assert len(instances) == 1, 'Wrong number of instances.' + status = instances[0]['State']['Name'] + self.id = instances[0]['InstanceId'] + if self.use_spot_instance: + self.spot_instance_request_id = instances[0]['SpotInstanceRequestId'] + + if status not in INSTANCE_KNOWN_STATUSES: + raise AwsUnknownStatusError('Unknown status %s' % status) + if status in INSTANCE_TRANSITIONAL_STATUSES: + logging.info('VM has status %s; retrying describe-instances command.', + status) + raise AwsTransitionalVmRetryableError() + # In this path run-instances succeeded, a pending instance was created, but + # not fulfilled so it moved to terminated. + if (status == TERMINATED and + instances[0]['StateReason']['Code'] == + 'Server.InsufficientInstanceCapacity'): + raise errors.Benchmarks.InsufficientCapacityCloudFailure( + instances[0]['StateReason']['Message']) + # In this path run-instances succeeded, a pending instance was created, but + # instance is shutting down due to internal server error. This is a + # retryable command for run-instance. + # Client token needs to be refreshed for idempotency. + if (status == SHUTTING_DOWN and + instances[0]['StateReason']['Code'] == 'Server.InternalError'): + self.client_token = str(uuid.uuid4()) + return status in INSTANCE_EXISTS_STATUSES + + def _GetNvmeBootIndex(self): + if aws_disk.LocalDriveIsNvme(self.machine_type) and \ + aws_disk.EbsDriveIsNvme(self.machine_type): + self.Install('storage_tools') + # identify boot drive + # If this command ever fails consider 'findmnt -nM / -o source' + cmd = ('realpath /dev/disk/by-label/cloudimg-rootfs ' + '| grep --only-matching "nvme[0-9]*"') + boot_drive = self.RemoteCommand(cmd, ignore_failure=True)[0].strip() + if boot_drive: + # get the boot drive index by dropping the nvme prefix + boot_idx = int(boot_drive[4:]) + logging.info('found boot drive at nvme index %d', boot_idx) + return boot_idx + else: + logging.warning('Failed to identify NVME boot drive index. Assuming 0.') + return 0 + + def CreateScratchDisk(self, disk_spec): + """Create a VM's scratch disk. + + Args: + disk_spec: virtual_machine.BaseDiskSpec object of the disk. + + Raises: + CreationError: If an NFS disk is listed but the NFS service not created. + """ + # Instantiate the disk(s) that we want to create. + disks = [] + nvme_boot_drive_index = self._GetNvmeBootIndex() + for _ in range(disk_spec.num_striped_disks): + if disk_spec.disk_type == disk.NFS: + data_disk = self._GetNfsService().CreateNfsDisk() + else: + data_disk = aws_disk.AwsDisk(disk_spec, self.zone, self.machine_type) + if disk_spec.disk_type == disk.LOCAL: + device_letter = chr(ord(DRIVE_START_LETTER) + self.local_disk_counter) + data_disk.AssignDeviceLetter(device_letter, nvme_boot_drive_index) + # Local disk numbers start at 1 (0 is the system disk). + data_disk.disk_number = self.local_disk_counter + 1 + self.local_disk_counter += 1 + if self.local_disk_counter > self.max_local_disks: + raise errors.Error('Not enough local disks.') + elif disk_spec.disk_type == disk.NFS: + pass + else: + # Remote disk numbers start at 1 + max_local disks (0 is the system disk + # and local disks occupy [1, max_local_disks]). + data_disk.disk_number = (self.remote_disk_counter + + 1 + self.max_local_disks) + self.remote_disk_counter += 1 + disks.append(data_disk) + + self._CreateScratchDiskFromDisks(disk_spec, disks) + + def AddMetadata(self, **kwargs): + """Adds metadata to the VM.""" + util.AddTags(self.id, self.region, **kwargs) + if self.use_spot_instance: + util.AddDefaultTags(self.spot_instance_request_id, self.region) + + def InstallCli(self): + """Installs the AWS cli and credentials on this AWS vm.""" + self.Install('awscli') + self.Install('aws_credentials') + + def DownloadPreprovisionedData(self, install_path, module_name, filename): + """Downloads a data file from an AWS S3 bucket with pre-provisioned data. + + Use --aws_preprovisioned_data_bucket to specify the name of the bucket. + + Args: + install_path: The install path on this VM. + module_name: Name of the module associated with this data file. + filename: The name of the file that was downloaded. + """ + self.InstallCli() + # TODO(deitz): Add retry logic. + self.RemoteCommand(GenerateDownloadPreprovisionedDataCommand( + install_path, module_name, filename)) + + def ShouldDownloadPreprovisionedData(self, module_name, filename): + """Returns whether or not preprovisioned data is available.""" + # MSW - There is something wrong with this function, it fails miserably when + # FLAGS.aws_preprovisioned_data_bucket is not set. It appears that when we + # attempt to install epel, it looks for it in an S3 bucket instead of d/l + # directly. This might be okay in some cases, but is not the typical use + # case. This if statement is a quick and dirty workaround until I can + # think of a better solution in the context of the framework. + if (FLAGS.aws_preprovisioned_data_bucket): + self.Install('aws_credentials') + self.Install('awscli') + return FLAGS.aws_preprovisioned_data_bucket and self.TryRemoteCommand( + GenerateStatPreprovisionedDataCommand(module_name, filename)) + else: + return False + + def IsInterruptible(self): + """Returns whether this vm is an interruptible vm (spot vm). + + Returns: True if this vm is an interruptible vm (spot vm). + """ + return self.use_spot_instance + + def WasInterrupted(self): + """Returns whether this spot vm was terminated early by AWS. + + Returns: True if this vm was terminated early by AWS. + """ + return self.spot_early_termination + + def GetVmStatusCode(self): + """Returns the early termination code if any. + + Returns: Early termination code. + """ + return self.spot_status_code + + def GetResourceMetadata(self): + """Returns a dict containing metadata about the VM. + + Returns: + dict mapping string property key to value. + """ + result = super(AwsVirtualMachine, self).GetResourceMetadata() + result['boot_disk_type'] = self.DEFAULT_ROOT_DISK_TYPE + result['boot_disk_size'] = self.boot_disk_size + if self.use_dedicated_host: + result['num_vms_per_host'] = self.num_vms_per_host + result['efa'] = FLAGS.aws_efa + if FLAGS.aws_efa: + result['efa_version'] = FLAGS.aws_efa_version + result['efa_count'] = FLAGS.aws_efa_count + result['preemptible'] = self.use_spot_instance + return result + + +class ClearBasedAwsVirtualMachine(AwsVirtualMachine, + linux_virtual_machine.ClearMixin): + IMAGE_NAME_FILTER = 'clear/images/*/clear-*' + DEFAULT_USER_NAME = 'clear' + + +class CoreOsBasedAwsVirtualMachine(AwsVirtualMachine, + linux_virtual_machine.CoreOsMixin): + IMAGE_NAME_FILTER = 'fedora-coreos-*-hvm' + # CoreOS only distinguishes between stable and testing in the description + IMAGE_DESCRIPTION_FILTER = 'Fedora CoreOS stable *' + IMAGE_OWNER = CENTOS_IMAGE_PROJECT + DEFAULT_USER_NAME = 'core' + + +class Debian9BasedAwsVirtualMachine(AwsVirtualMachine, + linux_virtual_machine.Debian9Mixin): + # From https://wiki.debian.org/Cloud/AmazonEC2Image/Stretch + IMAGE_NAME_FILTER = 'debian-stretch-*64-*' + IMAGE_OWNER = DEBIAN_9_IMAGE_PROJECT + DEFAULT_USER_NAME = 'admin' + + def _BeforeSuspend(self): + """Prepares the aws vm for hibernation.""" + raise NotImplementedError() + + +class Debian10BasedAwsVirtualMachine(AwsVirtualMachine, + linux_virtual_machine.Debian10Mixin): + # From https://wiki.debian.org/Cloud/AmazonEC2Image/Buster + IMAGE_NAME_FILTER = 'debian-10-*64*' + IMAGE_OWNER = DEBIAN_IMAGE_PROJECT + DEFAULT_USER_NAME = 'admin' + + +class Debian11BasedAwsVirtualMachine(AwsVirtualMachine, + linux_virtual_machine.Debian11Mixin): + # From https://wiki.debian.org/Cloud/AmazonEC2Image/Buster + IMAGE_NAME_FILTER = 'debian-11-*64*' + IMAGE_OWNER = DEBIAN_IMAGE_PROJECT + DEFAULT_USER_NAME = 'admin' + + +class UbuntuBasedAwsVirtualMachine(AwsVirtualMachine): + IMAGE_OWNER = UBUNTU_IMAGE_PROJECT + DEFAULT_USER_NAME = 'ubuntu' + + +class Ubuntu1604BasedAwsVirtualMachine(UbuntuBasedAwsVirtualMachine, + linux_virtual_machine.Ubuntu1604Mixin): + IMAGE_NAME_FILTER = 'ubuntu/images/*/ubuntu-xenial-16.04-*64-server-20*' + + def _InstallEfa(self): + super(Ubuntu1604BasedAwsVirtualMachine, self)._InstallEfa() + self.Reboot() + self.WaitForBootCompletion() + + +class Ubuntu1804BasedAwsVirtualMachine(UbuntuBasedAwsVirtualMachine, + linux_virtual_machine.Ubuntu1804Mixin): + IMAGE_NAME_FILTER = 'ubuntu/images/*/ubuntu-bionic-18.04-*64-server-20*' + + +class Ubuntu1804EfaBasedAwsVirtualMachine( + UbuntuBasedAwsVirtualMachine, linux_virtual_machine.Ubuntu1804EfaMixin): + IMAGE_OWNER = UBUNTU_EFA_IMAGE_PROJECT + IMAGE_NAME_FILTER = 'Deep Learning AMI (Ubuntu 18.04) Version *' + + +class Ubuntu2004BasedAwsVirtualMachine(UbuntuBasedAwsVirtualMachine, + linux_virtual_machine.Ubuntu2004Mixin): + IMAGE_NAME_FILTER = 'ubuntu/images/*/ubuntu-focal-20.04-*64-server-20*' + + +class Ubuntu2204BasedAwsVirtualMachine(UbuntuBasedAwsVirtualMachine, + linux_virtual_machine.Ubuntu2204Mixin): + IMAGE_NAME_FILTER = 'ubuntu/images/*/ubuntu-jammy-22.04-*64-server-20*' + + +class JujuBasedAwsVirtualMachine(UbuntuBasedAwsVirtualMachine, + linux_virtual_machine.JujuMixin): + """Class with configuration for AWS Juju virtual machines.""" + IMAGE_NAME_FILTER = 'ubuntu/images/*/ubuntu-trusty-14.04-*64-server-20*' + + +class AmazonLinux2BasedAwsVirtualMachine( + AwsVirtualMachine, linux_virtual_machine.AmazonLinux2Mixin): + """Class with configuration for AWS Amazon Linux 2 virtual machines.""" + IMAGE_NAME_FILTER = 'amzn2-ami-*-*-*' + IMAGE_OWNER = AMAZON_LINUX_IMAGE_PROJECT + + +class Rhel7BasedAwsVirtualMachine(AwsVirtualMachine, + linux_virtual_machine.Rhel7Mixin): + """Class with configuration for AWS RHEL 7 virtual machines.""" + # Documentation on finding RHEL images: + # https://access.redhat.com/articles/2962171 + IMAGE_NAME_FILTER = 'RHEL-7*_GA*' + IMAGE_OWNER = RHEL_IMAGE_PROJECT + + +class Rhel8BasedAwsVirtualMachine(AwsVirtualMachine, + linux_virtual_machine.Rhel8Mixin): + """Class with configuration for AWS RHEL 8 virtual machines.""" + # Documentation on finding RHEL images: + # https://access.redhat.com/articles/2962181 + # All RHEL AMIs are HVM. HVM- blocks HVM_BETA. + IMAGE_NAME_FILTER = 'RHEL-8*_HVM-*' + IMAGE_OWNER = RHEL_IMAGE_PROJECT + + +class CentOs7BasedAwsVirtualMachine(AwsVirtualMachine, + linux_virtual_machine.CentOs7Mixin): + """Class with configuration for AWS CentOS 7 virtual machines.""" + # Documentation on finding the CentOS 7 image: + # https://wiki.centos.org/Cloud/AWS#x86_64 + IMAGE_NAME_FILTER = 'CentOS 7*' + IMAGE_OWNER = CENTOS_IMAGE_PROJECT + DEFAULT_USER_NAME = 'centos' + + def _InstallEfa(self): + logging.info('Upgrading Centos7 kernel, installing kernel headers and ' + 'rebooting before installing EFA.') + self.RemoteCommand('sudo yum upgrade -y kernel') + self.InstallPackages('kernel-devel') + self.Reboot() + self.WaitForBootCompletion() + super(CentOs7BasedAwsVirtualMachine, self)._InstallEfa() + + +class CentOs8BasedAwsVirtualMachine(AwsVirtualMachine, + linux_virtual_machine.CentOs8Mixin): + """Class with configuration for AWS CentOS 8 virtual machines.""" + # This describes the official AMIs listed here: + # https://wiki.centos.org/Cloud/AWS#Official_CentOS_Linux_:_Public_Images + IMAGE_OWNER = CENTOS_IMAGE_PROJECT + IMAGE_NAME_FILTER = 'CentOS 8*' + DEFAULT_USER_NAME = 'centos' + + +class CentOsStream8BasedAwsVirtualMachine( + AwsVirtualMachine, linux_virtual_machine.CentOsStream8Mixin): + """Class with configuration for AWS CentOS Stream 8 virtual machines.""" + # This describes the official AMIs listed here: + # https://wiki.centos.org/Cloud/AWS#Official_CentOS_Linux_:_Public_Images + IMAGE_OWNER = CENTOS_IMAGE_PROJECT + IMAGE_NAME_FILTER = 'CentOS Stream 8*' + DEFAULT_USER_NAME = 'centos' + + +class RockyLinux8BasedAwsVirtualMachine(AwsVirtualMachine, + linux_virtual_machine.RockyLinux8Mixin): + """Class with configuration for AWS Rocky Linux 8 virtual machines.""" + IMAGE_OWNER = MARKETPLACE_IMAGE_PROJECT + IMAGE_PRODUCT_CODE_FILTER = 'cotnnspjrsi38lfn8qo4ibnnm' + IMAGE_NAME_FILTER = 'Rocky-8-*' + DEFAULT_USER_NAME = 'rocky' + + +class CentOsStream9BasedAwsVirtualMachine( + AwsVirtualMachine, linux_virtual_machine.CentOsStream9Mixin): + """Class with configuration for AWS CentOS Stream 9 virtual machines.""" + # This describes the official AMIs listed here: + # https://wiki.centos.org/Cloud/AWS#Official_CentOS_Linux_:_Public_Images + IMAGE_OWNER = CENTOS_IMAGE_PROJECT + IMAGE_NAME_FILTER = 'CentOS Stream 9*' + + +class BaseWindowsAwsVirtualMachine(AwsVirtualMachine, + windows_virtual_machine.BaseWindowsMixin): + """Support for Windows machines on AWS.""" + DEFAULT_USER_NAME = 'Administrator' + IMAGE_OWNER = WINDOWS_IMAGE_PROJECT + + def __init__(self, vm_spec): + super(BaseWindowsAwsVirtualMachine, self).__init__(vm_spec) + self.user_data = ('%s' % + windows_virtual_machine.STARTUP_SCRIPT) + + @vm_util.Retry() + def _GetDecodedPasswordData(self): + # Retrieve a base64 encoded, encrypted password for the VM. + get_password_cmd = util.AWS_PREFIX + [ + 'ec2', + 'get-password-data', + '--region=%s' % self.region, + '--instance-id=%s' % self.id] + stdout, _ = util.IssueRetryableCommand(get_password_cmd) + response = json.loads(stdout) + password_data = response['PasswordData'] + + # AWS may not populate the password data until some time after + # the VM shows as running. Simply retry until the data shows up. + if not password_data: + raise ValueError('No PasswordData in response.') + + # Decode the password data. + return base64.b64decode(password_data) + + def _PostCreate(self): + """Retrieve generic VM info and then retrieve the VM's password.""" + super(BaseWindowsAwsVirtualMachine, self)._PostCreate() + + # Get the decoded password data. + decoded_password_data = self._GetDecodedPasswordData() + + # Write the encrypted data to a file, and use openssl to + # decrypt the password. + with vm_util.NamedTemporaryFile() as tf: + tf.write(decoded_password_data) + tf.close() + decrypt_cmd = ['openssl', + 'rsautl', + '-decrypt', + '-in', + tf.name, + '-inkey', + vm_util.GetPrivateKeyPath()] + password, _ = vm_util.IssueRetryableCommand(decrypt_cmd) + self.password = password + + def GetResourceMetadata(self): + """Returns a dict containing metadata about the VM. + + Returns: + dict mapping metadata key to value. + """ + result = super(BaseWindowsAwsVirtualMachine, self).GetResourceMetadata() + result['disable_interrupt_moderation'] = self.disable_interrupt_moderation + return result + + @vm_util.Retry( + max_retries=10, + retryable_exceptions=(AwsUnexpectedWindowsAdapterOutputError, + errors.VirtualMachine.RemoteCommandError)) + def DisableInterruptModeration(self): + """Disable the networking feature 'Interrupt Moderation'.""" + + # First ensure that the driver supports interrupt moderation + net_adapters, _ = self.RemoteCommand('Get-NetAdapter') + if 'Intel(R) 82599 Virtual Function' not in net_adapters: + raise AwsDriverDoesntSupportFeatureError( + 'Driver not tested with Interrupt Moderation in PKB.') + aws_int_dis_path = ('HKLM\\SYSTEM\\ControlSet001\\Control\\Class\\' + '{4d36e972-e325-11ce-bfc1-08002be10318}\\0011') + command = 'reg add "%s" /v *InterruptModeration /d 0 /f' % aws_int_dis_path + self.RemoteCommand(command) + try: + self.RemoteCommand('Restart-NetAdapter -Name "Ethernet 2"') + except IOError: + # Restarting the network adapter will always fail because + # the winrm connection used to issue the command will be + # broken. + pass + int_dis_value, _ = self.RemoteCommand( + 'reg query "%s" /v *InterruptModeration' % aws_int_dis_path) + # The second line should look like: + # *InterruptModeration REG_SZ 0 + registry_query_lines = int_dis_value.splitlines() + if len(registry_query_lines) < 3: + raise AwsUnexpectedWindowsAdapterOutputError( + 'registry query failed: %s ' % int_dis_value) + registry_query_result = registry_query_lines[2].split() + if len(registry_query_result) < 3: + raise AwsUnexpectedWindowsAdapterOutputError( + 'unexpected registry query response: %s' % int_dis_value) + if registry_query_result[2] != '0': + raise AwsUnexpectedWindowsAdapterOutputError( + 'InterruptModeration failed to disable') + + +class Windows2012CoreAwsVirtualMachine( + BaseWindowsAwsVirtualMachine, windows_virtual_machine.Windows2012CoreMixin): + IMAGE_NAME_FILTER = 'Windows_Server-2012-R2_RTM-English-64Bit-Core-*' + + +class Windows2016CoreAwsVirtualMachine( + BaseWindowsAwsVirtualMachine, windows_virtual_machine.Windows2016CoreMixin): + IMAGE_NAME_FILTER = 'Windows_Server-2016-English-Core-Base-*' + + +class Windows2019CoreAwsVirtualMachine( + BaseWindowsAwsVirtualMachine, windows_virtual_machine.Windows2019CoreMixin): + IMAGE_NAME_FILTER = 'Windows_Server-2019-English-Core-Base-*' + + +class Windows2022CoreAwsVirtualMachine( + BaseWindowsAwsVirtualMachine, windows_virtual_machine.Windows2022CoreMixin): + IMAGE_NAME_FILTER = 'Windows_Server-2022-English-Core-Base-*' + + +class Windows2012DesktopAwsVirtualMachine( + BaseWindowsAwsVirtualMachine, + windows_virtual_machine.Windows2012DesktopMixin): + IMAGE_NAME_FILTER = 'Windows_Server-2012-R2_RTM-English-64Bit-Base-*' + + +class Windows2016DesktopAwsVirtualMachine( + BaseWindowsAwsVirtualMachine, + windows_virtual_machine.Windows2016DesktopMixin): + IMAGE_NAME_FILTER = 'Windows_Server-2016-English-Full-Base-*' + + +class Windows2019DesktopAwsVirtualMachine( + BaseWindowsAwsVirtualMachine, + windows_virtual_machine.Windows2019DesktopMixin): + IMAGE_NAME_FILTER = 'Windows_Server-2019-English-Full-Base-*' + + +class Windows2022DesktopAwsVirtualMachine( + BaseWindowsAwsVirtualMachine, + windows_virtual_machine.Windows2022DesktopMixin): + IMAGE_NAME_FILTER = 'Windows_Server-2022-English-Full-Base-*' + + +class Windows2019DesktopSQLServer2019StandardAwsVirtualMachine( + BaseWindowsAwsVirtualMachine, + windows_virtual_machine.Windows2019SQLServer2019Standard): + IMAGE_NAME_FILTER = 'Windows_Server-2019-English-Full-SQL_2019_Standard-*' + + +class Windows2019DesktopSQLServer2019EnterpriseAwsVirtualMachine( + BaseWindowsAwsVirtualMachine, + windows_virtual_machine.Windows2019SQLServer2019Enterprise): + IMAGE_NAME_FILTER = 'Windows_Server-2019-English-Full-SQL_2019_Enterprise-*' + + +class Windows2022DesktopSQLServer2019StandardAwsVirtualMachine( + BaseWindowsAwsVirtualMachine, + windows_virtual_machine.Windows2022SQLServer2019Standard): + IMAGE_NAME_FILTER = 'Windows_Server-2022-English-Full-SQL_2019_Standard-*' + + +class Windows2022DesktopSQLServer2019EnterpriseAwsVirtualMachine( + BaseWindowsAwsVirtualMachine, + windows_virtual_machine.Windows2022SQLServer2019Enterprise): + IMAGE_NAME_FILTER = 'Windows_Server-2022-English-Full-SQL_2019_Enterprise-*' + + +def GenerateDownloadPreprovisionedDataCommand(install_path, module_name, + filename): + """Returns a string used to download preprovisioned data.""" + return 'aws s3 cp --only-show-errors s3://%s/%s/%s %s' % ( + FLAGS.aws_preprovisioned_data_bucket, module_name, filename, + posixpath.join(install_path, filename)) + + +def GenerateStatPreprovisionedDataCommand(module_name, filename): + """Returns a string used to download preprovisioned data.""" + return 'aws s3api head-object --bucket %s --key %s/%s' % ( + FLAGS.aws_preprovisioned_data_bucket, module_name, filename) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_vpc_endpoint.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_vpc_endpoint.py new file mode 100644 index 0000000..895356f --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/aws_vpc_endpoint.py @@ -0,0 +1,169 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""An AWS VPC Endpoint. + +See https://docs.aws.amazon.com/vpc/latest/userguide/vpc-endpoints.html + +A VPC Endpoint provides routing between a VPC and an AWS service without using +the internet connection interface. + +For example when an S3 Endpoint is created for region us-west-1 the route table +for that VPC is updated so that requests for the IP addresses associated with +com.amazonaws.us-west-1.s3 now go through this new interface and not out through +the original internet gateway. +""" + +import json + +from perfkitbenchmarker import providers +from perfkitbenchmarker import resource +from perfkitbenchmarker.providers.aws import util + + +def GetAwsVpcEndpointClass(aws_service): + """Returns the AwsVpcEndpoint class for the given service.""" + return resource.GetResourceClass( + AwsVpcEndpoint, CLOUD=providers.AWS, AWS_SERVICE=aws_service) + + +def CreateEndpointService(aws_service, vpc): + """Creates the named VPC endpoint in the given VPC. + + Args: + aws_service: The AWS service to use. + vpc: The VPC to launch the endpoint service in. + + Returns: + The resource.BaseResource of the endpoint service. + """ + service_class = GetAwsVpcEndpointClass(aws_service) + return service_class(vpc) + + +class AwsVpcEndpoint(resource.BaseResource): + """An AWS Endpoint. + + Attributes: + region: The AWS region of the VPC + vpc: The aws_network.AwsVpc object to make the connection in. The VPC does + not initially need an ID but does when Create() is called. + """ + REQUIRED_ATTRS = ['CLOUD', 'AWS_SERVICE'] + RESOURCE_TYPE = 'AwsVpcEndpoint' + CLOUD = providers.AWS + AWS_SERVICE: str # must be set by derived classes + + def __init__(self, vpc): + super(AwsVpcEndpoint, self).__init__() + assert vpc, 'Must have a VPC object (does not require an id).' + self._vpc = vpc + self.region = self._vpc.region + assert self.region, 'VPC region must be set' + self._service_name = 'com.amazonaws.{}.{}'.format(self.region, + self.AWS_SERVICE) + # in the Create() method query to see if an endpoint already defined + self.id = None + + @property + def vpc_id(self): + """Returns the VPC id. Can be None.""" + return self._vpc.id + + @property + def endpoint_id(self): + """Returns the endpoint id for the defined VPC.""" + if not self.vpc_id: + # When creating an SDDC there will not be a VPC to have an endpoint + return None + ids = self._RunCommand(['describe-vpc-endpoints'] + util.AwsFilter({ + 'vpc-id': self.vpc_id, + 'service-name': self._service_name + }) + ['--query', 'VpcEndpoints[].VpcEndpointId']) + if not ids: + # There is a VPC but no endpoint + return None + assert len(ids) == 1, 'More than 1 VPC endpoint found: {}'.format(ids) + return ids[0] + + @property + def route_table_id(self): + """Returns the route table id for the VPC. + + Raises: + AssertionError: If no VPC is defined or if there are 0 or more than 1 + routing tables found. + """ + assert self.vpc_id, 'No defined VPC id.' + table_ids = self._RunCommand(['describe-route-tables'] + + util.AwsFilter({'vpc-id': self.vpc_id}) + + ['--query', 'RouteTables[].RouteTableId']) + assert len(table_ids) == 1, 'Only want 1 route table: {}'.format(table_ids) + return table_ids[0] + + def _Create(self): + """See base class. + + Raises: + AssertionError: If no VPC is defined. + """ + assert self.vpc_id, 'No defined VPC id.' + self.id = self.endpoint_id + if self.id: + # Endpoint already created + return + create_response = self._RunCommand([ + 'create-vpc-endpoint', '--vpc-endpoint-type', 'Gateway', '--vpc-id', + self.vpc_id, '--service-name', self._service_name, '--route-table-ids', + self.route_table_id + ]) + self.id = create_response['VpcEndpoint']['VpcEndpointId'] + + def _PostCreate(self): + """See base class.""" + util.AddDefaultTags(self.id, self.region) + + def _Exists(self): + """See base class.""" + return bool(self.endpoint_id) + + def _Delete(self): + """See base class.""" + endpoint_id = self.id or self.endpoint_id + if endpoint_id: + self._RunCommand( + ['delete-vpc-endpoints', '--vpc-endpoint-ids', endpoint_id]) + + def _RunCommand(self, cmds): + """Runs the AWS ec2 command in the defined region. + + Args: + cmds: List of AWS ec2 commands to run, example: ['describe-route-tables'] + + Returns: + Dict of the AWS response. + """ + cmd = util.AWS_PREFIX + ['ec2', '--region=%s' % self.region] + list(cmds) + stdout, _ = util.IssueRetryableCommand(cmd) + return json.loads(stdout) + + +class AwsVpcS3Endpoint(AwsVpcEndpoint): + """An AWS VPC S3 Endpoint. + + Attributes: + region: The AWS region of the VPC + vpc: The aws_network.AwsVpc object to make the connection in. The VPC does + not initially need an ID but does when Create() is called. + """ + AWS_SERVICE = 's3' diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py new file mode 100644 index 0000000..a1b2f63 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py @@ -0,0 +1,235 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Contains classes/functions related to EKS (Elastic Kubernetes Service). + +This requires that the eksServiceRole IAM role has already been created and +requires that the aws-iam-authenticator binary has been installed. +See https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html for +instructions. +""" + +import json +import logging +import re +from typing import Any, Dict + +from absl import flags +from perfkitbenchmarker import container_service +from perfkitbenchmarker import data +from perfkitbenchmarker import errors +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import aws +from perfkitbenchmarker.providers.aws import aws_disk +from perfkitbenchmarker.providers.aws import aws_virtual_machine +from perfkitbenchmarker.providers.aws import util + +FLAGS = flags.FLAGS + +PROXY_FILE = 'proxy_ip_list/proxy_ip_list.txt' + + +class EksCluster(container_service.KubernetesCluster): + """Class representing an Elastic Kubernetes Service cluster.""" + + CLOUD = providers.AWS + + def __init__(self, spec): + super(EksCluster, self).__init__(spec) + # EKS requires a region and optionally a list of one or zones. + # Interpret the zone as a comma separated list of zones or a region. + self.control_plane_zones = self.zone and self.zone.split(',') + if not self.control_plane_zones: + raise errors.Config.MissingOption( + 'container_cluster.vm_spec.AWS.zone is required.') + elif len(self.control_plane_zones) == 1 and util.IsRegion(self.zone): + self.region = self.zone + self.control_plane_zones = [] + logging.info("Interpreting zone '%s' as a region", self.zone) + else: + self.region = util.GetRegionFromZones(self.control_plane_zones) + # control_plane_zones must be a superset of the node zones + for nodepool in self.nodepools.values(): + if (nodepool.vm_config.zone and + nodepool.vm_config.zone not in self.control_plane_zones): + self.control_plane_zones.append(nodepool.vm_config.zone) + if len(self.control_plane_zones) == 1: + # eksctl essentially requires you pass --zones if you pass --node-zones + # and --zones must have at least 2 zones + # https://github.com/weaveworks/eksctl/issues/4735 + self.control_plane_zones.append(self.region + + ('b' if self.zone.endswith('a') else 'a')) + self.cluster_version = FLAGS.container_cluster_version + # TODO(user) support setting boot disk type if EKS does. + self.boot_disk_type = self.vm_config.DEFAULT_ROOT_DISK_TYPE + self.node_group_name = 'eks' + self.ssh_access = FLAGS.aws_eks_ssh_access + + def GetResourceMetadata(self): + """Returns a dict containing metadata about the cluster. + + Returns: + dict mapping string property key to value. + """ + result = super(EksCluster, self).GetResourceMetadata() + result['boot_disk_type'] = self.boot_disk_type + result['boot_disk_size'] = self.vm_config.boot_disk_size + return result + + def _CreateDependencies(self): + """Set up the ssh key.""" + aws_virtual_machine.AwsKeyFileManager.ImportKeyfile(self.region) + + def _DeleteDependencies(self): + """Delete the ssh key.""" + aws_virtual_machine.AwsKeyFileManager.DeleteKeyfile(self.region) + + def _Create(self): + """Creates the control plane and worker nodes.""" + eksctl_flags = { + 'kubeconfig': FLAGS.kubeconfig, + 'managed': True, + 'name': self.name, + 'nodegroup-name': container_service.DEFAULT_NODEPOOL, + 'version': self.cluster_version, + # NAT mode uses an EIP. + 'vpc-nat-mode': 'Disable', + } + # If multiple zones are passed use them for the control plane. + # Otherwise EKS will auto-select control plane zones in the region. + eksctl_flags['zones'] = ','.join(self.control_plane_zones) + if self.min_nodes != self.max_nodes: + eksctl_flags.update({ + 'nodes-min': self.min_nodes, + 'nodes-max': self.max_nodes, + }) + eksctl_flags.update( + self._GetNodeFlags(container_service.DEFAULT_NODEPOOL, self.num_nodes, + self.vm_config)) + + cmd = [FLAGS.eksctl, 'create', 'cluster'] + sorted( + '--{}={}'.format(k, v) for k, v in eksctl_flags.items() if v) + stdout, _, retcode = vm_util.IssueCommand( + cmd, timeout=1800, raise_on_failure=False) + if retcode: + # TODO(pclay): add other quota errors + if 'The maximum number of VPCs has been reached' in stdout: + raise errors.Benchmarks.QuotaFailure(stdout) + else: + raise errors.Resource.CreationError(stdout) + + for name, node_group in self.nodepools.items(): + self._CreateNodeGroup(name, node_group) + + def _CreateNodeGroup(self, name: str, node_group): + """Creates a node group.""" + eksctl_flags = { + 'cluster': self.name, + 'name': name, + # Support ARM: https://github.com/weaveworks/eksctl/issues/3569 + 'skip-outdated-addons-check': True + } + eksctl_flags.update( + self._GetNodeFlags(name, node_group.num_nodes, node_group.vm_config)) + cmd = [FLAGS.eksctl, 'create', 'nodegroup'] + sorted( + '--{}={}'.format(k, v) for k, v in eksctl_flags.items() if v) + vm_util.IssueCommand(cmd, timeout=600) + + def _GetNodeFlags(self, node_group: str, num_nodes: int, + vm_config) -> Dict[str, Any]: + """Get common flags for creating clusters and node_groups.""" + tags = util.MakeDefaultTags() + return { + 'nodes': + num_nodes, + 'node-labels': + f'pkb_nodepool={node_group}', + 'node-type': + vm_config.machine_type, + 'node-volume-size': + vm_config.boot_disk_size, + # vm_config.zone may be split a comma separated list + 'node-zones': + vm_config.zone, + 'region': + self.region, + 'tags': + ','.join(f'{k}={v}' for k, v in tags.items()), + 'ssh-public-key': + aws_virtual_machine.AwsKeyFileManager.GetKeyNameForRun(), + } + + def _Delete(self): + """Deletes the control plane and worker nodes.""" + super()._Delete() + cmd = [FLAGS.eksctl, 'delete', 'cluster', + '--name', self.name, + '--region', self.region] + vm_util.IssueCommand(cmd, timeout=1800) + + def _IsReady(self): + """Returns True if the workers are ready, else False.""" + get_cmd = [ + FLAGS.kubectl, '--kubeconfig', FLAGS.kubeconfig, + 'get', 'nodes', + ] + stdout, _, _ = vm_util.IssueCommand(get_cmd) + ready_nodes = len(re.findall('Ready', stdout)) + return ready_nodes >= self.min_nodes + + def _PostCreate(self): + """Adds CIDR IP range of ingress SSH security group rules.""" + if self.ssh_access: + group_name = 'eksctl-{}-nodegroup-{}-remoteAccess'.format(self.name, self.node_group_name) + cmd = aws.util.AWS_PREFIX + ['ec2', 'describe-security-groups', '--filters', + 'Name=group-name,Values={}'.format(group_name), + '--query', 'SecurityGroups[*].{ID:GroupId}', + '--region', self.region] + raw_output, stderror, retcode = vm_util.IssueCommand(cmd) + if retcode != 0: + logging.warning('Failed to add CIDR IP range of ingress SSH security group rules! %s', stderror) + return + + json_output = json.loads(raw_output) + if len(json_output) != 1: + logging.warning("Failed to add CIDR IP range of ingress SSH security group rules! " + "Couldn't find {} security group!".format(group_name)) + return + + if not ('ID' in json_output[0]): + logging.warning("Failed to add CIDR IP range of ingress SSH security group rules! " + "Missing security group id!") + group_id = json_output[0]['ID'] + + CIDRs = vm_util.GetCIDRList(PROXY_FILE) + if CIDRs is None or len(CIDRs) == 0: + logging.warning('Failed to add CIDR IP range of ingress SSH security group rules! No rules in "{}"!', + data.ResourcePath(PROXY_FILE)) + return + + for cidr in CIDRs: + cmd = aws.util.AWS_PREFIX + ['ec2', 'authorize-security-group-ingress', + '--group-id', group_id, '--protocol', 'tcp', + '--port', '22', '--cidr', cidr, '--region', self.region] + + _, stderror, retcode = vm_util.IssueCommand(cmd) + if retcode != 0: + logging.warning('Failed to add %q CIDR IP range of ingress SSH security group rule! %s', cidr, stderror) + return + + def GetDefaultStorageClass(self) -> str: + """Get the default storage class for the provider.""" + # https://docs.aws.amazon.com/eks/latest/userguide/storage-classes.html + return aws_disk.GP2 diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/elasticache.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/elasticache.py new file mode 100644 index 0000000..af463f8 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/elasticache.py @@ -0,0 +1,140 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging + +from perfkitbenchmarker import errors +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.linux_packages import memcached_server +from perfkitbenchmarker.memcache_service import MemcacheService +from perfkitbenchmarker.providers.aws import aws_network +from perfkitbenchmarker.providers.aws import util + + +ELASTICACHE_PORT = 11211 + + +class ElastiCacheMemcacheService(MemcacheService): + """Class for AWS elasticache memcache service.""" + + CLOUD = providers.AWS + + def __init__(self, network, cluster_id, region, node_type, num_servers=1): + self.cluster_id = cluster_id + self.region = region + self.node_type = node_type + self.num_servers = num_servers + self.hosts = [] # [(ip, port)] + + self.vpc_id = network.subnet.vpc_id + self.security_group_id = ( + network.regional_network.vpc.default_security_group_id) + self.subnet_id = network.subnet.id + self.subnet_group_name = '%ssubnet' % cluster_id + + def Create(self): + # Open the port memcached needs + aws_network.AwsFirewall.GetFirewall().AllowPortInSecurityGroup( + self.region, self.security_group_id, ELASTICACHE_PORT) + + # Create a cache subnet group + cmd = ['aws', 'elasticache', 'create-cache-subnet-group', + '--region=%s' % self.region, + '--cache-subnet-group-name=%s' % self.subnet_group_name, + '--cache-subnet-group-description="PKB memcached_ycsb benchmark"', + '--subnet-ids=%s' % self.subnet_id] + vm_util.IssueCommand(cmd) + + # Create the cluster + cmd = ['aws', 'elasticache', 'create-cache-cluster', + '--engine=memcached', + '--cache-subnet-group-name=%s' % self.subnet_group_name, + '--cache-cluster-id=%s' % self.cluster_id, + '--num-cache-nodes=%s' % self.num_servers, + '--region=%s' % self.region, + '--cache-node-type=%s' % self.node_type, + '--tags'] + util.MakeFormattedDefaultTags() + vm_util.IssueCommand(cmd) + + # Wait for the cluster to come up + cluster_info = self._WaitForClusterUp() + + # Parse out the hosts + self.hosts = [(node['Endpoint']['Address'], node['Endpoint']['Port']) + for node in cluster_info['CacheNodes']] + assert len(self.hosts) == self.num_servers + + def Destroy(self): + # Delete the ElastiCache cluster + cmd = ['aws', 'elasticache', 'delete-cache-cluster', + '--cache-cluster-id=%s' % self.cluster_id, + '--region=%s' % self.region] + vm_util.IssueCommand(cmd, raise_on_failure=False) + # Don't have to delete the subnet group. It will be deleted with the subnet. + + def Flush(self): + vm_util.RunThreaded(memcached_server.FlushMemcachedServer, self.hosts) + + def GetHosts(self): + return ['%s:%s' % (ip, port) for ip, port in self.hosts] + + def GetMetadata(self): + return {'num_servers': self.num_servers, + 'elasticache_region': self.region, + 'elasticache_node_type': self.node_type} + + def _GetClusterInfo(self): + cmd = ['aws', 'elasticache', 'describe-cache-clusters'] + cmd += ['--cache-cluster-id=%s' % self.cluster_id] + cmd += ['--region=%s' % self.region] + cmd += ['--show-cache-node-info'] + out, _, _ = vm_util.IssueCommand(cmd) + return json.loads(out)['CacheClusters'][0] + + @vm_util.Retry(poll_interval=15, timeout=300, + retryable_exceptions=(errors.Resource.RetryableCreationError)) + def _WaitForClusterUp(self): + """Block until the ElastiCache memcached cluster is up. + + Will timeout after 5 minutes, and raise an exception. Before the timeout + expires any exceptions are caught and the status check is retried. + + We check the status of the cluster using the AWS CLI. + + Returns: + The cluster info json as a dict + + Raises: + errors.Resource.RetryableCreationError when response is not as expected or + if there is an error connecting to the port or otherwise running the + remote check command. + """ + logging.info('Trying to get ElastiCache cluster info for %s', + self.cluster_id) + cluster_status = None + try: + cluster_info = self._GetClusterInfo() + cluster_status = cluster_info['CacheClusterStatus'] + if cluster_status == 'available': + logging.info('ElastiCache memcached cluster is up and running.') + return cluster_info + except errors.VirtualMachine.RemoteCommandError as e: + raise errors.Resource.RetryableCreationError( + 'ElastiCache memcached cluster not up yet: %s.' % str(e)) + else: + raise errors.Resource.RetryableCreationError( + 'ElastiCache memcached cluster not up yet. Status: %s' % + cluster_status) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/flags.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/flags.py new file mode 100644 index 0000000..231ff6d --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/flags.py @@ -0,0 +1,120 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing flags applicable across benchmark run on AWS.""" + +from absl import flags + +flags.DEFINE_string( + 'aws_user_name', '', 'This determines the user name that Perfkit will ' + 'attempt to use. Defaults are OS specific.') +flags.DEFINE_integer('aws_provisioned_iops', None, + 'IOPS for Provisioned IOPS (SSD) volumes in AWS.') +flags.DEFINE_integer('aws_provisioned_throughput', None, + 'Provisioned throughput (MB/s) for (SSD) volumes in AWS.') + +flags.DEFINE_string('aws_dax_node_type', 'dax.r4.large', + 'The node type used for creating AWS DAX cluster.') +flags.DEFINE_integer('aws_dax_replication_factor', 3, + 'The replication factor of AWS DAX cluster.') +flags.DEFINE_string('aws_emr_loguri', None, + 'The log-uri parameter to pass to AWS when creating a ' + 'cluster. If not set, a bucket will be created.') +flags.DEFINE_integer('aws_emr_job_wait_time', 18000, + 'The time to wait for an EMR job to finish, in seconds') +flags.DEFINE_boolean('aws_spot_instances', False, + 'Whether to use AWS spot instances for any AWS VMs.') +flags.DEFINE_float('aws_spot_price', None, + 'The spot price to bid for AWS spot instances. Defaults ' + 'to on-demand price when left as None.') +flags.DEFINE_enum('aws_spot_block_duration_minutes', None, + ['60', '120', '180', '240', '300', '360'], 'The required ' + 'duration for the Spot Instances (also known as Spot blocks),' + ' in minutes. This value must be a multiple of 60.') +flags.DEFINE_integer('aws_boot_disk_size', None, + 'The boot disk size in GiB for AWS VMs.') +flags.DEFINE_string('kops', 'kops', + 'The path to the kops binary.') +flags.DEFINE_string('aws_image_name_filter', None, + 'The filter to use when searching for an image for a VM. ' + 'See usage details in aws_virtual_machine.py around ' + 'IMAGE_NAME_FILTER.') +flags.DEFINE_string('aws_image_name_regex', None, + 'The Python regex to use to further filter images for a ' + 'VM. This applies after the aws_image_name_filter. See ' + 'usage details in aws_virtual_machine.py around ' + 'IMAGE_NAME_REGEX.') +flags.DEFINE_string('aws_preprovisioned_data_bucket', None, + 'AWS bucket where pre-provisioned data has been copied.') +flags.DEFINE_string('cache_node_type', + 'cache.m4.large', + 'The AWS cache node type to use for elasticache clusters.') +flags.DEFINE_string('aws_elasticache_failover_zone', + None, + 'AWS elasticache failover zone') +flags.DEFINE_string('aws_efs_token', None, + 'The creation token used to create the EFS resource. ' + 'If the file system already exists, it will use that ' + 'instead of creating a new one.') +flags.DEFINE_boolean('aws_delete_file_system', True, + 'Whether to delete the EFS file system.') +flags.DEFINE_enum('efs_throughput_mode', 'provisioned', + ['provisioned', 'bursting'], + 'The throughput mode to use for EFS.') +flags.DEFINE_float('efs_provisioned_throughput', 1024.0, + 'The throughput limit of EFS (in MiB/s) when run in ' + 'provisioned mode.') +flags.DEFINE_boolean('provision_athena', False, + 'Whether to provision the Athena database.') +flags.DEFINE_boolean('teardown_athena', True, + 'Whether to teardown the Athena database.') +flags.DEFINE_string( + 'athena_output_location_prefix', 'athena-cli-results', + 'Prefix of the S3 bucket name for Athena Query Output. Suffix will be the ' + 'region and the run URI, and the bucket will be dynamically created and ' + 'deleted during the test.') +flags.DEFINE_string('eksctl', 'eksctl', 'Path to eksctl.') +flags.DEFINE_enum('redshift_client_interface', 'JDBC', ['JDBC'], + 'The Runtime Interface used when interacting with Redshift.') +flags.DEFINE_enum('athena_client_interface', 'JAVA', ['JAVA'], + 'The Runtime Interface used when interacting with Athena.') +flags.DEFINE_string('athena_query_timeout', '600', 'Query timeout in seconds.') +flags.DEFINE_string('athena_workgroup', '', + 'Use athena workgroup to separate applications and choose ' + 'execution configuration like the engine version.') +flags.DEFINE_boolean( + 'athena_metrics_collection', False, + 'Should the cloud watch metrics be collected for Athena query executions.') +flags.DEFINE_boolean( + 'athena_workgroup_delete', True, + 'Should the dedicated athena workgroups be deleted or kept alive for investigations.' +) +flags.DEFINE_boolean( + 'aws_eks_ssh_access', False, + 'Determine whether to have SSH access to the EKS nodes.') +flags.DEFINE_enum('aws_credit_specification', None, + ['CpuCredits=unlimited', 'CpuCredits=standard'], + 'Credit specification for burstable vms.') +flags.DEFINE_boolean('aws_vm_hibernate', False, + 'Whether to hibernate(suspend) an aws vm' + 'instance.') +flags.DEFINE_string( + 'aws_glue_crawler_role', None, + "Role's ARN to be used by the crawler. Must have policies that grant " + 'permission for using AWS Glue and read access to S3.') +flags.DEFINE_integer( + 'aws_glue_crawler_sample_size', None, + 'Sets how many files will be crawled in each leaf directory. If left ' + 'unset, all the files will be crawled. May range from 1 to 249.', + 1, 249 +) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/provider_info.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/provider_info.py new file mode 100644 index 0000000..8a15208 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/provider_info.py @@ -0,0 +1,22 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""AWS provider info.""" + +from perfkitbenchmarker import provider_info +from perfkitbenchmarker import providers + + +class AWSProviderInfo(provider_info.BaseProviderInfo): + + CLOUD = providers.AWS diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/redshift.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/redshift.py new file mode 100644 index 0000000..f8b88d8 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/redshift.py @@ -0,0 +1,536 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for AWS's Redshift EDW service. + +Clusters can be created (based on new configuration or restored from a snapshot) +and deleted. +""" + +import copy +import json +import os +from typing import Dict, List, Text, Tuple + +from absl import flags +from perfkitbenchmarker import benchmark_spec +from perfkitbenchmarker import data +from perfkitbenchmarker import edw_service +from perfkitbenchmarker import errors +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import aws_cluster_parameter_group +from perfkitbenchmarker.providers.aws import aws_cluster_subnet_group +from perfkitbenchmarker.providers.aws import util + +FLAGS = flags.FLAGS + +VALID_EXIST_STATUSES = ['creating', 'available'] +DELETION_STATUSES = ['deleting'] +READY_STATUSES = ['available'] +ELIMINATE_AUTOMATED_SNAPSHOT_RETENTION = '--automated-snapshot-retention-period=0' +DEFAULT_DATABASE_NAME = 'dev' +BOOTSTRAP_DB = 'sample' +REDSHIFT_JDBC_JAR = 'redshift-jdbc-client-1.0.jar' + + +def AddTags(resource_arn, region): + """Adds tags to a Redshift cluster created by PerfKitBenchmarker. + + Args: + resource_arn: The arn of AWS resource to operate on. + region: The AWS region resource was created in. + """ + cmd_prefix = util.AWS_PREFIX + tag_cmd = cmd_prefix + ['redshift', 'create-tags', '--region=%s' % region, + '--resource-name', resource_arn, '--tags'] + tag_cmd += util.MakeFormattedDefaultTags() + vm_util.IssueCommand(tag_cmd) + + +def GetDefaultRegion(): + """Utility method to supply the default region.""" + cmd_prefix = util.AWS_PREFIX + default_region_cmd = cmd_prefix + ['configure', 'get', 'region'] + stdout, _, _ = vm_util.IssueCommand(default_region_cmd) + return stdout + + +def GetRedshiftClientInterface(database: str, user: str, + password: str) -> edw_service.EdwClientInterface: + """Builds and Returns the requested Redshift client Interface. + + Args: + database: Name of the database to run queries against. + user: Redshift username for authentication. + password: Redshift password for authentication. + + Returns: + A concrete Client Interface object. + + Raises: + RuntimeError: if an unsupported redshift_client_interface is requested + """ + if FLAGS.redshift_client_interface == 'CLI': + return CliClientInterface(database, user, password) + if FLAGS.redshift_client_interface == 'JDBC': + return JdbcClientInterface(database, user, password) + raise RuntimeError('Unknown Redshift Client Interface requested.') + + +class CliClientInterface(edw_service.EdwClientInterface): + """Command Line Client Interface class for Redshift. + + Uses the native Redshift client that ships with pgbench. + https://docs.aws.amazon.com/cli/latest/reference/redshift/index.html + + Attributes: + host: Host endpoint to be used for interacting with the cluster. + database: Name of the database to run queries against. + user: Redshift username for authentication. + password: Redshift password for authentication. + """ + + def __init__(self, database: str, user: str, password: str): + self.database = database + self.user = user + self.password = password + + # set by SetProvisionedAttributes() + self.host = None + + def SetProvisionedAttributes(self, bm_spec: benchmark_spec.BenchmarkSpec): + """Sets any attributes that were unknown during initialization.""" + super(CliClientInterface, self).SetProvisionedAttributes(bm_spec) + self.host = bm_spec.edw_service.endpoint + + def Prepare(self, package_name: str) -> None: + """Prepares the client vm to execute query. + + Installs the redshift tool dependencies. + + Args: + package_name: String name of the package defining the preprovisioned data + (certificates, etc.) to extract and use during client vm preparation. + """ + self.client_vm.Install('pip') + self.client_vm.RemoteCommand('sudo pip install absl-py') + self.client_vm.Install('pgbench') + + # Push the framework to execute a sql query and gather performance details + service_specific_dir = os.path.join('edw', Redshift.SERVICE_TYPE) + self.client_vm.PushFile( + data.ResourcePath( + os.path.join(service_specific_dir, 'script_runner.sh'))) + runner_permission_update_cmd = 'chmod 755 {}'.format('script_runner.sh') + self.client_vm.RemoteCommand(runner_permission_update_cmd) + self.client_vm.PushFile( + data.ResourcePath(os.path.join('edw', 'script_driver.py'))) + self.client_vm.PushFile( + data.ResourcePath( + os.path.join(service_specific_dir, + 'provider_specific_script_driver.py'))) + + def ExecuteQuery(self, query_name: Text) -> Tuple[float, Dict[str, str]]: + """Executes a query and returns performance details. + + Args: + query_name: String name of the query to execute + + Returns: + A tuple of (execution_time, execution details) + execution_time: A Float variable set to the query's completion time in + secs. -1.0 is used as a sentinel value implying the query failed. For a + successful query the value is expected to be positive. + performance_details: A dictionary of query execution attributes eg. job_id + """ + query_command = ('python script_driver.py --script={} --host={} ' + '--database={} --user={} --password={}').format( + query_name, self.host, self.database, self.user, + self.password) + stdout, _ = self.client_vm.RemoteCommand(query_command) + performance = json.loads(stdout) + details = copy.copy(self.GetMetadata()) + details['job_id'] = performance[query_name]['job_id'] + return float(performance[query_name]['execution_time']), details + + def GetMetadata(self) -> Dict[str, str]: + """Gets the Metadata attributes for the Client Interface.""" + return {'client': FLAGS.redshift_client_interface} + + +class JdbcClientInterface(edw_service.EdwClientInterface): + """Native JDBC Client Interface class for Redshift. + + https://docs.aws.amazon.com/redshift/latest/mgmt/jdbc20-install.html + + Attributes: + host: Host endpoint to be used for interacting with the cluster. + database: Name of the database to run queries against. + user: Redshift username for authentication. + password: Redshift password for authentication. + """ + + def __init__(self, database: str, user: str, password: str): + self.database = database + # Use the default port. + self.port = '5439' + self.user = user + self.password = password + + # set in SetProvisionedAttributes() + self.host = None + + def SetProvisionedAttributes(self, bm_spec: benchmark_spec.BenchmarkSpec): + """Sets any attributes that were unknown during initialization.""" + super(JdbcClientInterface, self).SetProvisionedAttributes(bm_spec) + endpoint = bm_spec.edw_service.endpoint + self.host = f'jdbc:redshift://{endpoint}:{self.port}/{self.database}' + + def Prepare(self, package_name: str) -> None: + """Prepares the client vm to execute query. + + Installs the redshift tool dependencies. + + Args: + package_name: String name of the package defining the preprovisioned data + (certificates, etc.) to extract and use during client vm preparation. + """ + self.client_vm.Install('openjdk') + + # Push the executable jar to the working directory on client vm + self.client_vm.InstallPreprovisionedPackageData(package_name, + [REDSHIFT_JDBC_JAR], '') + + def ExecuteQuery(self, query_name: Text) -> Tuple[float, Dict[str, str]]: + """Executes a query and returns performance details. + + Args: + query_name: String name of the query to execute. + + Returns: + A tuple of (execution_time, execution details) + execution_time: A Float variable set to the query's completion time in + secs. -1.0 is used as a sentinel value implying the query failed. For a + successful query the value is expected to be positive. + performance_details: A dictionary of query execution attributes eg. job_id + """ + query_command = ('java -cp {} com.google.cloud.performance.edw.Single ' + '--endpoint {} --query_file {}').format( + REDSHIFT_JDBC_JAR, self.host, query_name) + stdout, _ = self.client_vm.RemoteCommand(query_command) + performance = json.loads(stdout) + details = copy.copy(self.GetMetadata()) + if 'failure_reason' in performance: + details.update({'failure_reason': performance['failure_reason']}) + else: + details.update(performance['details']) + return performance['query_wall_time_in_secs'], details + + def ExecuteSimultaneous(self, submission_interval: int, + queries: List[str]) -> str: + """Executes queries simultaneously on client and return performance details. + + Simultaneous app expects queries as white space separated query file names. + + Args: + submission_interval: Simultaneous query submission interval in + milliseconds. + queries: List of strings (names) of queries to execute. + + Returns: + A serialized dictionary of execution details. + """ + cmd = ('java -cp {} com.google.cloud.performance.edw.Simultaneous ' + '--endpoint {} --submission_interval {} --query_files {}'.format( + REDSHIFT_JDBC_JAR, self.host, submission_interval, + ' '.join(queries))) + stdout, _ = self.client_vm.RemoteCommand(cmd) + return stdout + + def ExecuteThroughput(self, concurrency_streams: List[List[str]]) -> str: + """Executes a throughput test and returns performance details. + + Args: + concurrency_streams: List of streams to execute simultaneously, each of + which is a list of string names of queries. + + Returns: + A serialized dictionary of execution details. + """ + cmd = ('java -cp {} com.google.cloud.performance.edw.Throughput ' + '--endpoint {} --query_streams {}'.format( + REDSHIFT_JDBC_JAR, self.host, + ' '.join([','.join(stream) for stream in concurrency_streams]))) + stdout, _ = self.client_vm.RemoteCommand(cmd) + return stdout + + def GetMetadata(self) -> Dict[str, str]: + """Gets the Metadata attributes for the Client Interface.""" + return {'client': FLAGS.redshift_client_interface} + + +class Redshift(edw_service.EdwService): + """Object representing a Redshift cluster. + + Attributes: + cluster_id: ID of the cluster. + project: ID of the project. + """ + + CLOUD = providers.AWS + SERVICE_TYPE = 'redshift' + + READY_TIMEOUT = 7200 + + def __init__(self, edw_service_spec): + super(Redshift, self).__init__(edw_service_spec) + # pkb setup attribute + self.project = None + self.cmd_prefix = list(util.AWS_PREFIX) + if FLAGS.zones: + self.zone = FLAGS.zones[0] + self.region = util.GetRegionFromZone(self.zone) + else: + self.region = GetDefaultRegion() + self.cmd_prefix += ['--region', self.region] + + # Redshift specific attribute (see if they can be set) + self.cluster_subnet_group = None + self.cluster_parameter_group = None + self.arn = '' + self.cluster_subnet_group = aws_cluster_subnet_group.RedshiftClusterSubnetGroup( + self.cmd_prefix) + self.cluster_parameter_group = aws_cluster_parameter_group.RedshiftClusterParameterGroup( + self.cmd_prefix) + + if self.db is None: + self.db = DEFAULT_DATABASE_NAME + self.client_interface = GetRedshiftClientInterface(self.db, self.user, + self.password) + + def _CreateDependencies(self): + self.cluster_subnet_group.Create() + self.cluster_parameter_group.Create() + + def _Create(self): + """Create the redshift cluster resource.""" + if self.snapshot: + self.Restore(self.snapshot, self.cluster_identifier) + else: + self.Initialize(self.cluster_identifier, self.node_type, self.node_count, + self.user, self.password, self.cluster_parameter_group, + self.cluster_subnet_group) + + def Initialize(self, cluster_identifier, node_type, node_count, user, + password, cluster_parameter_group, cluster_subnet_group): + """Method to initialize a Redshift cluster from an configuration parameters. + + The cluster is initialized in the EC2-VPC platform, that runs it in a + virtual private cloud (VPC). This allows control access to the cluster by + associating one or more VPC security groups with the cluster. + + To create a cluster in a VPC, first create an Amazon Redshift cluster subnet + group by providing subnet information of the VPC, and then provide the + subnet group when launching the cluster. + + + Args: + cluster_identifier: A unique identifier for the cluster. + node_type: The node type to be provisioned for the cluster. + Valid Values: ds2.xlarge | ds2.8xlarge | ds2.xlarge | ds2.8xlarge | + dc1.large | dc1.8xlarge | dc2.large | dc2.8xlarge + node_count: The number of compute nodes in the cluster. + user: The user name associated with the master user account for the + cluster that is being created. + password: The password associated with the master user account for the + cluster that is being created. + cluster_parameter_group: Cluster Parameter Group associated with the + cluster. + cluster_subnet_group: Cluster Subnet Group associated with the cluster. + + Returns: + None + + + Raises: + MissingOption: If any of the required parameters is missing. + """ + if not (cluster_identifier and node_type and user and password): + raise errors.Config.MissingOption('Need cluster_identifier, user and ' + 'password set for creating a cluster.') + + prefix = [ + 'redshift', 'create-cluster', '--cluster-identifier', cluster_identifier + ] + + if node_count == 1: + worker_count_cmd = ['--cluster-type', 'single-node'] + else: + worker_count_cmd = ['--number-of-nodes', str(node_count)] + + postfix = [ + '--node-type', node_type, '--master-username', user, + '--master-user-password', password, '--cluster-parameter-group-name', + cluster_parameter_group.name, '--cluster-subnet-group-name', + cluster_subnet_group.name, '--publicly-accessible', + ELIMINATE_AUTOMATED_SNAPSHOT_RETENTION + ] + + cmd = self.cmd_prefix + prefix + worker_count_cmd + postfix + stdout, stderr, _ = vm_util.IssueCommand(cmd, raise_on_failure=False) + if not stdout: + raise errors.Resource.CreationError('Cluster creation failure: ' + '{}'.format(stderr)) + + def _ValidateSnapshot(self, snapshot_identifier): + """Validate the presence of a cluster snapshot based on its metadata.""" + cmd = self.cmd_prefix + ['redshift', 'describe-cluster-snapshots', + '--snapshot-identifier', snapshot_identifier] + stdout, _, _ = vm_util.IssueCommand(cmd) + if not stdout: + raise errors.Config.InvalidValue('Cluster snapshot indicated by ' + 'edw_service_cluster_snapshot does not' + ' exist: {}.' + .format(snapshot_identifier)) + result = json.loads(stdout) + return result['Snapshots'][0]['Status'] == 'available' + + def _SnapshotDetails(self, snapshot_identifier): + """Delete a redshift cluster and disallow creation of a snapshot.""" + cmd = self.cmd_prefix + ['redshift', 'describe-cluster-snapshots', + '--snapshot-identifier', snapshot_identifier] + stdout, _, _ = vm_util.IssueCommand(cmd) + result = json.loads(stdout) + node_type = result['Snapshots'][0]['NodeType'] + node_count = result['Snapshots'][0]['NumberOfNodes'] + return node_type, node_count + + def Restore(self, snapshot_identifier, cluster_identifier): + """Method to restore a Redshift cluster from an existing snapshot. + + A snapshot of cluster in VPC can be restored only in VPC. Therefore, subnet + group name where the cluster is to be restored must be provided. + + vpc-security-group-ids are not specified at the time of restoration, and it + is expected that the default VPC security group which gets associated with + the cluster has appropriate ingress and egress rules. + + Ref: http://docs.aws.amazon.com/cli/latest/reference/ + redshift/restore-from-cluster-snapshot.html + + Args: + snapshot_identifier: Identifier of the snapshot to restore + cluster_identifier: Identifier of the restored cluster + Returns: + None + """ + + if not (self.user and self.password and self.db): + raise errors.Config.MissingOption( + 'Need the db, user and password set for restoring a cluster') + + if self._ValidateSnapshot(snapshot_identifier): + node_type, node_count = self._SnapshotDetails(snapshot_identifier) + # For a restored cluster update the cluster shape and size based on the + # snapshot's configuration + self.node_type = node_type + self.node_count = node_count + cmd = self.cmd_prefix + ['redshift', 'restore-from-cluster-snapshot', + '--cluster-identifier', cluster_identifier, + '--snapshot-identifier', snapshot_identifier, + '--cluster-subnet-group-name', + self.cluster_subnet_group.name, + '--cluster-parameter-group-name', + self.cluster_parameter_group.name, + '--publicly-accessible', + '--automated-snapshot-retention-period=1'] + stdout, stderr, _ = vm_util.IssueCommand(cmd) + if not stdout: + raise errors.Resource.CreationError('Cluster creation failure: ' + '{}'.format(stderr)) + + def __DescribeCluster(self): + """Describe a redshift cluster.""" + cmd = self.cmd_prefix + ['redshift', 'describe-clusters', + '--cluster-identifier', self.cluster_identifier] + return vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _Exists(self): + """Method to validate the existence of a redshift cluster. + + Provision pipeline: returns True during the provisioning (status in + 'creating', 'available') to prevent retry of creation + + Deletion pipeline: returns True, during the deletion (status in + 'deleting') which causes a retry of deletion, an idempotent operation. + TODO(saksena): handle the deletion step more cleanly, and spin till deletion + + Returns: + Boolean value indicating the existence of a cluster. + """ + stdout, _, _ = self.__DescribeCluster() + if (not stdout or (json.loads(stdout)['Clusters'][0]['ClusterStatus'] not in + VALID_EXIST_STATUSES)): + return False + else: + return True + + def _IsReady(self): + """Method to return if the cluster is ready to handle queries.""" + stdout, _, _ = self.__DescribeCluster() + return json.loads(stdout)['Clusters'][0]['ClusterStatus'] in READY_STATUSES + + def _PostCreate(self): + """Perform general post create operations on the cluster. + + Get the endpoint to be used for interacting with the cluster and apply + tags on the cluster. + """ + stdout, _, _ = self.__DescribeCluster() + self.endpoint = json.loads(stdout)['Clusters'][0]['Endpoint']['Address'] + account = util.GetAccount() + self.arn = 'arn:aws:redshift:{}:{}:cluster:{}'.format(self.region, account, + self. + cluster_identifier) + AddTags(self.arn, self.region) + + def _Delete(self): + """Delete a redshift cluster and disallow creation of a snapshot.""" + cmd = self.cmd_prefix + ['redshift', 'delete-cluster', + '--cluster-identifier', self.cluster_identifier, + '--skip-final-cluster-snapshot'] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _IsDeleting(self): + """Method to check if the cluster is being deleting.""" + stdout, _, _ = self.__DescribeCluster() + if not stdout: + return False + else: + return (json.loads(stdout)['Clusters'][0]['ClusterStatus'] in + DELETION_STATUSES) + + def _DeleteDependencies(self): + """Delete dependencies of a redshift cluster.""" + self.cluster_subnet_group.Delete() + self.cluster_parameter_group.Delete() + + def GetMetadata(self): + """Return a dictionary of the metadata for this cluster.""" + basic_data = super(Redshift, self).GetMetadata() + basic_data['region'] = self.region + if self.snapshot is not None: + basic_data['snapshot'] = self.snapshot + basic_data.update(self.client_interface.GetMetadata()) + return basic_data diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/requirements.txt b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/requirements.txt new file mode 100644 index 0000000..847fdfa --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/requirements.txt @@ -0,0 +1,17 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Requirements for running PerfKit Benchmarker on AWS. +awscli>=1.19.75 +colorama==0.3.7 diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/s3.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/s3.py new file mode 100644 index 0000000..144c678 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/s3.py @@ -0,0 +1,219 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Contains classes/functions related to S3.""" + +import json +import os +import posixpath +from typing import List + +from absl import flags +from absl import logging +from perfkitbenchmarker import errors +from perfkitbenchmarker import linux_packages +from perfkitbenchmarker import object_storage_service +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import util + +FLAGS = flags.FLAGS + +AWS_CREDENTIAL_LOCATION = '.aws' +DEFAULT_AWS_REGION = 'us-east-1' +_READ = 's3:GetObject' +_WRITE = 's3:PutObject' + + +class S3Service(object_storage_service.ObjectStorageService): + """Interface to Amazon S3.""" + + STORAGE_NAME = providers.AWS + + region: str + + def PrepareService(self, location): + self.region = location or DEFAULT_AWS_REGION + + def MakeBucket(self, bucket_name, raise_on_failure=True): + command = [ + 'aws', 's3', 'mb', + 's3://%s' % bucket_name, + '--region=%s' % self.region + ] + _, stderr, ret_code = vm_util.IssueCommand(command, raise_on_failure=False) + if ret_code and raise_on_failure: + raise errors.Benchmarks.BucketCreationError(stderr) + + # Tag the bucket with the persistent timeout flag so that buckets can + # optionally stick around after PKB runs. + default_tags = util.MakeFormattedDefaultTags( + timeout_minutes=max(FLAGS.timeout_minutes, + FLAGS.persistent_timeout_minutes)) + tag_set = ','.join('{%s}' % tag for tag in default_tags) + vm_util.IssueRetryableCommand( + ['aws', 's3api', 'put-bucket-tagging', + '--bucket', bucket_name, + '--tagging', 'TagSet=[%s]' % tag_set, + '--region=%s' % self.region]) + + def Copy(self, src_url, dst_url, recursive=False): + """See base class.""" + cmd = ['aws', 's3', 'cp', '--region', self.region] + if recursive: + cmd.append('--recursive') + # Fix cp to mimic gsutil behavior + dst_url = os.path.join(dst_url, os.path.basename(src_url)) + cmd += [src_url, dst_url] + vm_util.IssueCommand(cmd) + + def CopyToBucket(self, src_path, bucket, object_path): + """See base class.""" + dst_url = self.MakeRemoteCliDownloadUrl(bucket, object_path) + vm_util.IssueCommand(['aws', 's3', 'cp', src_path, dst_url, + '--region', self.region]) + + def MakeRemoteCliDownloadUrl(self, bucket, object_path): + """See base class.""" + path = posixpath.join(bucket, object_path) + return 's3://' + path + + def GenerateCliDownloadFileCommand(self, src_url, local_path): + """See base class.""" + return 'aws s3 cp "%s" "%s" --region=%s' % ( + src_url, local_path, self.region) + + def List(self, bucket): + """See base class.""" + stdout, _, _ = vm_util.IssueCommand( + ['aws', 's3', 'ls', bucket, '--region', self.region]) + return stdout + + def ListTopLevelSubfolders(self, bucket): + """Lists the top level folders (not files) in a bucket. + + Each result that is a folder has "PRE" in front of the name (meaning + prefix), eg. "PRE customer/", so that part is removed from each line. When + there's more than one result, splitting on the newline returns a final blank + row, so blank values are skipped. + + Args: + bucket: Name of the bucket to list the top level subfolders of. + + Returns: + A list of top level subfolder names. Can be empty if there are no folders. + """ + return [ + obj.split('PRE ')[1].strip().replace('/', '') + for obj in self.List(bucket).split('\n') + if obj and obj.endswith('/') + ] + + @vm_util.Retry() + def DeleteBucket(self, bucket): + """See base class.""" + + def _SuppressFailure(stdout, stderr, retcode): + """Suppresses failure when bucket does not exist.""" + del stdout # unused + if retcode and 'NoSuchBucket' in stderr: + return True + return False + + vm_util.IssueCommand( + ['aws', 's3', 'rb', + 's3://%s' % bucket, + '--region', self.region, + '--force'], # --force deletes even if bucket contains objects. + suppress_failure=_SuppressFailure) + + def EmptyBucket(self, bucket): + vm_util.IssueCommand( + ['aws', 's3', 'rm', + 's3://%s' % bucket, + '--region', self.region, + '--recursive']) + + def MakeBucketPubliclyReadable(self, bucket, also_make_writable=False): + """See base class.""" + actions = [_READ] + logging.warning('Making bucket %s publicly readable!', bucket) + if also_make_writable: + actions.append(_WRITE) + logging.warning('Making bucket %s publicly writable!', bucket) + vm_util.IssueCommand([ + 'aws', 's3api', 'put-bucket-policy', '--region', self.region, + '--bucket', bucket, '--policy', + _MakeS3BucketPolicy(bucket, actions) + ]) + + def GetDownloadUrl(self, bucket, object_name, use_https=True): + """See base class.""" + assert self.region + scheme = 'https' if use_https else 'http' + return f'{scheme}://{bucket}.s3.{self.region}.amazonaws.com/{object_name}' + + UPLOAD_HTTP_METHOD = 'PUT' + + def PrepareVM(self, vm): + vm.Install('awscli') + vm.Install('boto3') + + vm.PushFile( + object_storage_service.FindCredentialFile('~/' + + AWS_CREDENTIAL_LOCATION), + AWS_CREDENTIAL_LOCATION) + vm.PushFile(object_storage_service.FindBotoFile(), + object_storage_service.DEFAULT_BOTO_LOCATION_USER) + + def CleanupVM(self, vm): + vm.Uninstall('awscli') + + def CLIUploadDirectory(self, vm, directory, file_names, bucket): + return vm.RemoteCommand( + 'time aws s3 sync %s s3://%s/' % (directory, bucket)) + + def CLIDownloadBucket(self, vm, bucket, objects, dest): + return vm.RemoteCommand( + 'time aws s3 sync s3://%s/ %s' % (bucket, dest)) + + def Metadata(self, vm): + return { + object_storage_service.BOTO_LIB_VERSION: + linux_packages.GetPipPackageVersion(vm, 'boto3') + } + + def APIScriptArgs(self): + return ['--region=' + self.region] + + @classmethod + def APIScriptFiles(cls): + return ['s3.py'] + + +def _MakeS3BucketPolicy(bucket: str, + actions: List[str], + object_prefix='') -> str: + # https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_examples_s3_rw-bucket.html + return json.dumps({ + 'Version': + '2012-10-17', + 'Statement': [{ + 'Principal': '*', + 'Sid': 'PkbAcl', + 'Effect': 'Allow', + 'Action': actions, + 'Resource': [f'arn:aws:s3:::{bucket}/{object_prefix}*'] + }] + }) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/snowflake.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/snowflake.py new file mode 100644 index 0000000..611d9ee --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/snowflake.py @@ -0,0 +1,207 @@ +# Copyright 2020 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for Snowflake EDW service resource hosted on AWS.""" + +import copy +import json +from typing import Dict, List, Text, Tuple +from absl import flags +from perfkitbenchmarker import edw_service +from perfkitbenchmarker import providers + + +FLAGS = flags.FLAGS + + +def GetSnowflakeClientInterface(warehouse: str, database: str, + schema: str) -> edw_service.EdwClientInterface: + """Builds and Returns the requested Snowflake client Interface. + + Args: + warehouse: String name of the Snowflake virtual warehouse to use during the + benchmark + database: String name of the Snowflake database to use during the benchmark + schema: String name of the Snowflake schema to use during the benchmark + + Returns: + A concrete Client Interface object (subclass of EdwClientInterface) + + Raises: + RuntimeError: if an unsupported snowflake_client_interface is requested + """ + if FLAGS.snowflake_client_interface == 'JDBC': + return JdbcClientInterface(warehouse, database, schema) + raise RuntimeError('Unknown Snowflake Client Interface requested.') + + +class JdbcClientInterface(edw_service.EdwClientInterface): + """Jdbc Client Interface class for Snowflake. + + Attributes: + warehouse: String name of the virtual warehouse used during benchmark + database: String name of the database to benchmark + schema: String name of the schema to benchmark + """ + + def __init__(self, warehouse: str, database: str, schema: str): + self.warehouse = warehouse + self.database = database + self.schema = schema + + def Prepare(self, package_name: str) -> None: + """Prepares the client vm to execute query. + + Installs a java client application that uses the JDBC driver for connecting + to a database server. + https://docs.snowflake.com/en/user-guide/jdbc.html + + Args: + package_name: String name of the package defining the preprovisioned data + (certificates, etc.) to extract and use during client vm preparation. + """ + self.client_vm.Install('openjdk') + + # Push the executable jar to the working directory on client vm + self.client_vm.InstallPreprovisionedPackageData( + package_name, ['snowflake-jdbc-client-2.0.jar'], '') + + def ExecuteQuery(self, query_name: Text) -> Tuple[float, Dict[str, str]]: + """Executes a query and returns performance details. + + Args: + query_name: String name of the query to execute + + Returns: + A tuple of (execution_time, execution details) + execution_time: A Float variable set to the query's completion time in + secs. -1.0 is used as a sentinel value implying the query failed. For a + successful query the value is expected to be positive. + performance_details: A dictionary of query execution attributes eg. job_id + """ + query_command = ('java -cp snowflake-jdbc-client-2.0.jar ' + 'com.google.cloud.performance.edw.Single --warehouse {} ' + '--database {} --schema {} --query_file {}').format( + self.warehouse, self.database, self.schema, query_name) + stdout, _ = self.client_vm.RemoteCommand(query_command) + details = copy.copy(self.GetMetadata()) # Copy the base metadata + details.update(json.loads(stdout)['details']) + return json.loads(stdout)['query_wall_time_in_secs'], details + + def ExecuteSimultaneous(self, submission_interval: int, + queries: List[str]) -> str: + """Executes queries simultaneously on client and return performance details. + + Simultaneous app expects queries as white space separated query file names. + + Args: + submission_interval: Simultaneous query submission interval in + milliseconds. + queries: List of strings (names) of queries to execute. + + Returns: + A serialized dictionary of execution details. + """ + query_command = ( + 'java -cp snowflake-jdbc-client-2.0.jar ' + 'com.google.cloud.performance.edw.Simultaneous --warehouse {} ' + '--database {} --schema {} --submission_interval {} --query_files {}' + ).format(self.warehouse, self.database, self.schema, submission_interval, + ' '.join(queries)) + stdout, _ = self.client_vm.RemoteCommand(query_command) + return stdout + + def ExecuteThroughput(self, concurrency_streams: List[List[str]]) -> str: + """Executes a throughput test and returns performance details. + + Args: + concurrency_streams: List of streams to execute simultaneously, each of + which is a list of string names of queries. + + Returns: + A serialized dictionary of execution details. + """ + query_command = ('java -cp snowflake-jdbc-client-2.0.jar ' + 'com.google.cloud.performance.edw.Throughput --warehouse' + ' {} --database {} --schema {} --query_streams {}').format( + self.warehouse, self.database, self.schema, ' '.join([ + ','.join(stream) for stream in concurrency_streams + ])) + stdout, _ = self.client_vm.RemoteCommand(query_command) + return stdout + + def GetMetadata(self) -> Dict[str, str]: + """Gets the Metadata attributes for the Client Interface.""" + return {'client': FLAGS.snowflake_client_interface} + + +class Snowflake(edw_service.EdwService): + """Object representing a Snowflake Data Warehouse Instance hosted on AWS.""" + CLOUD = providers.AWS + SERVICE_TYPE = 'snowflake_aws' + + def __init__(self, edw_service_spec): + super(Snowflake, self).__init__(edw_service_spec) + self.warehouse = FLAGS.snowflake_warehouse + self.database = FLAGS.snowflake_database + self.schema = FLAGS.snowflake_schema + self.client_interface = GetSnowflakeClientInterface(self.warehouse, + self.database, + self.schema) + + def IsUserManaged(self, edw_service_spec): + # TODO(saksena): Remove the assertion after implementing provisioning of + # virtual warehouses. + return True + + def _Create(self): + """Create a Snowflake cluster.""" + raise NotImplementedError + + def _Exists(self): + """Method to validate the existence of a Snowflake cluster. + + Returns: + Boolean value indicating the existence of a cluster. + """ + return True + + def _Delete(self): + """Delete a Snowflake cluster.""" + raise NotImplementedError + + def GetMetadata(self): + """Return a metadata dictionary of the benchmarked Snowflake cluster.""" + basic_data = super(Snowflake, self).GetMetadata() + basic_data['warehouse'] = self.warehouse + basic_data['database'] = self.database + basic_data['schema'] = self.schema + basic_data.update(self.client_interface.GetMetadata()) + return basic_data + + +class Snowflakeexternal(Snowflake): + """Class representing Snowflake External Warehouses.""" + + SERVICE_TYPE = 'snowflakeexternal_aws' + + def GetMetadata(self) -> Dict[str, str]: + """Return a dictionary of the metadata for the Snowflake External service. + + Returns: + A dictionary set to service details. + """ + basic_data = super(Snowflakeexternal, self).GetMetadata() + basic_data['edw_service_type'] = Snowflakeexternal.SERVICE_TYPE + basic_data.update(self.client_interface.GetMetadata()) + return basic_data diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/spectrum.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/spectrum.py new file mode 100644 index 0000000..3d70577 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/spectrum.py @@ -0,0 +1,137 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for AWS's Spectrum EDW service. + +Clusters can be created (based on new configuration or restored from a snapshot) +and deleted. +""" + +import json +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.aws import redshift +from perfkitbenchmarker.providers.aws import util + + +FLAGS = flags.FLAGS + + +READY_STATUSES = ['available'] +SNAPSHOT_READY_STATUSES = ['completed'] + + +def AddTags(resource_arn, region): + """Adds tags to a Redshift cluster created by PerfKitBenchmarker. + + Args: + resource_arn: The arn of AWS resource to operate on. + region: The AWS region resource was created in. + """ + cmd_prefix = util.AWS_PREFIX + tag_cmd = cmd_prefix + ['redshift', 'create-tags', '--region=%s' % region, + '--resource-name', resource_arn, '--tags'] + tag_cmd += util.MakeFormattedDefaultTags() + vm_util.IssueCommand(tag_cmd) + + +class AddingIAMRole(object): + """IAM Role to associate with the cluster. + + IAM Role can be associated with the cluster to access to other services such + as S3. + + Attributes: + cluster_identifier: Identifier of the cluster + iam_role_name: Role name of the IAM + """ + + def __init__(self, cluster_identifier, iam_role_name, cmd_prefix): + self.cmd_prefix = cmd_prefix + self.cluster_identifier = cluster_identifier + self.iam_role_name = iam_role_name + cmd = self.cmd_prefix + ['redshift', + 'modify-cluster-iam-roles', + '--cluster-identifier', + self.cluster_identifier, + '--add-iam-roles', + self.iam_role_name] + vm_util.IssueCommand(cmd) + + +class Spectrum(redshift.Redshift): + """Object representing a Spectrum cluster. + + Attributes: + cluster_id: ID of the cluster. + project: ID of the project. + """ + + SERVICE_TYPE = 'spectrum' + + def __init__(self, edw_service_spec): + super(Spectrum, self).__init__(edw_service_spec) + # Cluster setup attributes + self.iam_role = edw_service_spec.iam_role + + def _IsReady(self): + """Method to return if the cluster is ready to handle queries.""" + return self._IsClusterReady() and self._IsSnapshotRestored() + + def _IsClusterReady(self): + """Method to return if the cluster is ready.""" + stdout, _, _ = self.__DescribeCluster() + return json.loads(stdout)['Clusters'][0]['ClusterStatus'] in READY_STATUSES + + def __DescribeCluster(self): + """Describe a spectrum cluster.""" + cmd = self.cmd_prefix + ['redshift', 'describe-clusters', + '--cluster-identifier', self.cluster_identifier] + return vm_util.IssueCommand(cmd) + + def _IsSnapshotRestored(self): + """Method to return if the cluster snapshot is completed restoring.""" + stdout, _, _, = self.__DescribeCluster() + return (json.loads(stdout)['Clusters'][0]['RestoreStatus']['Status'] in + SNAPSHOT_READY_STATUSES) + + def _PostCreate(self): + """Perform general post create operations on the cluster. + + Get the endpoint to be used for interacting with the cluster and apply + tags on the cluster. + """ + @vm_util.Retry(poll_interval=self.POLL_INTERVAL, fuzz=0, + timeout=self.READY_TIMEOUT, + retryable_exceptions=( + errors.Resource.RetryableCreationError,)) + def WaitUntilReady(): + if not self._IsReady(): + raise errors.Resource.RetryableCreationError('Adding IAM Role') + + stdout, _, _ = self.__DescribeCluster() + self.adding_iam_role = None + if self.iam_role is not None: + self.adding_iam_role = AddingIAMRole(self.cluster_identifier, + self.iam_role, + self.cmd_prefix) + WaitUntilReady() + + stdout, _, _ = self.__DescribeCluster() + self.endpoint = json.loads(stdout)['Clusters'][0]['Endpoint']['Address'] + account = util.GetAccount() + self.arn = 'arn:aws:redshift:{}:{}:cluster:{}'.format(self.region, account, + self. + cluster_identifier) + AddTags(self.arn, self.region) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/aws/util.py b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/util.py new file mode 100644 index 0000000..7be6659 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/aws/util.py @@ -0,0 +1,297 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for working with Amazon Web Services resources.""" + + +import collections +import json +import re +import string +from typing import Dict, Set +from absl import flags +from perfkitbenchmarker import context +from perfkitbenchmarker import errors +from perfkitbenchmarker import vm_util +import six + +AWS_PATH = 'aws' +AWS_PREFIX = [AWS_PATH, '--output', 'json'] +FLAGS = flags.FLAGS +STOCKOUT_MESSAGE = ('Creation failed due to insufficient capacity indicating a ' + 'potential stockout scenario.') + + +def IsRegion(zone_or_region): + """Returns whether "zone_or_region" is a region.""" + if not re.match(r'[a-z]{2}-[a-z]+-[0-9][a-z]?$', zone_or_region): + raise ValueError( + '%s is not a valid AWS zone or region name' % zone_or_region) + return zone_or_region[-1] in string.digits + + +def GetRegionFromZone(zone_or_region: str) -> str: + """Returns the region a zone is in (or "zone_or_region" if it's a region).""" + if IsRegion(zone_or_region): + return zone_or_region + return zone_or_region[:-1] + + +def GetRegionFromZones(zones): + """Returns the region a set of zones are in. + + Args: + zones: A set of zones. + Raises: + Exception: if the zones are in different regions. + """ + region = None + for zone in zones: + current_region = GetRegionFromZone(zone) + if region is None: + region = current_region + else: + if region != current_region: + raise Exception('Not All zones are in the same region %s not same as ' + '%s. zones: %s' % + (region, current_region, ','.join(zones))) + return region + + +def GetZonesInRegion(region: str) -> Set[str]: + """Returns all available zones in a given region.""" + get_zones_cmd = AWS_PREFIX + [ + 'ec2', + 'describe-availability-zones', + '--region={0}'.format(region) + ] + stdout, _, _ = vm_util.IssueCommand(get_zones_cmd) + response = json.loads(stdout) + return { + item['ZoneName'] + for item in response['AvailabilityZones'] + if item['State'] == 'available' + } + + +def GetZonesFromMachineType() -> Set[str]: + """Returns all available zones for a given machine type.""" + zones = set() + for region in GetAllRegions(): + get_zones_cmd = AWS_PREFIX + [ + 'ec2', 'describe-instance-type-offerings', + '--location-type=availability-zone', f'--region={region}' + ] + AwsFilter({'instance-type': FLAGS.machine_type}) + stdout, _, _ = vm_util.IssueCommand(get_zones_cmd) + response = json.loads(stdout) + for item in response['InstanceTypeOfferings']: + zones.add(item['Location']) + return zones + + +def GetAllRegions() -> Set[str]: + """Returns all enabled AWS regions.""" + get_regions_cmd = AWS_PREFIX + [ + 'ec2', + 'describe-regions', + ] + stdout, _, _ = vm_util.IssueCommand(get_regions_cmd) + response = json.loads(stdout) + return { + item['RegionName'] + for item in response['Regions'] + if item['OptInStatus'] in ('opt-in-not-required', 'opted-in') + } + + +def GetGeoFromRegion(region: str) -> str: + """Gets valid geo from the region, i.e. region us-west-1 returns us.""" + return region.split('-')[0] + + +def GetRegionsInGeo(geo: str) -> Set[str]: + """Gets valid regions in the geo.""" + return {region for region in GetAllRegions() if region.startswith(geo)} + + +def GetAllZones() -> Set[str]: + """Returns all available AWS zones.""" + results = set() + for region in GetAllRegions(): + results.update(GetZonesInRegion(region)) + return results + + +def GroupZonesIntoRegions(zones): + """Returns a map of regions to zones.""" + regions_to_zones_map = collections.defaultdict(set) + for zone in zones: + region = GetRegionFromZone(zone) + regions_to_zones_map[region].add(zone) + return regions_to_zones_map + + +def FormatTags(tags_dict): + """Format a dict of tags into arguments for 'tag' parameter. + + Args: + tags_dict: Tags to be formatted. + + Returns: + A list of tags formatted as arguments for 'tag' parameter. + """ + return [ + 'Key=%s,Value=%s' % (k, v) for k, v in sorted(six.iteritems(tags_dict)) + ] + + +def FormatTagSpecifications(resource_type, tags_dict): + """Format a dict of tags into arguments for 'tag-specifications' parameter. + + Args: + resource_type: resource type to be tagged. + tags_dict: Tags to be formatted. + + Returns: + A list of tags formatted as arguments for 'tag-specifications' parameter. + """ + tags = ','.join('{Key=%s,Value=%s}' % + (k, v) for k, v in six.iteritems(tags_dict)) + return 'ResourceType=%s,Tags=[%s]' % (resource_type, tags) + + +def AddTags(resource_id, region, **kwargs): + """Adds tags to an AWS resource created by PerfKitBenchmarker. + + Args: + resource_id: An extant AWS resource to operate on. + region: The AWS region 'resource_id' was created in. + **kwargs: dict. Key-value pairs to set on the instance. + """ + if not kwargs: + return + + tag_cmd = AWS_PREFIX + [ + 'ec2', + 'create-tags', + '--region=%s' % region, + '--resources', resource_id, + '--tags'] + FormatTags(kwargs) + IssueRetryableCommand(tag_cmd) + + +def MakeDefaultTags(timeout_minutes=None): + """Default tags for an AWS resource created by PerfKitBenchmarker. + + Args: + timeout_minutes: Timeout used for setting the timeout_utc tag. + + Returns: + Dict of default tags, contributed from the benchmark spec. + """ + benchmark_spec = context.GetThreadBenchmarkSpec() + if not benchmark_spec: + return {} + return benchmark_spec.GetResourceTags(timeout_minutes=timeout_minutes) + + +def MakeFormattedDefaultTags(timeout_minutes=None): + """Get the default tags formatted correctly for --tags parameter.""" + return FormatTags(MakeDefaultTags(timeout_minutes=timeout_minutes)) + + +def AddDefaultTags(resource_id, region): + """Adds tags to an AWS resource created by PerfKitBenchmarker. + + By default, resources are tagged with "owner" and "perfkitbenchmarker-run" + key-value + pairs. + + Args: + resource_id: An extant AWS resource to operate on. + region: The AWS region 'resource_id' was created in. + """ + tags = MakeDefaultTags() + AddTags(resource_id, region, **tags) + + +def _GetCallerId() -> Dict[str, str]: + cmd = AWS_PREFIX + ['sts', 'get-caller-identity'] + stdout, _, _ = vm_util.IssueCommand(cmd) + return json.loads(stdout) + + +def GetAccount() -> str: + """Retrieve details about the current IAM identity. + + http://docs.aws.amazon.com/cli/latest/reference/sts/get-caller-identity.html + + Returns: + A string of the AWS account ID number of the account that owns or contains + the calling entity. + """ + return _GetCallerId()['Account'] + + +def GetCallerArn() -> str: + """Retrieve the ARN of the AWS credentials used.""" + return _GetCallerId()['Arn'] + + +@vm_util.Retry() +def IssueRetryableCommand(cmd, env=None, suppress_failure=None): + """Tries running the provided command until it succeeds or times out. + + On Windows, the AWS CLI doesn't correctly set the return code when it + has an error (at least on version 1.7.28). By retrying the command if + we get output on stderr, we can work around this issue. + + Args: + cmd: A list of strings such as is given to the subprocess.Popen() + constructor. + env: An alternate environment to pass to the Popen command. + suppress_failure: A function to pass to vm_util.IssueCommand() + + Returns: + A tuple of stdout and stderr from running the provided command. + """ + stdout, stderr, retcode = vm_util.IssueCommand( + cmd, env=env, raise_on_failure=False, suppress_failure=suppress_failure) + if retcode: + raise errors.VmUtil.CalledProcessException( + 'Command returned a non-zero exit code.\n') + if stderr: + raise errors.VmUtil.CalledProcessException( + 'The command had output on stderr:\n%s' % stderr) + return stdout, stderr + + +def AwsFilter(filter_keys_and_values): + """Returns a list suitable for an AWS command line filter. + + Example: + AwsFilter({'a': 'b', 'c': 'd'}) returns a three element array: + ['--filters', 'Name=a,Values=b', 'Name=c,Values=d'] + + For an example see + https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-instances.html#options + + Args: + filter_keys_and_values: A dict with the key as the name of the AWS attribute + and the value is the value of that attribute + """ + filters = ['--filters'] + for name, value in sorted(filter_keys_and_values.items()): + filters.append('Name={},Values={}'.format(name, value)) + return filters diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/__init__.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/__init__.py new file mode 100644 index 0000000..bf3b09c --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Provider for Azure.""" + +AZURE_PATH = 'az' diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_blob_storage.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_blob_storage.py new file mode 100644 index 0000000..81806ac --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_blob_storage.py @@ -0,0 +1,263 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes/functions related to Azure Blob Storage.""" + +import datetime +import json +import logging + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import linux_packages +from perfkitbenchmarker import object_storage_service +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import azure +from perfkitbenchmarker.providers.azure import azure_network + +FLAGS = flags.FLAGS + +DEFAULT_AZURE_REGION = 'eastus2' + + +class AzureBlobStorageService(object_storage_service.ObjectStorageService): + """Interface to Azure Blob Storage. + + Relevant documentation: + http://azure.microsoft.com/en-us/documentation/articles/xplat-cli/ + """ + + def __init__(self): + self.storage_account = None + self.resource_group = None + + STORAGE_NAME = providers.AZURE + + def PrepareService(self, + region, + existing_storage_account_and_resource_group=None, + try_to_create_storage_account_and_resource_group=False): + """See base class (without additional args). + + TODO(deitz): We should use the same interface across the clouds without + additional arguments. + + Args: + region: where to place our data. + existing_storage_account_and_resource_group: An existing storage account + and resource group for reading objects that may have already been + created. + try_to_create_storage_account_and_resource_group: Whether to try to create + the storage account and resource group in case it does not exist yet. + This supports invoking the object_storage_service_benchmark multiple + times on the same bucket name and creating the resource group the first + time. While this defaults to False, if there is no existing storage + account and resource group passed to this function via + existing_storage_account_and_resource_group, then one will be created. + """ + # abs is "Azure Blob Storage" + prefix = 'pkb%sabs' % FLAGS.run_uri + + # Maybe extract existing storage account and resource group names + existing_storage_account, existing_resource_group = None, None + if existing_storage_account_and_resource_group: + existing_storage_account, existing_resource_group = ( + existing_storage_account_and_resource_group) + assert existing_storage_account is not None + assert existing_resource_group is not None + else: + # We don't have an existing storage account or resource group so we better + # create one. + try_to_create_storage_account_and_resource_group = True + storage_account_name = existing_storage_account or prefix + 'storage' + resource_group_name = existing_resource_group or prefix + '-resource-group' + + # If we have an existing storage account and resource, we typically would + # not try to create it. If try_to_create_storage_account_and_resource_group + # is True, however, then we do try to create it. In this case, we shouldn't + # raise on a failure since it may already exist. + raise_on_create_failure = not ( + existing_storage_account_and_resource_group and + try_to_create_storage_account_and_resource_group) + + # We use a separate resource group so that our buckets can optionally stick + # around after PKB runs. This is useful for things like cold reads tests + self.resource_group = azure_network.AzureResourceGroup( + resource_group_name, + use_existing=not try_to_create_storage_account_and_resource_group, + timeout_minutes=max(FLAGS.timeout_minutes, + FLAGS.persistent_timeout_minutes), + raise_on_create_failure=raise_on_create_failure) + self.resource_group.Create() + + # We use a different Azure storage account than the VM account + # because a) we need to be able to set the storage class + # separately, including using a blob-specific storage account and + # b) this account might be in a different location than any + # VM-related account. + self.storage_account = azure_network.AzureStorageAccount( + FLAGS.azure_storage_type, + region or DEFAULT_AZURE_REGION, + storage_account_name, + kind=FLAGS.azure_blob_account_kind, + resource_group=self.resource_group, + use_existing=not try_to_create_storage_account_and_resource_group, + raise_on_create_failure=raise_on_create_failure) + self.storage_account.Create() + + def CleanupService(self): + if hasattr(self, 'storage_account') and self.storage_account: + self.storage_account.Delete() + if hasattr(self, 'resource_group') and self.resource_group: + self.resource_group.Delete() + + def MakeBucket(self, bucket, raise_on_failure=True): + _, stderr, ret_code = vm_util.IssueCommand( + [azure.AZURE_PATH, 'storage', 'container', 'create', '--name', bucket] + + self.storage_account.connection_args, + raise_on_failure=False) + if ret_code and raise_on_failure: + raise errors.Benchmarks.BucketCreationError(stderr) + + def DeleteBucket(self, bucket): + if (not hasattr(self, 'storage_account') or + not self.storage_account or + not hasattr(self.storage_account, 'connection_args') or + not self.storage_account.connection_args): + logging.warning( + 'storage_account not properly configured. Skipping DeleteBucket %s', + bucket) + return + + vm_util.IssueCommand( + [azure.AZURE_PATH, 'storage', 'container', 'delete', '--name', bucket] + + self.storage_account.connection_args, + raise_on_failure=False) + + def Copy(self, src_url, dst_url, recursive=False): + """See base class.""" + raise NotImplementedError() + + def CopyToBucket(self, src_path, bucket, object_path): + vm_util.IssueCommand(['az', 'storage', 'blob', 'upload', + '--account-name', self.storage_account.name, + '--file', src_path, + '--container', bucket, + '--name', object_path] + + self.storage_account.connection_args) + + def _GenerateDownloadToken(self, bucket, object_path): + blob_store_expiry = datetime.datetime.utcnow() + datetime.timedelta( + days=365) + stdout, _, _ = vm_util.IssueCommand([ + 'az', 'storage', 'blob', 'generate-sas', + '--account-name', self.storage_account.name, + '--container-name', bucket, + '--name', object_path, + '--expiry', blob_store_expiry.strftime('%Y-%m-%dT%H:%M:%SZ'), + '--permissions', 'r' + ] + self.storage_account.connection_args) + token = stdout.strip('\n').strip('"') + return token + + def MakeRemoteCliDownloadUrl(self, bucket, object_path): + """See base class.""" + token = self._GenerateDownloadToken(bucket, object_path) + url = 'https://{acc}.blob.core.windows.net/{con}/{src}?{tkn}'.format( + acc=self.storage_account.name, + con=bucket, + src=object_path, + tkn=token) + return url + + def GenerateCliDownloadFileCommand(self, src_url, dst_url): + """See base class.""" + return 'wget -O {dst_url} "{src_url}"'.format(src_url=src_url, + dst_url=dst_url) + + def List(self, bucket): + """See base class.""" + stdout, _, _ = vm_util.IssueCommand([ + 'az', 'storage', 'blob', 'list', '--container-name', bucket, + '--account-name', self.storage_account.name + ]) + return [metadata['name'] for metadata in json.loads(str(stdout))] + + def ListTopLevelSubfolders(self, bucket): + """Lists the top level folders (not files) in a bucket. + + Each listed item is a full file name, eg. "supplier/supplier.csv", so just + the high level folder name is extracted, and repetitions are eliminated for + when there's multiple files in a folder. + + Args: + bucket: Name of the bucket to list the top level subfolders of. + + Returns: + A list of top level subfolder names. Can be empty if there are no folders. + """ + unique_folders = set([ + obj.split('/')[0].strip() + for obj in self.List(bucket) + if obj and obj.contains('/') + ]) + return list(unique_folders) + + def EmptyBucket(self, bucket): + # Emptying buckets on Azure is hard. We pass for now - this will + # increase our use of storage space, but should not affect the + # benchmark results. + pass + + def PrepareVM(self, vm): + vm.Install('azure_cli') + vm.Install('azure_sdk') + vm.Install('azure_credentials') + + def CLIUploadDirectory(self, vm, directory, file_names, bucket): + return vm.RemoteCommand( + ('time for file in {files}; ' + 'do azure storage blob upload -q {directory}/$file {bucket} ' + '--connection-string {connection_string}; ' + 'done').format( + files=' '.join(file_names), + directory=directory, + bucket=bucket, + connection_string=self.storage_account.connection_string)) + + def CLIDownloadBucket(self, vm, bucket, objects, dest): + return vm.RemoteCommand( + ('time for object in {objects}; ' + 'do azure storage blob download {bucket} $object {dest} ' + '--connection-string {connection_string}; ' + 'done').format( + objects=' '.join(objects), + bucket=bucket, + dest=dest, + connection_string=self.storage_account.connection_string)) + + def Metadata(self, vm): + return { + 'azure_lib_version': linux_packages.GetPipPackageVersion(vm, 'azure') + } + + def APIScriptArgs(self): + return [ + '--azure_account=%s' % self.storage_account.name, + '--azure_key=%s' % self.storage_account.key + ] + + @classmethod + def APIScriptFiles(cls): + return ['azure_service.py'] diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_container_instances.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_container_instances.py new file mode 100644 index 0000000..79a691f --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_container_instances.py @@ -0,0 +1,171 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes/functions related to Azure Container Instances. + +For now, these only support benchmarks that don't make use of the container's +private ip since they can't be connected to vnets. +""" + +import json + +from absl import flags +from perfkitbenchmarker import container_service +from perfkitbenchmarker import context +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import azure +from perfkitbenchmarker.providers.azure import azure_network +from perfkitbenchmarker.providers.azure import util + +FLAGS = flags.FLAGS + + +class AciContainer(container_service.BaseContainer): + """Class representing an ACI container.""" + + def __init__(self, container_spec, name, resource_group): + super(AciContainer, self).__init__(container_spec) + self.name = name + self.resource_group = resource_group + benchmark_spec = context.GetThreadBenchmarkSpec() + self.registry = benchmark_spec.container_registry + + def _Create(self): + """Creates the container.""" + create_cmd = [ + azure.AZURE_PATH, + 'container', + 'create', + '--name', + self.name, + '--image', + self.image, + '--restart-policy', + 'Never', + '--cpu', + str(int(self.cpus)), + '--memory', + '%0.1f' % (self.memory / 1024.0), + ] + self.resource_group.args + if self.registry and self.registry.CLOUD == providers.AZURE: + create_cmd.extend([ + '--registry-login-server', + self.registry.login_server, + '--registry-username', + self.registry.service_principal.app_id, + '--registry-password', + self.registry.service_principal.password, + ]) + if self.command: + # Note that this is inconsistent with other containers which use lists + # of command/args. This creates some differences mostly when + # the command contains quotes. + create_cmd.extend(['--command-line', ' '.join(self.command)]) + vm_util.IssueCommand(create_cmd) + + def _Delete(self): + """Deletes the container.""" + delete_cmd = [ + azure.AZURE_PATH, + 'container', + 'delete', + '--name', + self.name, + '--yes', + ] + self.resource_group.args + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + @property + def ip_address(self): + """Container instances don't have private ips yet.""" + raise NotImplementedError('ACI containers don\'t have private ips.') + + @ip_address.setter + def ip_address(self, value): + """Sets the containers ip_address.""" + self.__ip_address = value + + def _GetContainerInstance(self): + """Gets a representation of the container and returns it.""" + show_cmd = [azure.AZURE_PATH, 'container', 'show', '--name', self.name + ] + self.resource_group.args + stdout, _, _ = vm_util.IssueCommand(show_cmd) + return json.loads(stdout) + + def _IsReady(self): + """Returns true if the container has stopped pending.""" + state = self._GetContainerInstance()['instanceView']['state'] + return state != 'Pending' + + def WaitForExit(self, timeout=None): + """Waits until the container has finished running.""" + + @vm_util.Retry( + timeout=timeout, + retryable_exceptions=(container_service.RetriableContainerException,)) + def _WaitForExit(): + container = self._GetContainerInstance()['containers'][0] + state = container['instanceView']['currentState']['state'] + if state != 'Terminated': + raise container_service.RetriableContainerException( + f'Container in ({state}). Not yet in expected state Terminated.') + return container + + return _WaitForExit() + + def GetLogs(self): + """Returns the logs from the container.""" + logs_cmd = [azure.AZURE_PATH, 'container', 'logs', '--name', self.name + ] + self.resource_group.args + stdout, _, _ = vm_util.IssueCommand(logs_cmd) + return stdout + + +class AciCluster(container_service.BaseContainerCluster): + """Class that can deploy ACI containers.""" + + CLOUD = providers.AZURE + CLUSTER_TYPE = 'aci' + + def __init__(self, cluster_spec): + super(AciCluster, self).__init__(cluster_spec) + self.region = util.GetRegionFromZone(self.zone) + self.resource_group = azure_network.GetResourceGroup(self.region) + + def _Create(self): + """ACI has no cluster.""" + pass + + def _Delete(self): + """ACI has no cluster.""" + pass + + def _CreateDependencies(self): + """Creates the resource group.""" + self.resource_group.Create() + + def _DeleteDependencies(self): + """Deletes the resource group.""" + self.resource_group.Delete() + + def DeployContainer(self, base_name, container_spec): + """Deploys Containers according to the ContainerSpec.""" + name = base_name + str(len(self.containers[base_name])) + container = AciContainer(container_spec, name, self.resource_group) + self.containers[base_name].append(container) + container.Create() + + def DeployContainerService(self, name, container_spec): + """Deploys a ContainerSerivice according to the ContainerSpec.""" + raise NotImplementedError() diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_disk.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_disk.py new file mode 100644 index 0000000..9ae8945 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_disk.py @@ -0,0 +1,278 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing classes related to Azure disks. + +Disks can be created, deleted, attached to VMs, and detached from VMs. +At this time, Azure only supports one disk type, so the disk spec's disk type +is ignored. +See http://msdn.microsoft.com/en-us/library/azure/dn790303.aspx for more +information about azure disks. +""" + + +import itertools +import json +import re +import threading + +from absl import flags +from perfkitbenchmarker import disk +from perfkitbenchmarker import errors +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import azure +from perfkitbenchmarker.providers.azure import azure_network +from perfkitbenchmarker.providers.azure import flags as azure_flags +from perfkitbenchmarker.providers.azure import util +from six.moves import range + +FLAGS = flags.FLAGS + +MAX_DRIVE_SUFFIX_LENGTH = 2 # Last allowable device is /dev/sdzz. + +PREMIUM_STORAGE = 'Premium_LRS' +STANDARD_DISK = 'Standard_LRS' +ULTRA_STORAGE = 'UltraSSD_LRS' + +DISK_TYPE = {disk.STANDARD: STANDARD_DISK, disk.REMOTE_SSD: PREMIUM_STORAGE} + +HOST_CACHING = 'host_caching' + +AZURE = 'Azure' +disk.RegisterDiskTypeMap(AZURE, DISK_TYPE) + +AZURE_REPLICATION_MAP = { + azure_flags.LRS: disk.ZONE, + azure_flags.ZRS: disk.REGION, + # Deliberately omitting PLRS, because that is set explicty in __init__, + # and (RA)GRS, because those are asynchronously replicated. +} + +LOCAL_SSD_PREFIXES = {'Standard_D', 'Standard_G', 'Standard_L'} + +AZURE_NVME_TYPES = [ + r'(Standard_L[0-9]+s_v2)', +] + +# https://docs.microsoft.com/en-us/azure/virtual-machines/azure-vms-no-temp-disk +# D/Ev4 and D/E(i)sv4 VMs do not have tmp/OS disk; Dv3, Dsv3, and Ddv4 VMs do. +# Same for *v5, including Milan machines. +AZURE_NO_TMP_DISK_TYPES = [ + r'(Standard_D[0-9]+s?_v4)', + r'(Standard_E[0-9]+i?s?_v4)', + r'(Standard_D[0-9]+s?_v5)', + r'(Standard_E[0-9]+i?s?_v5)', + r'(Standard_D[0-9]+as?_v5)', + r'(Standard_E[0-9]+as?_v5)', + + r'(Standard_D[0-9]+ps_v5)', + r'(Standard_D[0-9]+pls_v5)', + r'(Standard_E[0-9]+ps_v5)', +] + + +def _ProductWithIncreasingLength(iterable, max_length): + """Yields increasing length cartesian products of iterable.""" + for length in range(1, max_length + 1): + for p in itertools.product(iterable, repeat=length): + yield p + + +def _GenerateDrivePathSuffixes(machine_type): + """Yields drive path suffix strings. + + Drive path suffixes in the form 'a', 'b', 'c', 'd', ..., 'z', 'aa', 'ab', etc. + Note: the os-disk will be /dev/sda, and the temporary disk will be /dev/sdb: + https://docs.microsoft.com/en-us/azure/virtual-machines/linux/faq#can-i-use-the-temporary-disk-devsdb1-to-store-data + Some newer VMs (e.g. Dsv4 VMs) do not have temporary disks. + + The linux kernel code that determines this naming can be found here: + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/scsi/sd.c?h=v2.6.37#n2262 + + Quoting the link from above: + SCSI disk names starts at sda. The 26th device is sdz and the 27th is sdaa. + The last one for two lettered suffix is sdzz which is followed by sdaaa. + """ + character_range = range(ord('a'), ord('z') + 1) + products = _ProductWithIncreasingLength( + character_range, MAX_DRIVE_SUFFIX_LENGTH) + + for p in products: + yield ''.join(chr(c) for c in p) + + +class TooManyAzureDisksError(Exception): + """Exception raised when too many disks are attached.""" + pass + + +def LocalDiskIsSSD(machine_type): + """Check whether the local disk is an SSD drive.""" + + return any((machine_type.startswith(prefix) for prefix in LOCAL_SSD_PREFIXES)) + + +def LocalDriveIsNvme(machine_type): + """Check if the machine type uses NVMe driver.""" + return any( + re.search(machine_series, machine_type) + for machine_series in AZURE_NVME_TYPES) + + +def HasTempDrive(machine_type): + """Check if the machine type has the temp drive (sdb).""" + return not any( + re.search(machine_series, machine_type) + for machine_series in AZURE_NO_TMP_DISK_TYPES) + + +class AzureDisk(disk.BaseDisk): + """Object representing an Azure Disk.""" + + _lock = threading.Lock() + + def __init__(self, + disk_spec, + vm, + lun, + is_image=False): + super(AzureDisk, self).__init__(disk_spec) + self.host_caching = FLAGS.azure_host_caching + self.vm = vm + self.vm_name = vm.name + self.name = self.vm_name + str(lun) + self.resource_group = azure_network.GetResourceGroup() + self.storage_account = vm.storage_account + # lun is Azure's abbreviation for "logical unit number" + self.lun = lun + self.is_image = is_image + self._deleted = False + self.machine_type = vm.machine_type + if self.disk_type == PREMIUM_STORAGE: + self.metadata.update({ + disk.MEDIA: disk.SSD, + disk.REPLICATION: disk.ZONE, + HOST_CACHING: self.host_caching, + }) + elif self.disk_type == STANDARD_DISK: + self.metadata.update({ + disk.MEDIA: disk.HDD, + disk.REPLICATION: AZURE_REPLICATION_MAP[FLAGS.azure_storage_type], + HOST_CACHING: self.host_caching, + }) + elif self.disk_type == disk.LOCAL: + media = disk.SSD if LocalDiskIsSSD(self.machine_type) else disk.HDD + + self.metadata.update({ + disk.MEDIA: media, + disk.REPLICATION: disk.NONE, + }) + + def _Create(self): + """Creates the disk.""" + assert not self.is_image + + if self.disk_type == ULTRA_STORAGE and not self.vm.availability_zone: + raise Exception(f'Azure Ultradisk is being created in zone "{self.zone}"' + 'which was not specified to have an availability zone. ' + 'Availability zones are specified with zone-\\d e.g. ' + 'eastus1-2 for availability zone 2 in zone eastus1') + with self._lock: + _, _, retcode = vm_util.IssueCommand([ + azure.AZURE_PATH, 'vm', 'disk', 'attach', '--new', '--caching', + self.host_caching, '--name', self.name, '--lun', + str(self.lun), '--sku', self.disk_type, '--vm-name', self.vm_name, + '--size-gb', + str(self.disk_size) + ] + self.resource_group.args, raise_on_failure=False, timeout=600) + + if retcode: + raise errors.Resource.RetryableCreationError( + 'Error creating Azure disk.') + + _, _, retcode = vm_util.IssueCommand([ + azure.AZURE_PATH, 'disk', 'update', '--name', self.name, '--set', + util.GetTagsJson(self.resource_group.timeout_minutes) + ] + self.resource_group.args, raise_on_failure=False) + + if retcode: + raise errors.Resource.RetryableCreationError( + 'Error tagging Azure disk.') + + if (self.disk_type == ULTRA_STORAGE and ( + FLAGS.azure_provisioned_iops or FLAGS.azure_provisioned_throughput)): + args = ([azure.AZURE_PATH, 'disk', 'update', '--name', self.name] + + self.resource_group.args) + + if FLAGS.azure_provisioned_iops: + args = args + ['--disk-iops-read-write', + str(FLAGS.azure_provisioned_iops)] + if FLAGS.azure_provisioned_throughput: + args = args + ['--disk-mbps-read-write', + str(FLAGS.azure_provisioned_throughput)] + + _, _, _ = vm_util.IssueCommand(args, raise_on_failure=True) + + def _Delete(self): + """Deletes the disk.""" + assert not self.is_image + self._deleted = True + + def _Exists(self): + """Returns true if the disk exists.""" + assert not self.is_image + if self._deleted: + return False + + stdout, _, _ = vm_util.IssueCommand([ + azure.AZURE_PATH, 'disk', 'show', '--output', 'json', '--name', + self.name + ] + self.resource_group.args, raise_on_failure=False) + try: + json.loads(stdout) + return True + except: + return False + + def Attach(self, vm): + """Attaches the disk to a VM. + + Args: + vm: The AzureVirtualMachine instance to which the disk will be attached. + """ + pass # TODO(user): Implement Attach() + # (not critical because disks are attached to VMs when created) + + def Detach(self): + """Detaches the disk from a VM.""" + # Not needed since the resource group can be deleted + # without detaching disks. + pass + + def GetDevicePath(self): + """Returns the path to the device inside the VM.""" + REMOTE_DRIVE_PATH_SUFFIXES = list(_GenerateDrivePathSuffixes(self.machine_type)) + if self.disk_type == disk.LOCAL: + if LocalDriveIsNvme(self.machine_type): + return '/dev/nvme%sn1' % str(self.lun) + # Temp disk naming isn't always /dev/sdb: + # https://github.com/MicrosoftDocs/azure-docs/issues/54055 + return '/dev/disk/cloud/azure_resource' + else: + try: + start_index = 1 # the os drive is always at index 0; skip the OS drive. + if HasTempDrive(self.machine_type): + start_index += 1 + return '/dev/sd%s' % REMOTE_DRIVE_PATH_SUFFIXES[start_index + self.lun] + except IndexError: + raise TooManyAzureDisksError() diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py new file mode 100644 index 0000000..999d812 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py @@ -0,0 +1,272 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Contains classes/functions related to Azure Kubernetes Service.""" + +import json +from typing import List + +from absl import flags +from perfkitbenchmarker import container_service +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import azure +from perfkitbenchmarker.providers.azure import azure_network +from perfkitbenchmarker.providers.azure import service_principal +from perfkitbenchmarker.providers.azure import util + +FLAGS = flags.FLAGS + + +class AzureContainerRegistry(container_service.BaseContainerRegistry): + """Class for building and storing container images on Azure.""" + + CLOUD = providers.AZURE + + def __init__(self, registry_spec): + super(AzureContainerRegistry, self).__init__(registry_spec) + self.region = util.GetRegionFromZone(self.zone) + self.resource_group = azure_network.GetResourceGroup(self.region) + self.login_server = None + self.sku = 'Basic' + self._deleted = False + self.acr_id = None + self.service_principal = service_principal.ServicePrincipal.GetInstance() + + def _Exists(self): + """Returns True if the registry exists.""" + if self._deleted: + return False + stdout, _, _ = vm_util.IssueCommand([ + azure.AZURE_PATH, 'acr', 'show', '--name', self.name, + ], suppress_warning=True, raise_on_failure=False) + try: + registry = json.loads(stdout) + self.login_server = registry['loginServer'] + self.acr_id = registry['id'] + return True + except ValueError: + return False + + def _Create(self): + """Creates the registry.""" + if self._Exists(): + return + vm_util.IssueCommand([ + azure.AZURE_PATH, 'acr', 'create', + '--name', self.name, + '--sku', self.sku + ] + self.resource_group.args) + + def _Delete(self): + """Deletes the registry.""" + # This will be deleted along with the resource group + self._deleted = True + + def _PostCreate(self): + """Allow the service principle to read from the repository.""" + # If we bootstrapped our own credentials into the AKS cluster it already, + # has read permission, because it created the repo. + if not FLAGS.bootstrap_azure_service_principal: + create_role_assignment_cmd = [ + azure.AZURE_PATH, 'role', 'assignment', 'create', + '--assignee', self.service_principal.app_id, + '--role', 'Reader', + '--scope', self.acr_id, + ] + vm_util.IssueRetryableCommand(create_role_assignment_cmd) + + def _CreateDependencies(self): + """Creates the resource group.""" + self.resource_group.Create() + self.service_principal.Create() + + def _DeleteDependencies(self): + """Deletes the resource group.""" + self.service_principal.Delete() + + def Login(self): + """Logs in to the registry.""" + vm_util.IssueCommand([ + azure.AZURE_PATH, 'acr', 'login', + '--name', self.name, + ]) + + def GetFullRegistryTag(self, image): + """Gets the full tag of the image.""" + full_tag = '{login_server}/{name}'.format( + login_server=self.login_server, name=image) + return full_tag + + +class AksCluster(container_service.KubernetesCluster): + """Class representing an Azure Kubernetes Service cluster.""" + + CLOUD = providers.AZURE + + def __init__(self, spec): + """Initializes the cluster.""" + super(AksCluster, self).__init__(spec) + self.region = util.GetRegionFromZone(self.zone) + self.resource_group = azure_network.GetResourceGroup(self.region) + self.node_resource_group = None + self.name = 'pkbcluster%s' % FLAGS.run_uri + # TODO(pclay): replace with built in service principal once I figure out how + # to make it work with ACR + self.service_principal = service_principal.ServicePrincipal.GetInstance() + self.cluster_version = FLAGS.container_cluster_version + self._deleted = False + + def GetResourceMetadata(self): + """Returns a dict containing metadata about the cluster. + + Returns: + dict mapping string property key to value. + """ + result = super(AksCluster, self).GetResourceMetadata() + result['boot_disk_type'] = self.vm_config.os_disk.disk_type + result['boot_disk_size'] = self.vm_config.os_disk.disk_size + return result + + def _Create(self): + """Creates the AKS cluster.""" + cmd = [ + azure.AZURE_PATH, 'aks', 'create', + '--name', self.name, + '--location', self.region, + '--ssh-key-value', vm_util.GetPublicKeyPath(), + '--service-principal', self.service_principal.app_id, + # TODO(pclay): avoid logging client secret + '--client-secret', + self.service_principal.password, + '--nodepool-name', + container_service.DEFAULT_NODEPOOL, + '--nodepool-labels', + f'pkb_nodepool={container_service.DEFAULT_NODEPOOL}', + ] + self._GetNodeFlags(self.num_nodes, self.vm_config) + + # TODO(pclay): expose quota and capacity errors + # Creating an AKS cluster with a fresh service principal usually fails due + # to a race condition. Active Directory knows the service principal exists, + # but AKS does not. (https://github.com/Azure/azure-cli/issues/9585) + # Use 5 min timeout on service principle retry. cmd will fail fast. + vm_util.Retry(timeout=300)(vm_util.IssueCommand)( + cmd, + # Half hour timeout on creating the cluster. + timeout=1800) + + for name, node_pool in self.nodepools.items(): + self._CreateNodePool(name, node_pool) + + def _CreateNodePool(self, name: str, node_pool): + """Creates a node pool.""" + cmd = [ + azure.AZURE_PATH, 'aks', 'nodepool', 'add', + '--cluster-name', self.name, + '--name', name, + '--labels', f'pkb_nodepool={name}', + ] + self._GetNodeFlags(node_pool.num_nodes, node_pool.vm_config) + vm_util.IssueCommand(cmd, timeout=600) + + def _GetNodeFlags(self, num_nodes: int, vm_config) -> List[str]: + """Common flags for create and nodepools add.""" + args = [ + '--node-vm-size', vm_config.machine_type, + '--node-count', str(num_nodes), + ] + self.resource_group.args + if self.vm_config.zone and self.vm_config.zone != self.region: + zones = ' '.join(zone[-1] for zone in self.vm_config.zone.split(',')) + args += ['--zones', zones] + if self.vm_config.os_disk and self.vm_config.os_disk.disk_size: + args += ['--node-osdisk-size', str(self.vm_config.os_disk.disk_size)] + if self.cluster_version: + args += ['--kubernetes-version', self.cluster_version] + return args + + def _Exists(self): + """Returns True if the cluster exists.""" + if self._deleted: + return False + stdout, _, _ = vm_util.IssueCommand([ + azure.AZURE_PATH, 'aks', 'show', '--name', self.name, + ] + self.resource_group.args, raise_on_failure=False) + try: + cluster = json.loads(stdout) + self.node_resource_group = cluster['nodeResourceGroup'] + return True + except ValueError: + return False + + def _Delete(self): + """Deletes the AKS cluster.""" + # Do not call super._Delete() as it will try to delete containers and the + # cluster may have already been deleted by deleting a corresponding + # AzureContainerRegistry. The registry deletes the shared resource group. + # + # Normally only azure networks manage resource groups because, + # PKB assumes all benchmarks use VMs and VMs always depend on networks. + # However container benchmarks do not provision networks and + # directly manage their own resource groups. However ContainerClusters and + # ContainerRegistries can be used independently so they must both directly + # mangage the undlerlying resource group. This is indempotent, but can cause + # AKS clusters to have been deleted before calling _Delete(). + # + # If it has not yet been deleted it will be deleted along with the resource + # group. + self._deleted = True + + def _PostCreate(self): + """Tags the cluster resource group.""" + super(AksCluster, self)._PostCreate() + set_tags_cmd = [ + azure.AZURE_PATH, 'group', 'update', '-g', self.node_resource_group, + '--set', util.GetTagsJson(self.resource_group.timeout_minutes) + ] + vm_util.IssueCommand(set_tags_cmd) + + def _IsReady(self): + """Returns True if the cluster is ready.""" + vm_util.IssueCommand([ + azure.AZURE_PATH, 'aks', 'get-credentials', + '--admin', + '--name', self.name, + '--file', FLAGS.kubeconfig, + ] + self.resource_group.args, suppress_warning=True) + version_cmd = [FLAGS.kubectl, '--kubeconfig', FLAGS.kubeconfig, 'version'] + _, _, retcode = vm_util.IssueCommand(version_cmd, suppress_warning=True, + raise_on_failure=False) + if retcode: + return False + # POD creation will fail until the default service account is created. + get_cmd = [ + FLAGS.kubectl, '--kubeconfig', FLAGS.kubeconfig, + 'get', 'serviceAccounts' + ] + stdout, _, _ = vm_util.IssueCommand(get_cmd) + return 'default' in stdout + + def _CreateDependencies(self): + """Creates the resource group.""" + self.resource_group.Create() + self.service_principal.Create() + + def _DeleteDependencies(self): + """Deletes the resource group.""" + self.service_principal.Delete() + + def GetDefaultStorageClass(self) -> str: + """Get the default storage class for the provider.""" + # https://docs.microsoft.com/en-us/azure/aks/csi-storage-drivers + # Premium_LRS + return 'managed-csi-premium' diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_network.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_network.py new file mode 100644 index 0000000..59b8082 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_network.py @@ -0,0 +1,707 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing classes related to Azure VM networking. + +The Firewall class provides a way of opening VM ports. The Network class allows +VMs to communicate via internal ips and isolates PerfKitBenchmarker VMs from +others in +the same project. See http://msdn.microsoft.com/library/azure/jj156007.aspx +for more information about Azure Virtual Networks. +""" + +import json +import logging +import threading + +from absl import flags +from perfkitbenchmarker import context +from perfkitbenchmarker import errors +from perfkitbenchmarker import network +from perfkitbenchmarker import placement_group +from perfkitbenchmarker import providers +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import azure +from perfkitbenchmarker.providers.azure import azure_placement_group +from perfkitbenchmarker.providers.azure import util + +FLAGS = flags.FLAGS +CIDR_LIST_THRESHOLD = 14 +# sending more than 14 IP addresses or cidr blocks will cause API call failure +SSH_PORT = 22 + +flags.DEFINE_boolean('azure_infiniband', False, + 'Install Mellanox OpenFabrics drivers') + +DEFAULT_REGION = 'eastus2' + +REGION = 'region' +ZONE = 'zone' + + +def GetResourceGroup(zone=None): + """Get the resource group for the current benchmark.""" + spec = context.GetThreadBenchmarkSpec() + # This is protected by spec.networks_lock, so there's no race + # condition with checking for the attribute and then creating a + # resource group. + try: + return spec.azure_resource_group + except AttributeError: + group = AzureResourceGroup( + 'pkb%s-%s' % (FLAGS.run_uri, spec.uid), zone=zone) + spec.azure_resource_group = group + return group + + +class AzureResourceGroup(resource.BaseResource): + """A Resource Group, the basic unit of Azure provisioning.""" + + def __init__(self, + name, + zone=None, + use_existing=False, + timeout_minutes=None, + raise_on_create_failure=True): + super(AzureResourceGroup, self).__init__() + self.name = name + self.use_existing = use_existing + self.timeout_minutes = timeout_minutes + self.raise_on_create_failure = raise_on_create_failure + # A resource group's region doesn't affect the region of + # actual resources, but we need to choose *some* region for every + # benchmark, even if the user doesn't specify one. + self.region = util.GetRegionFromZone( + FLAGS.zones[0] if FLAGS.zones else zone or DEFAULT_REGION) + # Whenever an Azure CLI command needs a resource group, it's + # always specified the same way. + self.args = ['--resource-group', self.name] + + def _Create(self): + if not self.use_existing: + # A resource group can own resources in multiple zones, but the + # group itself needs to have a region. Therefore, + # FLAGS.zones[0]. + _, _, retcode = vm_util.IssueCommand( + [ + azure.AZURE_PATH, 'group', 'create', '--name', self.name, + '--location', self.region, '--tags' + ] + util.GetTags(self.timeout_minutes), + raise_on_failure=False) + + if retcode and self.raise_on_create_failure: + raise errors.Resource.RetryableCreationError( + 'Error creating Azure resource group') + + def _Exists(self): + stdout, _, _ = vm_util.IssueCommand( + [azure.AZURE_PATH, 'group', 'show', '--name', self.name], + suppress_warning=True, + raise_on_failure=False) + try: + json.loads(stdout) + return True + except ValueError: + return False + + def _Delete(self): + # Ignore delete failures (potentially already deleted) and timeouts as + # delete should complete even if we stop waiting for the response. + vm_util.IssueCommand( + [azure.AZURE_PATH, 'group', 'delete', '--yes', '--name', self.name], + timeout=600, + raise_on_failure=False, + raise_on_timeout=False) + + def AddTag(self, key, value): + """Add a single tag to an existing Resource Group. + + Args: + key: tag key + value: tag value + + Raises: + errors.resource.CreationError on failure. + """ + tag_cmd = [ + azure.AZURE_PATH, 'group', 'update', '--name', self.name, '--set', + 'tags.' + util.FormatTag(key, value) + ] + _, _, retcode = vm_util.IssueCommand(tag_cmd, raise_on_failure=False) + if retcode: + raise errors.Resource.CreationError('Error tagging Azure resource group.') + + +class AzureStorageAccount(resource.BaseResource): + """Object representing an Azure Storage Account.""" + + total_storage_accounts = 0 + + def __init__(self, + storage_type, + region, + name, + kind=None, + access_tier=None, + resource_group=None, + use_existing=False, + raise_on_create_failure=True): + super(AzureStorageAccount, self).__init__() + self.storage_type = storage_type + self.name = name + self.resource_group = resource_group or GetResourceGroup() + self.region = region + self.kind = kind or 'Storage' + self.use_existing = use_existing + self.raise_on_create_failure = raise_on_create_failure + + AzureStorageAccount.total_storage_accounts += 1 + + if kind == 'BlobStorage': + self.access_tier = access_tier or 'Hot' + else: + # Access tiers are only valid for blob storage accounts. + assert access_tier is None + self.access_tier = access_tier + + def _Create(self): + """Creates the storage account.""" + if not self.use_existing: + create_cmd = [ + azure.AZURE_PATH, 'storage', 'account', 'create', '--kind', self.kind, + '--sku', self.storage_type, '--name', self.name, + '--min-tls-version', 'TLS1_2', + '--allow-blob-public-access', 'false', '--tags' + ] + util.GetTags( + self.resource_group.timeout_minutes) + self.resource_group.args + if self.region: + create_cmd.extend(['--location', self.region]) + if self.kind == 'BlobStorage': + create_cmd.extend(['--access-tier', self.access_tier]) + if FLAGS.min_tls_version: + create_cmd.extend(['--min-tls-version', FLAGS.min_tls_version]) + vm_util.IssueCommand( + create_cmd, raise_on_failure=self.raise_on_create_failure) + + def _PostCreate(self): + """Get our connection string and our keys.""" + self.connection_string = util.GetAzureStorageConnectionString( + self.name, self.resource_group.args) + self.connection_args = ['--connection-string', self.connection_string] + self.key = util.GetAzureStorageAccountKey(self.name, + self.resource_group.args) + + def _Delete(self): + """Deletes the storage account.""" + delete_cmd = [ + azure.AZURE_PATH, 'storage', 'account', 'delete', '--name', self.name, + '--yes' + ] + self.resource_group.args + vm_util.IssueCommand(delete_cmd, raise_on_failure=False) + + def _Exists(self): + """Returns true if the storage account exists.""" + stdout, _, _ = vm_util.IssueCommand( + [ + azure.AZURE_PATH, 'storage', 'account', 'show', '--output', 'json', + '--name', self.name + ] + self.resource_group.args, + suppress_warning=True, + raise_on_failure=False) + + try: + json.loads(stdout) + return True + except ValueError: + return False + + +class AzureVirtualNetwork(network.BaseNetwork): + """Object representing an Azure Virtual Network. + + The benchmark spec contains one instance of this class per region, which an + AzureNetwork may retrieve or create via AzureVirtualNetwork.GetForRegion. + + Attributes: + name: string. Name of the virtual network. + resource_group: Resource Group instance that network belongs to. + region: string. Azure region of the network. + """ + + # Initializes an address space for a new AzureVirtualNetwork + _regional_network_count = 0 + vnet_lock = threading.Lock() + + CLOUD = providers.AZURE + + def __init__(self, spec, region, name, number_subnets): + super(AzureVirtualNetwork, self).__init__(spec) + self.name = name + self.resource_group = GetResourceGroup() + self.region = region + self.args = ['--vnet-name', self.name] + self.address_index = 0 + self.regional_index = AzureVirtualNetwork._regional_network_count + self.address_spaces = [] + for zone_num in range(number_subnets): + self.address_spaces.append( + network.GetCidrBlock(self.regional_index, zone_num)) + self.is_created = False + + @classmethod + def GetForRegion(cls, spec, region, name, number_subnets=1): + """Retrieves or creates an AzureVirtualNetwork. + + Args: + spec: BaseNetworkSpec. Spec for Azure Network. + region: string. Azure region name. + name: string. Azure Network Name. + number_subnets: int. Optional. Number of subnets that network will + contain. + + Returns: + AzureVirtualNetwork. If an AzureVirtualNetwork for the same region already + exists in the benchmark spec, that instance is returned. Otherwise, a new + AzureVirtualNetwork is created and returned. + """ + benchmark_spec = context.GetThreadBenchmarkSpec() + if benchmark_spec is None: + raise errors.Error('GetNetwork called in a thread without a ' + 'BenchmarkSpec.') + key = cls.CLOUD, REGION, region + # Because this method is only called from the AzureNetwork constructor, + # which is only called from AzureNetwork.GetNetwork, we already hold the + # benchmark_spec.networks_lock. + number_subnets = max(number_subnets, len(FLAGS.zones)) + if key not in benchmark_spec.regional_networks: + benchmark_spec.regional_networks[key] = cls(spec, region, name, + number_subnets) + AzureVirtualNetwork._regional_network_count += 1 + return benchmark_spec.regional_networks[key] + + def GetNextAddressSpace(self): + """Returns the next available address space for next subnet.""" + with self.vnet_lock: + assert self.address_index < len( + self.address_spaces), 'Only allocated {} addresses'.format( + len(self.address_spaces)) + next_address_space = self.address_spaces[self.address_index] + self.address_index += 1 + return next_address_space + + def Create(self): + """Creates the virtual network.""" + with self.vnet_lock: + if self.is_created: + return + + logging.info('Creating %d Azure subnets in %s', len(self.address_spaces), + self.region) + vm_util.IssueRetryableCommand([ + azure.AZURE_PATH, 'network', 'vnet', 'create', '--location', self + .region, '--name', self.name, '--address-prefixes' + ] + self.address_spaces + self.resource_group.args) + + self.is_created = True + + def Delete(self): + """Deletes the virtual network.""" + pass + + @vm_util.Retry() + def Exists(self): + """Returns true if the virtual network exists.""" + stdout, _, _ = vm_util.IssueCommand( + [ + azure.AZURE_PATH, 'network', 'vnet', 'show', '--output', 'json', + '--name', self.name + ] + self.resource_group.args, + suppress_warning=True, + raise_on_failure=False) + + return bool(json.loads(stdout)) + + +class AzureSubnet(resource.BaseResource): + """Object representing an Azure Subnet.""" + + def __init__(self, vnet, name): + super(AzureSubnet, self).__init__() + self.resource_group = GetResourceGroup() + self.vnet = vnet + self.name = name + self.args = ['--subnet', self.name] + self.address_space = None + + def _Create(self): + # Avoids getting additional address space when create retries. + if not self.address_space: + self.address_space = self.vnet.GetNextAddressSpace() + + vm_util.IssueCommand([ + azure.AZURE_PATH, 'network', 'vnet', 'subnet', 'create', '--vnet-name', + self.vnet.name, '--address-prefix', self.address_space, '--name', + self.name + ] + self.resource_group.args) + + @vm_util.Retry() + def _Exists(self): + stdout, _, _ = vm_util.IssueCommand( + [ + azure.AZURE_PATH, 'network', 'vnet', 'subnet', 'show', + '--vnet-name', self.vnet.name, '--output', 'json', '--name', + self.name + ] + self.resource_group.args, + raise_on_failure=False) + + return bool(json.loads(stdout)) + + def _Delete(self): + pass + + +class AzureNetworkSecurityGroup(resource.BaseResource): + """Object representing an Azure Network Security Group.""" + + def __init__(self, region, subnet, name): + super(AzureNetworkSecurityGroup, self).__init__() + + self.region = region + self.subnet = subnet + self.name = name + self.resource_group = GetResourceGroup() + self.args = ['--nsg', self.name] + + self.rules_lock = threading.Lock() + # Mapping of (start_port, end_port, source) -> rule name, used to + # deduplicate rules. We expect duplicate rules because PKB will + # call AllowPort() for each VM on a subnet, but the rules are + # actually applied to the entire subnet. + self.rules = {} + # True if the special 'DenyAll' rule is present. + self.have_deny_all_rule = False + + def _Create(self): + vm_util.IssueCommand([ + azure.AZURE_PATH, 'network', 'nsg', 'create', '--location', + self.region, '--name', self.name + ] + self.resource_group.args) + + @vm_util.Retry() + def _Exists(self): + stdout, _, _ = vm_util.IssueCommand( + [ + azure.AZURE_PATH, 'network', 'nsg', 'show', '--output', 'json', + '--name', self.name + ] + self.resource_group.args, + raise_on_failure=False) + + return bool(json.loads(stdout)) + + def _Delete(self): + pass + + def _GetRulePriority(self, rule, rule_name): + # Azure priorities are between 100 and 4096, but we reserve 4095 + # for the special DenyAll rule created by DisallowAllPorts. + rule_priority = 100 + len(self.rules) + if rule_priority >= 4095: + raise ValueError('Too many firewall rules!') + self.rules[rule] = rule_name + return rule_priority + + def AttachToSubnet(self): + vm_util.IssueRetryableCommand([ + azure.AZURE_PATH, 'network', 'vnet', 'subnet', 'update', '--name', + self.subnet.name, '--network-security-group', self.name + ] + self.resource_group.args + self.subnet.vnet.args) + + def _ReturnSubString(self, str_ori, list_bound): + list_ori = str_ori.split(",") + list_sub, list_remain = self._ReturnSubList(list_ori, list_bound) + + # convert them back to string + separator = "," + str_sub = separator.join(list_sub) + str_remain = separator.join(list_remain) + return str_sub, str_remain + + def _ReturnSubList(self, list_orig, list_bound): + list_sublist = list_orig[0:list_bound] + list_remain = list_orig[list_bound:] + return list_sublist, list_remain + + def _CreateAndIssueCommand(self, start_port, end_port, source_range, source_range_str, iteration): + rule = (start_port, end_port, source_range_str) + with self.rules_lock: + if rule in self.rules: + return + port_range = '%s-%s' % (start_port, end_port) + rule_name = 'allow-%s-intel-public-cidrs-%s' % (port_range, iteration) + rule_priority = self._GetRulePriority(rule, rule_name) + + network_cmd = [ + azure.AZURE_PATH, 'network', 'nsg', 'rule', 'create', '--name', + rule_name, '--destination-port-range', port_range, '--access', 'Allow', + '--priority', + str(rule_priority) + ] + ['--source-address-prefixes'] + source_range + network_cmd.extend(self.resource_group.args + self.args) + vm_util.IssueRetryableCommand(network_cmd) + + def AllowPort(self, vm, start_port, end_port=None, source_range=None): + """Open a port or port range. + + Args: + vm: the virtual machine to open the port for. + start_port: either a single port or the start of a range. + end_port: if given, the end of the port range. + source_range: List of source CIDRs to allow for this port. If None, all + sources are allowed. i.e. ['0.0.0.0/0'] + + Raises: + ValueError: when there are too many firewall rules. + """ + source_range = source_range or ['0.0.0.0/0'] + end_port = end_port or start_port + + source_range.sort() + # Replace slashes as they are not allowed in an azure rule name. + source_range_str = ','.join(source_range).replace('/', '_') + + list_bound = CIDR_LIST_THRESHOLD + iteration = 1 + while len(source_range) > list_bound: + source_range, source_range_remain = self._ReturnSubList(source_range, list_bound) + source_range_str, source_range_str_remain = self._ReturnSubString(source_range_str, list_bound) + self._CreateAndIssueCommand(start_port, end_port, source_range, source_range_str, iteration) + source_range = source_range_remain + source_range_str = source_range_str_remain + iteration += 1 + + if len(source_range) > 0: + # do some remaining work on source_range and source_range_str + self._CreateAndIssueCommand(start_port, end_port, source_range, source_range_str, iteration) + + def AllowIcmp(self): + source_address = '0.0.0.0/0' + # '*' in Azure represents all ports + rule = ('*', source_address) + rule_name = 'allow-icmp' + with self.rules_lock: + if rule in self.rules: + return + rule_priority = self._GetRulePriority(rule, rule_name) + network_cmd = [ + azure.AZURE_PATH, 'network', 'nsg', 'rule', 'create', '--name', + rule_name, '--access', 'Allow', '--source-address-prefixes', + source_address, '--source-port-ranges', '*', + '--destination-port-ranges', '*', '--priority', + str(rule_priority), '--protocol', 'Icmp' + ] + network_cmd.extend(self.resource_group.args + self.args) + vm_util.IssueRetryableCommand(network_cmd) + + +class AzureFirewall(network.BaseFirewall): + """A fireall on Azure is a Network Security Group. + + NSGs are per-subnet, but this class is per-provider, so we just + proxy methods through to the right NSG instance. + """ + + CLOUD = providers.AZURE + + def AllowPort(self, vm, start_port, end_port=None, source_range=None): + """Opens a port on the firewall. + + Args: + vm: The BaseVirtualMachine object to open the port for. + start_port: The local port to open. + end_port: if given, open the range [start_port, end_port]. + source_range: unsupported at present. + """ + + vm.network.nsg.AllowPort( + vm, start_port, end_port=end_port, source_range=source_range) + + def DisallowAllPorts(self): + """Closes all ports on the firewall.""" + pass + + def AllowIcmp(self, vm): + """Opens the ICMP protocol on the firewall. + + Args: + vm: The BaseVirtualMachine object to open the ICMP protocol for. + """ + + vm.network.nsg.AllowIcmp() + + +class AzureNetwork(network.BaseNetwork): + """Locational network components. + + A container object holding all of the network-related objects that + we need for an Azure zone (aka region). + """ + + CLOUD = providers.AZURE + + def __init__(self, spec): + super(AzureNetwork, self).__init__(spec) + self.resource_group = GetResourceGroup() + self.region = util.GetRegionFromZone(self.zone) + self.availability_zone = util.GetAvailabilityZoneFromZone(self.zone) + + placement_group_spec = azure_placement_group.AzurePlacementGroupSpec( + 'AzurePlacementGroupSpec', + flag_values=FLAGS, + zone=self.zone, + resource_group=self.resource_group.name) + + is_dedicated_host = bool(FLAGS.dedicated_hosts) + in_availability_zone = bool(self.availability_zone) + cluster_placement_group = ( + FLAGS.placement_group_style == placement_group.PLACEMENT_GROUP_CLUSTER) + spread_placement_group = ( + FLAGS.placement_group_style == placement_group.PLACEMENT_GROUP_SPREAD) + + if cluster_placement_group: + self.placement_group = azure_placement_group.AzureProximityGroup( + placement_group_spec) + # With dedicated hosting and/or an availability zone, an availability set + # cannot be created + elif spread_placement_group and not (is_dedicated_host or + in_availability_zone): + self.placement_group = azure_placement_group.AzureAvailSet( + placement_group_spec) + else: + self.placement_group = None + + # Storage account names can't include separator characters :(. + storage_account_prefix = 'pkb%s' % FLAGS.run_uri + + # Storage account names must be 3-24 characters long and use + # numbers and lower-case letters only, which leads us to this + # awful naming scheme. + suffix = 'storage%d' % AzureStorageAccount.total_storage_accounts + self.storage_account = AzureStorageAccount( + FLAGS.azure_storage_type, self.region, + storage_account_prefix[:24 - len(suffix)] + suffix) + + # Length restriction from https://docs.microsoft.com/en-us/azure/azure-resource-manager/management/resource-name-rules#microsoftnetwork pylint: disable=line-too-long + prefix = '%s-%s' % (self.resource_group.name, self.region) + vnet_name = prefix + '-vnet' + if len(vnet_name) > 64: + vnet_name = prefix[:59] + '-vnet' + self.vnet = AzureVirtualNetwork.GetForRegion(spec, self.region, vnet_name) + subnet_name = self.vnet.name + if self.availability_zone: + subnet_name += '-' + self.availability_zone + subnet_name += '-subnet' + self.subnet = AzureSubnet(self.vnet, subnet_name) + self.nsg = AzureNetworkSecurityGroup(self.region, self.subnet, + self.subnet.name + '-nsg') + + @vm_util.Retry() + def Create(self): + """Creates the network.""" + # If the benchmark includes multiple zones, + # self.resource_group.Create() will be called more than once. But + # BaseResource will prevent us from running the underlying Azure + # commands more than once, so that is fine. + self.resource_group.Create() + + if self.placement_group: + self.placement_group.Create() + + self.storage_account.Create() + + self.vnet.Create() + + self.subnet.Create() + + self.nsg.Create() + self.nsg.AttachToSubnet() + + def Delete(self): + """Deletes the network.""" + # If the benchmark includes multiple zones, this will be called + # multiple times, but there will be no bad effects from multiple + # deletes. + self.resource_group.Delete() + + def Peer(self, peering_network): + """Peers the network with the peering_network. + + This method is used for VPC peering. It will connect 2 VPCs together. + + Args: + peering_network: BaseNetwork. The network to peer with. + """ + + # Skip Peering if the networks are the same + if self.vnet is peering_network.vnet: + return + + spec = network.BaseVPCPeeringSpec(self.vnet, + peering_network.vnet) + self.vpc_peering = AzureVpcPeering(spec) + peering_network.vpc_peering = self.vpc_peering + self.vpc_peering.Create() + + @classmethod + def _GetKeyFromNetworkSpec(cls, spec): + """Returns a key used to register Network instances.""" + return (cls.CLOUD, ZONE, spec.zone) + + +class AzureVpcPeering(network.BaseVPCPeering): + """Object containing all information needed to create a VPC Peering Object.""" + + def _Create(self): + """Creates the peering object.""" + self.name = '%s-%s-%s' % (self.network_a.resource_group.name, + self.network_a.region, self.network_b.region) + + # Creates Peering Connection + create_cmd = [ + azure.AZURE_PATH, 'network', 'vnet', 'peering', 'create', + '--name', self.name, + '--vnet-name', self.network_a.name, + '--remote-vnet', self.network_b.name, + '--allow-vnet-access' + ] + self.network_a.resource_group.args + + vm_util.IssueRetryableCommand(create_cmd) + + # Accepts Peering Connection + accept_cmd = [ + azure.AZURE_PATH, 'network', 'vnet', 'peering', 'create', + '--name', self.name, + '--vnet-name', self.network_b.name, + '--remote-vnet', self.network_a.name, + '--allow-vnet-access' + ] + self.network_b.resource_group.args + vm_util.IssueRetryableCommand(accept_cmd) + + logging.info('Created VPC peering between %s and %s', + self.network_a.address_spaces[0], + self.network_b.address_spaces[0]) + + def _Delete(self): + """Deletes the peering connection.""" + # Gets Deleted with resource group deletion + pass diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_placement_group.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_placement_group.py new file mode 100644 index 0000000..1203c4d --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_placement_group.py @@ -0,0 +1,148 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Class to represent an Azure Placement Group object. + +Cloud specific implementations of Placement Group. +""" + +import abc +import json + +from absl import flags +from perfkitbenchmarker import placement_group +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.providers import azure +from perfkitbenchmarker.providers.azure import util + + +FLAGS = flags.FLAGS + + +class AzurePlacementGroupSpec(placement_group.BasePlacementGroupSpec): + """Object containing the information needed to create an AzurePlacementGroup. + + Attributes: + zone: The Azure zone the Placement Group is in. + """ + + CLOUD = providers.AZURE + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(AzurePlacementGroupSpec, + cls)._GetOptionDecoderConstructions() + result.update({ + 'resource_group': (option_decoders.StringDecoder, {'none_ok': False}), + 'placement_group_style': (option_decoders.EnumDecoder, { + 'valid_values': placement_group.PLACEMENT_GROUP_OPTIONS, + 'default': placement_group.PLACEMENT_GROUP_NONE, + }) + }) + return result + + +class AzurePlacementGroup(placement_group.BasePlacementGroup): + """Object representing an Azure Placement Group.""" + + CLOUD = providers.AZURE + + def __init__(self, azure_placement_group_spec): + """Init method for AzurePlacementGroup. + + Args: + azure_placement_group_spec: Object containing the + information needed to create an AzurePlacementGroup. + """ + super(AzurePlacementGroup, self).__init__(azure_placement_group_spec) + self.resource_group = azure_placement_group_spec.resource_group + self.name = '%s-%s' % (self.resource_group, self.zone) + self.region = util.GetRegionFromZone(self.zone) + self.strategy = azure_placement_group_spec.placement_group_style + + @abc.abstractmethod + def AddVmArgs(self): + """List of arguments to add to vm creation.""" + raise NotImplementedError() + + +class AzureAvailSet(AzurePlacementGroup): + """Object representing an Azure Availability Set.""" + + def _Create(self): + """Create the availability set.""" + create_cmd = [ + azure.AZURE_PATH, 'vm', 'availability-set', 'create', + '--resource-group', self.resource_group, '--name', self.name + ] + if self.region: + create_cmd.extend(['--location', self.region]) + vm_util.IssueCommand(create_cmd) + + def _Delete(self): + pass + + @vm_util.Retry() + def _Exists(self): + """Returns True if the availability set exists.""" + show_cmd = [ + azure.AZURE_PATH, 'vm', 'availability-set', 'show', '--output', 'json', + '--resource-group', self.resource_group, '--name', self.name + ] + stdout, _, _ = vm_util.IssueCommand(show_cmd, raise_on_failure=False) + return bool(json.loads(stdout)) + + def AddVmArgs(self): + """Returns Azure command to add VM to availability set.""" + return ['--availability-set', self.name] + + +class AzureProximityGroup(AzurePlacementGroup): + """Object representing an Azure Proximity Placement Group.""" + + def _Create(self): + """Create the Proximity Placement Group.""" + create_cmd = [ + azure.AZURE_PATH, 'ppg', 'create', + '--resource-group', self.resource_group, '--name', self.name + ] + if self.region: + create_cmd.extend(['--location', self.region]) + vm_util.IssueCommand(create_cmd) + + def _Delete(self): + pass + + @vm_util.Retry() + def _Exists(self): + """Returns True if the Proximity Placement Group exists.""" + show_cmd = [ + azure.AZURE_PATH, 'ppg', 'show', '--output', 'json', + '--resource-group', self.resource_group, '--name', self.name + ] + stdout, _, _ = vm_util.IssueCommand(show_cmd, raise_on_failure=False) + return bool(json.loads(stdout)) + + def AddVmArgs(self): + """Returns Azure command to add VM to placement group.""" + return ['--ppg', self.name] diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_redis_cache.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_redis_cache.py new file mode 100644 index 0000000..10cfb2e --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_redis_cache.py @@ -0,0 +1,190 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing classes for Azure Redis Cache. +""" + +import json +import time + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import managed_memory_store +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import azure +from perfkitbenchmarker.providers.azure import azure_network + +FLAGS = flags.FLAGS +# 15min timeout for issuing az redis delete command. +TIMEOUT = 900 +EXISTS_RETRY_TIMES = 3 +EXISTS_RETRY_POLL = 30 + + +class AzureRedisCache(managed_memory_store.BaseManagedMemoryStore): + """Object representing an Azure Redis Cache.""" + + CLOUD = providers.AZURE + MEMORY_STORE = managed_memory_store.REDIS + + # Azure redis could take up to an hour to create + READY_TIMEOUT = 60 * 60 # 60 minutes + + def __init__(self, spec): + super(AzureRedisCache, self).__init__(spec) + self.redis_region = FLAGS.cloud_redis_region + self.resource_group = azure_network.GetResourceGroup(self.redis_region) + self.azure_redis_size = FLAGS.azure_redis_size + self.failover_style = FLAGS.redis_failover_style + if self.failover_style == managed_memory_store.Failover.FAILOVER_SAME_REGION: + self.azure_tier = 'Premium' + else: + self.azure_tier = 'Basic' + self.redis_version = None + + def GetResourceMetadata(self): + """Returns a dict containing metadata about the cache. + + Returns: + dict mapping string property key to value. + """ + result = { + 'cloud_redis_failover_style': + self.failover_style, + 'cloud_redis_region': + self.redis_region, + 'cloud_redis_azure_tier': + self.azure_tier, + 'cloud_redis_azure_redis_size': + self.azure_redis_size, + 'cloud_redis_version': + managed_memory_store.ParseReadableVersion(self.redis_version), + } + return result + + @staticmethod + def CheckPrerequisites(benchmark_config): + """Check benchmark prerequisites on the input flag parameters. + + Args: + benchmark_config: Unused. + + Raises: + errors.Config.InvalidValue: Input flag parameters are invalid. + """ + if FLAGS.managed_memory_store_version: + raise errors.Config.InvalidValue( + 'Custom Redis version not supported on Azure Redis. ') + if FLAGS.redis_failover_style in [ + managed_memory_store.Failover.FAILOVER_SAME_ZONE]: + raise errors.Config.InvalidValue( + 'Azure redis with failover in the same zone is not supported.') + + def _Create(self): + """Creates the cache.""" + cmd = [ + azure.AZURE_PATH, 'redis', 'create', + '--resource-group', self.resource_group.name, + '--location', self.redis_region, + '--name', self.name, + '--sku', self.azure_tier, + '--vm-size', self.azure_redis_size, + '--enable-non-ssl-port', + ] + vm_util.IssueCommand(cmd, timeout=TIMEOUT) + + def _Delete(self): + """Deletes the cache.""" + cmd = [ + azure.AZURE_PATH, 'redis', 'delete', + '--resource-group', self.resource_group.name, + '--name', self.name, + '--yes', + ] + vm_util.IssueCommand(cmd, timeout=TIMEOUT) + + def DescribeCache(self): + """Calls show on the cache to get information about it. + + Returns: + stdout, stderr and retcode. + """ + stdout, stderr, retcode = vm_util.IssueCommand([ + azure.AZURE_PATH, 'redis', 'show', + '--resource-group', self.resource_group.name, + '--name', self.name, + ], raise_on_failure=False) + return stdout, stderr, retcode + + def _Exists(self): + """Returns True if the cache exists. + + Returns: + True if cache exists and false otherwise. + """ + # Retry to ensure there is no transient error in describe cache + for _ in range(EXISTS_RETRY_TIMES): + _, _, retcode = self.DescribeCache() + + if retcode == 0: + return True + time.sleep(EXISTS_RETRY_POLL) + return retcode == 0 + + def _IsReady(self): + """Returns True if the cache is ready. + + Returns: + True if cache is ready and false otherwise. + """ + stdout, _, retcode = self.DescribeCache() + if (retcode == 0 and + json.loads(stdout).get('provisioningState', None) == 'Succeeded'): + self.redis_version = json.loads(stdout).get('redisVersion', 'unspecified') + return True + return False + + def GetMemoryStorePassword(self): + """See base class.""" + if not self._password: + self._PopulateEndpoint() + return self._password + + @vm_util.Retry(max_retries=5) + def _PopulateEndpoint(self): + """Populates endpoint information for the instance. + + Raises: + errors.Resource.RetryableGetError: + Failed to retrieve information on cache. + """ + stdout, _, retcode = self.DescribeCache() + if retcode != 0: + raise errors.Resource.RetryableGetError( + 'Failed to retrieve information on %s.', self.name) + response = json.loads(stdout) + self._ip = response['hostName'] + self._port = response['port'] + + stdout, _, retcode = vm_util.IssueCommand([ + azure.AZURE_PATH, 'redis', 'list-keys', + '--resource-group', self.resource_group.name, + '--name', self.name, + ], raise_on_failure=False) + if retcode != 0: + raise errors.Resource.RetryableGetError( + 'Failed to retrieve information on %s.', self.name) + response = json.loads(stdout) + self._password = response['primaryKey'] diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_relational_db.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_relational_db.py new file mode 100644 index 0000000..b60d07b --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_relational_db.py @@ -0,0 +1,573 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Relational database provisioning and teardown for Azure RDS.""" + +import datetime +import json +import logging +import time + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import providers +from perfkitbenchmarker import relational_db +from perfkitbenchmarker import sql_engine_utils +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import azure +from perfkitbenchmarker.providers.azure import azure_network +from perfkitbenchmarker.providers.azure import util + +DEFAULT_DATABASE_NAME = 'database' + +FLAGS = flags.FLAGS + +DEFAULT_MYSQL_VERSION = '5.7' +DEFAULT_POSTGRES_VERSION = '9.6' +DEFALUT_SQLSERVER_VERSION = 'DEFAULT' + +# Disk size configurations details at +# https://docs.microsoft.com/en-us/cli/azure/mysql/server?view=azure-cli-latest#az_mysql_server_create +AZURE_MIN_DB_DISK_SIZE_MB = 5120 # Minimum db disk size supported by Azure +AZURE_MAX_DB_DISK_SIZE_MB = 16777216 # Maximum db disk size supported by Azure + +IS_READY_TIMEOUT = 60 * 60 * 1 # 1 hour (might take some time to prepare) + +# Longest time recorded is 20 minutes when +# creating STANDARD_D64_V3 - 12/02/2020 +# The Azure command timeout with the following error message: +# +# Deployment failed. Correlation ID: fcdc3c76-33cc-4eb1-986c-fbc30ce7d820. +# The operation timed out and automatically rolled back. +# Please retry the operation. +CREATE_AZURE_DB_TIMEOUT = 60 * 30 + + +class AzureRelationalDb(relational_db.BaseRelationalDb): + """An object representing an Azure RDS relational database. + + Currently Postgres is supported. This class requires that a + client vm be available as an attribute on the instance before Create() is + called, which is the current behavior of PKB. This is necessary to setup the + networking correctly. The following steps are performed to provision the + database: + 1. create the RDS instance in the requested region. + + Instructions from: + https://docs.microsoft.com/en-us/azure/postgresql/quickstart-create-server-database-azure-cli + + On teardown, all resources are deleted. + + Note that the client VM's region and the region requested for the database + must be the same. + + """ + CLOUD = providers.AZURE + + database_name: str + + def __init__(self, relational_db_spec): + super(AzureRelationalDb, self).__init__(relational_db_spec) + if util.IsZone(self.spec.db_spec.zone): + raise errors.Config.InvalidValue( + 'Availability zones are currently not supported by Azure DBs') + self.region = util.GetRegionFromZone(self.spec.db_spec.zone) + self.resource_group = azure_network.GetResourceGroup(self.region) + + self.unmanaged_db_exists = None if self.is_managed_db else False + + def GetResourceMetadata(self): + """Returns the metadata associated with the resource. + + All keys will be prefaced with relational_db before + being published (done in publisher.py). + + Returns: + metadata: dict of Azure DB metadata. + + """ + metadata = super(AzureRelationalDb, self).GetResourceMetadata() + metadata.update({ + 'zone': self.spec.db_spec.zone, + }) + + if hasattr(self.spec.db_disk_spec, 'iops'): + metadata.update({ + 'disk_iops': self.spec.db_disk_spec.iops, + }) + + return metadata + + @staticmethod + def GetDefaultEngineVersion(engine): + """Returns the default version of a given database engine. + + Args: + engine (string): type of database (my_sql or postgres). + Returns: + (string): Default engine version. + Raises: + RelationalDbEngineNotFoundError: if an unknown engine is + requested. + """ + if engine == sql_engine_utils.POSTGRES: + return DEFAULT_POSTGRES_VERSION + elif engine == sql_engine_utils.MYSQL: + return DEFAULT_MYSQL_VERSION + elif engine == sql_engine_utils.SQLSERVER: + return DEFALUT_SQLSERVER_VERSION + else: + raise relational_db.RelationalDbEngineNotFoundError( + 'Unsupported engine {0}'.format(engine)) + + def GetAzCommandForEngine(self): + engine = self.spec.engine + if engine == sql_engine_utils.POSTGRES: + return 'postgres' + elif engine == sql_engine_utils.MYSQL: + return 'mysql' + elif engine == sql_engine_utils.SQLSERVER: + return 'sql' + raise relational_db.RelationalDbEngineNotFoundError( + 'Unsupported engine {0}'.format(engine)) + + def GetConfigFromMachineType(self, machine_type): + """Returns a tuple of (edition, family, vcore) from Azure machine type. + + Args: + machine_type (string): Azure machine type i.e GP_Gen5_4 + Returns: + (string, string, string): edition, family, vcore + Raises: + UnsupportedError: if the machine type is not supported. + """ + machine_type = machine_type.split('_') + if len(machine_type) != 3: + raise relational_db.UnsupportedError( + 'Unsupported machine type {0},' + ' sample machine type GP_Gen5_2'.format(machine_type)) + edition = machine_type[0] + if edition == 'BC': + edition = 'BusinessCritical' + elif edition == 'GP': + edition = 'GeneralPurpose' + else: + raise relational_db.UnsupportedError( + 'Unsupported edition {}. Only supports BC or GP'.format(machine_type)) + + family = machine_type[1] + vcore = machine_type[2] + return (edition, family, vcore) + + def SetDbConfiguration(self, name, value): + """Set configuration for the database instance. + + Args: + name: string, the name of the settings to change + value: value, string the value to set + """ + cmd = [ + azure.AZURE_PATH, + self.GetAzCommandForEngine(), + 'server', + 'configuration', + 'set', + '--name', + name, + '--value', + value, + '--resource-group', + self.resource_group.name, + '--server', + self.instance_id + ] + vm_util.IssueCommand(cmd) + + def RenameDatabase(self, new_name): + """Renames an the database instace.""" + engine = self.spec.engine + if engine == sql_engine_utils.SQLSERVER: + cmd = [ + azure.AZURE_PATH, + self.GetAzCommandForEngine(), + 'db', + 'rename', + '--resource-group', + self.resource_group.name, + '--server', + self.instance_id, + '--name', + self.database_name, + '--new-name', + new_name + ] + vm_util.IssueCommand(cmd) + self.database_name = new_name + else: + raise relational_db.RelationalDbEngineNotFoundError( + 'Unsupported engine {0}'.format(engine)) + + def _ApplyManagedDbFlags(self): + """Applies the MySqlFlags to a managed instance.""" + for flag in FLAGS.db_flags: + name_and_value = flag.split('=') + cmd = [ + azure.AZURE_PATH, + self.GetAzCommandForEngine(), 'server', 'configuration', 'set', + '--name', name_and_value[0], '--resource-group', + self.resource_group.name, '--server', self.instance_id, '--value', + name_and_value[1] + ] + _, stderr, _ = vm_util.IssueCommand(cmd, raise_on_failure=False) + if stderr: + raise Exception('Invalid MySQL flags: {0}. Error {1}'.format( + name_and_value, stderr)) + + self._Reboot() + + def _CreateMySqlOrPostgresInstance(self): + """Creates a managed MySql or Postgres instance.""" + if not self.spec.high_availability: + raise Exception('Azure databases can only be used in high ' + 'availability. Please rerurn with flag ' + '--managed_db_high_availability=True') + + # Valid storage sizes range from minimum of 5120 MB + # and additional increments of 1024 MB up to maximum of 16777216 MB. + azure_disk_size_mb = self.spec.db_disk_spec.disk_size * 1024 + if azure_disk_size_mb > AZURE_MAX_DB_DISK_SIZE_MB: + error_msg = ('Azure disk size was specified as in the disk spec as %s,' + 'got rounded to %s which is greater than the ' + 'maximum of 16777216 MB' % ( + self.spec.db_disk_spec.disk_size, azure_disk_size_mb)) + raise errors.Config.InvalidValue(error_msg) + + elif azure_disk_size_mb < AZURE_MIN_DB_DISK_SIZE_MB: + error_msg = ('Azure disk size was specified ' + 'as in the disk spec as %s, got rounded to %s ' + 'which is smaller than the minimum of 5120 MB' % ( + self.spec.db_disk_spec.disk_size, azure_disk_size_mb)) + raise errors.Config.InvalidValue(error_msg) + + cmd = [ + azure.AZURE_PATH, + self.GetAzCommandForEngine(), + 'server', + 'create', + '--resource-group', + self.resource_group.name, + '--name', + self.instance_id, + '--location', + self.region, + '--admin-user', + self.spec.database_username, + '--admin-password', + self.spec.database_password, + '--storage-size', + str(azure_disk_size_mb), + '--sku-name', + self.spec.db_spec.machine_type, + '--version', + self.spec.engine_version, + ] + + vm_util.IssueCommand(cmd, timeout=CREATE_AZURE_DB_TIMEOUT) + + def _CreateSqlServerInstance(self): + """Creates a managed sql server instance.""" + cmd = [ + azure.AZURE_PATH, + self.GetAzCommandForEngine(), + 'server', + 'create', + '--resource-group', + self.resource_group.name, + '--name', + self.instance_id, + '--location', + self.region, + '--admin-user', + self.spec.database_username, + '--admin-password', + self.spec.database_password + ] + vm_util.IssueCommand(cmd) + + # Azure support two ways of specifying machine type DTU or with vcores + # if compute units is specified we will use the DTU model + if self.spec.db_spec.compute_units is not None: + # Supported families & capacities for 'Standard' are: + # [(None, 10), (None, 20), (None, 50), (None, 100), (None, 200), + # (None, 400), (None, 800), (None, 1600), (None, 3000)] + + # Supported families & capacities for 'Premium' are: + # [(None, 125), (None, 250), (None, 500), (None, 1000), (None, 1750), + # (None, 4000)]. + + cmd = [ + azure.AZURE_PATH, + self.GetAzCommandForEngine(), + 'db', + 'create', + '--resource-group', + self.resource_group.name, + '--server', + self.instance_id, + '--name', + DEFAULT_DATABASE_NAME, + '--edition', + self.spec.db_spec.tier, + '--capacity', + str(self.spec.db_spec.compute_units), + '--zone-redundant', + 'true' if self.spec.high_availability else 'false' + ] + else: + # Sample machine_type: GP_Gen5_2 + edition, family, vcore = ( + self.GetConfigFromMachineType(self.spec.db_spec.machine_type)) + cmd = [ + azure.AZURE_PATH, + self.GetAzCommandForEngine(), + 'db', + 'create', + '--resource-group', + self.resource_group.name, + '--server', + self.instance_id, + '--name', + DEFAULT_DATABASE_NAME, + '--edition', + edition, + '--family', + family, + '--capacity', + vcore, + '--zone-redundant', + 'true' if self.spec.high_availability else 'false' + ] + vm_util.IssueCommand(cmd) + self.database_name = DEFAULT_DATABASE_NAME + + def _CreateAzureManagedSqlInstance(self): + """Creates an Azure Sql Instance from a managed service.""" + if self.spec.engine == sql_engine_utils.POSTGRES: + self._CreateMySqlOrPostgresInstance() + elif self.spec.engine == sql_engine_utils.MYSQL: + self._CreateMySqlOrPostgresInstance() + elif self.spec.engine == sql_engine_utils.SQLSERVER: + self._CreateSqlServerInstance() + else: + raise NotImplementedError('Unknown how to create Azure data base ' + 'engine {0}'.format(self.spec.engine)) + + def _CreateAzureUnmanagedSqlInstance(self): + """Creates an Azure Sql Instance hosted inside of a VM.""" + self.endpoint = self.server_vm.ip_address + self._SetupUnmanagedDatabase() + self.firewall = azure_network.AzureFirewall() + self.firewall.AllowPort( + self.server_vm, + self.port, + source_range=['%s/32' % self.client_vm.ip_address]) + + def _Create(self): + """Creates the Azure RDS instance. + + Raises: + NotImplementedError: if unknown how to create self.spec.engine. + Exception: if attempting to create a non high availability database. + + """ + if self.is_managed_db: + self._CreateAzureManagedSqlInstance() + else: + self.unmanaged_db_exists = True + self._CreateAzureUnmanagedSqlInstance() + + def _Delete(self): + """Deletes the underlying resource. + + Implementations of this method should be idempotent since it may + be called multiple times, even if the resource has already been + deleted. + """ + if not self.is_managed_db: + if hasattr(self, 'firewall'): + self.firewall.DisallowAllPorts() + self.unmanaged_db_exists = False + self.PrintUnmanagedDbStats() + return + + cmd = [ + azure.AZURE_PATH, + self.GetAzCommandForEngine(), + 'server', + 'delete', + '--resource-group', self.resource_group.name, + '--name', self.instance_id, + '--yes' + ] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _Exists(self): + """Returns true if the underlying resource exists. + + Supplying this method is optional. If it is not implemented then the + default is to assume success when _Create and _Delete do not raise + exceptions. + """ + if not self.is_managed_db: + return self.unmanaged_db_exists + + json_server_show = self._AzServerShow() + if json_server_show is None: + return False + return True + + def _IsReady(self, timeout=IS_READY_TIMEOUT): + """Return true if the underlying resource is ready. + + This method will query the instance every 5 seconds until + its instance state is 'available', or until a timeout occurs. + + Args: + timeout: timeout in seconds + + Returns: + True if the resource was ready in time, False if the wait timed out + or an Exception occurred. + """ + + return self._IsInstanceReady(timeout) + + def _PostCreate(self): + """Perform general post create operations on the cluster. + + """ + super()._PostCreate() + + if self.is_managed_db: + cmd = [ + azure.AZURE_PATH, + self.GetAzCommandForEngine(), + 'server', + 'firewall-rule', + 'create', + '--resource-group', self.resource_group.name, + '--server', self.instance_id, + '--name', 'AllowAllIps', + '--start-ip-address', '0.0.0.0', + '--end-ip-address', '255.255.255.255' + ] + vm_util.IssueCommand(cmd) + self._AssignEndpointForWriterInstance() + + if self.spec.engine == 'mysql' or self.spec.engine == 'postgres': + # Azure will add @domainname after the database username + self.spec.database_username = (self.spec.database_username + '@' + + self.endpoint.split('.')[0]) + + self.client_vm_query_tools.InstallPackages() + + def _Reboot(self): + """Reboot the managed db.""" + cmd = [ + azure.AZURE_PATH, + self.GetAzCommandForEngine(), + 'server', + 'restart', + '--resource-group', self.resource_group.name, + '--name', self.instance_id + ] + vm_util.IssueCommand(cmd) + + if not self._IsInstanceReady(): + raise Exception('Instance could not be set to ready after ' + 'reboot') + + def _IsInstanceReady(self, timeout=IS_READY_TIMEOUT): + """Return true if the instance is ready. + + This method will query the instance every 5 seconds until + its instance state is 'Ready', or until a timeout occurs. + + Args: + timeout: timeout in seconds + + Returns: + True if the resource was ready in time, False if the wait timed out + or an Exception occurred. + """ + if not self.is_managed_db: + return self._IsReadyUnmanaged() + + start_time = datetime.datetime.now() + + while True: + if (datetime.datetime.now() - start_time).seconds >= timeout: + logging.warning('Timeout waiting for sql instance to be ready') + return False + + server_show_json = self._AzServerShow() + if server_show_json is not None: + engine = self.spec.engine + if engine == sql_engine_utils.POSTGRES: + state = server_show_json['userVisibleState'] + elif engine == sql_engine_utils.MYSQL: + state = server_show_json['userVisibleState'] + elif engine == sql_engine_utils.SQLSERVER: + state = server_show_json['state'] + else: + raise relational_db.RelationalDbEngineNotFoundError( + 'The db engine does not contain a valid state') + + if state == 'Ready': + break + time.sleep(5) + + return True + + def _AzServerShow(self): + """Runs the azure command az server show. + + Returns: + json object representing holding the of the show command on success. + None for a non 0 retcode. A non 0 retcode can occur if queried + before the database has finished being created. + """ + cmd = [ + azure.AZURE_PATH, + self.GetAzCommandForEngine(), + 'server', + 'show', + '--resource-group', self.resource_group.name, + '--name', self.instance_id + ] + stdout, _, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode != 0: + return None + json_output = json.loads(stdout) + return json_output + + def _AssignEndpointForWriterInstance(self): + """Assigns the ports and endpoints from the instance_id to self. + + These will be used to communicate with the data base + """ + server_show_json = self._AzServerShow() + self.endpoint = server_show_json['fullyQualifiedDomainName'] + + def _FailoverHA(self): + raise NotImplementedError() diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_service_bus.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_service_bus.py new file mode 100644 index 0000000..84f9b1d --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_service_bus.py @@ -0,0 +1,187 @@ +"""Azure Service Bus interface for resources. + +This class handles resource creation/cleanup for messaging service benchmark +on Azure Service Bus. +https://docs.microsoft.com/en-us/azure/service-bus-messaging/ +""" + +import json +import logging +import os +from typing import Any, Dict + +from absl import flags +from perfkitbenchmarker import messaging_service as msgsvc +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import azure +from perfkitbenchmarker.providers.azure import azure_network + +FLAGS = flags.FLAGS +MESSAGING_SERVICE_SCRIPTS_VM_AZURE_DIR = os.path.join( + msgsvc.MESSAGING_SERVICE_SCRIPTS_VM_LIB_DIR, 'azure') +MESSAGING_SERVICE_SCRIPTS_AZURE_PREFIX = 'messaging_service_scripts/azure' +MESSAGING_SERVICE_SCRIPTS_AZURE_FILES = [ + '__init__.py', 'azure_service_bus_client.py' +] +MESSAGING_SERVICE_SCRIPTS_AZURE_BIN = 'messaging_service_scripts/azure_benchmark.py' + + +class AzureServiceBus(msgsvc.BaseMessagingService): + """Azure Service Bus Interface Class.""" + + CLOUD = providers.AZURE + + def __init__(self): + super().__init__() + self.topic_name = 'pkb-topic-{0}'.format(FLAGS.run_uri) + self.subscription_name = 'pkb-subscription-{0}'.format(FLAGS.run_uri) + self.namespace_name = 'pkb-namespace-{0}'.format(FLAGS.run_uri) + self.resource_group = azure_network.GetResourceGroup() + + def _Create(self): + """Handles provision of resources needed for Azure Service Bus benchmark.""" + self._CreateNamespace() + self._CreateTopic() + self._CreateSubscription() + + def _Exists(self): + return (self._NamespaceExists() and self._TopicExists() and + self._SubscriptionExists()) + + def _Delete(self): + self._DeleteSubscription() + self._DeleteTopic() + self._DeleteNamespace() + + def _IsDeleting(self): + """Overrides BaseResource._IsDeleting. + + Used internally while deleting to check if the deletion is still in + progress. + + Returns: + A bool. True if the resource is not yet deleted, else False. + """ + return (self._NamespaceExists() or self._TopicExists() or + self._SubscriptionExists()) + + def Run(self, benchmark_scenario: str, number_of_messages: str, + message_size: str) -> Dict[str, Any]: + connection_str = self._GetPrimaryConnectionString() + command = (f'python3 -m azure_benchmark ' + f'--topic_name={self.topic_name} ' + f'--subscription_name={self.subscription_name} ' + f'--benchmark_scenario={benchmark_scenario} ' + f'--number_of_messages={number_of_messages} ' + f'--message_size={message_size} ' + f'--connection_str="{connection_str}" ') + results = self.client_vm.RemoteCommand(command) + results = json.loads(results[0]) + return results + + def _InstallCloudClients(self): + # Install/uploads Azure specific modules/files. + self.client_vm.RemoteCommand( + 'sudo pip3 install azure-servicebus', ignore_failure=False) + + self._CopyFiles(MESSAGING_SERVICE_SCRIPTS_AZURE_PREFIX, + MESSAGING_SERVICE_SCRIPTS_AZURE_FILES, + MESSAGING_SERVICE_SCRIPTS_VM_AZURE_DIR) + self.client_vm.PushDataFile(MESSAGING_SERVICE_SCRIPTS_AZURE_BIN) + + @property + def location(self): + return self.client_vm.zone + + def _CreateTopic(self): + """Creates Service Bus topic.""" + cmd = [ + azure.AZURE_PATH, 'servicebus', 'topic', 'create', '--name', + self.topic_name, '--namespace-name', self.namespace_name + ] + self.resource_group.args + vm_util.IssueCommand(cmd) + + def _TopicExists(self) -> bool: + """Checks whether Service Bus topic already exists.""" + cmd = [ + azure.AZURE_PATH, 'servicebus', 'topic', 'show', '--name', + self.topic_name, '--namespace-name', self.namespace_name + ] + self.resource_group.args + _, _, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + return retcode == 0 + + def _DeleteTopic(self): + """Handle Service Bus topic deletion.""" + cmd = [ + azure.AZURE_PATH, 'servicebus', 'topic', 'delete', '--name', + self.topic_name, '--namespace-name', self.namespace_name + ] + self.resource_group.args + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _CreateSubscription(self): + """Creates Service Bus subscription.""" + cmd = [ + azure.AZURE_PATH, 'servicebus', 'topic', 'subscription', 'create', + '--name', self.subscription_name, '--topic-name', self.topic_name, + '--namespace-name', self.namespace_name + ] + self.resource_group.args + vm_util.IssueCommand(cmd) + + def _SubscriptionExists(self) -> bool: + """Checks whether Service Bus subscription already exists.""" + cmd = [ + azure.AZURE_PATH, 'servicebus', 'topic', 'subscription', 'show', + '--name', self.subscription_name, '--topic-name', self.topic_name, + '--namespace-name', self.namespace_name + ] + self.resource_group.args + _, _, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + return retcode == 0 + + def _DeleteSubscription(self): + """Handle Service Bus subscription deletion.""" + cmd = [ + azure.AZURE_PATH, 'servicebus', 'topic', 'subscription', 'delete', + '--name', self.subscription_name, '--topic-name', self.topic_name, + '--namespace-name', self.namespace_name + ] + self.resource_group.args + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _CreateNamespace(self): + """Creates an Azure Service Bus Namespace.""" + cmd = [ + azure.AZURE_PATH, 'servicebus', 'namespace', 'create', '--name', + self.namespace_name, '--location', self.location + ] + self.resource_group.args + vm_util.IssueCommand(cmd) + + def _NamespaceExists(self) -> bool: + """Checks if our Service Bus Namespace exists.""" + cmd = [ + azure.AZURE_PATH, 'servicebus', 'namespace', 'show', '--name', + self.namespace_name + ] + self.resource_group.args + _, _, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + return retcode == 0 + + def _DeleteNamespace(self): + """Deletes the Azure Service Bus namespace.""" + cmd = [ + azure.AZURE_PATH, 'servicebus', 'namespace', 'delete', '--name', + self.namespace_name + ] + self.resource_group.args + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _GetPrimaryConnectionString(self): + """Gets Azure Service Bus Namespace connection string.""" + cmd = [ + azure.AZURE_PATH, 'servicebus', 'namespace', 'authorization-rule', + 'keys', 'list', '--name=RootManageSharedAccessKey', '--namespace-name', + self.namespace_name, '--query=primaryConnectionString', '-o=tsv' + ] + self.resource_group.args + output, stderror, retcode = vm_util.IssueCommand( + cmd, raise_on_failure=False) + if retcode: + logging.warning( + 'Failed to get Service Bus Namespace connection string! %s', stderror) + return output.strip() diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_smb_service.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_smb_service.py new file mode 100644 index 0000000..f6013e2 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_smb_service.py @@ -0,0 +1,162 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Azure SMB implementation. + +See Azure Files +https://docs.microsoft.com/en-us/azure/storage/files/storage-files-introduction + +This launches an Azure Files instance and creates a mount point. Individual +AzureDisks will then mount the share. + +The AzureSmbService object is a resource.BaseResource that has two resources +underneath it: +1. A resource to connect to the filer. +2. A resource to connect to the mount point on the filer. + +Lifecycle: +1. Azure Files service created and blocks until it is available, as it is needed + to make the mount point. +2. Issues a non-blocking call to create the mount point. Does not block as the + SmbDisk will block on it being available. +3. The SmbDisk then mounts the mount point and uses the disk like normal. +4. On teardown the mount point is first deleted. Blocks on that returning. +5. The Azure Files service is then deleted. Does not block as can take some + time. + +""" + +import json +import logging +from typing import List + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import network +from perfkitbenchmarker import providers +from perfkitbenchmarker import smb_service +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import azure +from perfkitbenchmarker.providers.azure import azure_network +from perfkitbenchmarker.providers.azure import util + +FLAGS = flags.FLAGS + + +class AzureSmbService(smb_service.BaseSmbService): + """An Azure SMB resource. + + Creates the Azure Files file system and mount point for use with SMB clients. + + See + https://docs.microsoft.com/en-us/azure/storage/files/storage-files-introduction + """ + + CLOUD = providers.AZURE + SMB_TIERS = ('Standard_LRS', 'Premium_LRS') + # TODO(spencerkim): Add smb tier and version to metadata + DEFAULT_SMB_VERSION = '3.0' + DEFAULT_TIER = 'Standard_LRS' + + def __init__(self, disk_spec, zone): + super(AzureSmbService, self).__init__(disk_spec, zone) + self.name = 'azure-smb-fs-%s' % FLAGS.run_uri + self.region = util.GetRegionFromZone(self.zone) + self.resource_group = azure_network.GetResourceGroup(self.region) + + # set during _Create() + self.connection_args: List[str] = None + self.storage_account_key: str = None + self.storage_account_name: str = None + + @property + def network(self): + network_spec = network.BaseNetworkSpec(self.zone) + return azure_network.AzureNetwork.GetNetworkFromNetworkSpec(network_spec) + + def GetRemoteAddress(self): + logging.debug('Calling GetRemoteAddress on SMB server %s', self.name) + if self.name is None: + raise errors.Resource.RetryableGetError('Filer not created') + return '//{storage}.file.core.windows.net/{name}'.format( + storage=self.storage_account_name, name=self.name) + + def GetStorageAccountAndKey(self): + logging.debug('Calling GetStorageAccountAndKey on SMB server %s', self.name) + if self.name is None: + raise errors.Resource.RetryableGetError('Filer not created') + return {'user': self.storage_account_name, 'pw': self.storage_account_key} + + def _Create(self): + """Creates an Azure Files share. + + For Standard Files, see + https://docs.microsoft.com/en-us/azure/storage/files/storage-how-to-create-file-share#create-file-share-through-command-line-interface-cli + and for Premium Files, see + https://docs.microsoft.com/en-us/azure/storage/files/storage-how-to-create-premium-fileshare#create-a-premium-file-share-using-azure-cli + """ + logging.info('Creating SMB server %s', self.name) + if FLAGS.smb_tier == 'Standard_LRS': + storage_account_number = azure_network.AzureStorageAccount.total_storage_accounts - 1 + self.storage_account_name = 'pkb%s' % FLAGS.run_uri + 'storage' + str( + storage_account_number) + elif FLAGS.smb_tier == 'Premium_LRS': + storage_account_number = ( + azure_network.AzureStorageAccount.total_storage_accounts) + self.storage_account_name = 'pkb%s' % FLAGS.run_uri + 'filestorage' + str( + storage_account_number) + # Premium Files uses a different storage account kind from Standard Files. + # See links in description for more details. + self.storage_account = azure_network.AzureStorageAccount( + storage_type='Premium_LRS', + region=FLAGS.zone[0] or 'westus2', + name=self.storage_account_name, + kind='FileStorage', + resource_group=self.resource_group, + use_existing=False) + self.storage_account.Create() + + self.connection_args = util.GetAzureStorageConnectionArgs( + self.storage_account_name, self.resource_group.args) + self.storage_account_key = util.GetAzureStorageAccountKey( + self.storage_account_name, self.resource_group.args) + + self._AzureSmbCommand('create') + + def _Delete(self): + logging.info('Deleting SMB server %s', self.name) + self._AzureSmbCommand('delete') + + def _Exists(self): + logging.debug('Calling Exists on SMB server %s', self.name) + return self._AzureSmbCommand('exists')['exists'] + + def _IsReady(self): + logging.debug('Calling IsReady on SMB server %s', self.name) + return self._Exists() + + def _Describe(self): + logging.debug('Calling Describe on SMB server %s', self.name) + output = self._AzureSmbCommand('show') + return output + + def _AzureSmbCommand(self, verb): + cmd = [azure.AZURE_PATH, 'storage', 'share', verb, '--output', 'json'] + cmd += ['--name', self.name] + if verb == 'create': + cmd += ['--quota', str(FLAGS.data_disk_size)] + cmd += self.connection_args + stdout, stderr, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode: + raise errors.Error('Error running command %s : %s' % (verb, stderr)) + return json.loads(stdout) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_sql_data_warehouse.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_sql_data_warehouse.py new file mode 100644 index 0000000..187edb4 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_sql_data_warehouse.py @@ -0,0 +1,395 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for Azure's SQL data warehouse EDW service. + +Clusters can be paused and unpaused. +""" + +import copy +import json +import os +from typing import Dict, List, Text, Tuple + +from absl import flags +from perfkitbenchmarker import data +from perfkitbenchmarker import edw_service +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import azure + + +FLAGS = flags.FLAGS + + +VALID_EXIST_STATUSES = ['Resuming', 'Online'] +READY_STATUSES = ['Online'] +PAUSING_STATUSES = ['Pausing'] +SYNAPSE_JDBC_JAR = 'synapse-jdbc-client-1.0.jar' + + +def GetSqlDataWarehouseClientInterface( + server_name: str, database: str, user: str, password: str, + resource_group: str) -> edw_service.EdwClientInterface: + """Builds and Returns the requested SqlDataWarehouse client Interface. + + Args: + server_name: Name of the SqlDataWarehouse server to use. + database: Name of the database to run queries against. + user: SqlDataWarehouse username for authentication. + password: SqlDataWarehouse password for authentication. + resource_group: Azure resource group used to whitelist the VM's IP address. + + Returns: + A concrete Client Interface object. + + Raises: + RuntimeError: if an unsupported sqldatawarehouse_client_interface is + requested. + """ + if FLAGS.sqldatawarehouse_client_interface == 'CLI': + return CliClientInterface(server_name, database, user, password, + resource_group) + if FLAGS.sqldatawarehouse_client_interface == 'JDBC': + return JdbcClientInterface(server_name, database, user, password, + resource_group) + raise RuntimeError('Unknown SqlDataWarehouse Client Interface requested.') + + +class CliClientInterface(edw_service.EdwClientInterface): + """Command Line Client Interface class for Azure SqlDataWarehouse. + + Uses the native SqlDataWarehouse client that ships with the Azure CLI. + https://docs.microsoft.com/en-us/cli/azure/sql/server?view=azure-cli-latest + + Attributes: + server_name: Name of the SqlDataWarehouse server to use. + database: Name of the database to run queries against. + user: Redshift username for authentication. + password: Redshift password for authentication. + resource_group: Azure resource group used to whitelist the VM's IP address. + """ + + def __init__(self, server_name: str, database: str, user: str, password: str, + resource_group: str): + self.server_name = server_name + self.database = database + self.user = user + self.password = password + self.resource_group = resource_group + + def Prepare(self, package_name: str) -> None: + """Prepares the client vm to execute query. + + Installs the sql server tool dependencies. + + Args: + package_name: String name of the package defining the preprovisioned data + (certificates, etc.) to extract and use during client vm preparation. + """ + self.client_vm.Install('pip') + self.client_vm.RemoteCommand('sudo pip install absl-py') + self.client_vm.Install('mssql_tools') + self.whitelist_ip = self.client_vm.ip_address + + cmd = [ + azure.AZURE_PATH, 'sql', 'server', 'firewall-rule', 'create', '--name', + self.whitelist_ip, '--resource-group', self.resource_group, '--server', + self.server_name, '--end-ip-address', self.whitelist_ip, + '--start-ip-address', self.whitelist_ip + ] + vm_util.IssueCommand(cmd) + + # Push the framework to execute a sql query and gather performance details + service_specific_dir = os.path.join('edw', + Azuresqldatawarehouse.SERVICE_TYPE) + self.client_vm.PushFile( + data.ResourcePath( + os.path.join(service_specific_dir, 'script_runner.sh'))) + runner_permission_update_cmd = 'chmod 755 {}'.format('script_runner.sh') + self.client_vm.RemoteCommand(runner_permission_update_cmd) + self.client_vm.PushFile( + data.ResourcePath(os.path.join('edw', 'script_driver.py'))) + self.client_vm.PushFile( + data.ResourcePath( + os.path.join(service_specific_dir, + 'provider_specific_script_driver.py'))) + + def ExecuteQuery(self, query_name: Text) -> Tuple[float, Dict[str, str]]: + """Executes a query and returns performance details. + + Args: + query_name: String name of the query to execute + + Returns: + A tuple of (execution_time, execution details) + execution_time: A Float variable set to the query's completion time in + secs. -1.0 is used as a sentinel value implying the query failed. For a + successful query the value is expected to be positive. + performance_details: A dictionary of query execution attributes eg. job_id + """ + query_command = ( + 'python script_driver.py --script={} --server={} --database={} ' + '--user={} --password={} --query_timeout={}').format( + query_name, self.server_name, self.database, self.user, + self.password, FLAGS.query_timeout) + stdout, _ = self.client_vm.RemoteCommand(query_command) + performance = json.loads(stdout) + details = copy.copy(self.GetMetadata()) + details['job_id'] = performance[query_name]['job_id'] + return float(performance[query_name]['execution_time']), details + + def GetMetadata(self) -> Dict[str, str]: + """Gets the Metadata attributes for the Client Interface.""" + return {'client': FLAGS.sqldatawarehouse_client_interface} + + +class JdbcClientInterface(edw_service.EdwClientInterface): + """JDBC Client Interface class for Azure SqlDataWarehouse. + + Attributes: + server_name: Name of the SqlDataWarehouse server to use. + database: Name of the database to run queries against. + user: Redshift username for authentication. + password: Redshift password for authentication. + resource_group: Azure resource group used to whitelist the VM's IP address. + """ + + def __init__(self, server_name: str, database: str, user: str, password: str, + resource_group: str): + self.server_name = server_name + self.database = database + self.user = user + self.password = password + self.resource_group = resource_group + + def Prepare(self, package_name: str) -> None: + """Prepares the client vm to execute query. + + Installs the sql server tool dependencies. + + Args: + package_name: String name of the package defining the preprovisioned data + (certificates, etc.) to extract and use during client vm preparation. + """ + self.client_vm.Install('openjdk') + self.client_vm.Install('mssql_tools') + self.client_vm.Install('azure_cli') + self.whitelist_ip = self.client_vm.ip_address + + cmd = [ + azure.AZURE_PATH, 'sql', 'server', 'firewall-rule', 'create', '--name', + self.whitelist_ip, '--resource-group', self.resource_group, '--server', + self.server_name, '--end-ip-address', self.whitelist_ip, + '--start-ip-address', self.whitelist_ip + ] + vm_util.IssueCommand(cmd) + + # Push the executable jar to the working directory on client vm + self.client_vm.InstallPreprovisionedPackageData(package_name, + [SYNAPSE_JDBC_JAR], '') + + def ExecuteQuery(self, query_name: Text) -> Tuple[float, Dict[str, str]]: + """Executes a query and returns performance details. + + Args: + query_name: String name of the query to execute + + Returns: + A tuple of (execution_time, execution details) + execution_time: A Float variable set to the query's completion time in + secs. -1.0 is used as a sentinel value implying the query failed. For a + successful query the value is expected to be positive. + performance_details: A dictionary of query execution attributes eg. job_id + """ + query_command = (f'java -cp {SYNAPSE_JDBC_JAR} ' + f'com.google.cloud.performance.edw.Single ' + f'--server {self.server_name} --database {self.database} ' + f'--query_timeout {FLAGS.query_timeout} ' + f'--query_file {query_name}') + stdout, _ = self.client_vm.RemoteCommand(query_command) + performance = json.loads(stdout) + details = copy.copy(self.GetMetadata()) + if 'failure_reason' in performance: + details.update({'failure_reason': performance['failure_reason']}) + else: + details.update(performance['details']) + return performance['query_wall_time_in_secs'], details + + def ExecuteSimultaneous(self, submission_interval: int, + queries: List[str]) -> str: + """Executes queries simultaneously on client and return performance details. + + Simultaneous app expects queries as white space separated query file names. + + Args: + submission_interval: Simultaneous query submission interval in + milliseconds. + queries: List of strings (names) of queries to execute. + + Returns: + A serialized dictionary of execution details. + """ + query_list = ' '.join(queries) + cmd = (f'java -cp {SYNAPSE_JDBC_JAR} ' + f'com.google.cloud.performance.edw.Simultaneous ' + f'--server {self.server_name} --database {self.database} ' + f'--submission_interval {submission_interval} --query_timeout ' + f'{FLAGS.query_timeout} --query_files {query_list}') + stdout, _ = self.client_vm.RemoteCommand(cmd) + return stdout + + def ExecuteThroughput(self, concurrency_streams: List[List[str]]) -> str: + """Executes a throughput test and returns performance details. + + Args: + concurrency_streams: List of streams to execute simultaneously, each of + which is a list of string names of queries. + + Returns: + A serialized dictionary of execution details. + """ + query_list = ' '.join([','.join(stream) for stream in concurrency_streams]) + cmd = ( + f'java -cp {SYNAPSE_JDBC_JAR} ' + f'com.google.cloud.performance.edw.Throughput ' + f'--server {self.server_name} --database {self.database} ' + f'--query_timeout {FLAGS.query_timeout} --query_streams {query_list}') + stdout, _ = self.client_vm.RemoteCommand(cmd) + return stdout + + def GetMetadata(self) -> Dict[str, str]: + """Gets the Metadata attributes for the Client Interface.""" + return {'client': FLAGS.sqldatawarehouse_client_interface} + + +class Azuresqldatawarehouse(edw_service.EdwService): + """Object representing an Azure SQL data warehouse.""" + + CLOUD = providers.AZURE + SERVICE_TYPE = 'azuresqldatawarehouse' + + def __init__(self, edw_service_spec): + super(Azuresqldatawarehouse, self).__init__(edw_service_spec) + self.whitelist_ip = None + self.resource_group = edw_service_spec.resource_group + self.server_name = edw_service_spec.server_name + self.client_interface = GetSqlDataWarehouseClientInterface( + self.server_name, self.db, self.user, self.password, + self.resource_group) + + def WhitelistIPAddress(self, ip_address): + """To whitelist the IP address on the cluster.""" + self.whitelist_ip = ip_address + + cmd = [azure.AZURE_PATH, + 'sql', + 'server', + 'firewall-rule', + 'create', + '--name', + self.whitelist_ip, + '--resource-group', + self.resource_group, + '--server', + self.server_name, + '--end-ip-address', + self.whitelist_ip, + '--start-ip-address', + self.whitelist_ip] + vm_util.IssueCommand(cmd) + + def __DescribeCluster(self): + """Describe cluster.""" + cmd = [azure.AZURE_PATH, + 'sql', + 'dw', + 'show', + '--name', + self.db, + '--resource-group', + self.resource_group, + '--server', + self.server_name] + return vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _Exists(self): + """Method to validate the existence of cluster. + + Returns: + Boolean value indicating the existence of a cluster. + """ + stdout, _, _ = self.__DescribeCluster() + if not stdout or (json.loads(stdout)['status'] not in VALID_EXIST_STATUSES): + return False + else: + return True + + def _IsReady(self): + """Method to return if the cluster is ready to handle queries.""" + stdout, _, _ = self.__DescribeCluster() + return json.loads(stdout)['status'] in READY_STATUSES + + def _Create(self): + """Resuming the cluster.""" + cmd = [azure.AZURE_PATH, + 'sql', + 'dw', + 'resume', + '--name', + self.db, + '--resource-group', + self.resource_group, + '--server', + self.server_name] + vm_util.IssueCommand(cmd, timeout=420) + + def _IsDeleting(self): + """Method to check if the cluster is pausing.""" + stdout, _, _ = self.__DescribeCluster() + if not stdout: + return False + else: + return json.loads(stdout)['status'] in PAUSING_STATUSES + + def _Delete(self): + """Pausing cluster.""" + cmd = [azure.AZURE_PATH, + 'sql', + 'dw', + 'pause', + '--name', + self.db, + '--resource-group', + self.resource_group, + '--server', + self.server_name] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def _DeleteDependencies(self): + """Delete dependencies of the cluster.""" + if self.client_interface.whitelist_ip is not None: + cmd = [ + azure.AZURE_PATH, 'sql', 'server', 'firewall-rule', 'delete', + '--name', self.client_interface.whitelist_ip, '--resource-group', + self.resource_group, '--server', self.server_name + ] + vm_util.IssueCommand(cmd, raise_on_failure=False) + + def GetMetadata(self): + """Return a dictionary of the metadata for this cluster.""" + basic_data = super(Azuresqldatawarehouse, self).GetMetadata() + basic_data['resource_group'] = self.resource_group + basic_data['server_name'] = self.server_name + basic_data.update(self.client_interface.GetMetadata()) + return basic_data diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_virtual_machine.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_virtual_machine.py new file mode 100644 index 0000000..6496be2 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/azure_virtual_machine.py @@ -0,0 +1,1165 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Class to represent an Azure Virtual Machine object. + +Zones: +run 'azure vm location list' +Machine Types: +http://msdn.microsoft.com/en-us/library/azure/dn197896.aspx +Images: +run 'azure vm image list' + +All VM specifics are self-contained and the class provides methods to +operate on the VM: boot, shutdown, etc. +""" + + +import abc +import collections +import itertools +import json +import logging +import posixpath +import re +import threading +import six + +from absl import flags +from perfkitbenchmarker import custom_virtual_machine_spec +from perfkitbenchmarker import disk +from perfkitbenchmarker import errors +from perfkitbenchmarker import linux_virtual_machine +from perfkitbenchmarker import placement_group +from perfkitbenchmarker import providers +from perfkitbenchmarker import resource +from perfkitbenchmarker import virtual_machine +from perfkitbenchmarker import vm_util +from perfkitbenchmarker import windows_virtual_machine +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.providers import azure +from perfkitbenchmarker.providers.azure import azure_disk +from perfkitbenchmarker.providers.azure import azure_network +from perfkitbenchmarker.providers.azure import util +from six.moves import range +# Added by Intel +import yaml +import ipaddress + +try: + unicode +except NameError: + unicode = str +# End added by Intel + +FLAGS = flags.FLAGS +NUM_LOCAL_VOLUMES = { + 'Standard_L8s_v2': 1, + 'Standard_L16s_v2': 2, + 'Standard_L32s_v2': 4, + 'Standard_L64s_v2': 8, + 'Standard_L80s_v2': 10 +} + +_MACHINE_TYPES_ARM64 = ( + 'Standard_D2ps_v5', 'Standard_D4ps_v5', 'Standard_D8ps_v5', + 'Standard_D16ps_v5', 'Standard_D32ps_v5', 'Standard_D48ps_v5', + 'Standard_D64ps_v5', 'Standard_D2pds_v5', 'Standard_D4pds_v5', + 'Standard_D8pds_v5', 'Standard_D16pds_v5', 'Standard_D32pds_v5', + 'Standard_D48pds_v5', 'Standard_D64pds_v5', + + 'Standard_D2pls_v5', 'Standard_D4pls_v5', 'Standard_D8pls_v5', + 'Standard_D16pls_v5', 'Standard_D32pls_v5', 'Standard_D48pls_v5', + 'Standard_D64pls_v5', + + 'Standard_D2plds_v5', 'Standard_D4plds_v5', 'Standard_D8plds_v5', + 'Standard_D16plds_v5', 'Standard_D32plds_v5', 'Standard_D48plds_v5', + 'Standard_D64plds_v5', + + 'Standard_E2ps_v5', 'Standard_E4ps_v5', 'Standard_E8ps_v5', + 'Standard_E16ps_v5', 'Standard_E20ps_v5', 'Standard_E32ps_v5', + + 'Standard_E2pds_v5', 'Standard_E4pds_v5', 'Standard_E8pds_v5', + 'Standard_E16pds_v5', 'Standard_E20pds_v5', 'Standard_E32pds_v5', +) + +_MACHINE_TYPES_ONLY_SUPPORT_GEN2_IMAGES = ( + 'Standard_ND96asr_v4', 'Standard_ND96asr_A100_v4', + 'Standard_ND96amsr_A100_v4', 'Standard_M208ms_v2', 'Standard_M208s_v2', + 'Standard_M416ms_v2', 'Standard_M416s_v2', 'Standard_ND40rs_v2', + 'Standard_M32ms_v2', 'Standard_M64s_v2', 'Standard_M64ms_v2', + 'Standard_M128s_v2', 'Standard_M128ms_v2', 'Standard_M192is_v2', + 'Standard_M192ims_v2', 'Standard_M32dms_v2', 'Standard_M64ds_v2', + 'Standard_M128ds_v2', 'Standard_M128dms_v2', 'Standard_M192ids_v2', + 'Standard_M192idms_v2', 'Standard_DC2s_v2', 'Standard_DC2s_v3', + 'Standard_DC32ds_v3', 'Standard_DC32s_v3', 'Standard_DC48ds_v3', + 'Standard_DC48s_v3', 'Standard_DC4ds_v3', 'Standard_DC4s_v2', + 'Standard_DC4s_v3', 'Standard_DC8_v2', 'Standard_DC8ds_v3', + 'Standard_DC8s_v3', 'Standard_FX12mds', 'Standard_FX24mds', + 'Standard_FX36mds', 'Standard_FX48mds', 'Standard_FX4mds', + 'Standard_M64dms_v2', 'Standard_DC16ds_v3', 'Standard_DC16s_v3', + 'Standard_DC1ds_v3', 'Standard_DC1s_v3', 'Standard_DC24ds_v3', + 'Standard_DC24s_v3', 'Standard_DC2ds_v3', 'Standard_DC1s_v2', +) + _MACHINE_TYPES_ARM64 + + +# https://docs.microsoft.com/en-us/azure/virtual-machines/windows/scheduled-events +_SCHEDULED_EVENTS_CMD = ('curl -H Metadata:true http://169.254.169.254/metadata' + '/scheduledevents?api-version=2019-01-01') + +_SCHEDULED_EVENTS_CMD_WIN = ('Invoke-RestMethod -Headers @{"Metadata"="true"} ' + '-Uri http://169.254.169.254/metadata/' + 'scheduledevents?api-version=2019-01-01 | ' + 'ConvertTo-Json') + + +class AzureVmSpec(virtual_machine.BaseVmSpec): + """Object containing the information needed to create a AzureVirtualMachine. + + Attributes: + tier: None or string. performance tier of the machine. + compute_units: int. number of compute units for the machine. + accelerated_networking: boolean. True if supports accelerated_networking. + boot_disk_size: None or int. The size of the boot disk in GB. + boot_disk_type: string or None. The type of the boot disk. + low_priority: boolean. True if the VM should be low-priority, else False. + """ + + CLOUD = providers.AZURE + + def __init__(self, *args, **kwargs): + super(AzureVmSpec, self).__init__(*args, **kwargs) + if isinstance(self.machine_type, + custom_virtual_machine_spec.AzurePerformanceTierDecoder): + self.tier = self.machine_type.tier + self.compute_units = self.machine_type.compute_units + self.machine_type = None + else: + self.tier = None + self.compute_units = None + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May be + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(AzureVmSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['machine_type'].present: + config_values['machine_type'] = yaml.safe_load(flag_values.machine_type) + if flag_values['azure_accelerated_networking'].present: + config_values['accelerated_networking'] = ( + flag_values.azure_accelerated_networking) + if flag_values['azure_low_priority_vms'].present: + config_values['low_priority'] = flag_values.azure_low_priority_vms + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(AzureVmSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'machine_type': + (custom_virtual_machine_spec.AzureMachineTypeDecoder, {}), + 'accelerated_networking': (option_decoders.BooleanDecoder, { + 'default': False + }), + 'boot_disk_size': (option_decoders.IntDecoder, { + 'default': None + }), + 'boot_disk_type': (option_decoders.StringDecoder, { + 'default': None + }), + 'low_priority': (option_decoders.BooleanDecoder, { + 'default': False + }), + }) + return result + + +# Per-VM resources are defined here. +class AzurePublicIPAddress(resource.BaseResource): + """Class to represent an Azure Public IP Address.""" + + def __init__(self, region, availability_zone, name, dns_name=None): + super(AzurePublicIPAddress, self).__init__() + self.region = region + self.availability_zone = availability_zone + self.name = name + self._deleted = False + self.resource_group = azure_network.GetResourceGroup() + self.dns_name = dns_name + + def _Create(self): + cmd = [ + azure.AZURE_PATH, 'network', 'public-ip', 'create', '--location', + self.region, '--name', self.name + ] + self.resource_group.args + + if self.availability_zone: + # Availability Zones require Standard IPs. + # TODO(user): Consider setting this by default + cmd += ['--zone', self.availability_zone, '--sku', 'Standard'] + + if self.dns_name: + cmd += ['--dns-name', self.dns_name] + + _, stderr, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + + if retcode and re.search(r'Cannot create more than \d+ public IP addresses', + stderr): + raise errors.Benchmarks.QuotaFailure( + virtual_machine.QUOTA_EXCEEDED_MESSAGE + stderr) + + def _Exists(self): + if self._deleted: + return False + + stdout, _, _ = vm_util.IssueCommand( + [ + azure.AZURE_PATH, 'network', 'public-ip', 'show', '--output', + 'json', '--name', self.name + ] + self.resource_group.args, + raise_on_failure=False) + try: + json.loads(stdout) + return True + except ValueError: + return False + + def GetIPAddress(self): + stdout, _ = vm_util.IssueRetryableCommand([ + azure.AZURE_PATH, 'network', 'public-ip', 'show', '--output', 'json', + '--name', self.name + ] + self.resource_group.args) + + response = json.loads(stdout) + return response['ipAddress'] + + def _Delete(self): + self._deleted = True + + +class AzureNIC(resource.BaseResource): + """Class to represent an Azure NIC.""" + + def __init__(self, + subnet, + name, + public_ip, + accelerated_networking, + network_security_group=None, + private_ip=None): + super(AzureNIC, self).__init__() + self.subnet = subnet + self.name = name + self.public_ip = public_ip + self.private_ip = private_ip + self._deleted = False + self.resource_group = azure_network.GetResourceGroup() + self.region = self.subnet.vnet.region + self.args = ['--nics', self.name] + self.accelerated_networking = accelerated_networking + self.network_security_group = network_security_group + + def _Create(self): + cmd = [ + azure.AZURE_PATH, 'network', 'nic', 'create', '--location', + self.region, '--vnet-name', self.subnet.vnet.name, '--subnet', + self.subnet.name, '--public-ip-address', self.public_ip, '--name', + self.name + ] + self.resource_group.args + if self.private_ip: + cmd += ['--private-ip-address', self.private_ip] + if self.accelerated_networking: + cmd += ['--accelerated-networking', 'true'] + if self.network_security_group: + cmd += ['--network-security-group', self.network_security_group.name] + vm_util.IssueCommand(cmd) + + def _Exists(self): + if self._deleted: + return False + # Same deal as AzurePublicIPAddress. 'show' doesn't error out if + # the resource doesn't exist, but no-op 'set' does. + stdout, _, _ = vm_util.IssueCommand( + [ + azure.AZURE_PATH, 'network', 'nic', 'show', '--output', 'json', + '--name', self.name + ] + self.resource_group.args, + raise_on_failure=False) + try: + json.loads(stdout) + return True + except ValueError: + return False + + def GetInternalIP(self): + """Grab some data.""" + + stdout, _ = vm_util.IssueRetryableCommand([ + azure.AZURE_PATH, 'network', 'nic', 'show', '--output', 'json', + '--name', self.name + ] + self.resource_group.args) + + response = json.loads(stdout) + return response['ipConfigurations'][0]['privateIpAddress'] + + def _Delete(self): + self._deleted = True + + +class AzureDedicatedHostGroup(resource.BaseResource): + """Object representing an Azure host group (a collection of dedicated hosts). + + A host group is required for dedicated host creation. + Attributes: + name: The name of the vm - to be part of the host group name. + location: The region the host group will exist in. + resource_group: The group of resources for the host group. + """ + + def __init__(self, name, region, resource_group, availability_zone): + super(AzureDedicatedHostGroup, self).__init__() + self.name = name + 'Group' + self.region = region + self.resource_group = resource_group + self.availability_zone = availability_zone + + def _Create(self): + """See base class.""" + create_cmd = ([ + azure.AZURE_PATH, + 'vm', + 'host', + 'group', + 'create', + '--name', + self.name, + '--location', + self.region, + # number of fault domains (physical racks) to span across + # TODO(buggay): add support for multiple fault domains + # https://docs.microsoft.com/en-us/azure/virtual-machines/windows/dedicated-hosts#high-availability-considerations + '--platform-fault-domain-count', + '1', + ] + self.resource_group.args) + + if self.availability_zone: + create_cmd.extend(['--zone', self.availability_zone]) + + vm_util.IssueCommand(create_cmd) + + def _Delete(self): + """See base class.""" + delete_cmd = ([ + azure.AZURE_PATH, + 'vm', + 'host', + 'group', + 'delete', + '--host-group', + self.name, + ] + self.resource_group.args) + vm_util.IssueCommand(delete_cmd) + + def _Exists(self): + """See base class.""" + show_cmd = [ + azure.AZURE_PATH, 'vm', 'host', 'group', 'show', '--output', 'json', + '--name', self.name + ] + self.resource_group.args + stdout, _, _ = vm_util.IssueCommand(show_cmd, raise_on_failure=False) + try: + json.loads(stdout) + return True + except ValueError: + return False + + +def _GetSkuType(machine_type): + """Returns the host SKU type derived from the VM machine type.""" + # TODO(buggay): add support for FSv2 machine types when no longer in preview + # https://docs.microsoft.com/en-us/azure/virtual-machines/windows/dedicated-hosts + sku = '' + if re.match('Standard_D[0-9]*s_v3', machine_type): + sku = 'DSv3-Type1' + elif re.match('Standard_E[0-9]*s_v3', machine_type): + sku = 'ESv3-Type1' + else: + raise ValueError('Dedicated hosting does not support machine type %s.' % + machine_type) + return sku + + +class AzureDedicatedHost(resource.BaseResource): + """Object representing an Azure host. + + Attributes: + host_group: The required host group to which the host will belong. + name: The name of the vm - to be part of the host name. + region: The region the host will exist in. + resource_group: The group of resources for the host. + """ + _lock = threading.Lock() + # globals guarded by _lock + host_group_map = {} + + def __init__(self, name, region, resource_group, sku_type, + availability_zone): + super(AzureDedicatedHost, self).__init__() + self.name = name + '-Host' + self.region = region + self.resource_group = resource_group + self.sku_type = sku_type + self.availability_zone = availability_zone + self.host_group = None + self.fill_fraction = 0.0 + + def _CreateDependencies(self): + """See base class.""" + with self._lock: + if self.region not in self.host_group_map: + new_host_group = AzureDedicatedHostGroup(self.name, self.region, + self.resource_group, + self.availability_zone) + new_host_group.Create() + self.host_group_map[self.region] = new_host_group.name + self.host_group = self.host_group_map[self.region] + + def _Create(self): + """See base class.""" + create_cmd = ([ + azure.AZURE_PATH, + 'vm', + 'host', + 'create', + '--host-group', + self.host_group, + '--name', + self.name, + '--sku', + self.sku_type, + '--location', + self.region, + # the specific fault domain (physical rack) for the host dependent on + # the number (count) of fault domains of the host group + # TODO(buggay): add support for specifying multiple fault domains if + # benchmarks require + '--platform-fault-domain', + '0', + ] + self.resource_group.args) + vm_util.IssueCommand(create_cmd) + + def _Delete(self): + """See base class.""" + delete_cmd = ([ + azure.AZURE_PATH, + 'vm', + 'host', + 'delete', + '--host-group', + self.host_group, + '--name', + self.name, + '--yes', + ] + self.resource_group.args) + vm_util.IssueCommand(delete_cmd) + + def _Exists(self): + """See base class.""" + show_cmd = [ + azure.AZURE_PATH, + 'vm', + 'host', + 'show', + '--output', + 'json', + '--name', + self.name, + '--host-group', + self.host_group, + ] + self.resource_group.args + stdout, _, _ = vm_util.IssueCommand(show_cmd, raise_on_failure=False) + try: + json.loads(stdout) + return True + except ValueError: + return False + + +class AzureVirtualMachine(virtual_machine.BaseVirtualMachine): + """Object representing an Azure Virtual Machine.""" + CLOUD = providers.AZURE + + _lock = threading.Lock() + # TODO(buggay): remove host groups & hosts as globals -> create new spec + # globals guarded by _lock + host_map = collections.defaultdict(list) + + def __init__(self, vm_spec): + """Initialize an Azure virtual machine. + + Args: + vm_spec: virtual_machine.BaseVmSpec object of the vm. + """ + super(AzureVirtualMachine, self).__init__(vm_spec) + + # PKB zone can be either a region or a region with an availability zone. + # Format for Azure availability zone support is "region-availability_zone" + # Example: eastus2-1 is Azure region eastus2 with availability zone 1. + + self.region = util.GetRegionFromZone(self.zone) + self.availability_zone = util.GetAvailabilityZoneFromZone(self.zone) + self.use_dedicated_host = vm_spec.use_dedicated_host + self.num_vms_per_host = vm_spec.num_vms_per_host + self.network = azure_network.AzureNetwork.GetNetwork(self) + self.firewall = azure_network.AzureFirewall.GetFirewall() + self.max_local_disks = NUM_LOCAL_VOLUMES.get(self.machine_type) or 1 + self._lun_counter = itertools.count() + self._deleted = False + + self.resource_group = azure_network.GetResourceGroup() + self.public_ip = AzurePublicIPAddress(self.region, self.availability_zone, + self.name + '-public-ip') + self.nic = AzureNIC(self.network.subnet, self.name + '-nic', + self.public_ip.name, vm_spec.accelerated_networking, + self.network.nsg) + self.storage_account = self.network.storage_account + if vm_spec.image: + self.image = vm_spec.image + elif self.machine_type in _MACHINE_TYPES_ONLY_SUPPORT_GEN2_IMAGES: + if hasattr(type(self), 'GEN2_IMAGE_URN'): + if self.machine_type in _MACHINE_TYPES_ARM64: + self.image = type(self).IMAGE_ARM64_URN + else: + self.image = type(self).GEN2_IMAGE_URN + else: + raise errors.Benchmarks.UnsupportedConfigError('No Azure gen2 image.') + else: + self.image = type(self).IMAGE_URN + + self.host = None + if self.use_dedicated_host: + self.host_series_sku = _GetSkuType(self.machine_type) + self.host_list = None + self.low_priority = vm_spec.low_priority + self.low_priority_status_code = None + self.spot_early_termination = False + self.ultra_ssd_enabled = False + + disk_spec = disk.BaseDiskSpec('azure_os_disk') + disk_spec.disk_type = ( + vm_spec.boot_disk_type or self.storage_account.storage_type) + if vm_spec.boot_disk_size: + disk_spec.disk_size = vm_spec.boot_disk_size + self.os_disk = azure_disk.AzureDisk( + disk_spec, + self, + None, + is_image=True) + + @property + @classmethod + @abc.abstractmethod + def IMAGE_URN(cls): + raise NotImplementedError() + + def _CreateDependencies(self): + """Create VM dependencies.""" + self.public_ip.Create() + self.nic.Create() + + if self.use_dedicated_host: + with self._lock: + self.host_list = self.host_map[(self.host_series_sku, self.region)] + if (not self.host_list or (self.num_vms_per_host and + self.host_list[-1].fill_fraction + + 1.0 / self.num_vms_per_host > 1.0)): + new_host = AzureDedicatedHost(self.name, self.region, + self.resource_group, + self.host_series_sku, + self.availability_zone) + self.host_list.append(new_host) + new_host.Create() + self.host = self.host_list[-1] + if self.num_vms_per_host: + self.host.fill_fraction += 1.0 / self.num_vms_per_host + + def _RequiresUltraDisk(self): + return any(disk_spec.disk_type == azure_disk.ULTRA_STORAGE + for disk_spec in self.disk_specs) + + def _Create(self): + """See base class.""" + if self.os_disk.disk_size: + disk_size_args = ['--os-disk-size-gb', str(self.os_disk.disk_size)] + else: + disk_size_args = [] + + tags = {} + tags.update(self.vm_metadata) + tags.update(util.GetResourceTags(self.resource_group.timeout_minutes)) + tag_args = ['--tags'] + util.FormatTags(tags) + + create_cmd = ([ + azure.AZURE_PATH, 'vm', 'create', '--location', self.region, + '--image', self.image, '--size', self.machine_type, '--admin-username', + self.user_name, '--storage-sku', self.os_disk.disk_type, '--name', + self.name + ] + disk_size_args + self.resource_group.args + self.nic.args + tag_args) + + if self._RequiresUltraDisk(): + self.ultra_ssd_enabled = True + create_cmd.extend(['--ultra-ssd-enabled']) + + if self.availability_zone: + create_cmd.extend(['--zone', self.availability_zone]) + + # Resources in Availability Set are not allowed to be + # deployed to particular hosts. + if self.use_dedicated_host: + create_cmd.extend( + ['--host-group', self.host.host_group, '--host', self.host.name]) + num_hosts = len(self.host_list) + + if self.network.placement_group: + create_cmd.extend(self.network.placement_group.AddVmArgs()) + + if self.low_priority: + create_cmd.extend(['--priority', 'Spot']) + + if self.password: + create_cmd.extend(['--admin-password', self.password]) + else: + create_cmd.extend(['--ssh-key-value', self.ssh_public_key]) + + # Uses a custom default because create has a very long tail. + azure_vm_create_timeout = 1800 + _, stderr, retcode = vm_util.IssueCommand( + create_cmd, timeout=azure_vm_create_timeout, raise_on_failure=False) + if retcode: + if 'quota' in stderr.lower(): + raise errors.Benchmarks.QuotaFailure( + virtual_machine.QUOTA_EXCEEDED_MESSAGE + stderr) + elif re.search( + r'requested VM size \S+ is not available', stderr) or re.search( + r'not available in location .+ for subscription', stderr): + raise errors.Benchmarks.UnsupportedConfigError(stderr) + elif self.low_priority and 'OverconstrainedAllocationRequest' in stderr: + raise errors.Benchmarks.InsufficientCapacityCloudFailure(stderr) + # TODO(buggay) refactor to share code with gcp_virtual_machine.py + if (self.use_dedicated_host and retcode and + 'AllocationFailed' in stderr): + if self.num_vms_per_host: + raise errors.Resource.CreationError( + 'Failed to create host: %d vms of type %s per host exceeds ' + 'memory capacity limits of the host' % + (self.num_vms_per_host, self.machine_type)) + else: + logging.warning( + 'Creation failed due to insufficient host capacity. A new host will ' + 'be created and instance creation will be retried.') + with self._lock: + if num_hosts == len(self.host_list): + new_host = AzureDedicatedHost(self.name, self.region, + self.resource_group, + self.host_series_sku, + self.availability_zone) + self.host_list.append(new_host) + new_host.Create() + self.host = self.host_list[-1] + raise errors.Resource.RetryableCreationError() + if (not self.use_dedicated_host and retcode and + ('AllocationFailed' in stderr or + 'OverconstrainedZonalAllocationRequest' in stderr)): + raise errors.Benchmarks.InsufficientCapacityCloudFailure(stderr) + if retcode: + if "Virtual Machine Scale Set with '' security type." in stderr: + raise errors.Resource.CreationError( + f'Failed to create VM: {self.machine_type} is likely a confidential' + ' machine, which PKB does not support at this time.\n\n' + f' Full error: {stderr} return code: {retcode}') + if "cannot boot Hypervisor Generation '1'" in stderr: + raise errors.Resource.CreationError( + f'Failed to create VM: {self.machine_type} is unable to support V1 ' + 'Hypervision. Please update _MACHINE_TYPES_ONLY_SUPPORT_GEN2_IMAGES' + ' in azure_virtual_machine.py.\n\n' + f' Full error: {stderr} return code: {retcode}') + else: + raise errors.Resource.CreationError( + 'Failed to create VM: %s return code: %s' % (stderr, retcode)) + + def _Exists(self): + """Returns True if the VM exists.""" + if self._deleted: + return False + show_cmd = [ + azure.AZURE_PATH, 'vm', 'show', '--output', 'json', '--name', self.name + ] + self.resource_group.args + stdout, _, _ = vm_util.IssueCommand(show_cmd, raise_on_failure=False) + try: + json.loads(stdout) + return True + except ValueError: + return False + + def _Delete(self): + # The VM will be deleted when the resource group is. + self._deleted = True + + def _Start(self): + """Starts the VM.""" + start_cmd = ([azure.AZURE_PATH, 'vm', 'start', '--name', self.name] + + self.resource_group.args) + vm_util.IssueCommand(start_cmd) + self.ip_address = self.public_ip.GetIPAddress() + + def _Stop(self): + """Stops the VM.""" + stop_cmd = ([azure.AZURE_PATH, 'vm', 'stop', '--name', self.name] + + self.resource_group.args) + vm_util.IssueCommand(stop_cmd) + # remove resources, similar to GCE stop + deallocate_cmd = ( + [azure.AZURE_PATH, 'vm', 'deallocate', '--name', self.name] + + self.resource_group.args) + vm_util.IssueCommand(deallocate_cmd) + + def _Suspend(self): + """Suspends the VM.""" + raise NotImplementedError() + + def _Resume(self): + """Resumes the VM.""" + raise NotImplementedError() + + @vm_util.Retry() + def _PostCreate(self): + """Get VM data.""" + stdout, _ = vm_util.IssueRetryableCommand([ + azure.AZURE_PATH, 'vm', 'show', '--output', 'json', '--name', self.name + ] + self.resource_group.args) + response = json.loads(stdout) + self.os_disk.name = response['storageProfile']['osDisk']['name'] + self.os_disk.created = True + vm_util.IssueCommand([ + azure.AZURE_PATH, 'disk', 'update', '--name', self.os_disk.name, + '--set', + util.GetTagsJson(self.resource_group.timeout_minutes) + ] + self.resource_group.args) + self.internal_ip = self.nic.GetInternalIP() + self.ip_address = self.public_ip.GetIPAddress() + + def AddMetadata(self, **kwargs): + if not kwargs: + return + tag_list = ['tags.%s=%s' % (k, v) for k, v in six.iteritems(kwargs)] + vm_util.IssueRetryableCommand( + [azure.AZURE_PATH, 'vm', 'update', '--name', self.name] + + self.resource_group.args + ['--set'] + tag_list) + + def CreateScratchDisk(self, disk_spec): + """Create a VM's scratch disk. + + Args: + disk_spec: virtual_machine.BaseDiskSpec object of the disk. + + Raises: + CreationError: If an SMB disk is listed but the SMB service not created. + """ + disks = [] + + for _ in range(disk_spec.num_striped_disks): + if disk_spec.disk_type == disk.NFS: + data_disk = self._GetNfsService().CreateNfsDisk() + disks.append(data_disk) + continue + elif disk_spec.disk_type == disk.SMB: + data_disk = self._GetSmbService().CreateSmbDisk() + disks.append(data_disk) + continue + elif disk_spec.disk_type == disk.LOCAL: + # Local disk numbers start at 1 (0 is the system disk). + disk_number = self.local_disk_counter + 1 + self.local_disk_counter += 1 + if self.local_disk_counter > self.max_local_disks: + raise errors.Error('Not enough local disks.') + else: + # Remote disk numbers start at max_local disks, Azure does not separate + # local disk and system disk. + disk_number = self.remote_disk_counter + self.max_local_disks + self.remote_disk_counter += 1 + lun = next(self._lun_counter) + data_disk = azure_disk.AzureDisk(disk_spec, self, lun) + data_disk.disk_number = disk_number + disks.append(data_disk) + + self._CreateScratchDiskFromDisks(disk_spec, disks) + + def InstallCli(self): + """Installs the Azure cli and credentials on this Azure vm.""" + self.Install('azure_cli') + self.Install('azure_credentials') + + def DownloadPreprovisionedData(self, install_path, module_name, filename): + """Downloads a data file from Azure blob storage with pre-provisioned data. + + Use --azure_preprovisioned_data_bucket to specify the name of the account. + + Note: Azure blob storage does not allow underscores in the container name, + so this method replaces any underscores in module_name with dashes. + Make sure that the same convention is used when uploading the data + to Azure blob storage. For example: when uploading data for + 'module_name' to Azure, create a container named 'benchmark-name'. + + Args: + install_path: The install path on this VM. + module_name: Name of the module associated with this data file. + filename: The name of the file that was downloaded. + """ + # N.B. Should already be installed by ShouldDownloadPreprovisionedData + self.Install('azure_cli') + self.RemoteCommand( + GenerateDownloadPreprovisionedDataCommand(install_path, module_name, + filename)) + + def ShouldDownloadPreprovisionedData(self, module_name, filename): + """Returns whether or not preprovisioned data is available.""" + # Do not install credentials. Data are fetched using locally generated + # connection strings and do not use credentials on the VM. + self.Install('azure_cli') + return FLAGS.azure_preprovisioned_data_bucket and self.TryRemoteCommand( + GenerateStatPreprovisionedDataCommand(module_name, filename)) + + def GetResourceMetadata(self): + result = super(AzureVirtualMachine, self).GetResourceMetadata() + result['accelerated_networking'] = self.nic.accelerated_networking + result['boot_disk_type'] = self.os_disk.disk_type + result['boot_disk_size'] = self.os_disk.disk_size + if self.network.placement_group: + result['placement_group_strategy'] = self.network.placement_group.strategy + else: + result['placement_group_strategy'] = placement_group.PLACEMENT_GROUP_NONE + result['preemptible'] = self.low_priority + if self.use_dedicated_host: + result['num_vms_per_host'] = self.num_vms_per_host + return result + + @vm_util.Retry(max_retries=5) + def UpdateInterruptibleVmStatus(self): + """Updates the interruptible status if the VM was preempted.""" + if self.spot_early_termination: + return + if self.low_priority and self._Exists(): + stdout, stderr, return_code = self.RemoteCommandWithReturnCode( + _SCHEDULED_EVENTS_CMD) + if return_code: + logging.error('Checking Interrupt Error: %s', stderr) + else: + events = json.loads(stdout).get('Events', []) + self.spot_early_termination = any( + event.get('EventType') == 'Preempt' for event in events) + if self.spot_early_termination: + logging.info('Spotted early termination on %s', self) + + def _UpdateInterruptibleVmStatusThroughMetadataService(self): + stdout, stderr, return_code = self.RemoteCommandWithReturnCode( + _SCHEDULED_EVENTS_CMD) + if return_code: + logging.error('Checking Interrupt Error: %s', stderr) + else: + events = json.loads(stdout).get('Events', []) + self.spot_early_termination = any( + event.get('EventType') == 'Preempt' for event in events) + if self.spot_early_termination: + logging.info('Spotted early termination on %s', self) + + def IsInterruptible(self): + """Returns whether this vm is a interruptible vm (e.g. spot, preemptible). + + Returns: + True if this vm is a interruptible vm. + """ + return self.low_priority + + def WasInterrupted(self): + """Returns whether this low-priority vm was terminated early by Azure. + + Returns: True if this vm was terminated early by Azure. + """ + return self.spot_early_termination + + def GetVmStatusCode(self): + """Returns the early termination code if any. + + Returns: Early termination code. + """ + return self.low_priority_status_code + + +class Debian9BasedAzureVirtualMachine(AzureVirtualMachine, + linux_virtual_machine.Debian9Mixin): + # From https://wiki.debian.org/Cloud/MicrosoftAzure + IMAGE_URN = 'credativ:Debian:9:latest' + + +class Debian10BasedAzureVirtualMachine(AzureVirtualMachine, + linux_virtual_machine.Debian10Mixin): + # From https://wiki.debian.org/Cloud/MicrosoftAzure + GEN2_IMAGE_URN = 'Debian:debian-10:10-gen2:latest' + IMAGE_URN = 'Debian:debian-10:10:latest' + + +class Debian11BasedAzureVirtualMachine(AzureVirtualMachine, + linux_virtual_machine.Debian11Mixin): + # From https://wiki.debian.org/Cloud/MicrosoftAzure + GEN2_IMAGE_URN = 'Debian:debian-11:11-gen2:latest' + IMAGE_URN = 'Debian:debian-11:11:latest' + + +class Ubuntu1604BasedAzureVirtualMachine(AzureVirtualMachine, + linux_virtual_machine.Ubuntu1604Mixin): + GEN2_IMAGE_URN = 'Canonical:UbuntuServer:16_04-lts-gen2:latest' + IMAGE_URN = 'Canonical:UbuntuServer:16.04-LTS:latest' + + +class Ubuntu1804BasedAzureVirtualMachine(AzureVirtualMachine, + linux_virtual_machine.Ubuntu1804Mixin): + GEN2_IMAGE_URN = 'Canonical:UbuntuServer:18_04-lts-gen2:latest' + IMAGE_URN = 'Canonical:UbuntuServer:18.04-LTS:latest' + + +class Ubuntu2004BasedAzureVirtualMachine(AzureVirtualMachine, + linux_virtual_machine.Ubuntu2004Mixin): + IMAGE_ARM64_URN = 'Canonical:0001-com-ubuntu-server-focal:20_04-lts-arm64:latest' + GEN2_IMAGE_URN = 'Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest' + IMAGE_URN = 'Canonical:0001-com-ubuntu-server-focal:20_04-lts:latest' + + +class Ubuntu2204BasedAzureVirtualMachine(AzureVirtualMachine, + linux_virtual_machine.Ubuntu2204Mixin): + IMAGE_ARM64_URN = 'Canonical:0001-com-ubuntu-server-jammy:22_04-lts-arm64:latest' + GEN2_IMAGE_URN = 'Canonical:0001-com-ubuntu-server-jammy:22_04-lts-gen2:latest' + IMAGE_URN = 'Canonical:0001-com-ubuntu-server-jammy:22_04-lts:latest' + + +class Rhel7BasedAzureVirtualMachine(AzureVirtualMachine, + linux_virtual_machine.Rhel7Mixin): + GEN2_IMAGE_URN = 'RedHat:RHEL:7lvm-gen2:latest' + IMAGE_URN = 'RedHat:RHEL:7-LVM:latest' + + +class Rhel8BasedAzureVirtualMachine(AzureVirtualMachine, + linux_virtual_machine.Rhel8Mixin): + GEN2_IMAGE_URN = 'RedHat:RHEL:8-lvm-gen2:latest' + IMAGE_URN = 'RedHat:RHEL:8-LVM:latest' + + +class CentOs7BasedAzureVirtualMachine(AzureVirtualMachine, + linux_virtual_machine.CentOs7Mixin): + GEN2_IMAGE_URN = 'OpenLogic:CentOS-LVM:7-lvm-gen2:latest' + IMAGE_URN = 'OpenLogic:CentOS-LVM:7-lvm:latest' + + +class CentOs8BasedAzureVirtualMachine(AzureVirtualMachine, + linux_virtual_machine.CentOs8Mixin): + GEN2_IMAGE_URN = 'OpenLogic:CentOS-LVM:8-lvm-gen2:latest' + IMAGE_URN = 'OpenLogic:CentOS-LVM:8-lvm:latest' + + +class CentOsStream8BasedAzureVirtualMachine(AzureVirtualMachine, + linux_virtual_machine.CentOsStream8Mixin): + # TODO: Change to Azure official centos 8 stream image when it is available + IMAGE_URN = 'cloudwhizsolutions:centos-8-stream-cw:centos-8-stream-cw:1.2019.0712' + + +# TODO(pclay): Add Fedora CoreOS when available: +# https://docs.fedoraproject.org/en-US/fedora-coreos/provisioning-azure/ + + +class BaseWindowsAzureVirtualMachine(AzureVirtualMachine, + windows_virtual_machine.BaseWindowsMixin): + """Class supporting Windows Azure virtual machines.""" + + # This ia a required attribute, but this is a base class. + IMAGE_URN = 'non-existent' + + def __init__(self, vm_spec): + super(BaseWindowsAzureVirtualMachine, self).__init__(vm_spec) + # The names of Windows VMs on Azure are limited to 15 characters so let's + # drop the pkb prefix if necessary. + if len(self.name) > 15: + self.name = re.sub('^pkb-', '', self.name) + self.user_name = self.name + self.password = vm_util.GenerateRandomWindowsPassword() + + def _PostCreate(self): + super(BaseWindowsAzureVirtualMachine, self)._PostCreate() + config_dict = {'commandToExecute': windows_virtual_machine.STARTUP_SCRIPT} + config = json.dumps(config_dict) + vm_util.IssueRetryableCommand([ + azure.AZURE_PATH, 'vm', 'extension', 'set', '--vm-name', self.name, + '--name', 'CustomScriptExtension', '--publisher', 'Microsoft.Compute', + '--version', '1.4', + '--protected-settings=%s' % config + ] + self.resource_group.args) + + def _UpdateInterruptibleVmStatusThroughMetadataService(self): + stdout, _ = self.RemoteCommand(_SCHEDULED_EVENTS_CMD_WIN) + events = json.loads(stdout).get('Events', []) + self.spot_early_termination = any( + event.get('EventType') == 'Preempt' for event in events) + if self.spot_early_termination: + logging.info('Spotted early termination on %s', self) + + +# Azure seems to have dropped support for 2012 Server Core. It is neither here: +# https://docs.microsoft.com/en-us/azure/virtual-machines/windows/cli-ps-findimage#table-of-commonly-used-windows-images +# nor in `az vm image list -p MicrosoftWindowsServer -f WindowsServer -s 2012` +# Rather than exclude this just allow 2012 to refer to the 2012 Base image. +class Windows2012CoreAzureVirtualMachine( + BaseWindowsAzureVirtualMachine, + windows_virtual_machine.Windows2012CoreMixin): + GEN2_IMAGE_URN = 'MicrosoftWindowsServer:windowsserver-gen2preview:2012-r2-datacenter-gen2:latest' + IMAGE_URN = 'MicrosoftWindowsServer:WindowsServer:2012-R2-Datacenter:latest' + + +class Windows2016CoreAzureVirtualMachine( + BaseWindowsAzureVirtualMachine, + windows_virtual_machine.Windows2016CoreMixin): + GEN2_IMAGE_URN = 'MicrosoftWindowsServer:windowsserver-gen2preview:2016-datacenter-gen2:latest' + IMAGE_URN = 'MicrosoftWindowsServer:WindowsServer:2016-Datacenter-Server-Core:latest' + + +class Windows2019CoreAzureVirtualMachine( + BaseWindowsAzureVirtualMachine, + windows_virtual_machine.Windows2019CoreMixin): + GEN2_IMAGE_URN = 'MicrosoftWindowsServer:windowsserver-gen2preview:2019-datacenter-gen2:latest' + IMAGE_URN = 'MicrosoftWindowsServer:WindowsServer:2019-Datacenter-Core:latest' + + +class Windows2022CoreAzureVirtualMachine( + BaseWindowsAzureVirtualMachine, + windows_virtual_machine.Windows2022CoreMixin): + IMAGE_URN = 'MicrosoftWindowsServer:WindowsServer:2022-Datacenter-Core:latest' + + +class Windows2012DesktopAzureVirtualMachine( + BaseWindowsAzureVirtualMachine, + windows_virtual_machine.Windows2012DesktopMixin): + GEN2_IMAGE_URN = 'MicrosoftWindowsServer:windowsserver-gen2preview:2012-r2-datacenter-gen2:latest' + IMAGE_URN = 'MicrosoftWindowsServer:WindowsServer:2012-R2-Datacenter:latest' + + +class Windows2016DesktopAzureVirtualMachine( + BaseWindowsAzureVirtualMachine, + windows_virtual_machine.Windows2016DesktopMixin): + GEN2_IMAGE_URN = 'MicrosoftWindowsServer:windowsserver-gen2preview:2016-datacenter-gen2:latest' + IMAGE_URN = 'MicrosoftWindowsServer:WindowsServer:2016-Datacenter:latest' + + +class Windows2019DesktopAzureVirtualMachine( + BaseWindowsAzureVirtualMachine, + windows_virtual_machine.Windows2019DesktopMixin): + GEN2_IMAGE_URN = 'MicrosoftWindowsServer:windowsserver-gen2preview:2019-datacenter-gen2:latest' + IMAGE_URN = 'MicrosoftWindowsServer:WindowsServer:2019-Datacenter:latest' + + +class Windows2022DesktopAzureVirtualMachine( + BaseWindowsAzureVirtualMachine, + windows_virtual_machine.Windows2022DesktopMixin): + IMAGE_URN = 'MicrosoftWindowsServer:WindowsServer:2022-Datacenter:latest' + + +class Windows2019DesktopSQLServer2019StandardAzureVirtualMachine( + BaseWindowsAzureVirtualMachine, + windows_virtual_machine.Windows2019SQLServer2019Standard): + GEN2_IMAGE_URN = 'MicrosoftSQLServer:sql2019-ws2019:standard-gen2:latest' + IMAGE_URN = 'MicrosoftSQLServer:sql2019-ws2019:standard:latest' + + +class Windows2019DesktopSQLServer2019EnterpriseAzureVirtualMachine( + BaseWindowsAzureVirtualMachine, + windows_virtual_machine.Windows2019SQLServer2019Enterprise): + GEN2_IMAGE_URN = 'MicrosoftSQLServer:sql2019-ws2019:enterprise-gen2:latest' + IMAGE_URN = 'MicrosoftSQLServer:sql2019-ws2019:enterprise:latest' + + +class Windows2022DesktopSQLServer2019StandardAzureVirtualMachine( + BaseWindowsAzureVirtualMachine, + windows_virtual_machine.Windows2022SQLServer2019Standard): + IMAGE_URN = 'MicrosoftSQLServer:sql2019-ws2022:standard:latest' + + +class Windows2022DesktopSQLServer2019EnterpriseAzureVirtualMachine( + BaseWindowsAzureVirtualMachine, + windows_virtual_machine.Windows2022SQLServer2019Enterprise): + IMAGE_URN = 'MicrosoftSQLServer:sql2019-ws2022:enterprise:latest' + + +def GenerateDownloadPreprovisionedDataCommand(install_path, module_name, + filename): + """Returns a string used to download preprovisioned data.""" + module_name_with_underscores_removed = module_name.replace('_', '-') + destpath = posixpath.join(install_path, filename) + if install_path: + # TODO(ferneyhough): Refactor this so that this mkdir command + # is run on all clouds, and is os-agnostic (this is linux specific). + mkdir_command = 'mkdir -p %s' % posixpath.dirname(destpath) + + account_name = FLAGS.azure_preprovisioned_data_bucket + connection_string = util.GetAzureStorageConnectionString(account_name, []) + download_command = ( + 'az storage blob download ' + '--no-progress ' + '--account-name {account_name} ' + '--container-name {container_name} ' + '--name {name} ' + '--file {file} ' + '--connection-string "{connection_string}"'.format( + account_name=account_name, + container_name=module_name_with_underscores_removed, + name=filename, + file=destpath, + connection_string=connection_string)) + if install_path: + return '{0} && {1}'.format(mkdir_command, download_command) + return download_command + + +def GenerateStatPreprovisionedDataCommand(module_name, filename): + """Returns a string used to download preprovisioned data.""" + module_name_with_underscores_removed = module_name.replace('_', '-') + account_name = FLAGS.azure_preprovisioned_data_bucket + connection_string = util.GetAzureStorageConnectionString(account_name, []) + return ('az storage blob show ' + '--account-name {account_name} ' + '--container-name {container_name} ' + '--name {name} ' + '--connection-string "{connection_string}"'.format( + account_name=account_name, + container_name=module_name_with_underscores_removed, + name=filename, + connection_string=connection_string)) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/flags.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/flags.py new file mode 100644 index 0000000..ec6201c --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/flags.py @@ -0,0 +1,104 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing flags applicable across benchmark run on Azure.""" + +from absl import flags + + +NONE = 'None' +READ_ONLY = 'ReadOnly' +READ_WRITE = 'ReadWrite' +flags.DEFINE_enum( + 'azure_host_caching', NONE, + [NONE, READ_ONLY, READ_WRITE], + 'The type of host caching to use on Azure data disks.') +# Azure Storage Account types. See +# http://azure.microsoft.com/en-us/pricing/details/storage/ for more information +# about the different types. +LRS = 'Standard_LRS' +ULRS = 'UltraSSD_LRS' +PLRS = 'Premium_LRS' +ZRS = 'Standard_ZRS' +GRS = 'Standard_GRS' +RAGRS = 'Standard_RAGRS' + +STORAGE = 'Storage' +BLOB_STORAGE = 'BlobStorage' +VALID_TIERS = ['Basic', 'Standard', 'Premium'] + +# Azure redis cache tiers. See +# https://docs.microsoft.com/en-us/azure/redis-cache/cache-faq for information. +VALID_CACHE_SIZES = ['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', + 'P1', 'P2', 'P3', 'P4', 'P5'] + +flags.DEFINE_enum( + 'azure_storage_type', LRS, + [LRS, PLRS, ULRS, ZRS, GRS, RAGRS], + 'The type of storage account to create. See ' + 'http://azure.microsoft.com/en-us/pricing/details/storage/ for more ' + 'information. To use remote ssd scratch disks, you must use Premium_LRS. ' + 'If you use Premium_LRS, you must use the DS series of machines, or else ' + 'VM creation will fail.') + +flags.DEFINE_enum( + 'azure_blob_account_kind', BLOB_STORAGE, + [STORAGE, BLOB_STORAGE], + 'The type of storage account to use for blob storage. Choosing Storage ' + 'will let you use ZRS storage. Choosing BlobStorage will give you access ' + 'to Hot and Cold storage tiers.') + +flags.DEFINE_integer('azure_provisioned_iops', None, + 'IOPS for Provisioned IOPS volumes in Azure.') +flags.DEFINE_integer('azure_provisioned_throughput', None, + 'Provisioned throughput (MB/s) for volumes in Azure.') + +flags.DEFINE_string('azure_preprovisioned_data_bucket', None, + 'Azure blob storage account where pre-provisioned data ' + 'has been copied.') + +flags.DEFINE_boolean('azure_accelerated_networking', False, + 'Enable Azure Accelerated Networking. See ' + 'https://docs.microsoft.com/en-us/azure/virtual-network/' + 'create-vm-accelerated-networking-cli' + 'for more information.') + +flags.DEFINE_enum('azure_tier', 'Basic', VALID_TIERS, + 'Performance tier to use for the machine type. Defaults to ' + 'Basic.') + +flags.DEFINE_integer( + 'azure_compute_units', None, + 'Number of compute units to allocate for the machine type') + +flags.DEFINE_enum('azure_redis_size', + 'C3', VALID_CACHE_SIZES, + 'Azure redis cache size to use.') + +flags.DEFINE_boolean('azure_low_priority_vms', False, + 'Whether to set the priority to low for Azure VMs') + +flags.DEFINE_boolean('bootstrap_azure_service_principal', True, + 'Whether to use the current service principal credentials ' + 'when passing a service principal to a service. This has ' + 'no effect if the logged in user is not a service ' + 'principal. This is useful, because service principals ' + "usually lack the 'User Authentication Admin' role that " + 'allows creation of new service principals.') +flags.DEFINE_enum('sqldatawarehouse_client_interface', 'CLI', ['CLI', 'JDBC'], + 'The Runtime Interface used when interacting with Synapse.') +flags.DEFINE_string('query_timeout', '600', 'Query timeout in seconds.') +flags.DEFINE_string('min_tls_version', 'TLS1_2', + 'The minimum TLS version to be permitted on requests to storage.' + 'The default interpretation is TLS 1.0 for this property.' + 'accepted values: TLS1_0, TLS1_1, TLS1_2 .') diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/provider_info.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/provider_info.py new file mode 100644 index 0000000..12d62ec --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/provider_info.py @@ -0,0 +1,24 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Provider info for Azure.""" + +from perfkitbenchmarker import provider_info +from perfkitbenchmarker import providers + + +class AzureProviderInfo(provider_info.BaseProviderInfo): + + UNSUPPORTED_BENCHMARKS = ['mysql_service'] + CLOUD = providers.AZURE diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/service_principal.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/service_principal.py new file mode 100644 index 0000000..b84d1ee --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/service_principal.py @@ -0,0 +1,124 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Contains classes/functions related to Azure Service Principals.""" + +import json +import logging + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import object_storage_service +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.linux_packages import azure_credentials +from perfkitbenchmarker.providers import azure + +FLAGS = flags.FLAGS + + +class ServicePrincipal(resource.BaseResource): + """Class representing an Azure service principal.""" + + _instance = None + + @classmethod + def GetInstance(cls): + """Returns the service principal instance.""" + if cls._instance is None: + if FLAGS.bootstrap_azure_service_principal: + cls._instance = cls.LoadFromFile() + else: + cls._instance = cls() + return cls._instance + + @classmethod + def LoadFromFile(cls): + """Loads a service principal from a file.""" + with open( + object_storage_service.FindCredentialFile( + azure_credentials.PROFILE_FILE), + encoding='utf-8-sig') as profile_fp: + subscriptions = json.load(profile_fp)['subscriptions'] + subscription = [sub for sub in subscriptions if sub['isDefault']][0] + subscription_type = subscription['user']['type'] + if subscription_type != 'servicePrincipal': + # We are using user auth, and will probably have permission to create a + # service principal. + logging.info("Azure credentials are of type '%s'. " + 'Will try to create a new service principal.', + subscription_type) + return cls() + # name and id are backwards + name = subscription['id'] + app_id = subscription['user']['name'] + + # If subscription_type is servicePrincipal then service_principals.json + # will exist. + with open( + object_storage_service.FindCredentialFile( + azure_credentials.SERVICE_PRINCIPAL_FILE) + ) as service_principals_fp: + for sp in json.load(service_principals_fp): + if sp['client_id'] == app_id: + logging.info("Azure credentials are of type 'servicePrincipal'. " + 'Will reuse them for benchmarking.') + return cls( + name, app_id, password=sp['client_secret'], user_managed=True) + logging.warning('No access tokens found matching Azure defaultProfile ' + 'Will try to create a new service principal.') + return cls() + + def __init__(self, name=None, app_id=None, password=None, user_managed=False): + super(ServicePrincipal, self).__init__(user_managed) + # Service principals can be referred to by user provided name as long as + # they are prefixed by http:// or by a server generated appId. + # Prefer user provided ID for idempotence when talking to Active Directory. + # When talking to AKS or ACR, app_id is required. + self.name = 'http://' + (name or 'pkb-' + FLAGS.run_uri) + self.app_id = app_id + self.password = password + + def _Create(self): + """Creates the service principal.""" + cmd = [ + azure.AZURE_PATH, 'ad', 'sp', 'create-for-rbac', '--name', self.name, + '--skip-assignment' + ] + stdout, _, _ = vm_util.IssueCommand(cmd) + response = json.loads(stdout) + if response: + self.app_id = response['appId'] + self.password = response['password'] + if not self.app_id or not self.password: + raise errors.Resource.CreationError( + 'Invalid creation response when creating service principal. ' + 'Expected appId and password. Received:\n' + stdout) + return True + return False + + def _Exists(self): + """Returns True if the service principal exists.""" + # Use show rather than list, because list requires admin privileges. + cmd = [azure.AZURE_PATH, 'ad', 'sp', 'show', '--id', self.app_id] + try: + vm_util.IssueCommand(cmd, raise_on_failure=True) + return True + except errors.VmUtil.IssueCommandError: + return False + + def _Delete(self): + """Deletes the service principal.""" + cmd = [azure.AZURE_PATH, 'ad', 'sp', 'delete', '--id', self.app_id] + vm_util.IssueCommand(cmd) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/azure/util.py b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/util.py new file mode 100644 index 0000000..26b29e0 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/azure/util.py @@ -0,0 +1,223 @@ +# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for working with Azure resources.""" + + +import json +import re +from typing import Any, Dict, Set + +from absl import flags +from perfkitbenchmarker import context +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers import azure +import six + +AZURE_PATH = azure.AZURE_PATH +AZURE_SUFFIX = ['--output', 'json'] +FLAGS = flags.FLAGS + + +def GetAzureStorageConnectionString(storage_account_name, resource_group_args): + """Get connection string.""" + stdout, _ = vm_util.IssueRetryableCommand( + [AZURE_PATH, 'storage', 'account', 'show-connection-string', + '--name', storage_account_name] + resource_group_args + AZURE_SUFFIX) + response = json.loads(stdout) + return response['connectionString'] + + +def GetAzureStorageConnectionArgs(storage_account_name, resource_group_args): + """Get connection CLI arguments.""" + return ['--connection-string', + GetAzureStorageConnectionString(storage_account_name, + resource_group_args)] + + +def GetAzureStorageAccountKey(storage_account_name, resource_group_args): + """Get storage account key.""" + stdout, _ = vm_util.IssueRetryableCommand( + [AZURE_PATH, 'storage', 'account', 'keys', 'list', + '--account-name', storage_account_name] + + resource_group_args + AZURE_SUFFIX) + + response = json.loads(stdout) + # A new storage account comes with two keys, but we only need one. + assert response[0]['permissions'].lower() == 'full' + return response[0]['value'] + + +def FormatTag(key, value): + """Format an individual tag for use with the --tags param of Azure CLI.""" + return '{0}={1}'.format(key, value) + + +def FormatTags(tags_dict): + """Format a dict of tags into arguments for 'tag' parameter. + + Args: + tags_dict: Tags to be formatted. + + Returns: + A list of tags formatted as arguments for 'tag' parameter. + """ + return [FormatTag(k, v) for k, v in sorted(six.iteritems(tags_dict))] + + +def GetResourceTags(timeout_minutes): + """Gets a dict of tags. + + Args: + timeout_minutes: int, Timeout used for setting the timeout_utc tag. + + Returns: + A dict contains formatted tags. + """ + benchmark_spec = context.GetThreadBenchmarkSpec() + return benchmark_spec.GetResourceTags(timeout_minutes) + + +def GetTags(timeout_minutes): + """Gets a list of tags to be used with the --tags param of Azure CLI. + + Args: + timeout_minutes: int, Timeout used for setting the timeout_utc tag. + + Returns: + A string contains formatted tags. + """ + return FormatTags(GetResourceTags(timeout_minutes)) + + +def GetTagsJson(timeout_minutes): + """Gets a JSON string of tags to be used with the --set param of Azure CLI. + + Args: + timeout_minutes: int, Timeout used for setting the timeout_utc tag. + + Returns: + A string contains json formatted tags. + """ + return 'tags={}'.format(json.dumps(GetResourceTags(timeout_minutes))) + + +def _IsRegion(zone_or_region): + """Returns whether "zone_or_region" is a region.""" + return re.match(r'[a-z]+[0-9]?$', zone_or_region, re.IGNORECASE) + + +def _IsRecommendedRegion(json_object: Dict[str, Any]) -> bool: + return json_object['metadata']['regionCategory'] == 'Recommended' + + +def IsZone(zone_or_region): + """Returns whether "zone_or_region" is a zone. + + Args: + zone_or_region: string, Azure zone or region. Format for Azure + availability + zone support is "region-availability_zone". Example: eastus2-1 specifies + Azure region eastus2 with availability zone 1. + """ + + return re.match(r'[a-z]+[0-9]?-[0-9]$', zone_or_region, re.IGNORECASE) + + +def GetRegionFromZone(zone_or_region: str) -> str: + """Returns the region a zone is in (or "zone_or_region" if it's a region).""" + if _IsRegion(zone_or_region): + return zone_or_region + if IsZone(zone_or_region): + return zone_or_region[:-2] + + raise ValueError('%s is not a valid Azure zone or region name' % + zone_or_region) + + +def GetZonesInRegion(region: str) -> Set[str]: + """Returns a set of zones in the region.""" + # As of 2021 all Azure AZs are numbered 1-3 for eligible regions. + return set([f'{region}-{i}' for i in range(1, 4)]) + + +def ShouldKeepZoneFromCLI(zone: str) -> bool: + """Filter out zones that we can't access.""" + if 'EUAP' in zone: + return False + return True + + +def GetZonesFromMachineType() -> Set[str]: + """Returns a set of zones for a machine type.""" + stdout, _ = vm_util.IssueRetryableCommand( + [AZURE_PATH, 'vm', 'list-skus', '--size', FLAGS.machine_type]) + zones = set() + for item in json.loads(stdout): + for location_info in item['locationInfo']: + region = location_info['location'] + for zone in location_info['zones']: + if ShouldKeepZoneFromCLI(f'{region}-{zone}'): + zones.add(f'{region}-{zone}') + return zones + + +def GetAllRegions() -> Set[str]: + """Returns all valid regions.""" + stdout, _ = vm_util.IssueRetryableCommand([ + AZURE_PATH, 'account', 'list-locations', '--output', 'json' + ]) + # Filter out staging regions from the output. + return set([ + item['name'] for item in json.loads(stdout) if _IsRecommendedRegion(item) + ]) + + +def GetAllZones() -> Set[str]: + """Returns all valid availability zones.""" + zones = set() + for region in GetAllRegions(): + zones.update(GetZonesInRegion(region)) + return zones + + +def GetGeoFromRegion(region: str) -> str: + """Gets valid geo from the region, i.e. region westus2 returns US.""" + stdout, _ = vm_util.IssueRetryableCommand([ + AZURE_PATH, 'account', 'list-locations', + '--output', 'json', + '--query', f"[?name == '{region}'].metadata.geographyGroup" + ]) + return stdout.splitlines()[1].strip('" ') + + +def GetRegionsInGeo(geo: str) -> Set[str]: + """Gets valid regions in the geo.""" + stdout, _ = vm_util.IssueRetryableCommand([ + AZURE_PATH, 'account', 'list-locations', + '--output', 'json', + '--query', f"[?metadata.geographyGroup == '{geo}']" + ]) + return set([ + item['name'] for item in json.loads(stdout) if _IsRecommendedRegion(item) + ]) + + +def GetAvailabilityZoneFromZone(zone_or_region): + """Returns the Availability Zone from a zone.""" + if IsZone(zone_or_region): + return zone_or_region[-1] + if _IsRegion(zone_or_region): + return None + raise ValueError('%s is not a valid Azure zone' % zone_or_region) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/__init__.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/__init__.py new file mode 100644 index 0000000..49b94a7 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Provider for GCP.""" diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/bigquery.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/bigquery.py new file mode 100644 index 0000000..2d375b0 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/bigquery.py @@ -0,0 +1,618 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for GCP's Bigquery EDW service.""" + +import copy +import datetime +import json +import logging +import os +import re +from typing import Dict, List, Text, Tuple + +from absl import flags +from perfkitbenchmarker import data +from perfkitbenchmarker import edw_service +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.linux_packages import google_cloud_sdk +from perfkitbenchmarker.providers.gcp import util as gcp_util + + +FLAGS = flags.FLAGS + +DEFAULT_TABLE_EXPIRATION = 3600 * 24 * 365 # seconds + + +def GetBigQueryClientInterface( + project_id: str, dataset_id: str) -> edw_service.EdwClientInterface: + """Builds and Returns the requested BigQuery client Interface. + + Args: + project_id: String name of the BigQuery project to benchmark + dataset_id: String name of the BigQuery dataset to benchmark + + Returns: + A concrete Client Interface object (subclass of GenericClientInterface) + + Raises: + RuntimeError: if an unsupported bq_client_interface is requested + """ + if FLAGS.bq_client_interface == 'CLI': + return CliClientInterface(project_id, dataset_id) + if FLAGS.bq_client_interface == 'JAVA': + return JavaClientInterface(project_id, dataset_id) + if FLAGS.bq_client_interface == 'SIMBA_JDBC_1_2_4_1007': + return JdbcClientInterface(project_id, dataset_id) + raise RuntimeError('Unknown BigQuery Client Interface requested.') + + +class GenericClientInterface(edw_service.EdwClientInterface): + """Generic Client Interface class for BigQuery. + + Attributes: + project_id: String name of the BigQuery project to benchmark + dataset_id: String name of the BigQuery dataset to benchmark + """ + + def __init__(self, project_id: str, dataset_id: str): + self.project_id = project_id + self.dataset_id = dataset_id + + def GetMetadata(self) -> Dict[str, str]: + """Gets the Metadata attributes for the Client Interface.""" + return {'client': FLAGS.bq_client_interface} + + +class CliClientInterface(GenericClientInterface): + """Command Line Client Interface class for BigQuery. + + Uses the native Bigquery client that ships with the google_cloud_sdk + https://cloud.google.com/bigquery/docs/bq-command-line-tool. + """ + + def Prepare(self, package_name: str) -> None: + """Prepares the client vm to execute query. + + Installs the bq tool dependencies and authenticates using a service account. + + Args: + package_name: String name of the package defining the preprovisioned data + (certificates, etc.) to extract and use during client vm preparation. + """ + self.client_vm.Install('pip') + self.client_vm.RemoteCommand('sudo pip install absl-py') + self.client_vm.Install('google_cloud_sdk') + + # Push the service account file to the working directory on client vm + key_file_name = FLAGS.gcp_service_account_key_file.split('/')[-1] + if '/' in FLAGS.gcp_service_account_key_file: + self.client_vm.PushFile(FLAGS.gcp_service_account_key_file) + else: + self.client_vm.InstallPreprovisionedPackageData( + package_name, [FLAGS.gcp_service_account_key_file], '') + + # Authenticate using the service account file + vm_gcloud_path = google_cloud_sdk.GCLOUD_PATH + activate_cmd = ('{} auth activate-service-account {} --key-file={}'.format( + vm_gcloud_path, FLAGS.gcp_service_account, key_file_name)) + self.client_vm.RemoteCommand(activate_cmd) + + # Push the framework to execute a sql query and gather performance details + service_specific_dir = os.path.join('edw', Bigquery.SERVICE_TYPE) + self.client_vm.PushFile( + data.ResourcePath( + os.path.join(service_specific_dir, 'script_runner.sh'))) + runner_permission_update_cmd = 'chmod 755 {}'.format('script_runner.sh') + self.client_vm.RemoteCommand(runner_permission_update_cmd) + self.client_vm.PushFile( + data.ResourcePath(os.path.join('edw', 'script_driver.py'))) + self.client_vm.PushFile( + data.ResourcePath( + os.path.join(service_specific_dir, + 'provider_specific_script_driver.py'))) + + def ExecuteQuery(self, query_name: Text) -> Tuple[float, Dict[str, str]]: + """Executes a query and returns performance details. + + Args: + query_name: String name of the query to execute + + Returns: + A tuple of (execution_time, execution details) + execution_time: A Float variable set to the query's completion time in + secs. -1.0 is used as a sentinel value implying the query failed. For a + successful query the value is expected to be positive. + performance_details: A dictionary of query execution attributes eg. job_id + """ + query_command = ('python script_driver.py --script={} --bq_project_id={} ' + '--bq_dataset_id={}').format(query_name, + self.project_id, + self.dataset_id) + stdout, _ = self.client_vm.RemoteCommand(query_command) + performance = json.loads(stdout) + details = copy.copy(self.GetMetadata()) # Copy the base metadata + details['job_id'] = performance[query_name]['job_id'] + return float(performance[query_name]['execution_time']), details + + +class JdbcClientInterface(GenericClientInterface): + """JDBC Client Interface class for BigQuery. + + https://cloud.google.com/bigquery/providers/simba-drivers + """ + + def SetProvisionedAttributes(self, benchmark_spec): + super(JdbcClientInterface, + self).SetProvisionedAttributes(benchmark_spec) + self.project_id = re.split(r'\.', + benchmark_spec.edw_service.cluster_identifier)[0] + self.dataset_id = re.split(r'\.', + benchmark_spec.edw_service.cluster_identifier)[1] + + def Prepare(self, package_name: str) -> None: + """Prepares the client vm to execute query. + + Installs + a) Java Execution Environment, + b) BigQuery Authnetication Credentials, + c) JDBC Application to execute a query and gather execution details, + d) Simba JDBC BigQuery client code dependencencies, and + e) The Simba JDBC interface jar + + Args: + package_name: String name of the package defining the preprovisioned data + (certificates, etc.) to extract and use during client vm preparation. + """ + self.client_vm.Install('openjdk') + + # Push the service account file to the working directory on client vm + self.client_vm.InstallPreprovisionedPackageData( + package_name, [FLAGS.gcp_service_account_key_file], '') + + # Push the executable jars to the working directory on client vm + self.client_vm.InstallPreprovisionedPackageData( + package_name, ['bq-jdbc-client-1.0.jar', 'GoogleBigQueryJDBC42.jar'], + '') + + def ExecuteQuery(self, query_name: Text) -> Tuple[float, Dict[str, str]]: + """Executes a query and returns performance details. + + Args: + query_name: String name of the query to execute + + Returns: + A tuple of (execution_time, execution details) + execution_time: A Float variable set to the query's completion time in + secs. -1.0 is used as a sentinel value implying the query failed. For a + successful query the value is expected to be positive. + performance_details: A dictionary of query execution attributes eg. job_id + """ + query_command = ( + 'java -cp bq-jdbc-client-1.0.jar:GoogleBigQueryJDBC42.jar ' + 'com.google.cloud.performance.edw.App --project {} --service_account ' + '{} --credentials_file {} --dataset {} --query_file {}'.format( + self.project_id, FLAGS.gcp_service_account, + FLAGS.gcp_service_account_key_file, self.dataset_id, query_name)) + stdout, _ = self.client_vm.RemoteCommand(query_command) + details = copy.copy(self.GetMetadata()) # Copy the base metadata + details.update(json.loads(stdout)['details']) + return json.loads(stdout)['performance'], details + + +class JavaClientInterface(GenericClientInterface): + """Native Java Client Interface class for BigQuery. + + https://cloud.google.com/bigquery/docs/reference/libraries#client-libraries-install-java + """ + + def Prepare(self, package_name: str) -> None: + """Prepares the client vm to execute query. + + Installs the Java Execution Environment and a uber jar with + a) BigQuery Java client libraries, + b) An application to execute a query and gather execution details, and + c) their dependencies. + + Args: + package_name: String name of the package defining the preprovisioned data + (certificates, etc.) to extract and use during client vm preparation. + """ + self.client_vm.Install('openjdk') + + # Push the service account file to the working directory on client vm + if '/' in FLAGS.gcp_service_account_key_file: + self.client_vm.PushFile(FLAGS.gcp_service_account_key_file) + else: + self.client_vm.InstallPreprovisionedPackageData( + package_name, [FLAGS.gcp_service_account_key_file], '') + # Push the executable jar to the working directory on client vm + self.client_vm.InstallPreprovisionedPackageData(package_name, + ['bq-java-client-2.3.jar'], + '') + + def ExecuteQuery(self, query_name: Text) -> Tuple[float, Dict[str, str]]: + """Executes a query and returns performance details. + + Args: + query_name: String name of the query to execute. + + Returns: + A tuple of (execution_time, execution details) + execution_time: A Float variable set to the query's completion time in + secs. -1.0 is used as a sentinel value implying the query failed. For a + successful query the value is expected to be positive. + performance_details: A dictionary of query execution attributes eg. job_id + """ + key_file_name = FLAGS.gcp_service_account_key_file + if '/' in FLAGS.gcp_service_account_key_file: + key_file_name = FLAGS.gcp_service_account_key_file.split('/')[-1] + + query_command = ('java -cp bq-java-client-2.3.jar ' + 'com.google.cloud.performance.edw.Single --project {} ' + '--credentials_file {} --dataset {} ' + '--query_file {}').format(self.project_id, key_file_name, + self.dataset_id, query_name) + stdout, _ = self.client_vm.RemoteCommand(query_command) + details = copy.copy(self.GetMetadata()) # Copy the base metadata + details.update(json.loads(stdout)['details']) + return json.loads(stdout)['query_wall_time_in_secs'], details + + def ExecuteSimultaneous(self, submission_interval: int, + queries: List[str]) -> str: + """Executes queries simultaneously on client and return performance details. + + Simultaneous app expects queries as white space separated query file names. + + Args: + submission_interval: Simultaneous query submission interval in + milliseconds. + queries: List of strings (names) of queries to execute. + + Returns: + A serialized dictionary of execution details. + """ + key_file_name = FLAGS.gcp_service_account_key_file + if '/' in FLAGS.gcp_service_account_key_file: + key_file_name = os.path.basename(FLAGS.gcp_service_account_key_file) + cmd = ('java -cp bq-java-client-2.3.jar ' + 'com.google.cloud.performance.edw.Simultaneous --project {} ' + '--credentials_file {} --dataset {} --submission_interval {} ' + '--query_files {}'.format(self.project_id, key_file_name, + self.dataset_id, submission_interval, + ' '.join(queries))) + stdout, _ = self.client_vm.RemoteCommand(cmd) + return stdout + + def ExecuteThroughput(self, concurrency_streams: List[List[str]]) -> str: + """Executes a throughput test and returns performance details. + + Args: + concurrency_streams: List of streams to execute simultaneously, each of + which is a list of string names of queries. + + Returns: + A serialized dictionary of execution details. + """ + key_file_name = FLAGS.gcp_service_account_key_file + if '/' in FLAGS.gcp_service_account_key_file: + key_file_name = os.path.basename(FLAGS.gcp_service_account_key_file) + cmd = ('java -cp bq-java-client-2.3.jar ' + 'com.google.cloud.performance.edw.Throughput --project {} ' + '--credentials_file {} --dataset {} --query_streams {}'.format( + self.project_id, key_file_name, self.dataset_id, + ' '.join([','.join(stream) for stream in concurrency_streams]))) + stdout, _ = self.client_vm.RemoteCommand(cmd) + return stdout + + +class Bigquery(edw_service.EdwService): + """Object representing a Bigquery cluster. + + Attributes: + job_id_prefix: A string prefix for the job id for bigquery job. + """ + + CLOUD = providers.GCP + SERVICE_TYPE = 'bigquery' + + def __init__(self, edw_service_spec): + super(Bigquery, self).__init__(edw_service_spec) + project_id = re.split(r'\.', self.cluster_identifier)[0] + dataset_id = re.split(r'\.', self.cluster_identifier)[1] + self.client_interface = GetBigQueryClientInterface(project_id, dataset_id) + + def _Create(self): + """Create a BigQuery cluster. + + Bigquery clusters creation is out of scope of the benchmarking. + """ + raise NotImplementedError + + def _Exists(self): + """Method to validate the existence of a Bigquery cluster. + + Returns: + Boolean value indicating the existence of a cluster. + """ + return True + + def _Delete(self): + """Delete a BigQuery cluster. + + Bigquery cluster deletion is out of scope of benchmarking. + """ + raise NotImplementedError + + def GetMetadata(self): + """Return a dictionary of the metadata for the BigQuery cluster.""" + basic_data = super(Bigquery, self).GetMetadata() + basic_data.update(self.client_interface.GetMetadata()) + return basic_data + + def FormatProjectAndDatasetForCommand(self, dataset=None): + """Returns the project and dataset in the format needed for bq commands. + + E.g., project:dataset. + + Args: + dataset: The dataset to run commands against. If None, extracts the + dataset from the cluster identifier whose format is "project.dataset"). + """ + return ((self.cluster_identifier.split('.')[0] + ':' + + dataset) if dataset else self.cluster_identifier.replace('.', ':')) + + def GetDatasetLastUpdatedTime(self, dataset=None): + """Get the formatted last modified timestamp of the dataset.""" + cmd = [ + 'bq', 'show', '--format=prettyjson', + self.FormatProjectAndDatasetForCommand(dataset) + ] + dataset_metadata, _, _ = vm_util.IssueCommand(cmd) + metadata_json = json.loads(str(dataset_metadata)) + return datetime.datetime.fromtimestamp( + float(metadata_json['lastModifiedTime']) / + 1000.0).strftime('%Y-%m-%d_%H-%M-%S') + + def GetAllTablesInDataset(self, dataset=None): + """Returns a list of the IDs of all the tables in the dataset.""" + cmd = [ + 'bq', 'ls', '--format=prettyjson', + self.FormatProjectAndDatasetForCommand(dataset) + ] + tables_list, _, _ = vm_util.IssueCommand(cmd) + all_tables = [] + for table in json.loads(str(tables_list)): + if table['type'] == 'TABLE': + all_tables.append(table['tableReference']['tableId']) + return all_tables + + def ExtractDataset(self, + dest_bucket, + dataset=None, + tables=None, + dest_format='CSV'): + """Extract all tables in a dataset to a GCS bucket. + + Args: + dest_bucket: Name of the bucket to extract the data to. Should already + exist. + dataset: Optional name of the dataset. If none, will be extracted from the + cluster_identifier. + tables: Optional list of table names to extract. If none, all tables in + the dataset will be extracted. + dest_format: Format to extract data in. Can be one of: CSV, JSON, or Avro. + """ + if tables is None: + tables = self.GetAllTablesInDataset(dataset) + gcs_uri = 'gs://' + dest_bucket + + # Make sure the bucket is empty. + vm_util.IssueCommand(['gsutil', '-m', 'rm', gcs_uri + '/**'], + raise_on_failure=False) + + project_dataset = self.FormatProjectAndDatasetForCommand(dataset) + for table in tables: + cmd = [ + 'bq', 'extract', + '--destination_format=%s' % dest_format, + '%s.%s' % (project_dataset, table), + '%s/%s/*.csv' % (gcs_uri, table) + ] + _, stderr, retcode = vm_util.IssueCommand(cmd) + # There is a 10T daily limit on extracting from BQ. Large datasets will + # inherently hit this limit and benchmarks shouldn't use those. + gcp_util.CheckGcloudResponseKnownFailures(stderr, retcode) + + def RemoveDataset(self, dataset=None): + """Removes a dataset. + + See https://cloud.google.com/bigquery/docs/managing-tables#deleting_tables + + Args: + dataset: Optional name of the dataset. If none, will be extracted from the + cluster_identifier. + """ + project_dataset = self.FormatProjectAndDatasetForCommand(dataset) + vm_util.IssueCommand(['bq', 'rm', '-r', '-f', '-d', project_dataset], + raise_on_failure=False) + + def CreateDataset(self, dataset=None, description=None): + """Creates a new dataset. + + See https://cloud.google.com/bigquery/docs/tables + + Args: + dataset: Optional name of the dataset. If none, will be extracted from the + cluster_identifier. + description: Optional description of the dataset. Escape double quotes. + """ + project_dataset = self.FormatProjectAndDatasetForCommand(dataset) + cmd = [ + 'bq', 'mk', '--dataset', + '--default_table_expiration=%d' % DEFAULT_TABLE_EXPIRATION + ] + if description: + cmd.extend(['--description', '"%s"' % description]) + cmd.append(project_dataset) + vm_util.IssueCommand(cmd) + + cmd = ['bq', 'update'] + for key, value in gcp_util.GetDefaultTags().items(): + cmd.extend(['--set_label', f'{key}:{value}']) + cmd.append(project_dataset) + vm_util.IssueCommand(cmd) + + def LoadDataset(self, + source_bucket, + tables, + schema_dir, + dataset=None, + append=True, + skip_header_row=True, + field_delimiter=','): + """Load all tables in a dataset to a database from CSV object storage. + + See https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv + + Args: + source_bucket: Name of the bucket to load the data from. Should already + exist. Each table must have its own subfolder in the bucket named after + the table, containing one or more csv files that make up the table data. + tables: List of table names to load. + schema_dir: GCS directory containing json schemas of all tables to load. + dataset: Optional name of the dataset. If none, will be extracted from the + cluster_identifier. + append: If True, appends loaded data to the existing set. If False, + replaces the existing data (if any). + skip_header_row: If True, skips the first row of data being loaded. + field_delimiter: The separator for fields in the CSV file. + """ + project_dataset = self.FormatProjectAndDatasetForCommand(dataset) + for table in tables: + schema_path = schema_dir + table + '.json' + local_schema = './%s.json' % table + vm_util.IssueCommand(['gsutil', 'cp', schema_path, local_schema]) + cmd = [ + 'bq', 'load', '--noreplace' if append else '--replace', + '--source_format=CSV', + '--field_delimiter=%s' % field_delimiter, + '--skip_leading_rows=%d' % (1 if skip_header_row else 0), + '%s.%s' % (project_dataset, table), + 'gs://%s/%s/*.csv' % (source_bucket, table), local_schema + ] + _, stderr, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) + if retcode: + logging.warning('Loading table %s failed. stderr: %s, retcode: %s', + table, stderr, retcode) + + cmd = ['bq', 'update'] + for key, value in gcp_util.GetDefaultTags().items(): + cmd.extend(['--set_label', f'{key}:{value}']) + cmd.append(f'{project_dataset}.{table}') + vm_util.IssueCommand(cmd) + + +class Endor(Bigquery): + """Class representing BigQuery Endor service.""" + + SERVICE_TYPE = 'endor' + + def GetMetadata(self) -> Dict[str, str]: + """Return a dictionary of the metadata for the BigQuery Endor service. + + Returns: + A dictionary set to Endor service details. + """ + basic_data = super(Endor, self).GetMetadata() + basic_data['edw_service_type'] = 'endor' + basic_data.update(self.client_interface.GetMetadata()) + basic_data.update(self.GetDataDetails()) + return basic_data + + def GetDataDetails(self) -> Dict[str, str]: + """Returns a dictionary with underlying data details. + + cluster_identifier = . + Data details are extracted from the dataset_id that follows the format: + ____ + eg. + tpch100_parquet_uncompressed_unpartitoned_s3 + + Returns: + A dictionary set to underlying data's details (format, etc.) + """ + data_details = {} + dataset_id = re.split(r'\.', self.cluster_identifier)[1] + parsed_id = re.split(r'_', dataset_id) + data_details['format'] = parsed_id[1] + data_details['compression'] = parsed_id[2] + data_details['partitioning'] = parsed_id[3] + data_details['location'] = parsed_id[4] + return data_details + + +class Endorazure(Endor): + """Class representing BigQuery Endor Azure service.""" + + SERVICE_TYPE = 'endorazure' + + def GetMetadata(self) -> Dict[str, str]: + """Return a dictionary of the metadata for the BigQuery Endor Azure service. + + Returns: + A dictionary set to Endor Azure service details. + """ + basic_data = super(Endorazure, self).GetMetadata() + basic_data['edw_service_type'] = 'endorazure' + return basic_data + + +class Bqfederated(Bigquery): + """Class representing BigQuery Federated service.""" + + SERVICE_TYPE = 'bqfederated' + + def GetMetadata(self) -> Dict[str, str]: + """Return a dictionary of the metadata for the BigQuery Federated service. + + Returns: + A dictionary set to Federated service details. + """ + basic_data = super(Bqfederated, self).GetMetadata() + basic_data['edw_service_type'] = Bqfederated.SERVICE_TYPE + basic_data.update(self.client_interface.GetMetadata()) + basic_data.update(self.GetDataDetails()) + return basic_data + + def GetDataDetails(self) -> Dict[str, str]: + """Returns a dictionary with underlying data details. + + cluster_identifier = . + Data details are extracted from the dataset_id that follows the format: + ____ + eg. + tpch10000_parquet_compressed_partitoned_gcs + + Returns: + A dictionary set to underlying data's details (format, etc.) + """ + data_details = {} + dataset_id = re.split(r'\.', self.cluster_identifier)[1] + parsed_id = re.split(r'_', dataset_id) + data_details['format'] = parsed_id[1] + data_details['compression'] = parsed_id[2] + data_details['partitioning'] = parsed_id[3] + data_details['location'] = parsed_id[4] + return data_details diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/flags.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/flags.py new file mode 100644 index 0000000..da48ea5 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/flags.py @@ -0,0 +1,172 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing flags applicable across benchmark run on GCP.""" + +from absl import flags + +# Sentinel value for unspecified platform. +GCP_MIN_CPU_PLATFORM_NONE = 'none' + +flags.DEFINE_string('gcloud_path', 'gcloud', 'The path for the gcloud utility.') +flags.DEFINE_list('additional_gcloud_flags', [], + 'Additional flags to pass to gcloud.') +flags.DEFINE_integer( + 'gce_num_local_ssds', 0, + 'The number of ssds that should be added to the VM. Note ' + 'that this is currently only supported in certain zones ' + '(see https://cloud.google.com/compute/docs/local-ssd).') +flags.DEFINE_string( + 'gcloud_scopes', None, 'If set, space-separated list of ' + 'scopes to apply to every created machine') +flags.DEFINE_boolean('gce_migrate_on_maintenance', True, 'If true, allow VM ' + 'migration on GCE host maintenance.') +flags.DEFINE_boolean('gce_automatic_restart', False, 'If true, allow VM ' + 'to restart when crashes.') +flags.DEFINE_boolean('gce_preemptible_vms', False, 'If true, use preemptible ' + 'VMs on GCE.') +flags.DEFINE_string( + 'image_family', None, 'The family of the image that the boot disk will be ' + 'initialized with. The --image flag will take priority over this flag. See:' + ' https://cloud.google.com/sdk/gcloud/reference/compute/instances/create') +flags.DEFINE_string( + 'image_project', None, 'The project against which all image references will' + ' be resolved. See: ' + 'https://cloud.google.com/sdk/gcloud/reference/compute/disks/create') +flags.DEFINE_string( + 'gce_network_name', None, 'The name of an already created ' + 'network to use instead of creating a new one.') +flags.DEFINE_string( + 'gce_subnet_name', None, 'The name of an already created ' + 'subnet to use instead of creating a new one.') +flags.DEFINE_string( + 'gce_subnet_region', None, 'Region to create subnet in ' + 'instead of automatically creating one in every region.') +flags.DEFINE_string( + 'gce_subnet_addr', '10.128.0.0/20', 'Address range to the ' + 'subnet, given in CDR notation. Not used unless ' + '--gce_subnet_region is given.') +flags.DEFINE_string( + 'gce_remote_access_firewall_rule', None, 'The name of an ' + 'already created firewall rule which allows remote access ' + 'instead of creating a new one.') +flags.DEFINE_multi_string( + 'gcp_instance_metadata_from_file', [], + 'A colon separated key-value pair that will be added to the ' + '"--metadata-from-file" flag of the gcloud cli (with the colon replaced by ' + 'the equal sign). Multiple key-value pairs may be specified by separating ' + 'each pair by commas. This option can be repeated multiple times. For ' + 'information about GCP instance metadata, see: --metadata-from-file from ' + '`gcloud help compute instances create`.') +flags.DEFINE_multi_string( + 'gcp_instance_metadata', [], + 'A colon separated key-value pair that will be added to the ' + '"--metadata" flag of the gcloud cli (with the colon replaced by the equal ' + 'sign). Multiple key-value pairs may be specified by separating each pair ' + 'by commas. This option can be repeated multiple times. For information ' + 'about GCP instance metadata, see: --metadata from ' + '`gcloud help compute instances create`.') +flags.DEFINE_integer('gce_boot_disk_size', None, + 'The boot disk size in GB for GCP VMs.') +flags.DEFINE_enum('gce_boot_disk_type', None, ['pd-standard', 'pd-ssd', 'pd-extreme'], + 'The boot disk type for GCP VMs.') +flags.DEFINE_enum('gce_ssd_interface', 'SCSI', ['SCSI', 'NVME'], + 'The ssd interface for GCE local SSD.') +flags.DEFINE_enum('gce_nic_type', 'VIRTIO_NET', ['VIRTIO_NET', 'GVNIC'], + 'The virtual NIC type of GCE VMs.') +EGRESS_BANDWIDTH_TIER = flags.DEFINE_enum( + 'gce_egress_bandwidth_tier', None, ['TIER_1'], + 'Egress bandwidth tier of the GCE VMs.') + +flags.DEFINE_string('gcp_node_type', None, + 'The node type of all sole tenant hosts that get created.') +flags.DEFINE_enum( + 'gcp_min_cpu_platform', None, [ + GCP_MIN_CPU_PLATFORM_NONE, 'sandybridge', 'ivybridge', 'haswell', + 'broadwell', 'skylake', 'cascadelake', 'icelake', 'rome', 'milan' + ], 'When specified, the VM will have either the specified ' + 'architecture or a newer one. Architecture availability is zone dependent.') +flags.DEFINE_string( + 'gce_accelerator_type_override', None, + 'When specified, override the accelerator_type string passed to the gcloud ' + 'compute instance create command.') +flags.DEFINE_string('gcp_preprovisioned_data_bucket', None, + 'GCS bucket where pre-provisioned data has been copied.') +flags.DEFINE_integer('gcp_redis_gb', 5, 'Size of redis cluster in gb') +flags.DEFINE_string('gcp_service_account', None, 'Service account to use for ' + 'authorization.') +flags.DEFINE_string( + 'gcp_service_account_key_file', None, + 'Local path to file that contains a private authorization ' + 'key, used to activate gcloud.') +flags.DEFINE_list('gce_tags', None, 'List of --tags when creating a VM') +flags.DEFINE_boolean('gke_enable_alpha', False, + 'Whether to enable alpha kubernetes clusters.') +flags.DEFINE_string('gcp_dataproc_subnet', None, + 'Specifies the subnet that the cluster will be part of.') +flags.DEFINE_multi_string('gcp_dataproc_property', [], + 'Specifies configuration properties for installed ' + 'packages, such as Hadoop and Spark. Properties are ' + 'mapped to configuration files by specifying a prefix' + ', such as "core:io.serializations". ' + 'See https://cloud.google.com/dataproc/docs/concepts/' + 'configuring-clusters/cluster-properties ' + 'for details.') +flags.DEFINE_string('gcp_dataproc_image', None, + 'Specifies the custom image URI or the custom image name ' + 'that will be used to create a cluster.') +flags.DEFINE_boolean('gcp_internal_ip', False, + 'Use internal ips for ssh or scp commands. gcloud beta' + 'components must be installed to use this flag.') +flags.DEFINE_enum('gce_network_tier', 'premium', ['premium', 'standard'], + 'Network tier to use for all GCE VMs. Note that standard ' + 'networking is only available in certain regions. See ' + 'https://cloud.google.com/network-tiers/docs/overview') +flags.DEFINE_boolean( + 'gce_shielded_secure_boot', False, + 'Whether the image uses the shielded VM feature') +flags.DEFINE_boolean('gce_firewall_rules_clean_all', False, + 'Determines whether all the gce firewall rules should be ' + 'cleaned up before deleting the network. If firewall ' + 'rules are added manually, PKB will not know about all of ' + 'them. However, they must be deleted in order to ' + 'successfully delete the PKB-created network.') +flags.DEFINE_enum('bq_client_interface', 'CLI', + ['CLI', 'JAVA', 'SIMBA_JDBC_1_2_4_1007'], + 'The Runtime Interface used when interacting with BigQuery.') +flags.DEFINE_string('gcp_preemptible_status_bucket', None, + 'The GCS bucket to store the preemptible status when ' + 'running on GCP.') +flags.DEFINE_boolean( + 'gce_confidential_compute', False, + 'Whether the image uses the confidential VM feature') +flags.DEFINE_integer( + 'gcp_provisioned_iops', 100000, + 'Iops to provision for pd-extreme. Defaults to the gcloud ' + 'default of 100000.') +API_OVERRIDE = flags.DEFINE_string( + 'gcp_cloud_redis_api_override', + default='https://redis.googleapis.com/', + help='Cloud redis API endpoint override. Defaults to prod.') + + +def _ValidatePreemptFlags(flags_dict): + if flags_dict['gce_preemptible_vms']: + return bool(flags_dict['gcp_preemptible_status_bucket']) + return True + + +flags.register_multi_flags_validator( + ['gce_preemptible_vms', 'gcp_preemptible_status_bucket'], + _ValidatePreemptFlags, 'When gce_preemptible_vms is specified, ' + 'gcp_preemptible_status_bucket must be specified.') diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_disk.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_disk.py new file mode 100644 index 0000000..47985e9 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_disk.py @@ -0,0 +1,192 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing classes related to GCE disks. + +Disks can be created, deleted, attached to VMs, and detached from VMs. +Use 'gcloud compute disk-types list' to determine valid disk types. +""" + +import json + +from absl import flags +from perfkitbenchmarker import disk +from perfkitbenchmarker import errors +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.gcp import util + +FLAGS = flags.FLAGS + +PD_STANDARD = 'pd-standard' +PD_SSD = 'pd-ssd' +PD_BALANCED = 'pd-balanced' +PD_EXTREME = 'pd-extreme' + +DISK_TYPE = {disk.STANDARD: PD_STANDARD, disk.REMOTE_SSD: PD_SSD} + +REGIONAL_DISK_SCOPE = 'regional' + +DISK_METADATA = { + PD_STANDARD: { + disk.MEDIA: disk.HDD, + disk.REPLICATION: disk.ZONE, + }, + PD_BALANCED: { + disk.MEDIA: disk.SSD, + disk.REPLICATION: disk.ZONE, + }, + PD_SSD: { + disk.MEDIA: disk.SSD, + disk.REPLICATION: disk.ZONE, + }, + PD_EXTREME: { + disk.MEDIA: disk.SSD, + disk.REPLICATION: disk.ZONE, + }, + disk.LOCAL: { + disk.MEDIA: disk.SSD, + disk.REPLICATION: disk.NONE, + }, +} + +SCSI = 'SCSI' +NVME = 'NVME' + +disk.RegisterDiskTypeMap(providers.GCP, DISK_TYPE) + + +class GceDisk(disk.BaseDisk): + """Object representing an GCE Disk.""" + + def __init__(self, + disk_spec, + name, + zone, + project, + image=None, + image_project=None, + replica_zones=None): + super(GceDisk, self).__init__(disk_spec) + self.attached_vm_name = None + self.image = image + self.image_project = image_project + self.name = name + self.zone = zone + self.project = project + self.replica_zones = replica_zones + self.region = util.GetRegionFromZone(self.zone) + self.provisioned_iops = None + if self.disk_type == PD_EXTREME: + self.provisioned_iops = FLAGS.gcp_provisioned_iops + + disk_metadata = DISK_METADATA[disk_spec.disk_type] + if self.replica_zones: + disk_metadata[disk.REPLICATION] = disk.REGION + self.metadata['replica_zones'] = replica_zones + self.metadata.update(DISK_METADATA[disk_spec.disk_type]) + if self.disk_type == disk.LOCAL: + self.metadata['interface'] = FLAGS.gce_ssd_interface + if self.provisioned_iops and self.disk_type == PD_EXTREME: + self.metadata['provisioned_iops'] = self.provisioned_iops + + def _Create(self): + """Creates the disk.""" + cmd = util.GcloudCommand(self, 'compute', 'disks', 'create', self.name) + cmd.flags['size'] = self.disk_size + cmd.flags['type'] = self.disk_type + if self.provisioned_iops and self.disk_type == PD_EXTREME: + cmd.flags['provisioned-iops'] = self.provisioned_iops + cmd.flags['labels'] = util.MakeFormattedDefaultTags() + if self.image: + cmd.flags['image'] = self.image + if self.image_project: + cmd.flags['image-project'] = self.image_project + + if self.replica_zones: + cmd.flags['region'] = self.region + cmd.flags['replica-zones'] = ','.join(self.replica_zones) + del cmd.flags['zone'] + + _, stderr, retcode = cmd.Issue(raise_on_failure=False) + util.CheckGcloudResponseKnownFailures(stderr, retcode) + + def _Delete(self): + """Deletes the disk.""" + cmd = util.GcloudCommand(self, 'compute', 'disks', 'delete', self.name) + if self.replica_zones: + cmd.flags['region'] = self.region + del cmd.flags['zone'] + cmd.Issue(raise_on_failure=False) + + def _Exists(self): + """Returns true if the disk exists.""" + cmd = util.GcloudCommand(self, 'compute', 'disks', 'describe', self.name) + + if self.replica_zones: + cmd.flags['region'] = self.region + del cmd.flags['zone'] + + stdout, _, _ = cmd.Issue(suppress_warning=True, raise_on_failure=False) + try: + json.loads(stdout) + except ValueError: + return False + return True + + @vm_util.Retry() + def Attach(self, vm): + """Attaches the disk to a VM. + + Args: + vm: The GceVirtualMachine instance to which the disk will be attached. + """ + self.attached_vm_name = vm.name + cmd = util.GcloudCommand(self, 'compute', 'instances', 'attach-disk', + self.attached_vm_name) + cmd.flags['device-name'] = self.name + cmd.flags['disk'] = self.name + + if self.replica_zones: + cmd.flags['disk-scope'] = REGIONAL_DISK_SCOPE + + stdout, stderr, retcode = cmd.Issue(raise_on_failure=False) + # Gcloud attach-disk commands may still attach disks despite being rate + # limited. + if retcode: + if (cmd.rate_limited and 'is already being used' in stderr and + FLAGS.retry_on_rate_limited): + return + debug_text = ('Ran: {%s}\nReturnCode:%s\nSTDOUT: %s\nSTDERR: %s' % + (' '.join(cmd.GetCommand()), retcode, stdout, stderr)) + raise errors.VmUtil.CalledProcessException( + 'Command returned a non-zero exit code:\n{}'.format(debug_text)) + + def Detach(self): + """Detaches the disk from a VM.""" + cmd = util.GcloudCommand(self, 'compute', 'instances', 'detach-disk', + self.attached_vm_name) + cmd.flags['device-name'] = self.name + + if self.replica_zones: + cmd.flags['disk-scope'] = REGIONAL_DISK_SCOPE + cmd.IssueRetryable() + self.attached_vm_name = None + + def GetDevicePath(self): + """Returns the path to the device inside the VM.""" + if self.disk_type == disk.LOCAL and FLAGS.gce_ssd_interface == NVME: + return '/dev/%s' % self.name + else: + # by default, gce_ssd_interface == SCSI and returns this name id + return '/dev/disk/by-id/google-%s' % self.name diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_network.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_network.py new file mode 100644 index 0000000..d836386 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_network.py @@ -0,0 +1,991 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing classes related to GCE VM networking. + +The Firewall class provides a way of opening VM ports. The Network class allows +VMs to communicate via internal ips and isolates PerfKitBenchmarker VMs from +others in the +same project. See https://developers.google.com/compute/docs/networking for +more information about GCE VM networking. +""" + + +import json +import logging +import threading +from typing import Any, Dict, List, Optional, Set, Tuple, Union + +from absl import flags +from perfkitbenchmarker import context +from perfkitbenchmarker import errors +from perfkitbenchmarker import network +from perfkitbenchmarker import placement_group +from perfkitbenchmarker import providers +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util +from perfkitbenchmarker import vpn_service +from perfkitbenchmarker.providers.gcp import gce_placement_group +from perfkitbenchmarker.providers.gcp import util +import six + +FLAGS = flags.FLAGS +NETWORK_RANGE = '10.0.0.0/8' +ALLOW_ALL = 'tcp:1-65535,udp:1-65535,icmp' + + +class GceVpnGateway(network.BaseVpnGateway): + """Object representing a GCE VPN Gateway.""" + CLOUD = providers.GCP + + def __init__(self, name: str, network_name: str, region: str, cidr: str, + project: str): + super(GceVpnGateway, self).__init__() + + self.forwarding_rules: Dict[str, GceForwardingRule] = {} + self.forwarding_rules_lock = threading.Lock() + self.tunnels: Dict[str, GceStaticTunnel] = {} + self.routes: Dict[str, GceRoute] = {} + self.ip_resource = None + self.vpn_gateway_resource = GceVpnGatewayResource( + name, network_name, region, cidr, project) + self.vpn_gateway_resource_lock = threading.Lock() + + self.name = name + self.network_name = network_name + self.region = region + self.cidr = cidr + self.project = project + + self.ip_address = None + self.ip_address_lock = threading.Lock() + self.created = False + self.require_target_to_init = False + self.routing = None + self.psk = None + + # Add gateway to benchmark spec at init(). + benchmark_spec = context.GetThreadBenchmarkSpec() + if benchmark_spec is None: + raise errors.Error('GetNetwork called in a thread without a ' + 'BenchmarkSpec.') + key = self.name + with benchmark_spec.vpn_gateways_lock: + if key not in benchmark_spec.vpn_gateways: + benchmark_spec.vpn_gateways[key] = self + + def ConfigureTunnel(self, tunnel_config: vpn_service.TunnelConfig): + """Updates tunnel config with new information. + + Args: + tunnel_config: The tunnel configuration for this VPN. + """ + logging.debug('Configuring Tunnel with params:') + logging.debug(tunnel_config) + + # update tunnel_config if needed + if self.name not in tunnel_config.endpoints: + logging.debug('tunnel_config: This endpoint isnt registered yet... %s', + self.name) + tunnel_config.endpoints[self.name] = { + 'is_configured': False, + 'cidr': self.cidr, + 'project': self.project, + 'network_name': self.network_name, + 'region': self.region, + 'require_target_to_init': self.require_target_to_init, + } + + # attach public IP to this gateway if doesnt exist + # and update tunnel_config if needed + # requires project, region, name + with self.ip_address_lock: + if not self.ip_address: + if not self.ip_resource: + self.ip_resource = GceIPAddress(self.project, self.region, self.name) + self.ip_resource.Create() + self.ip_address = self.ip_resource.ip_address + if 'ip_address' not in tunnel_config.endpoints[self.name]: + logging.debug('tunnel_config: Configuring IP for %s', self.name) + tunnel_config.endpoints[self.name]['ip_address'] = self.ip_address + + # configure forwarding + # requires: - + with self.forwarding_rules_lock: + if len(self.forwarding_rules) == 3: + logging.debug('tunnel_config: Forwarding already configured, skipping') + else: + logging.debug('tunnel_config: Setting up forwarding') + self._SetupForwarding(tunnel_config) + + # Abort if we don't have a target info configured yet + if len(tunnel_config.endpoints) < 2: + logging.debug('tunnel_config: Only found %d endpoints... ' + 'waiting for target to configure', + len(tunnel_config.endpoints)) + return + + # Get target endpoint config key + target_endpoint = [k for k in tunnel_config.endpoints.keys() + if k not in self.name][0] + + # configure tunnel resources + # requires: target_ip_address, IKE version (default 1), + if 'ip_address' not in tunnel_config.endpoints[target_endpoint]: + logging.debug('tunnel_config: Target IP needed... ' + 'waiting for target to configure') + return + if not hasattr(tunnel_config, 'psk'): + logging.debug('tunnel_config: PSK not provided... setting to runid') + tunnel_config.psk = 'key' + FLAGS.run_uri + self._SetupTunnel(tunnel_config) + + # configure routing + # requires: next_hop_tunnel_id, target_cidr, + # TODO(dlott) Should be Optional[str], but that requires making endpoints a + # proper class rather than a dictionary of string and bool. See TunnelConfig + dest_cidr: Optional[Any] = tunnel_config.endpoints[target_endpoint].get( + 'cidr') + if not dest_cidr or not dest_cidr.strip(): + logging.debug('tunnel_config: destination CIDR needed... ' + 'waiting for target to configure') + return + self._SetupRouting( + tunnel_config.suffix, + tunnel_config.endpoints[self.name]['tunnel_id'], + tunnel_config.endpoints[target_endpoint]['cidr']) + + tunnel_config.endpoints[self.name]['is_configured'] = True + + def IsTunnelReady(self, tunnel_id: str) -> bool: + """Returns True if the tunnel is up and ready for traffic. + + Args: + tunnel_id: The id of the tunnel to check. + + Returns: + boolean. + + """ + return self.tunnels[tunnel_id].IsReady() + + def _SetupTunnel(self, tunnel_config: vpn_service.TunnelConfig): + """Register a new GCE VPN tunnel for this endpoint. + + Args: + tunnel_config: VPN tunnel configuration. + """ + target_endpoint = [k for k in tunnel_config.endpoints.keys() + if k not in self.name][0] + project = tunnel_config.endpoints[self.name]['project'] + region = tunnel_config.endpoints[self.name]['region'] + vpn_gateway_id = self.name + target_ip = tunnel_config.endpoints[target_endpoint]['ip_address'] + psk = tunnel_config.psk + ike_version = tunnel_config.ike_version + suffix = tunnel_config.suffix + name = 'tun-' + self.name + '-' + suffix + if name not in self.tunnels: + self.tunnels[name] = GceStaticTunnel( + project, region, name, vpn_gateway_id, target_ip, ike_version, psk) + self.tunnels[name].Create() + tunnel_config.endpoints[self.name]['tunnel_id'] = name + + def _SetupForwarding(self, tunnel_config: vpn_service.TunnelConfig): + """Create IPSec forwarding rules. + + Forwards ESP protocol, and UDP 500/4500 for tunnel setup. + + Args: + tunnel_config: The tunnel configuration for this VPN. + """ + if len(self.forwarding_rules) == 3: + return # backout if already set + suffix = tunnel_config.suffix + # GCP doesnt like uppercase names?!? + fr_UDP500_name = ('fr-udp500-%s-%s' % + (suffix, FLAGS.run_uri)) + fr_UDP4500_name = ('fr-udp4500-%s-%s' % + (suffix, FLAGS.run_uri)) + fr_ESP_name = ('fr-esp-%s-%s' % + (suffix, FLAGS.run_uri)) + + if fr_UDP500_name not in self.forwarding_rules: + fr_UDP500 = GceForwardingRule( + fr_UDP500_name, 'UDP', self, 500) + self.forwarding_rules[fr_UDP500_name] = fr_UDP500 + fr_UDP500.Create() + if fr_UDP4500_name not in self.forwarding_rules: + fr_UDP4500 = GceForwardingRule( + fr_UDP4500_name, 'UDP', self, 4500) + self.forwarding_rules[fr_UDP4500_name] = fr_UDP4500 + fr_UDP4500.Create() + if fr_ESP_name not in self.forwarding_rules: + fr_ESP = GceForwardingRule( + fr_ESP_name, 'ESP', self) + self.forwarding_rules[fr_ESP_name] = fr_ESP + fr_ESP.Create() + + def _SetupRouting(self, suffix: str, next_hop_tun: str, dest_cidr: str): + """Create IPSec routing rules between the source and target gateways.""" + + route_name = 'route-' + self.name + '-' + suffix + if route_name not in self.routes: + self.routes[route_name] = GceRoute( + route_name, dest_cidr, self.network_name, next_hop_tun, + self.region, self.project) + self.routes[route_name].Create() + + def Create(self): + """Creates the actual VpnGateway.""" + benchmark_spec = context.GetThreadBenchmarkSpec() + if benchmark_spec is None: + raise errors.Error('GetNetwork called in a thread without a ' + 'BenchmarkSpec.') + if self.created: + return + self.vpn_gateway_resource.Create() + + self.created = True + + def Delete(self): + """Deletes the actual VpnGateway.""" + if self.ip_resource: + self.ip_resource.Delete() + + if self.tunnels: + vm_util.RunThreaded(lambda tun: self.tunnels[tun].Delete(), + list(self.tunnels.keys())) + + if self.forwarding_rules: + vm_util.RunThreaded(lambda fr: self.forwarding_rules[fr].Delete(), + list(self.forwarding_rules.keys())) + + if self.routes: + vm_util.RunThreaded(lambda route: self.routes[route].Delete(), + list(self.routes.keys())) + + if self.vpn_gateway_resource: + self.vpn_gateway_resource.Delete() + + self.created = False + + +class GceVpnGatewayResource(resource.BaseResource): + """Object representing a GCE VPN Gateway Resource.""" + + def __init__(self, name: str, network_name: str, region: str, cidr: str, + project: str): + super(GceVpnGatewayResource, self).__init__() + self.name = name + self.network_name = network_name + self.region = region + self.cidr = cidr + self.project = project + + def _Create(self): + cmd = util.GcloudCommand(self, 'compute', 'target-vpn-gateways', 'create', + self.name) + cmd.flags['network'] = self.network_name + cmd.flags['region'] = self.region + cmd.Issue() + + def _Exists(self): + cmd = util.GcloudCommand(self, 'compute', 'target-vpn-gateways', 'describe', + self.name) + cmd.flags['region'] = self.region + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + return not retcode + + def _Delete(self): + cmd = util.GcloudCommand(self, 'compute', 'target-vpn-gateways', 'delete', + self.name) + cmd.flags['region'] = self.region + cmd.Issue(raise_on_failure=False) + + +class GceIPAddress(resource.BaseResource): + """Object representing a GCE IP address.""" + + def __init__(self, project: str, region: str, name: str): + super(GceIPAddress, self).__init__() + self.project = project + self.region = region + self.name = name + self.ip_address = None + + def _Create(self): + """Allocates a public IP for the VPN gateway.""" + cmd = util.GcloudCommand(self, 'compute', 'addresses', 'create', self.name) + cmd.flags['region'] = self.region + cmd.Issue() + + def _PostCreate(self): + cmd = util.GcloudCommand(self, 'compute', 'addresses', 'describe', + self.name) + cmd.flags['region'] = self.region + cmd.flags['format'] = 'value(address)' + stdout, _, _ = cmd.Issue() + self.ip_address = stdout.rstrip() + + def _Delete(self): + """Deletes a public IP for the VPN gateway.""" + cmd = util.GcloudCommand(self, 'compute', 'addresses', 'delete', self.name) + cmd.flags['region'] = self.region + cmd.Issue(raise_on_failure=False) + + def _Exists(self) -> bool: + """Returns True if the IP address exists.""" + cmd = util.GcloudCommand(self, 'compute', 'addresses', 'describe', + self.name) + cmd.flags['region'] = self.region + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + return not retcode + + +class GceStaticTunnel(resource.BaseResource): + """An object representing a GCE Tunnel.""" + + def __init__(self, project: str, region: str, name: str, vpn_gateway_id: str, + target_ip: str, ike_version: str, psk: str): + super(GceStaticTunnel, self).__init__() + self.project = project + self.region = region + self.name = name + self.vpn_gateway_id = vpn_gateway_id + self.target_ip = target_ip + self.ike_version = ike_version + self.psk = psk + + def _Create(self): + """Creates the Tunnel.""" + cmd = util.GcloudCommand(self, 'compute', 'vpn-tunnels', 'create', + self.name) + cmd.flags['peer-address'] = self.target_ip + cmd.flags['target-vpn-gateway'] = self.vpn_gateway_id + cmd.flags['ike-version'] = self.ike_version + cmd.flags['local-traffic-selector'] = '0.0.0.0/0' + cmd.flags['remote-traffic-selector'] = '0.0.0.0/0' + cmd.flags['shared-secret'] = self.psk + cmd.flags['region'] = self.region + cmd.Issue() + + def _Delete(self): + """Delete IPSec tunnel.""" + cmd = util.GcloudCommand(self, 'compute', 'vpn-tunnels', 'delete', + self.name) + cmd.flags['region'] = self.region + cmd.Issue(raise_on_failure=False) + + def _Exists(self) -> bool: + """Returns True if the tunnel exists.""" + cmd = util.GcloudCommand(self, 'compute', 'vpn-tunnels', 'describe', + self.name) + cmd.flags['region'] = self.region + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + return not retcode + + def IsReady(self) -> bool: + cmd = util.GcloudCommand(self, 'compute', 'vpn-tunnels', 'describe', + self.name) + cmd.flags['region'] = self.region + response = cmd.Issue(suppress_warning=True) + return 'established' in str(response).lower() + + +class GceRoute(resource.BaseResource): + """An object representing a GCE Route.""" + + def __init__(self, route_name: str, dest_cidr: str, network_name: str, + next_hop_tun: str, next_hop_region: str, project: str): + super(GceRoute, self).__init__() + self.name = route_name + self.dest_cidr = dest_cidr + self.next_hop_region = next_hop_region + self.next_hop_tun = next_hop_tun + self.network_name = network_name + self.project = project + + def _Create(self): + """Creates the Route.""" + cmd = util.GcloudCommand(self, 'compute', 'routes', 'create', self.name) + cmd.flags['destination-range'] = self.dest_cidr + cmd.flags['network'] = self.network_name + cmd.flags['next-hop-vpn-tunnel'] = self.next_hop_tun + cmd.flags['next-hop-vpn-tunnel-region'] = self.next_hop_region + cmd.Issue() + + def _Delete(self): + """Delete route.""" + cmd = util.GcloudCommand(self, 'compute', 'routes', 'delete', self.name) + cmd.Issue(raise_on_failure=False) + + def _Exists(self) -> bool: + """Returns True if the Route exists.""" + cmd = util.GcloudCommand(self, 'compute', 'routes', 'describe', + self.name) + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + return not retcode + + +class GceForwardingRule(resource.BaseResource): + """An object representing a GCE Forwarding Rule.""" + + def __init__(self, + name: str, + protocol: str, + src_vpn_gateway: GceVpnGateway, + port: Optional[int] = None): + super(GceForwardingRule, self).__init__() + self.name = name + self.protocol = protocol + self.port = port + self.target_name = src_vpn_gateway.name + self.target_ip = src_vpn_gateway.ip_address + self.src_region = src_vpn_gateway.region + self.project = src_vpn_gateway.project + + def __eq__(self, other: 'GceForwardingRule') -> bool: + """Defines equality to make comparison easy.""" + return (self.name == other.name and + self.protocol == other.protocol and + self.port == other.port and + self.target_name == other.target_name and + self.target_ip == other.target_ip and + self.src_region == other.src_region) + + def _Create(self): + """Creates the Forwarding Rule.""" + cmd = util.GcloudCommand(self, 'compute', 'forwarding-rules', 'create', + self.name) + cmd.flags['ip-protocol'] = self.protocol + if self.port: + cmd.flags['ports'] = self.port + cmd.flags['address'] = self.target_ip + cmd.flags['target-vpn-gateway'] = self.target_name + cmd.flags['region'] = self.src_region + cmd.Issue() + + def _Delete(self): + """Deletes the Forwarding Rule.""" + cmd = util.GcloudCommand(self, 'compute', 'forwarding-rules', 'delete', + self.name) + cmd.flags['region'] = self.src_region + cmd.Issue(raise_on_failure=False) + + def _Exists(self) -> bool: + """Returns True if the Forwarding Rule exists.""" + cmd = util.GcloudCommand(self, 'compute', 'forwarding-rules', 'describe', + self.name) + cmd.flags['region'] = self.src_region + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + return not retcode + + +class GceFirewallRule(resource.BaseResource): + """An object representing a GCE Firewall Rule.""" + + def __init__(self, + name: str, + project: str, + allow: str, + network_name: str, + source_range: Optional[str] = None): + super(GceFirewallRule, self).__init__() + self.name = name + self.project = project + self.allow = allow + self.network_name = network_name + self.source_range = source_range + + def __eq__(self, other: 'GceFirewallRule') -> bool: + """Defines equality to make comparison easy.""" + return (self.name == other.name and + self.allow == other.allow and + self.project == other.project and + self.network_name == other.network_name and + self.source_range == other.source_range) + + def _Create(self): + """Creates the Firewall Rule.""" + cmd = util.GcloudCommand(self, 'compute', 'firewall-rules', 'create', + self.name) + cmd.flags['allow'] = self.allow + cmd.flags['network'] = self.network_name + if self.source_range: + cmd.flags['source-ranges'] = self.source_range + # Gcloud create commands may still create firewalls despite being rate + # limited. + stdout, stderr, retcode = cmd.Issue(raise_on_failure=False) + if retcode: + if cmd.rate_limited and 'already exists' in stderr: + return + debug_text = ('Ran: {%s}\nReturnCode:%s\nSTDOUT: %s\nSTDERR: %s' % + (' '.join(cmd.GetCommand()), retcode, stdout, stderr)) + raise errors.VmUtil.IssueCommandError(debug_text) + + def _Delete(self): + """Deletes the Firewall Rule.""" + cmd = util.GcloudCommand(self, 'compute', 'firewall-rules', 'delete', + self.name) + cmd.Issue(raise_on_failure=False) + + def _Exists(self) -> bool: + """Returns True if the Firewall Rule exists.""" + cmd = util.GcloudCommand(self, 'compute', 'firewall-rules', 'describe', + self.name) + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + return not retcode + + +class GceFirewall(network.BaseFirewall): + """An object representing the GCE Firewall.""" + + CLOUD = providers.GCP + + def __init__(self): + """Initialize GCE firewall class.""" + self._lock = threading.Lock() + # TODO(user): make the key always have the same number of elements + self.firewall_rules: Dict[Tuple[Any, ...], GceFirewallRule] = {} + self.firewall_icmp_rules: Dict[Tuple[Any, ...], GceFirewallRule] = {} + + def AllowPort( + self, + vm, # gce_virtual_machine.GceVirtualMachine + start_port: int, + end_port: Optional[int] = None, + source_range: Optional[List[str]] = None): + """Opens a port on the firewall. + + Args: + vm: The BaseVirtualMachine object to open the port for. + start_port: The first local port to open in a range. + end_port: The last local port to open in a range. If None, only start_port + will be opened. + source_range: List of source CIDRs to allow for this port. If none, all + sources are allowed. + """ + if vm.is_static: + return + if source_range: + source_range = ','.join(source_range) + with self._lock: + if end_port is None: + end_port = start_port + if vm.cidr: # Allow multiple networks per zone. + cidr_string = network.BaseNetwork.FormatCidrString(vm.cidr) + firewall_name = ('perfkit-firewall-%s-%s-%d-%d' % + (cidr_string, FLAGS.run_uri, start_port, end_port)) + key = (vm.project, vm.cidr, start_port, end_port, source_range) + else: + firewall_name = ('perfkit-firewall-%s-%d-%d' % + (FLAGS.run_uri, start_port, end_port)) + key = (vm.project, start_port, end_port, source_range) + if key in self.firewall_rules: + return + allow = ','.join('{0}:{1}-{2}'.format(protocol, start_port, end_port) + for protocol in ('tcp', 'udp')) + firewall_rule = GceFirewallRule( + firewall_name, vm.project, allow, + vm.network.network_resource.name, source_range) + self.firewall_rules[key] = firewall_rule + firewall_rule.Create() + + def DisallowAllPorts(self): + """Closes all ports on the firewall.""" + for firewall_rule in six.itervalues(self.firewall_rules): + firewall_rule.Delete() + for firewall_rule in six.itervalues(self.firewall_icmp_rules): + firewall_rule.Delete() + + def AllowIcmp(self, vm): + """Opens the ICMP protocol on the firewall. + + Args: + vm: The BaseVirtualMachine object to open the ICMP protocol for. + """ + if vm.is_static: + return + with self._lock: + if vm.cidr: # Allow multiple networks per zone. + cidr_string = network.BaseNetwork.FormatCidrString(vm.cidr) + firewall_name = ('perfkit-firewall-icmp-%s-%s' % + (cidr_string, FLAGS.run_uri)) + key = (vm.project, vm.cidr) + else: + firewall_name = ('perfkit-firewall-icmp-%s' % + (FLAGS.run_uri)) + key = (vm.project) + + if key in self.firewall_icmp_rules: + return + + allow = 'ICMP' + firewall_rule = GceFirewallRule( + firewall_name, vm.project, allow, + vm.network.network_resource.name) + self.firewall_icmp_rules[key] = firewall_rule + firewall_rule.Create() + + +class GceNetworkSpec(network.BaseNetworkSpec): + """Object representing a GCE Network specification.""" + + def __init__(self, + project: Optional[str] = None, + mtu: Optional[int] = None, + **kwargs): + """Initializes the GceNetworkSpec. + + Args: + project: The project for which the Network should be created. + mtu: The MTU (max transmission unit) to use, if any. + **kwargs: Additional key word arguments passed to BaseNetworkSpec. + """ + super(GceNetworkSpec, self).__init__(**kwargs) + self.project = project + self.mtu = mtu + + +class GceNetworkResource(resource.BaseResource): + """Object representing a GCE Network resource.""" + + def __init__(self, + name: str, + mode: str, + project: str, + mtu: Optional[int] = None): + super(GceNetworkResource, self).__init__() + self.name = name + self.mode = mode + self.project = project + self.mtu = mtu + + def _Create(self): + """Creates the Network resource.""" + cmd = util.GcloudCommand(self, 'compute', 'networks', 'create', self.name) + cmd.flags['subnet-mode'] = self.mode + if self.mtu: + cmd.flags['mtu'] = self.mtu + cmd.Issue() + + def _Delete(self): + """Deletes the Network resource.""" + if FLAGS.gce_firewall_rules_clean_all: + for firewall_rule in self._GetAllFirewallRules(): + firewall_rule.Delete() + + cmd = util.GcloudCommand(self, 'compute', 'networks', 'delete', self.name) + cmd.Issue(raise_on_failure=False) + + def _Exists(self) -> bool: + """Returns True if the Network resource exists.""" + cmd = util.GcloudCommand(self, 'compute', 'networks', 'describe', self.name) + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + return not retcode + + def _GetAllFirewallRules(self) -> List[GceFirewallRule]: + """Returns all the firewall rules that use the network.""" + cmd = util.GcloudCommand(self, 'compute', 'firewall-rules', 'list') + cmd.flags['filter'] = 'network=%s' % self.name + + stdout, _, _ = cmd.Issue(suppress_warning=True) + result = json.loads(stdout) + return [GceFirewallRule(entry['name'], self.project, ALLOW_ALL, self.name, + NETWORK_RANGE) for entry in result] + + +class GceSubnetResource(resource.BaseResource): + """Object representing a GCE subnet resource.""" + + def __init__(self, name: str, network_name: str, region: str, addr_range: str, + project: str): + super(GceSubnetResource, self).__init__() + self.name = name + self.network_name = network_name + self.region = region + self.addr_range = addr_range + self.project = project + + def _Create(self): + cmd = util.GcloudCommand(self, 'compute', 'networks', 'subnets', 'create', + self.name) + cmd.flags['network'] = self.network_name + cmd.flags['region'] = self.region + cmd.flags['range'] = self.addr_range + cmd.Issue() + + def _Exists(self) -> bool: + cmd = util.GcloudCommand(self, 'compute', 'networks', 'subnets', 'describe', + self.name) + if self.region: + cmd.flags['region'] = self.region + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + return not retcode + + def _Delete(self): + cmd = util.GcloudCommand(self, 'compute', 'networks', 'subnets', 'delete', + self.name) + if self.region: + cmd.flags['region'] = self.region + cmd.Issue(raise_on_failure=False) + + +class GceNetwork(network.BaseNetwork): + """Object representing a GCE Network.""" + + CLOUD = providers.GCP + + def __init__(self, network_spec: GceNetworkSpec): + super(GceNetwork, self).__init__(network_spec) + self.project: Optional[str] = network_spec.project + self.vpn_gateway: Dict[str, GceVpnGateway] = {} + + # Figuring out the type of network here. + # Precedence: User Managed > MULTI > SINGLE > DEFAULT + self.net_type = network.NetType.DEFAULT.value + self.cidr = NETWORK_RANGE + if FLAGS.gce_subnet_region: + self.net_type = network.NetType.SINGLE.value + self.cidr = FLAGS.gce_subnet_addr + if network_spec.cidr: + self.net_type = network.NetType.MULTI.value + self.cidr = network_spec.cidr + self.mtu = network_spec.mtu + + name = self._MakeGceNetworkName() + + subnet_region = (FLAGS.gce_subnet_region if not network_spec.cidr else + util.GetRegionFromZone(network_spec.zone)) + mode = 'auto' if subnet_region is None else 'custom' + self.network_resource = GceNetworkResource(name, mode, self.project, + self.mtu) + if subnet_region is None: + self.subnet_resource = None + else: + self.subnet_resource = GceSubnetResource(FLAGS.gce_subnet_name or name, + name, subnet_region, + self.cidr, self.project) + + # Stage FW rules. + self.all_nets = self._GetNetworksFromSpec( + network_spec) # Holds the different networks in this run. + # Holds FW rules for any external subnets. + self.external_nets_rules: Dict[str, GceFirewallRule] = {} + + # Set the default rule to allow all traffic within this network's subnet. + firewall_name = self._MakeGceFWRuleName() + self.default_firewall_rule = GceFirewallRule( + firewall_name, self.project, ALLOW_ALL, name, self.cidr) + + # Set external rules to allow traffic from other subnets in this benchmark. + for ext_net in self.all_nets: + if ext_net == self.cidr: + continue # We've already added our own network to the default rule. + rule_name = self._MakeGceFWRuleName(dst_cidr=ext_net) + self.external_nets_rules[rule_name] = GceFirewallRule(rule_name, + self.project, + ALLOW_ALL, name, + ext_net) + + # Add VpnGateways to the network. + if FLAGS.use_vpn: + for gatewaynum in range(0, FLAGS.vpn_service_gateway_count): + vpn_gateway_name = 'vpngw-%s-%s-%s' % ( + util.GetRegionFromZone(network_spec.zone), gatewaynum, + FLAGS.run_uri) + self.vpn_gateway[vpn_gateway_name] = GceVpnGateway( + vpn_gateway_name, name, util.GetRegionFromZone(network_spec.zone), + network_spec.cidr, self.project) + + # Add GCE Placement Group + no_placement_group = ( + not FLAGS.placement_group_style or + FLAGS.placement_group_style == placement_group.PLACEMENT_GROUP_NONE) + if no_placement_group: + self.placement_group = None + else: + placement_group_spec = gce_placement_group.GcePlacementGroupSpec( + 'GcePlacementGroupSpec', + flag_values=FLAGS, + zone=network_spec.zone, + project=self.project, + num_vms=self._GetNumberVms()) + self.placement_group = gce_placement_group.GcePlacementGroup( + placement_group_spec) + + def _GetNetworksFromSpec(self, network_spec: GceNetworkSpec) -> Set[str]: + """Returns a list of distinct CIDR networks for this benchmark. + + All GCP networks that aren't set explicitly with a vm_group cidr property + are assigned to the default network. The default network is either + specified as a single region with the gce_subnet_region and + gce_subnet_addr flags, or assigned to an auto created /20 in each region + if no flags are passed. + + Args: + network_spec: The network spec for the network. + Returns: + A set of CIDR strings used by this benchmark. + """ + nets = set() + gce_default_subnet = (FLAGS.gce_subnet_addr if FLAGS.gce_subnet_region + else NETWORK_RANGE) + + if hasattr(network_spec, 'custom_subnets'): + for (_, v) in network_spec.custom_subnets.items(): + if not v['cidr'] and v['cloud'] != 'GCP': + pass # @TODO handle other providers defaults in net_util + elif not v['cidr']: + nets.add(gce_default_subnet) + else: + nets.add(v['cidr']) + return nets + + def _MakeGceNetworkName(self, + net_type: Optional[str] = None, + cidr: Optional[str] = None, + uri: Optional[str] = None) -> str: + """Build the current network's name string. + + Uses current instance properties if none provided. + Must match regex: r'(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)' + + Args: + net_type: One of ['default', 'single', 'multi'] + cidr: The CIDR range of this network. + uri: A network suffix (if different than FLAGS.run_uri) + Returns: + String The name of this network. + """ + if FLAGS.gce_network_name: # Return user managed network name if defined. + return FLAGS.gce_network_name + + net_type = net_type or self.net_type + cidr = cidr or self.cidr + uri = uri or FLAGS.run_uri + + name = 'pkb-network-%s' % uri # Assume the default network naming. + + if net_type in (network.NetType.SINGLE.value, + network.NetType.MULTI.value): + name = 'pkb-network-%s-%s-%s' % ( + net_type, self.FormatCidrString(cidr), uri) + + return name + + def _MakeGceFWRuleName(self, + net_type: Optional[str] = None, + src_cidr: Optional[str] = None, + dst_cidr: Optional[str] = None, + port_range_lo: Optional[str] = None, + port_range_hi: Optional[str] = None, + uri: Optional[str] = None) -> str: + """Build a firewall name string. + + Firewall rule names must be unique within a project so we include source + and destination nets to disambiguate. + + Args: + net_type: One of ['default', 'single', 'multi'] + src_cidr: The CIDR range of this network. + dst_cidr: The CIDR range of the remote network. + port_range_lo: The low port to open + port_range_hi: The high port to open in range. + uri: A firewall suffix (if different than FLAGS.run_uri) + Returns: + The name of this firewall rule. + """ + net_type = net_type or self.net_type + uri = uri or FLAGS.run_uri + src_cidr = src_cidr or self.cidr + dst_cidr = dst_cidr or self.cidr + + prefix = None if src_cidr == dst_cidr else 'perfkit-firewall' + src_cidr = 'internal' if src_cidr == dst_cidr else self.FormatCidrString( + src_cidr) + dst_cidr = self.FormatCidrString(dst_cidr) + port_lo = port_range_lo + port_hi = None if port_range_lo == port_range_hi else port_range_hi + + firewall_name = '-'.join(str(i) for i in ( + prefix, net_type, src_cidr, dst_cidr, + port_lo, port_hi, uri) if i) + + return firewall_name + + @staticmethod + def _GetNetworkSpecFromVm(vm) -> GceNetworkSpec: + """Returns a BaseNetworkSpec created from VM attributes.""" + return GceNetworkSpec( + project=vm.project, zone=vm.zone, cidr=vm.cidr, mtu=vm.mtu) + + @classmethod + def _GetKeyFromNetworkSpec( + cls, spec) -> Union[Tuple[str, str], Tuple[str, str, str]]: + """Returns a key used to register Network instances.""" + if spec.cidr: + return (cls.CLOUD, spec.project, spec.cidr) + return (cls.CLOUD, spec.project) + + def _GetNumberVms(self) -> int: + """Counts the number of VMs to be used in this benchmark. + + Cannot do a len(benchmark_spec.vms) as that hasn't been populated yet. Go + through all the group_specs and sum up the vm_counts. + + Returns: + Count of the number of VMs in the benchmark. + """ + benchmark_spec = context.GetThreadBenchmarkSpec() + return sum((group_spec.vm_count - len(group_spec.static_vms)) + for group_spec in benchmark_spec.config.vm_groups.values()) + + def Create(self): + """Creates the actual network.""" + if not FLAGS.gce_network_name: + self.network_resource.Create() + if self.subnet_resource: + self.subnet_resource.Create() + if self.default_firewall_rule: + self.default_firewall_rule.Create() + if self.external_nets_rules: + vm_util.RunThreaded( + lambda rule: self.external_nets_rules[rule].Create(), + list(self.external_nets_rules.keys())) + if getattr(self, 'vpn_gateway', False): + vm_util.RunThreaded( + lambda gateway: self.vpn_gateway[gateway].Create(), + list(self.vpn_gateway.keys())) + if self.placement_group: + self.placement_group.Create() + + def Delete(self): + """Deletes the actual network.""" + if self.placement_group: + self.placement_group.Delete() + if not FLAGS.gce_network_name: + if getattr(self, 'vpn_gateway', False): + vm_util.RunThreaded( + lambda gateway: self.vpn_gateway[gateway].Delete(), + list(self.vpn_gateway.keys())) + if self.default_firewall_rule.created: + self.default_firewall_rule.Delete() + if self.external_nets_rules: + vm_util.RunThreaded( + lambda rule: self.external_nets_rules[rule].Delete(), + list(self.external_nets_rules.keys())) + if self.subnet_resource: + self.subnet_resource.Delete() + self.network_resource.Delete() diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_nfs_service.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_nfs_service.py new file mode 100644 index 0000000..5c9c4d8 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_nfs_service.py @@ -0,0 +1,103 @@ +"""Resource for GCE NFS service.""" + +import json +import logging + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import nfs_service +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.gcp import gce_network +from perfkitbenchmarker.providers.gcp import util + +FLAGS = flags.FLAGS + + +class GceNfsService(nfs_service.BaseNfsService): + """Resource for GCE NFS service.""" + CLOUD = providers.GCP + NFS_TIERS = ( + 'STANDARD', + 'PREMIUM' + ) + DEFAULT_NFS_VERSION = '3.0' + DEFAULT_TIER = 'STANDARD' + user_managed = False + + def __init__(self, disk_spec, zone): + super(GceNfsService, self).__init__(disk_spec, zone) + self.name = 'nfs-%s' % FLAGS.run_uri + self.server_directory = '/vol0' + + @property + def network(self): + spec = gce_network.GceNetworkSpec(project=FLAGS.project) + network = gce_network.GceNetwork.GetNetworkFromNetworkSpec(spec) + return network.network_resource.name + + def GetRemoteAddress(self): + return self._Describe()['networks'][0]['ipAddresses'][0] + + def _Create(self): + logging.info('Creating NFS server %s', self.name) + volume_arg = 'name={0},capacity={1}'.format( + self.server_directory.strip('/'), self.disk_spec.disk_size) + network_arg = 'name={0}'.format(self.network) + args = [ + '--file-share', volume_arg, '--network', network_arg, '--labels', + util.MakeFormattedDefaultTags() + ] + if self.nfs_tier: + args += ['--tier', self.nfs_tier] + try: + self._NfsCommand('create', *args) + except errors.Error as ex: + # if this NFS service already exists reuse it + if self._Exists(): + logging.info('Reusing existing NFS server %s', self.name) + else: + raise errors.Resource.RetryableCreationError( + 'Error creating NFS service %s' % self.name, ex) + + def _Delete(self): + logging.info('Deleting NFS server %s', self.name) + try: + self._NfsCommand('delete', '--async') + except errors.Error as ex: + if self._Exists(): + raise errors.Resource.RetryableDeletionError(ex) + else: + logging.info('NFS server %s already deleted', self.name) + + def _Exists(self): + try: + self._Describe() + return True + except errors.Error: + return False + + def _IsReady(self): + try: + return self._Describe().get('state', None) == 'READY' + except errors.Error: + return False + + def _Describe(self): + return self._NfsCommand('describe') + + def _GetLocation(self): + return self.zone + + def _NfsCommand(self, verb, *args): + cmd = [FLAGS.gcloud_path, 'alpha', '--quiet', '--format', 'json'] + if FLAGS.project: + cmd += ['--project', FLAGS.project] + cmd += ['filestore', 'instances', verb, self.name] + cmd += [str(arg) for arg in args] + cmd += ['--location', self._GetLocation()] + stdout, stderr, retcode = vm_util.IssueCommand( + cmd, raise_on_failure=False, timeout=1800) + if retcode: + raise errors.Error('Error running command %s : %s' % (verb, stderr)) + return json.loads(stdout) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_placement_group.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_placement_group.py new file mode 100644 index 0000000..e87b290 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_placement_group.py @@ -0,0 +1,156 @@ +# Copyright 2020 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Class to represent an GCP Placement Group object. + +GCP specific implementations of Placement Group. +https://cloud.google.com/compute/docs/instances/define-instance-placement +""" + +import json +import logging + +from absl import flags +from perfkitbenchmarker import context +from perfkitbenchmarker import errors +from perfkitbenchmarker import placement_group +from perfkitbenchmarker import providers +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.providers.gcp import util as gcp_util + +FLAGS = flags.FLAGS + +flags.DEFINE_integer( + 'gce_availability_domain_count', + 1, + 'Number of fault domains to create for SPREAD placement group', + lower_bound=1, + upper_bound=8) + + +class GcePlacementGroupSpec(placement_group.BasePlacementGroupSpec): + """Object containing the information needed to create an GcePlacementGroup. + + Attributes: + project: GCE project, used in creating the resource policy URL. + region: GCE region, used in creating the resource policy URL. + num_vms: Number of VMs to put into the resource group. + """ + + CLOUD = providers.GCP + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(GcePlacementGroupSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'project': (option_decoders.StringDecoder, {'none_ok': False}), + 'num_vms': (option_decoders.IntDecoder, {'none_ok': False}), + 'placement_group_style': (option_decoders.EnumDecoder, { + 'valid_values': set([placement_group.PLACEMENT_GROUP_SUPERCLUSTER] + + list(placement_group.PLACEMENT_GROUP_OPTIONS)), + 'default': placement_group.PLACEMENT_GROUP_NONE, + }) + }) + return result + + +class GcePlacementGroup(placement_group.BasePlacementGroup): + """Object representing an GCE Placement Group.""" + + CLOUD = providers.GCP + + def __init__(self, gce_placement_group_spec): + """Init method for GcePlacementGroup. + + Args: + gce_placement_group_spec: Object containing the + information needed to create an GcePlacementGroup. + """ + super(GcePlacementGroup, self).__init__(gce_placement_group_spec) + self.project = gce_placement_group_spec.project + self.region = gcp_util.GetRegionFromZone(gce_placement_group_spec.zone) + self.zone = None + self.num_vms = gce_placement_group_spec.num_vms + self.name = 'perfkit-{}'.format(context.GetThreadBenchmarkSpec().uuid) + self.style = gce_placement_group_spec.placement_group_style + self.availability_domain_count = FLAGS.gce_availability_domain_count + self.metadata.update({ + 'placement_group_name': self.name, + 'placement_group_style': self.style + }) + + def _Create(self): + """Creates the GCE placement group.""" + + cmd = gcp_util.GcloudCommand(self, 'compute', 'resource-policies', + 'create', 'group-placement', self.name) + + placement_policy = { + 'format': 'json', + 'region': self.region, + } + + if self.style == placement_group.PLACEMENT_GROUP_CLUSTER: + placement_policy['collocation'] = 'COLLOCATED' + placement_policy['vm-count'] = self.num_vms + + elif self.style == placement_group.PLACEMENT_GROUP_SUPERCLUSTER: + placement_policy['collocation'] = 'CLUSTERED' + placement_policy['vm-count'] = self.num_vms + # Only alpha API supported for CLUSTERED. + cmd = gcp_util.GcloudCommand(self, 'alpha', 'compute', + 'resource-policies', 'create', + 'group-placement', self.name) + + else: + placement_policy[ + 'availability-domain-count'] = self.availability_domain_count + + cmd.flags.update(placement_policy) + + _, stderr, retcode = cmd.Issue(raise_on_failure=False) + + if retcode and "Quota 'RESOURCE_POLICIES' exceeded" in stderr: + raise errors.Benchmarks.QuotaFailure(stderr) + elif retcode: + raise errors.Resource.CreationError( + 'Failed to create placement group: %s return code: %s' % + (stderr, retcode)) + + def _Exists(self): + """See base class.""" + cmd = gcp_util.GcloudCommand(self, 'compute', 'resource-policies', + 'describe', self.name) + cmd.flags.update({'region': self.region, 'format': 'json'}) + stdout, _, retcode = cmd.Issue(raise_on_failure=False) + if retcode: + return False + status = json.loads(stdout)['status'] + logging.info('Status of placement group %s: %s', self.name, status) + return True + + def _Delete(self): + """See base class.""" + logging.info('Deleting placement group %s', self.name) + cmd = gcp_util.GcloudCommand(self, 'compute', 'resource-policies', + 'delete', self.name) + cmd.flags.update({'region': self.region, 'format': 'json'}) + cmd.Issue(raise_on_failure=False) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_virtual_machine.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_virtual_machine.py new file mode 100644 index 0000000..c78563f --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gce_virtual_machine.py @@ -0,0 +1,1465 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Class to represent a GCE Virtual Machine object. + +Zones: +run 'gcloud compute zones list' +Machine Types: +run 'gcloud compute machine-types list' +Images: +run 'gcloud compute images list' + +All VM specifics are self-contained and the class provides methods to +operate on the VM: boot, shutdown, etc. +""" + + +import collections +import copy +import itertools +import json +import logging +import posixpath +import re +import threading +from typing import Dict, List, Optional, Tuple + +from absl import flags +from perfkitbenchmarker import custom_virtual_machine_spec +from perfkitbenchmarker import disk +from perfkitbenchmarker import errors +from perfkitbenchmarker import flag_util +from perfkitbenchmarker import linux_virtual_machine as linux_vm +from perfkitbenchmarker import placement_group +from perfkitbenchmarker import providers +from perfkitbenchmarker import resource +from perfkitbenchmarker import virtual_machine +from perfkitbenchmarker import vm_util +from perfkitbenchmarker import windows_virtual_machine +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.providers.gcp import flags as gcp_flags +from perfkitbenchmarker.providers.gcp import gce_disk +from perfkitbenchmarker.providers.gcp import gce_network +from perfkitbenchmarker.providers.gcp import gcs +from perfkitbenchmarker.providers.gcp import gcsfuse_disk +from perfkitbenchmarker.providers.gcp import util +import six +from six.moves import range +import yaml +import ipaddress +import math + +try: + unicode +except NameError: + unicode = str + +FLAGS = flags.FLAGS + +NVME = 'NVME' +SCSI = 'SCSI' +GVNIC = 'GVNIC' +_INSUFFICIENT_HOST_CAPACITY = ('does not have enough resources available ' + 'to fulfill the request.') +_FAILED_TO_START_DUE_TO_PREEMPTION = ( + 'Instance failed to start due to preemption.') +_UNSUPPORTED_RESOURCE = 'Could not fetch resource' +_GCE_VM_CREATE_TIMEOUT = 1200 +_GCE_NVIDIA_GPU_PREFIX = 'nvidia-tesla-' +_SHUTDOWN_SCRIPT = 'su "{user}" -c "echo | gsutil cp - {preempt_marker}"' +_WINDOWS_SHUTDOWN_SCRIPT_PS1 = 'Write-Host | gsutil cp - {preempt_marker}' +_METADATA_PREEMPT_URI = 'http://metadata.google.internal/computeMetadata/v1/instance/preempted' +_METADATA_PREEMPT_CMD = f'curl {_METADATA_PREEMPT_URI} -H "Metadata-Flavor: Google"' +_METADATA_PREEMPT_CMD_WIN = (f'Invoke-RestMethod -Uri {_METADATA_PREEMPT_URI} ' + '-Headers @{"Metadata-Flavor"="Google"}') +_NON_SEV_COMPATIBLE_IMAGE = ( + 'can only be set when using an SEV compatible image') + +_ARM_MACHINE_TYPES = [ + 't2a' +] + +class GceUnexpectedWindowsAdapterOutputError(Exception): + """Raised when querying the status of a windows adapter failed.""" + + +class GceDriverDoesntSupportFeatureError(Exception): + """Raised if there is an attempt to set a feature not supported.""" + + +class GceVmSpec(virtual_machine.BaseVmSpec): + """Object containing the information needed to create a GceVirtualMachine. + + Attributes: + cpus: None or int. Number of vCPUs for custom VMs. + memory: None or string. For custom VMs, a string representation of the size + of memory, expressed in MiB or GiB. Must be an integer number of MiB + (e.g. "1280MiB", "7.5GiB"). + num_local_ssds: int. The number of local SSDs to attach to the instance. + preemptible: boolean. True if the VM should be preemptible, False otherwise. + project: string or None. The project to create the VM in. + image_family: string or None. The image family used to locate the image. + image_project: string or None. The image project used to locate the + specifed image. + boot_disk_size: None or int. The size of the boot disk in GB. + boot_disk_type: string or None. The type of the boot disk. + """ + + CLOUD = providers.GCP + + def __init__(self, *args, **kwargs): + self.num_local_ssds: int = None + self.preemptible: bool = None + self.boot_disk_size: int = None + self.boot_disk_type: str = None + self.project: str = None + self.image_family: str = None + self.image_project: str = None + self.node_type: str = None + self.min_cpu_platform: str = None + self.threads_per_core: int = None + self.gce_tags: List[str] = None + self.min_node_cpus: int = None + super(GceVmSpec, self).__init__(*args, **kwargs) + + if isinstance(self.machine_type, + custom_virtual_machine_spec.CustomMachineTypeSpec): + logging.warning('Specifying a custom machine in the format of ' + '{cpus: [NUMBER_OF_CPUS], memory: [GB_OF_MEMORY]} ' + 'creates a custom machine in the n1 machine family. ' + 'To create custom machines in other machine families, ' + 'use [MACHINE_FAMILY]-custom-[NUMBER_CPUS]-[NUMBER_MiB] ' + 'nomaclature. e.g. n2-custom-2-4096.') + self.cpus = self.machine_type.cpus + self.memory = self.machine_type.memory + self.machine_type = None + else: + self.cpus = None + self.memory = None + + # The A2 machine family, unlike other GCP offerings has a preset number of + # GPUs, so we set them directly from the machine_type + # https://cloud.google.com/blog/products/compute/announcing-google-cloud-a2-vm-family-based-on-nvidia-a100-gpu + if self.machine_type and self.machine_type.startswith('a2-'): + a2_lookup = { + 'a2-highgpu-1g': 1, + 'a2-highgpu-2g': 2, + 'a2-highgpu-4g': 4, + 'a2-highgpu-8g': 8, + 'a2-megagpu-16g': 16 + } + self.gpu_count = a2_lookup[self.machine_type] + self.gpu_type = virtual_machine.GPU_A100 + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May + be modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super(GceVmSpec, cls)._ApplyFlags(config_values, flag_values) + if flag_values['gce_num_local_ssds'].present: + config_values['num_local_ssds'] = flag_values.gce_num_local_ssds + if flag_values['gce_preemptible_vms'].present: + config_values['preemptible'] = flag_values.gce_preemptible_vms + if flag_values['gce_boot_disk_size'].present: + config_values['boot_disk_size'] = flag_values.gce_boot_disk_size + if flag_values['gce_boot_disk_type'].present: + config_values['boot_disk_type'] = flag_values.gce_boot_disk_type + if flag_values['machine_type'].present: + config_values['machine_type'] = yaml.safe_load(flag_values.machine_type) + if flag_values['project'].present: + config_values['project'] = flag_values.project + if flag_values['image_family'].present: + config_values['image_family'] = flag_values.image_family + if flag_values['image_project'].present: + config_values['image_project'] = flag_values.image_project + if flag_values['gcp_node_type'].present: + config_values['node_type'] = flag_values.gcp_node_type + if flag_values['gcp_min_cpu_platform'].present: + if (flag_values.gcp_min_cpu_platform != + gcp_flags.GCP_MIN_CPU_PLATFORM_NONE): + config_values['min_cpu_platform'] = flag_values.gcp_min_cpu_platform + else: + # Specifying gcp_min_cpu_platform explicitly removes any config. + config_values.pop('min_cpu_platform', None) + if flag_values['disable_smt'].present: + config_values['threads_per_core'] = 1 + if flag_values['gce_tags'].present: + config_values['gce_tags'] = flag_values.gce_tags + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword + arguments to construct in order to decode the named option. + """ + result = super(GceVmSpec, cls)._GetOptionDecoderConstructions() + result.update({ + 'machine_type': (custom_virtual_machine_spec.MachineTypeDecoder, { + 'default': None + }), + 'num_local_ssds': (option_decoders.IntDecoder, { + 'default': 0, + 'min': 0 + }), + 'preemptible': (option_decoders.BooleanDecoder, { + 'default': False + }), + 'boot_disk_size': (option_decoders.IntDecoder, { + 'default': None + }), + 'boot_disk_type': (option_decoders.StringDecoder, { + 'default': None + }), + 'project': (option_decoders.StringDecoder, { + 'default': None + }), + 'image_family': (option_decoders.StringDecoder, { + 'default': None + }), + 'image_project': (option_decoders.StringDecoder, { + 'default': None + }), + 'node_type': (option_decoders.StringDecoder, { + 'default': 'n1-node-96-624' + }), + 'min_cpu_platform': (option_decoders.StringDecoder, { + 'default': None + }), + 'threads_per_core': (option_decoders.IntDecoder, { + 'default': None + }), + 'gce_tags': (option_decoders.ListDecoder, { + 'item_decoder': option_decoders.StringDecoder(), + 'default': None + }), + }) + return result + + +class GceSoleTenantNodeTemplate(resource.BaseResource): + """Object representing a GCE sole tenant node template. + + Attributes: + name: string. The name of the node template. + node_type: string. The node type of the node template. + zone: string. The zone of the node template, converted to region. + """ + + def __init__(self, name, node_type, zone, project): + super(GceSoleTenantNodeTemplate, self).__init__() + self.name = name + self.node_type = node_type + self.region = util.GetRegionFromZone(zone) + self.project = project + + def _Create(self): + """Creates the node template.""" + cmd = util.GcloudCommand(self, 'compute', 'sole-tenancy', + 'node-templates', 'create', self.name) + cmd.flags['node-type'] = self.node_type + cmd.flags['region'] = self.region + cmd.Issue() + + def _Exists(self): + """Returns True if the node template exists.""" + cmd = util.GcloudCommand(self, 'compute', 'sole-tenancy', + 'node-templates', 'describe', self.name) + cmd.flags['region'] = self.region + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + return not retcode + + def _Delete(self): + """Deletes the node template.""" + cmd = util.GcloudCommand(self, 'compute', 'sole-tenancy', + 'node-templates', 'delete', self.name) + cmd.flags['region'] = self.region + cmd.Issue(raise_on_failure=False) + + +class GceSoleTenantNodeGroup(resource.BaseResource): + """Object representing a GCE sole tenant node group. + + Attributes: + name: string. The name of the node group. + node_template: string. The note template of the node group. + zone: string. The zone of the node group. + """ + + _counter_lock = threading.Lock() + _counter = itertools.count() + + def __init__(self, node_type, zone, project): + super(GceSoleTenantNodeGroup, self).__init__() + with self._counter_lock: + self.instance_number = next(self._counter) + self.name = 'pkb-node-group-%s-%s' % (FLAGS.run_uri, self.instance_number) + self.node_type = node_type + self.node_template = None + self.zone = zone + self.project = project + self.fill_fraction = 0.0 + + def _Create(self): + """Creates the host.""" + cmd = util.GcloudCommand(self, 'compute', 'sole-tenancy', + 'node-groups', 'create', self.name) + cmd.flags['node-template'] = self.node_template.name + cmd.flags['target-size'] = 1 + _, stderr, retcode = cmd.Issue(raise_on_failure=False) + util.CheckGcloudResponseKnownFailures(stderr, retcode) + + def _CreateDependencies(self): + super(GceSoleTenantNodeGroup, self)._CreateDependencies() + node_template_name = self.name.replace('group', 'template') + node_template = GceSoleTenantNodeTemplate( + node_template_name, self.node_type, self.zone, self.project) + node_template.Create() + self.node_template = node_template + + def _DeleteDependencies(self): + if self.node_template: + self.node_template.Delete() + + def _Exists(self): + """Returns True if the host exists.""" + cmd = util.GcloudCommand(self, 'compute', 'sole-tenancy', + 'node-groups', 'describe', self.name) + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + return not retcode + + def _Delete(self): + """Deletes the host.""" + cmd = util.GcloudCommand(self, 'compute', 'sole-tenancy', + 'node-groups', 'delete', self.name) + cmd.Issue(raise_on_failure=False) + + +def GenerateAcceleratorSpecString(accelerator_type, accelerator_count): + """Generates a string to be used to attach accelerators to a VM using gcloud. + + This function takes a cloud-agnostic accelerator type (k80, p100, etc.) and + returns a gce-specific accelerator name (nvidia-tesla-k80, etc). + + If FLAGS.gce_accelerator_type_override is specified, the value of said flag + will be used as the name of the accelerator. + + Args: + accelerator_type: cloud-agnostic accelerator type (p100, k80, etc.) + accelerator_count: number of accelerators to attach to the VM + + Returns: + String to be used by gcloud to attach accelerators to a VM. + Must be prepended by the flag '--accelerator'. + """ + gce_accelerator_type = (FLAGS.gce_accelerator_type_override or + _GCE_NVIDIA_GPU_PREFIX + accelerator_type) + return 'type={0},count={1}'.format( + gce_accelerator_type, + accelerator_count) + + +class GceVirtualMachine(virtual_machine.BaseVirtualMachine): + """Object representing a Google Compute Engine Virtual Machine.""" + + CLOUD = providers.GCP + + # Subclasses should override the default image OR + # both the image family and image_project. + DEFAULT_IMAGE = None + DEFAULT_IMAGE_FAMILY = None + DEFAULT_IMAGE_PROJECT = None + + # Subclasses may override these, but are recommended to leave them up to GCE. + BOOT_DISK_SIZE_GB = None + BOOT_DISK_TYPE = None + + NVME_START_INDEX = 1 + + _host_lock = threading.Lock() + deleted_hosts = set() + host_map = collections.defaultdict(list) + + def __init__(self, vm_spec): + """Initialize a GCE virtual machine. + + Args: + vm_spec: virtual_machine.BaseVmSpec object of the vm. + + Raises: + errors.Config.MissingOption: If the spec does not include a "machine_type" + or both "cpus" and "memory". + errors.Config.InvalidValue: If the spec contains both "machine_type" and + at least one of "cpus" or "memory". + """ + super(GceVirtualMachine, self).__init__(vm_spec) + self.boot_metadata = {} + self.cpus = vm_spec.cpus + self.image = self.image or self.DEFAULT_IMAGE + self.max_local_disks = vm_spec.num_local_ssds + self.memory_mib = vm_spec.memory + self.preemptible = vm_spec.preemptible + self.spot_early_termination = False + self.preemptible_status_code = None + self.project = vm_spec.project or util.GetDefaultProject() + self.image_family = vm_spec.image_family or self.DEFAULT_IMAGE_FAMILY + + if self.machine_type.split("-")[0] in _ARM_MACHINE_TYPES: + self.image_family = self.image_family + "-arm64" + + self.image_project = vm_spec.image_project or self.DEFAULT_IMAGE_PROJECT + self.backfill_image = False + self.mtu: Optional[int] = FLAGS.mtu + self.network = self._GetNetwork() + self.firewall = gce_network.GceFirewall.GetFirewall() + self.boot_disk_size = vm_spec.boot_disk_size or self.BOOT_DISK_SIZE_GB + self.boot_disk_type = vm_spec.boot_disk_type or self.BOOT_DISK_TYPE + self.id = None + self.node_type = vm_spec.node_type + self.node_group = None + self.use_dedicated_host = vm_spec.use_dedicated_host + self.num_vms_per_host = vm_spec.num_vms_per_host + self.min_cpu_platform = vm_spec.min_cpu_platform + self.threads_per_core = vm_spec.threads_per_core + self.gce_remote_access_firewall_rule = FLAGS.gce_remote_access_firewall_rule + self.gce_accelerator_type_override = FLAGS.gce_accelerator_type_override + self.gce_tags = vm_spec.gce_tags + self.gce_network_tier = FLAGS.gce_network_tier + self.gce_nic_type = FLAGS.gce_nic_type + self.gce_egress_bandwidth_tier = gcp_flags.EGRESS_BANDWIDTH_TIER.value + self.gce_shielded_secure_boot = FLAGS.gce_shielded_secure_boot + self.gce_confidential_compute = FLAGS.gce_confidential_compute + # Default to GCE default (Live Migration) + self.on_host_maintenance = None + # https://cloud.google.com/compute/docs/instances/live-migration#gpusmaintenance + # https://cloud.google.com/compute/docs/instances/define-instance-placement#restrictions + # TODO(pclay): Update if this assertion ever changes + if (FLAGS['gce_migrate_on_maintenance'].present and + FLAGS.gce_migrate_on_maintenance and + (self.gpu_count or self.network.placement_group)): + raise errors.Config.InvalidValue( + 'Cannot set flag gce_migrate_on_maintenance on instances with GPUs ' + 'or network placement groups, as it is not supported by GCP.') + if (FLAGS['gce_migrate_on_maintenance'].present and + FLAGS.gce_migrate_on_maintenance and self.gce_confidential_compute): + raise errors.Config.InvalidValue( + 'Cannot set flag gce_migrate_on_maintenance on instances with confidential VM enabled, ' + 'as it is not supported by GCP.') + if (not FLAGS.gce_migrate_on_maintenance or + self.gpu_count or self.network.placement_group or + self.gce_confidential_compute): + self.on_host_maintenance = 'TERMINATE' + self.automatic_restart = FLAGS.gce_automatic_restart + if self.preemptible: + self.preempt_marker = f'gs://{FLAGS.gcp_preemptible_status_bucket}/{FLAGS.run_uri}/{self.name}' + # Can not configure a preemptible instance to live migrate + self.on_host_maintenance = 'TERMINATE' + + def _GetNetwork(self): + """Returns the GceNetwork to use.""" + return gce_network.GceNetwork.GetNetwork(self) + + @property + def host_list(self): + """Returns the list of hosts that are compatible with this VM.""" + return self.host_map[(self.project, self.zone)] + + def _GenerateCreateCommand(self, ssh_keys_path): + """Generates a command to create the VM instance. + + Args: + ssh_keys_path: string. Path to a file containing the sshKeys metadata. + + Returns: + GcloudCommand. gcloud command to issue in order to create the VM instance. + """ + args = ['compute', 'instances', 'create', self.name] + + cmd = util.GcloudCommand(self, *args) + + # https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration + if self.gce_egress_bandwidth_tier: + network_performance_configs = f'total-egress-bandwidth-tier={self.gce_egress_bandwidth_tier}' + cmd.flags['network-performance-configs'] = network_performance_configs + + # https://cloud.google.com/compute/docs/instances/setting-instance-scheduling-options + if self.on_host_maintenance: + maintenance_flag = 'maintenance-policy' + cmd.flags[maintenance_flag] = self.on_host_maintenance + + # Bundle network-related arguments with --network-interface + # This flag is mutually exclusive with any of these flags: + # --address, --network, --network-tier, --subnet, --private-network-ip. + # gcloud compute instances create ... --network-interface= + ni_args = [] + if self.network.subnet_resource is not None: + ni_args.append(f'subnet={self.network.subnet_resource.name}') + else: + ni_args.append(f'network={self.network.network_resource.name}') + ni_args.append(f'network-tier={self.gce_network_tier.upper()}') + if self.gce_confidential_compute: + self.gce_nic_type = GVNIC + ni_args.append(f'nic-type={self.gce_nic_type.upper()}') + cmd.flags['network-interface'] = ','.join(ni_args) + + if self.image: + cmd.flags['image'] = self.image + elif self.image_family: + cmd.flags['image-family'] = self.image_family + if self.image_project is not None: + cmd.flags['image-project'] = self.image_project + cmd.flags['boot-disk-auto-delete'] = True + if self.boot_disk_size: + cmd.flags['boot-disk-size'] = self.boot_disk_size + if self.boot_disk_type: + cmd.flags['boot-disk-type'] = self.boot_disk_type + if self.machine_type is None: + cmd.flags['custom-cpu'] = self.cpus + cmd.flags['custom-memory'] = '{0}MiB'.format(self.memory_mib) + else: + cmd.flags['machine-type'] = self.machine_type + + if self.min_cpu_platform: + cmd.flags['min-cpu-platform'] = self.min_cpu_platform + + if self.threads_per_core: + cmd.flags['threads-per-core'] = self.threads_per_core + + if self.gpu_count and self.machine_type and 'a2-' not in self.machine_type: + # A2 machine type already has predefined GPU type and count. + cmd.flags['accelerator'] = GenerateAcceleratorSpecString(self.gpu_type, + self.gpu_count) + cmd.flags['tags'] = ','.join(['perfkitbenchmarker'] + (self.gce_tags or [])) + if not self.automatic_restart: + cmd.flags['no-restart-on-failure'] = True + self.metadata['automatic_restart'] = self.automatic_restart + if self.node_group: + cmd.flags['node-group'] = self.node_group.name + if self.gce_shielded_secure_boot: + cmd.flags['shielded-secure-boot'] = True + if self.gce_confidential_compute: + cmd.flags['confidential-compute'] = True + + if self.network.placement_group: + self.metadata.update(self.network.placement_group.GetResourceMetadata()) + cmd.flags['resource-policies'] = self.network.placement_group.name + else: + self.metadata[ + 'placement_group_style'] = placement_group.PLACEMENT_GROUP_NONE + + metadata_from_file = {'sshKeys': ssh_keys_path} + parsed_metadata_from_file = flag_util.ParseKeyValuePairs( + FLAGS.gcp_instance_metadata_from_file) + for key, value in six.iteritems(parsed_metadata_from_file): + if key in metadata_from_file: + logging.warning('Metadata "%s" is set internally. Cannot be overridden ' + 'from command line.', key) + continue + metadata_from_file[key] = value + cmd.flags['metadata-from-file'] = ','.join([ + '%s=%s' % (k, v) for k, v in six.iteritems(metadata_from_file) + ]) + + metadata = {} + metadata.update(self.boot_metadata) + metadata.update(util.GetDefaultTags()) + + additional_metadata = {} + additional_metadata.update(self.vm_metadata) + additional_metadata.update( + flag_util.ParseKeyValuePairs(FLAGS.gcp_instance_metadata)) + + for key, value in six.iteritems(additional_metadata): + if key in metadata: + logging.warning('Metadata "%s" is set internally. Cannot be overridden ' + 'from command line.', key) + continue + metadata[key] = value + + if self.preemptible: + cmd.flags['preemptible'] = True + metadata.update([self._PreemptibleMetadataKeyValue()]) + + cmd.flags['metadata'] = util.FormatTags(metadata) + cmd.flags['local-ssd'] = (['interface={0}'.format( + FLAGS.gce_ssd_interface)] * self.max_local_disks) + if FLAGS.gcloud_scopes: + cmd.flags['scopes'] = ','.join(re.split(r'[,; ]', FLAGS.gcloud_scopes)) + cmd.flags['labels'] = util.MakeFormattedDefaultTags() + if self.gce_confidential_compute: + if self.machine_type and 'n2d-' not in self.machine_type: + raise errors.Config.InvalidValue( + 'Cannot set flag gce_confidential_compute on instances with machine family ' + 'other than n2d.') + if self.gce_nic_type.upper() != GVNIC: + raise errors.Config.InvalidValue( + 'Cannot set flag gce_confidential_compute on instances with virtual NIC ' + 'type other than {}.'.format(GVNIC)) + if self.max_local_disks > 0 and FLAGS.gce_ssd_interface.upper() != NVME: + raise errors.Config.InvalidValue( + 'Cannot set flag gce_confidential_compute on instances with ssd interface ' + 'type other than {}.'.format(NVME)) + + # gce_egress_bandwidth_tier flag could only be set when nic type is GVNIC + # and only work with VMs in the N2, N2D, C2, or C2D series + if self.gce_egress_bandwidth_tier: + if self.machine_type and not self._HighBandwidthVMType(self.machine_type): + raise errors.Config.InvalidValue( + 'Cannot set flag gce_egress_bandwidth_tier on instances with machine family ' + 'other than n2, n2d, c2, c2d.') + if self.gce_nic_type.upper() != GVNIC: + raise errors.Config.InvalidValue( + 'Cannot set flag gce_egress_bandwidth_tier on instances with virtual NIC ' + 'type other than {}.'.format(GVNIC)) + return cmd + + def _HighBandwidthVMType(self, machine_type): + if 'n2' in machine_type or 'c2' in machine_type: + return True + return False + + def _AddShutdownScript(self): + cmd = util.GcloudCommand( + self, 'compute', 'instances', 'add-metadata', self.name) + key, value = self._PreemptibleMetadataKeyValue() + cmd.flags['metadata'] = f'{key}={value}' + cmd.Issue() + + def _RemoveShutdownScript(self): + # Removes shutdown script which copies status when it is interrupted + cmd = util.GcloudCommand( + self, 'compute', 'instances', 'remove-metadata', self.name) + key, _ = self._PreemptibleMetadataKeyValue() + cmd.flags['keys'] = key + cmd.Issue(raise_on_failure=False) + + def Reboot(self): + if self.preemptible: + self._RemoveShutdownScript() + super().Reboot() + if self.preemptible: + self._AddShutdownScript() + + def _Start(self): + """Starts the VM.""" + start_cmd = util.GcloudCommand(self, 'compute', 'instances', 'start', + self.name) + # After start, IP address is changed + stdout, _, _ = start_cmd.Issue() + response = json.loads(stdout) + # Response is a list of size one + self._ParseDescribeResponse(response[0]) + + def _Stop(self): + """Stops the VM.""" + stop_cmd = util.GcloudCommand(self, 'compute', 'instances', 'stop', + self.name) + stop_cmd.Issue() + + def _PreDelete(self): + super()._PreDelete() + if self.preemptible: + self._RemoveShutdownScript() + + def _Create(self): + """Create a GCE VM instance.""" + num_hosts = len(self.host_list) + with open(self.ssh_public_key) as f: + public_key = f.read().rstrip('\n') + with vm_util.NamedTemporaryFile(mode='w', dir=vm_util.GetTempDir(), + prefix='key-metadata') as tf: + tf.write('%s:%s\n' % (self.user_name, public_key)) + tf.close() + create_cmd = self._GenerateCreateCommand(tf.name) + _, stderr, retcode = create_cmd.Issue(timeout=_GCE_VM_CREATE_TIMEOUT, + raise_on_failure=False) + + if (self.use_dedicated_host and retcode and + _INSUFFICIENT_HOST_CAPACITY in stderr): + if self.num_vms_per_host: + raise errors.Resource.CreationError( + 'Failed to create host: %d vms of type %s per host exceeds ' + 'memory capacity limits of the host' % + (self.num_vms_per_host, self.machine_type)) + else: + logging.warning( + 'Creation failed due to insufficient host capacity. A new host will ' + 'be created and instance creation will be retried.') + with self._host_lock: + if num_hosts == len(self.host_list): + host = GceSoleTenantNodeGroup(self.node_type, + self.zone, self.project) + self.host_list.append(host) + host.Create() + self.node_group = self.host_list[-1] + raise errors.Resource.RetryableCreationError() + if (not self.use_dedicated_host and retcode and + _INSUFFICIENT_HOST_CAPACITY in stderr): + logging.error(util.STOCKOUT_MESSAGE) + raise errors.Benchmarks.InsufficientCapacityCloudFailure( + util.STOCKOUT_MESSAGE) + util.CheckGcloudResponseKnownFailures(stderr, retcode) + if retcode: + if (create_cmd.rate_limited and 'already exists' in stderr and + FLAGS.retry_on_rate_limited): + # Gcloud create commands may still create VMs despite being rate + # limited. + return + if util.RATE_LIMITED_MESSAGE in stderr: + raise errors.Benchmarks.QuotaFailure.RateLimitExceededError(stderr) + if self.preemptible and _FAILED_TO_START_DUE_TO_PREEMPTION in stderr: + self.spot_early_termination = True + raise errors.Benchmarks.InsufficientCapacityCloudFailure( + 'Interrupted before VM started') + if self.gce_confidential_compute and _NON_SEV_COMPATIBLE_IMAGE in stderr: + raise errors.Resource.CreationError('Failed to create VM: Selected image ' + 'is not SEV compatible when using ' + 'gce_confidential_compute flag') + if _UNSUPPORTED_RESOURCE in stderr: + raise errors.Benchmarks.UnsupportedConfigError(stderr) + raise errors.Resource.CreationError( + 'Failed to create VM: %s return code: %s' % (stderr, retcode)) + + def _CreateDependencies(self): + super(GceVirtualMachine, self)._CreateDependencies() + # Create necessary VM access rules *prior* to creating the VM, such that it + # doesn't affect boot time. + self.AllowRemoteAccessPorts() + + if self.use_dedicated_host: + with self._host_lock: + if (not self.host_list or (self.num_vms_per_host and + self.host_list[-1].fill_fraction + + 1.0 / self.num_vms_per_host > 1.0)): + host = GceSoleTenantNodeGroup(self.node_type, + self.zone, self.project) + self.host_list.append(host) + host.Create() + self.node_group = self.host_list[-1] + if self.num_vms_per_host: + self.node_group.fill_fraction += 1.0 / self.num_vms_per_host + + def _DeleteDependencies(self): + if self.node_group: + with self._host_lock: + if self.node_group in self.host_list: + self.host_list.remove(self.node_group) + if self.node_group not in self.deleted_hosts: + self.node_group.Delete() + self.deleted_hosts.add(self.node_group) + + def _ParseDescribeResponse(self, describe_response): + """Sets the ID and IP addresses from a response to the describe command. + + Args: + describe_response: JSON-loaded response to the describe gcloud command. + + Raises: + KeyError, IndexError: If the ID and IP addresses cannot be parsed. + """ + self.id = describe_response['id'] + network_interface = describe_response['networkInterfaces'][0] + self.internal_ip = network_interface['networkIP'] + self.ip_address = network_interface['accessConfigs'][0]['natIP'] + + @property + def HasIpAddress(self): + """Returns True when the IP has been retrieved from a describe response.""" + return not self._NeedsToParseDescribeResponse() + + def _NeedsToParseDescribeResponse(self): + """Returns whether the ID and IP addresses still need to be set.""" + return not self.id or not self.internal_ip or not self.ip_address + + @vm_util.Retry() + def _PostCreate(self): + """Get the instance's data.""" + if self._NeedsToParseDescribeResponse(): + getinstance_cmd = util.GcloudCommand(self, 'compute', 'instances', + 'describe', self.name) + stdout, _, _ = getinstance_cmd.Issue() + response = json.loads(stdout) + self._ParseDescribeResponse(response) + if not all((self.image, self.boot_disk_size, self.boot_disk_type)): + getdisk_cmd = util.GcloudCommand( + self, 'compute', 'disks', 'describe', self.name) + stdout, _, _ = getdisk_cmd.Issue() + response = json.loads(stdout) + if not self.image: + self.image = response['sourceImage'].split('/')[-1] + self.backfill_image = True + if not self.boot_disk_size: + self.boot_disk_size = response['sizeGb'] + if not self.boot_disk_type: + self.boot_disk_type = response['type'].split('/')[-1] + + def _Delete(self): + """Delete a GCE VM instance.""" + delete_cmd = util.GcloudCommand(self, 'compute', 'instances', 'delete', + self.name) + delete_cmd.Issue(raise_on_failure=False) + + def _Suspend(self): + """Suspend a GCE VM instance.""" + util.GcloudCommand(self, 'beta', 'compute', 'instances', 'suspend', + self.name).Issue() + + def _Resume(self): + """Resume a GCE VM instance.""" + resume_cmd = util.GcloudCommand(self, 'beta', 'compute', 'instances', + 'resume', self.name) + + # After resume, IP address is refreshed + stdout, _, _ = resume_cmd.Issue() + response = json.loads(stdout) + # Response is a list of size one + self._ParseDescribeResponse(response[0]) + + def _Exists(self): + """Returns true if the VM exists.""" + getinstance_cmd = util.GcloudCommand(self, 'compute', 'instances', + 'describe', self.name) + stdout, _, _ = getinstance_cmd.Issue(suppress_warning=True, + raise_on_failure=False) + try: + response = json.loads(stdout) + except ValueError: + return False + try: + # The VM may exist before we can fully parse the describe response for the + # IP address or ID of the VM. For example, if the VM has a status of + # provisioning, we can't yet parse the IP address. If this is the case, we + # will continue to invoke the describe command in _PostCreate above. + # However, if we do have this information now, it's better to stash it and + # avoid invoking the describe command again. + self._ParseDescribeResponse(response) + except (KeyError, IndexError): + pass + return True + + def CreateScratchDisk(self, disk_spec): + """Create a VM's scratch disk. + + Args: + disk_spec: virtual_machine.BaseDiskSpec object of the disk. + """ + disks = [] + replica_zones = FLAGS.data_disk_zones + + for i in range(disk_spec.num_striped_disks): + if disk_spec.disk_type == disk.LOCAL: + name = '' + if FLAGS.gce_ssd_interface == SCSI: + name = 'local-ssd-%d' % self.local_disk_counter + disk_number = self.local_disk_counter + 1 + elif FLAGS.gce_ssd_interface == NVME: + # Device can either be /dev/nvme0n1 or /dev/nvme1n1. Find out which. + name, _ = self.RemoteCommand('find /dev/nvme*n%d' % + (self.local_disk_counter + 1)) + name = name.strip().split('/')[-1] + disk_number = self.local_disk_counter + self.NVME_START_INDEX + else: + raise errors.Error('Unknown Local SSD Interface.') + data_disk = gce_disk.GceDisk(disk_spec, name, self.zone, self.project, + replica_zones=replica_zones) + data_disk.disk_number = disk_number + self.local_disk_counter += 1 + if self.local_disk_counter > self.max_local_disks: + raise errors.Error('Not enough local disks.') + elif disk_spec.disk_type == disk.NFS: + data_disk = self._GetNfsService().CreateNfsDisk() + elif disk_spec.disk_type == disk.OBJECT_STORAGE: + data_disk = gcsfuse_disk.GcsFuseDisk(disk_spec) + else: + name = '%s-data-%d-%d' % (self.name, len(self.scratch_disks), i) + data_disk = gce_disk.GceDisk(disk_spec, name, self.zone, self.project, + replica_zones=replica_zones) + # Remote disk numbers start at 1+max_local_disks (0 is the system disk + # and local disks occupy 1-max_local_disks). + data_disk.disk_number = (self.remote_disk_counter + + 1 + self.max_local_disks) + self.remote_disk_counter += 1 + disks.append(data_disk) + + self._CreateScratchDiskFromDisks(disk_spec, disks) + + def AddMetadata(self, **kwargs): + """Adds metadata to disk.""" + # vm metadata added to vm on creation. + cmd = util.GcloudCommand( + self, 'compute', 'disks', 'add-labels', self.name) + cmd.flags['labels'] = util.MakeFormattedDefaultTags() + cmd.Issue() + + def AllowRemoteAccessPorts(self): + """Creates firewall rules for remote access if required.""" + + # If gce_remote_access_firewall_rule is specified, access is already + # granted by that rule. + # If not, GCE firewall rules are created for all instances in a + # network. + if not self.gce_remote_access_firewall_rule: + super(GceVirtualMachine, self).AllowRemoteAccessPorts() + + def GetResourceMetadata(self): + """Returns a dict containing metadata about the VM. + + Returns: + dict mapping string property key to value. + """ + result = super(GceVirtualMachine, self).GetResourceMetadata() + for attr_name in 'cpus', 'memory_mib', 'preemptible', 'project': + attr_value = getattr(self, attr_name) + if attr_value: + result[attr_name] = attr_value + # Only record image_family flag when it is used in vm creation command. + # Note, when using non-debian/ubuntu based custom images, user will need + # to use --os_type flag. In that case, we do not want to + # record image_family in metadata. + if self.backfill_image and self.image_family: + result['image_family'] = self.image_family + if self.image_project: + result['image_project'] = self.image_project + if self.use_dedicated_host: + result['node_type'] = self.node_type + result['num_vms_per_host'] = self.num_vms_per_host + if self.gpu_count: + result['gpu_type'] = self.gpu_type + result['gpu_count'] = self.gpu_count + if self.gce_accelerator_type_override: + result['accelerator_type_override'] = self.gce_accelerator_type_override + if self.gce_tags: + result['gce_tags'] = ','.join(self.gce_tags) + if self.max_local_disks: + result['gce_local_ssd_count'] = self.max_local_disks + result['gce_local_ssd_interface'] = FLAGS.gce_ssd_interface + result['gce_network_tier'] = self.gce_network_tier + result['gce_nic_type'] = self.gce_nic_type + if self.gce_egress_bandwidth_tier: + result['gce_egress_bandwidth_tier'] = self.gce_egress_bandwidth_tier + result['gce_shielded_secure_boot'] = self.gce_shielded_secure_boot + result['gce_confidential_compute'] = self.gce_confidential_compute + result['boot_disk_type'] = self.boot_disk_type + result['boot_disk_size'] = self.boot_disk_size + if self.threads_per_core: + result['threads_per_core'] = self.threads_per_core + if self.network.mtu: + result['mtu'] = self.network.mtu + return result + + def SimulateMaintenanceEvent(self): + """Simulates a maintenance event on the VM.""" + cmd = util.GcloudCommand(self, 'compute', 'instances', + 'simulate-maintenance-event', self.name, '--async') + _, _, retcode = cmd.Issue(raise_on_failure=False) + if retcode: + raise errors.VirtualMachine.VirtualMachineError( + 'Unable to simulate maintenance event.') + + def DownloadPreprovisionedData(self, install_path, module_name, filename): + """Downloads a data file from a GCS bucket with pre-provisioned data. + + Use --gce_preprovisioned_data_bucket to specify the name of the bucket. + + Args: + install_path: The install path on this VM. + module_name: Name of the module associated with this data file. + filename: The name of the file that was downloaded. + """ + # TODO(deitz): Add retry logic. + self.RemoteCommand(GenerateDownloadPreprovisionedDataCommand( + install_path, module_name, filename)) + + def InstallCli(self): + """Installs the gcloud cli on this GCP vm.""" + self.Install('google_cloud_sdk') + + def ShouldDownloadPreprovisionedData(self, module_name, filename): + """Returns whether or not preprovisioned data is available.""" + return FLAGS.gcp_preprovisioned_data_bucket and self.TryRemoteCommand( + GenerateStatPreprovisionedDataCommand(module_name, filename)) + + def _UpdateInterruptibleVmStatusThroughMetadataService(self): + _, _, retcode = vm_util.IssueCommand( + [FLAGS.gsutil_path, 'stat', self.preempt_marker], + raise_on_failure=False, suppress_warning=True) + # The VM is preempted if the command exits without an error + self.spot_early_termination = not bool(retcode) + if self.WasInterrupted(): + return + stdout, _ = self.RemoteCommand(self._MetadataPreemptCmd) + self.spot_early_termination = stdout.strip().lower() == 'true' + + @property + def _MetadataPreemptCmd(self): + return _METADATA_PREEMPT_CMD + + def _PreemptibleMetadataKeyValue(self) -> Tuple[str, str]: + """See base class.""" + return 'shutdown-script', _SHUTDOWN_SCRIPT.format( + preempt_marker=self.preempt_marker, user=self.user_name) + + def _AcquireWritePermissionsLinux(self): + gcs.GoogleCloudStorageService.AcquireWritePermissionsLinux(self) + + def OnStartup(self): + super().OnStartup() + if self.preemptible: + # Prepare VM to use GCS. When an instance is interrupt, the shutdown + # script will copy the status a GCS bucket. + self._AcquireWritePermissionsLinux() + + def _UpdateInterruptibleVmStatusThroughApi(self): + # If the run has failed then do a check that could throw an exception. + vm_without_zone = copy.copy(self) + vm_without_zone.zone = None + gcloud_command = util.GcloudCommand(vm_without_zone, 'compute', + 'operations', 'list') + gcloud_command.flags['filter'] = f'targetLink.scope():{self.name}' + gcloud_command.flags['zones'] = self.zone + stdout, _, _ = gcloud_command.Issue() + self.spot_early_termination = any( + operation['operationType'] == 'compute.instances.preempted' + for operation in json.loads(stdout)) + + def UpdateInterruptibleVmStatus(self, use_api=False): + """Updates the interruptible status if the VM was preempted.""" + if not self.IsInterruptible(): + return + if self.WasInterrupted(): + return + try: + self._UpdateInterruptibleVmStatusThroughMetadataService() + except errors.VirtualMachine.RemoteCommandError as error: + if use_api and 'connection timed out' in str(error).lower(): + self._UpdateInterruptibleVmStatusThroughApi() + + def IsInterruptible(self): + """Returns whether this vm is an interruptible vm (spot vm). + + Returns: True if this vm is an interruptible vm (spot vm). + """ + return self.preemptible + + def WasInterrupted(self): + """Returns whether this spot vm was terminated early by GCP. + + Returns: True if this vm was terminated early by GCP. + """ + return self.spot_early_termination + + def GetVmStatusCode(self): + """Returns the early termination code if any. + + Returns: Early termination code. + """ + return self.preemptible_status_code + + # Intel - lower priority of security scanner + def LowerSecurityScannerPriority(self): + run_cmd = "/opt/nessus_agent/sbin/nessuscli" + self.RemoteCommand(f"[ -d {run_cmd} ] && sudo {run_cmd} fix --set process_priority=low", + ignore_failure=True, + suppress_warning=True + ) + self.RemoteCommand(f"[ -d {run_cmd} ] && sudo {run_cmd} --set scan_performance_mode=low", + ignore_failure=True, + suppress_warning=True + ) + # End Intel contribution + + def GetInterruptableStatusPollSeconds(self): + """Get seconds between preemptible status polls. + + Returns: + Seconds between polls + """ + return 3600 + + +class BaseLinuxGceVirtualMachine(GceVirtualMachine, + linux_vm.BaseLinuxMixin): + """Class supporting Linux GCE virtual machines. + + Currently looks for gVNIC capabilities. + TODO(pclay): Make more generic and move to BaseLinuxMixin. + """ + + # regex to get the network devices from "ip link show" + _IP_LINK_RE = re.compile(r'^\d+: (?P\S+):.*mtu (?P\d+)') + # devices to ignore from "ip link show" + _IGNORE_NETWORK_DEVICES = ('lo',) + # ethtool properties output should match this regex + _ETHTOOL_RE = re.compile(r'^(?P.*?):\s*(?P.*)\s*') + # the "device" value in ethtool properties for gvnic + _GVNIC_DEVICE_NAME = 'gve' + + def __init__(self, vm_spec): + super(BaseLinuxGceVirtualMachine, self).__init__(vm_spec) + self._gvnic_version = None + self._discovered_mtu: Optional[int] = None + + def GetResourceMetadata(self): + """See base class.""" + metadata = super(BaseLinuxGceVirtualMachine, + self).GetResourceMetadata().copy() + if self._gvnic_version: + metadata['gvnic_version'] = self._gvnic_version + if self._discovered_mtu: + metadata['mtu'] = self._discovered_mtu + return metadata + + def OnStartup(self): + """See base class. Sets the _gvnic_version.""" + super(BaseLinuxGceVirtualMachine, self).OnStartup() + self._gvnic_version = self.GetGvnicVersion() + devices = self._GetNetworkDevices() + all_mtus = set(devices.values()) + if len(all_mtus) == 1: + self._discovered_mtu = list(all_mtus)[0] + else: + logging.warning('To record MTU must only have 1 unique MTU value not: %s', + devices) + + def GetGvnicVersion(self) -> Optional[str]: + """Returns the gvnic network driver version.""" + all_device_properties = {} + for device_name in self._GetNetworkDevices(): + device = self._GetNetworkDeviceProperties(device_name) + all_device_properties[device_name] = device + driver = device.get('driver') + driver_version = device.get('version') + if not driver: + logging.error( + 'Network device %s lacks a driver %s', device_name, device) + elif driver == self._GVNIC_DEVICE_NAME: + logging.info('gvnic properties %s', device) + if driver_version: + return driver_version + raise ValueError(f'No version in {device}') + + def _GetNetworkDeviceProperties(self, device_name: str) -> Dict[str, str]: + """Returns a dict of the network device properties.""" + # ethtool can exist under /usr/sbin or needs to be installed (debian9) + if self.HasPackage('ethtool'): + self.InstallPackages('ethtool') + try: + stdout, _ = self.RemoteCommand( + f'PATH="${{PATH}}":/usr/sbin ethtool -i {device_name}') + except errors.VirtualMachine.RemoteCommandError: + logging.info('ethtool not installed', exc_info=True) + return {} + properties = {} + for line in stdout.splitlines(): + m = self._ETHTOOL_RE.match(line) + if m: + properties[m['key']] = m['value'] + return properties + + def _GetNetworkDevices(self) -> Dict[str, int]: + """Returns network device names and their MTUs.""" + stdout, _ = self.RemoteCommand('PATH="${PATH}":/usr/sbin ip link show up') + devices = {} + for line in stdout.splitlines(): + m = self._IP_LINK_RE.match(line) + if m: + device_name = m['device_name'] + if device_name not in self._IGNORE_NETWORK_DEVICES: + devices[device_name] = int(m['mtu']) + return devices + + +class Debian9BasedGceVirtualMachine( + BaseLinuxGceVirtualMachine, linux_vm.Debian9Mixin): + DEFAULT_IMAGE_FAMILY = 'debian-9' + DEFAULT_IMAGE_PROJECT = 'debian-cloud' + + def _BeforeSuspend(self): + self.InstallPackages('dbus') + self.RemoteCommand('sudo systemctl restart systemd-logind.service') + + +class Debian10BasedGceVirtualMachine( + BaseLinuxGceVirtualMachine, linux_vm.Debian10Mixin): + DEFAULT_IMAGE_FAMILY = 'debian-10' + DEFAULT_IMAGE_PROJECT = 'debian-cloud' + + +class Debian11BasedGceVirtualMachine( + BaseLinuxGceVirtualMachine, linux_vm.Debian11Mixin): + DEFAULT_IMAGE_FAMILY = 'debian-11' + DEFAULT_IMAGE_PROJECT = 'debian-cloud' + + +class Rhel7BasedGceVirtualMachine( + BaseLinuxGceVirtualMachine, linux_vm.Rhel7Mixin): + DEFAULT_IMAGE_FAMILY = 'rhel-7' + DEFAULT_IMAGE_PROJECT = 'rhel-cloud' + + +class Rhel8BasedGceVirtualMachine( + BaseLinuxGceVirtualMachine, linux_vm.Rhel8Mixin): + DEFAULT_IMAGE_FAMILY = 'rhel-8' + DEFAULT_IMAGE_PROJECT = 'rhel-cloud' + + +class CentOs7BasedGceVirtualMachine( + BaseLinuxGceVirtualMachine, linux_vm.CentOs7Mixin): + DEFAULT_IMAGE_FAMILY = 'centos-7' + DEFAULT_IMAGE_PROJECT = 'centos-cloud' + + +class CentOsStream8BasedGceVirtualMachine(BaseLinuxGceVirtualMachine, + linux_vm.CentOsStream8Mixin): + DEFAULT_IMAGE_FAMILY = 'centos-stream-8' + DEFAULT_IMAGE_PROJECT = 'centos-cloud' + + +class RockyLinux8BasedGceVirtualMachine(BaseLinuxGceVirtualMachine, + linux_vm.RockyLinux8Mixin): + DEFAULT_IMAGE_FAMILY = 'rocky-linux-8' + DEFAULT_IMAGE_PROJECT = 'rocky-linux-cloud' + + +class ContainerOptimizedOsBasedGceVirtualMachine( + BaseLinuxGceVirtualMachine, linux_vm.ContainerOptimizedOsMixin): + DEFAULT_IMAGE_FAMILY = 'cos-stable' + DEFAULT_IMAGE_PROJECT = 'cos-cloud' + + +class CoreOsBasedGceVirtualMachine( + BaseLinuxGceVirtualMachine, linux_vm.CoreOsMixin): + DEFAULT_IMAGE_FAMILY = 'fedora-coreos-stable' + DEFAULT_IMAGE_PROJECT = 'fedora-coreos-cloud' + + def __init__(self, vm_spec): + super(CoreOsBasedGceVirtualMachine, self).__init__(vm_spec) + # Fedora CoreOS only creates the core user + self.user_name = 'core' + + +class Ubuntu1804BasedGceVirtualMachine( + BaseLinuxGceVirtualMachine, linux_vm.Ubuntu1804Mixin): + DEFAULT_IMAGE_FAMILY = 'ubuntu-1804-lts' + DEFAULT_IMAGE_PROJECT = 'ubuntu-os-cloud' + + +class Ubuntu2004BasedGceVirtualMachine( + BaseLinuxGceVirtualMachine, linux_vm.Ubuntu2004Mixin): + DEFAULT_IMAGE_FAMILY = 'ubuntu-2004-lts' + DEFAULT_IMAGE_PROJECT = 'ubuntu-os-cloud' + + +class Ubuntu2204BasedGceVirtualMachine( + BaseLinuxGceVirtualMachine, linux_vm.Ubuntu2204Mixin): + DEFAULT_IMAGE_FAMILY = 'ubuntu-2204-lts' + DEFAULT_IMAGE_PROJECT = 'ubuntu-os-cloud' + + +class BaseWindowsGceVirtualMachine(GceVirtualMachine, + windows_virtual_machine.BaseWindowsMixin): + """Class supporting Windows GCE virtual machines.""" + + DEFAULT_IMAGE_PROJECT = 'windows-cloud' + + NVME_START_INDEX = 0 + + def __init__(self, vm_spec): + """Initialize a Windows GCE virtual machine. + + Args: + vm_spec: virtual_machine.BaseVmSpec object of the vm. + """ + super(BaseWindowsGceVirtualMachine, self).__init__(vm_spec) + self.boot_metadata[ + 'windows-startup-script-ps1'] = windows_virtual_machine.STARTUP_SCRIPT + + def _GenerateResetPasswordCommand(self): + """Generates a command to reset a VM user's password. + + Returns: + GcloudCommand. gcloud command to issue in order to reset the VM user's + password. + """ + cmd = util.GcloudCommand(self, 'compute', 'reset-windows-password', + self.name) + cmd.flags['user'] = self.user_name + return cmd + + def _PostCreate(self): + super(BaseWindowsGceVirtualMachine, self)._PostCreate() + reset_password_cmd = self._GenerateResetPasswordCommand() + stdout, _ = reset_password_cmd.IssueRetryable() + response = json.loads(stdout) + self.password = response['password'] + + def _PreemptibleMetadataKeyValue(self) -> Tuple[str, str]: + """See base class.""" + return 'windows-shutdown-script-ps1', _WINDOWS_SHUTDOWN_SCRIPT_PS1.format( + preempt_marker=self.preempt_marker) + + @vm_util.Retry( + max_retries=10, + retryable_exceptions=(GceUnexpectedWindowsAdapterOutputError, + errors.VirtualMachine.RemoteCommandError)) + def GetResourceMetadata(self): + """Returns a dict containing metadata about the VM. + + Returns: + dict mapping metadata key to value. + """ + result = super(BaseWindowsGceVirtualMachine, self).GetResourceMetadata() + result['disable_rss'] = self.disable_rss + return result + + def DisableRSS(self): + """Disables RSS on the GCE VM. + + Raises: + GceDriverDoesntSupportFeatureError: If RSS is not supported. + GceUnexpectedWindowsAdapterOutputError: If querying the RSS state + returns unexpected output. + """ + # First ensure that the driver supports interrupt moderation + net_adapters, _ = self.RemoteCommand('Get-NetAdapter') + if 'Red Hat VirtIO Ethernet Adapter' not in net_adapters: + raise GceDriverDoesntSupportFeatureError( + 'Driver not tested with RSS disabled in PKB.') + + command = 'netsh int tcp set global rss=disabled' + self.RemoteCommand(command) + try: + self.RemoteCommand('Restart-NetAdapter -Name "Ethernet"') + except IOError: + # Restarting the network adapter will always fail because + # the winrm connection used to issue the command will be + # broken. + pass + + # Verify the setting went through + stdout, _ = self.RemoteCommand('netsh int tcp show global') + if 'Receive-Side Scaling State : enabled' in stdout: + raise GceUnexpectedWindowsAdapterOutputError('RSS failed to disable.') + + def _AcquireWritePermissionsLinux(self): + gcs.GoogleCloudStorageService.AcquireWritePermissionsWindows(self) + + @property + def _MetadataPreemptCmd(self): + return _METADATA_PREEMPT_CMD_WIN + + +class Windows2012CoreGceVirtualMachine( + BaseWindowsGceVirtualMachine, windows_virtual_machine.Windows2012CoreMixin): + DEFAULT_IMAGE_FAMILY = 'windows-2012-r2-core' + + +class Windows2016CoreGceVirtualMachine( + BaseWindowsGceVirtualMachine, windows_virtual_machine.Windows2016CoreMixin): + DEFAULT_IMAGE_FAMILY = 'windows-2016-core' + + +class Windows2019CoreGceVirtualMachine( + BaseWindowsGceVirtualMachine, windows_virtual_machine.Windows2019CoreMixin): + DEFAULT_IMAGE_FAMILY = 'windows-2019-core' + + +class Windows2012DesktopGceVirtualMachine( + BaseWindowsGceVirtualMachine, + windows_virtual_machine.Windows2012DesktopMixin): + DEFAULT_IMAGE_FAMILY = 'windows-2012-r2' + + +class Windows2016DesktopGceVirtualMachine( + BaseWindowsGceVirtualMachine, + windows_virtual_machine.Windows2016DesktopMixin): + DEFAULT_IMAGE_FAMILY = 'windows-2016' + + +class Windows2019DesktopGceVirtualMachine( + BaseWindowsGceVirtualMachine, + windows_virtual_machine.Windows2019DesktopMixin): + DEFAULT_IMAGE_FAMILY = 'windows-2019' + + +class Windows2022DesktopGceVirtualMachine( + BaseWindowsGceVirtualMachine, + windows_virtual_machine.Windows2022DesktopMixin): + DEFAULT_IMAGE_FAMILY = 'windows-2022' + + +class Windows2019DesktopSQLServer2017StandardGceVirtualMachine( + BaseWindowsGceVirtualMachine, + windows_virtual_machine.Windows2019SQLServer2017Standard): + DEFAULT_IMAGE_FAMILY = 'sql-std-2017-win-2019' + DEFAULT_IMAGE_PROJECT = 'windows-sql-cloud' + + +class Windows2019DesktopSQLServer2017EnterpriseGceVirtualMachine( + BaseWindowsGceVirtualMachine, + windows_virtual_machine.Windows2019SQLServer2017Enterprise): + DEFAULT_IMAGE_FAMILY = 'sql-ent-2017-win-2019' + DEFAULT_IMAGE_PROJECT = 'windows-sql-cloud' + + +class Windows2019DesktopSQLServer2019StandardGceVirtualMachine( + BaseWindowsGceVirtualMachine, + windows_virtual_machine.Windows2019SQLServer2019Standard): + DEFAULT_IMAGE_FAMILY = 'sql-std-2019-win-2019' + DEFAULT_IMAGE_PROJECT = 'windows-sql-cloud' + + +class Windows2019DesktopSQLServer2019EnterpriseGceVirtualMachine( + BaseWindowsGceVirtualMachine, + windows_virtual_machine.Windows2019SQLServer2019Enterprise): + DEFAULT_IMAGE_FAMILY = 'sql-ent-2019-win-2019' + DEFAULT_IMAGE_PROJECT = 'windows-sql-cloud' + + +class Windows2022DesktopSQLServer2019StandardGceVirtualMachine( + BaseWindowsGceVirtualMachine, + windows_virtual_machine.Windows2022SQLServer2019Standard): + DEFAULT_IMAGE_FAMILY = 'sql-std-2019-win-2022' + DEFAULT_IMAGE_PROJECT = 'windows-sql-cloud' + + +class Windows2022DesktopSQLServer2019EnterpriseGceVirtualMachine( + BaseWindowsGceVirtualMachine, + windows_virtual_machine.Windows2022SQLServer2019Enterprise): + DEFAULT_IMAGE_FAMILY = 'sql-ent-2019-win-2022' + DEFAULT_IMAGE_PROJECT = 'windows-sql-cloud' + + +def GenerateDownloadPreprovisionedDataCommand(install_path, module_name, + filename): + """Returns a string used to download preprovisioned data.""" + return 'gsutil -q cp gs://%s/%s/%s %s' % ( + FLAGS.gcp_preprovisioned_data_bucket, module_name, filename, + posixpath.join(install_path, filename)) + + +def GenerateStatPreprovisionedDataCommand(module_name, filename): + """Returns a string used to download preprovisioned data.""" + return 'gsutil stat gs://%s/%s/%s' % ( + FLAGS.gcp_preprovisioned_data_bucket, module_name, filename) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_bigtable.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_bigtable.py new file mode 100644 index 0000000..6731d77 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_bigtable.py @@ -0,0 +1,397 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for GCP's bigtable instances. + +Clusters can be created and deleted. +""" + +import json +import logging +from typing import Any, Dict, List, Optional + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import non_relational_db +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.providers.gcp import util +import requests + +FLAGS = flags.FLAGS + + +flags.DEFINE_string('google_bigtable_instance_name', None, + 'Bigtable instance name. If not specified, new instance ' + 'will be created and deleted on the fly. If specified, ' + 'the instance is considered user managed and will not ' + 'created/deleted by PKB.') +flags.DEFINE_integer( + 'bigtable_node_count', None, + 'Number of nodes to create in the bigtable cluster. ' + 'Ignored if --bigtable_autoscaling_min_nodes is set.' + 'TODO: Consider merging the two flags for better user-friendliness.') +_AUTOSCALING_MIN_NODES = flags.DEFINE_integer( + 'bigtable_autoscaling_min_nodes', None, + 'Minimum number of nodes for autoscaling.') +_AUTOSCALING_MAX_NODES = flags.DEFINE_integer( + 'bigtable_autoscaling_max_nodes', None, + 'Maximum number of nodes for autoscaling.') +_AUTOSCALING_CPU_TARGET = flags.DEFINE_integer( + 'bigtable_autoscaling_cpu_target', None, + 'The target CPU utilization percent for autoscaling.') +flags.DEFINE_enum('bigtable_storage_type', None, ['ssd', 'hdd'], + 'Storage class for the cluster') +flags.DEFINE_string('google_bigtable_zone', None, + 'Bigtable zone.') +flags.DEFINE_boolean('bigtable_replication_cluster', None, + 'Whether to create a Bigtable replication cluster.') +flags.DEFINE_string('bigtable_replication_cluster_zone', None, + 'Zone in which to create a Bigtable replication cluster.') +flags.DEFINE_boolean('bigtable_multicluster_routing', None, + 'Whether to use multi-cluster routing.') + +_DEFAULT_NODE_COUNT = 3 +_DEFAULT_STORAGE_TYPE = 'ssd' +_DEFAULT_ZONE = 'us-central1-b' +_DEFAULT_REPLICATION_ZONE = 'us-central1-c' + + +class BigtableSpec(non_relational_db.BaseNonRelationalDbSpec): + """Configurable options of a Bigtable instance. See below for descriptions.""" + + SERVICE_TYPE = non_relational_db.BIGTABLE + + name: str + zone: str + project: str + node_count: int + storage_type: str + replication_cluster: bool + replication_cluster_zone: str + multicluster_routing: bool + autoscaling_min_nodes: int + autoscaling_max_nodes: int + autoscaling_cpu_target: int + + def __init__(self, component_full_name, flag_values, **kwargs): + super().__init__(component_full_name, flag_values=flag_values, **kwargs) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes / constructor args for each configurable option.""" + result = super()._GetOptionDecoderConstructions() + none_ok = {'default': None, 'none_ok': True} + result.update({ + 'name': (option_decoders.StringDecoder, none_ok), + 'zone': (option_decoders.StringDecoder, none_ok), + 'project': (option_decoders.StringDecoder, none_ok), + 'node_count': (option_decoders.IntDecoder, none_ok), + 'storage_type': (option_decoders.StringDecoder, none_ok), + 'replication_cluster': (option_decoders.BooleanDecoder, none_ok), + 'replication_cluster_zone': (option_decoders.StringDecoder, none_ok), + 'multicluster_routing': (option_decoders.BooleanDecoder, none_ok), + 'autoscaling_min_nodes': (option_decoders.IntDecoder, none_ok), + 'autoscaling_max_nodes': (option_decoders.IntDecoder, none_ok), + 'autoscaling_cpu_target': (option_decoders.IntDecoder, none_ok), + }) + return result + + @classmethod + def _ValidateConfig(cls, config_values) -> None: + """Verifies correct usage of the bigtable config options.""" + if (config_values.get('multicluster_routing', False) and + not config_values.get('replication_cluster', False)): + raise errors.Config.InvalidValue( + 'bigtable_replication_cluster must be set if ' + 'bigtable_multicluster_routing is True.') + + @classmethod + def _ApplyFlags(cls, config_values, flag_values) -> None: + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May be + modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super()._ApplyFlags(config_values, flag_values) + option_name_from_flag = { + 'google_bigtable_instance_name': 'name', + 'google_bigtable_zone': 'zone', + 'bigtable_storage_type': 'storage_type', + 'bigtable_node_count': 'node_count', + 'bigtable_replication_cluster': 'replication_cluster', + 'bigtable_replication_cluster_zone': 'replication_cluster_zone', + 'bigtable_multicluster_routing': 'multicluster_routing', + 'bigtable_autoscaling_min_nodes': 'autoscaling_min_nodes', + 'bigtable_autoscaling_max_nodes': 'autoscaling_max_nodes', + 'bigtable_autoscaling_cpu_target': 'autoscaling_cpu_target', + } + for flag_name, option_name in option_name_from_flag.items(): + if flag_values[flag_name].present: + config_values[option_name] = flag_values[flag_name].value + + cls._ValidateConfig(config_values) + + def __repr__(self) -> str: + return str(self.__dict__) + + +class GcpBigtableInstance(non_relational_db.BaseNonRelationalDb): + """Object representing a GCP Bigtable Instance. + + See https://cloud.google.com/bigtable/docs/overview. + + For replication settings, see + + For autoscaling/multicluster attributes, see + https://cloud.google.com/bigtable/docs/autoscaling. + + Attributes: + name: Instance and cluster name. + project: Enclosing project for the instance. + zone: zone of the instance's cluster. + node_count: Number of nodes in the instance's cluster. + storage_type: Storage class for the cluster. + replication_cluster: Whether the instance has a replication cluster. + replication_cluster_zone: Zone for the replication cluster. + multicluster_routing: Whether the instance uses multicluster_routing. + autoscaling_min_nodes: Minimum number of nodes for autoscaling. + autoscaling_max_nodes: Maximum number of nodes for autoscaling. + autoscaling_cpu_target: CPU utilization percent for autoscaling. + """ + + SERVICE_TYPE = non_relational_db.BIGTABLE + + def __init__(self, + name: Optional[str], + project: Optional[str], + zone: Optional[str], + node_count: Optional[int], + storage_type: Optional[str], + replication_cluster: Optional[bool], + replication_cluster_zone: Optional[str], + multicluster_routing: Optional[bool], + autoscaling_min_nodes: Optional[int], + autoscaling_max_nodes: Optional[int], + autoscaling_cpu_target: Optional[int]): + super(GcpBigtableInstance, self).__init__() + if name is not None: + self.user_managed = True + self.name: str = name or f'pkb-bigtable-{FLAGS.run_uri}' + self.zone: str = zone or FLAGS.google_bigtable_zone + self.project: str = project or FLAGS.project or util.GetDefaultProject() + self.node_count: int = node_count or _DEFAULT_NODE_COUNT + self.storage_type: str = storage_type or _DEFAULT_STORAGE_TYPE + self.replication_cluster: bool = replication_cluster or False + self.replication_cluster_zone: str = ( + replication_cluster_zone or _DEFAULT_REPLICATION_ZONE) + self.multicluster_routing: bool = multicluster_routing or False + self.autoscaling_min_nodes: Optional[int] = autoscaling_min_nodes or None + self.autoscaling_max_nodes: Optional[int] = autoscaling_max_nodes or None + self.autoscaling_cpu_target: Optional[int] = autoscaling_cpu_target or None + + @classmethod + def FromSpec(cls, spec: BigtableSpec) -> 'GcpBigtableInstance': + return cls( + name=spec.name, + zone=spec.zone, + project=spec.project, + node_count=spec.node_count, + storage_type=spec.storage_type, + replication_cluster=spec.replication_cluster, + replication_cluster_zone=spec.replication_cluster_zone, + multicluster_routing=spec.multicluster_routing, + autoscaling_min_nodes=spec.autoscaling_min_nodes, + autoscaling_max_nodes=spec.autoscaling_max_nodes, + autoscaling_cpu_target=spec.autoscaling_cpu_target) + + def _BuildClusterConfigs(self) -> List[str]: + """Return flag values for --cluster_config when creating an instance. + + Returns: + List of strings for repeated --cluster_config flag values. + """ + flag_values = [] + cluster_config = { + 'id': f'{self.name}-0', + 'zone': self.zone, + 'nodes': self.node_count, + # Depending on flag settings, the config may be incomplete, but we rely + # on gcloud to validate for us. + 'autoscaling-min-nodes': self.autoscaling_min_nodes, + 'autoscaling-max-nodes': self.autoscaling_max_nodes, + 'autoscaling-cpu-target': self.autoscaling_cpu_target, + } + + # Ignore nodes if autoscaling is configured. --bigtable_node_count has a + # default value so we want to maintain backwards compatibility. + if self.autoscaling_min_nodes: + del cluster_config['nodes'] + + keys_to_remove = [] + for k, v in cluster_config.items(): + if v is None: + keys_to_remove.append(k) + for key in keys_to_remove: + del cluster_config[key] + + flag_values.append(','.join( + '{}={}'.format(k, v) for (k, v) in cluster_config.items())) + + if self.replication_cluster: + replication_cluster_config = cluster_config.copy() + replication_cluster_config['id'] = f'{self.name}-1' + replication_cluster_config['zone'] = self.replication_cluster_zone + flag_values.append(','.join( + '{}={}'.format(k, v) + for (k, v) in replication_cluster_config.items())) + + return flag_values + + def _Create(self): + """Creates the instance.""" + cmd = util.GcloudCommand(self, 'bigtable', 'instances', 'create', self.name) + cmd.flags['display-name'] = self.name + cmd.flags['cluster-storage-type'] = self.storage_type + cmd.flags['project'] = self.project + cmd.flags['cluster-config'] = self._BuildClusterConfigs() + # The zone flag makes this command fail. + cmd.flags['zone'] = [] + + logging.info('Creating instance %s.', self.name) + + _, stderr, _ = cmd.Issue() + if 'Insufficient node quota' in stderr: + raise errors.Benchmarks.QuotaFailure( + f'Insufficient node quota in project {self.project} ' + f'and zone {self.zone}') + + self._UpdateLabels(util.GetDefaultTags()) + + if self.multicluster_routing: + cmd = util.GcloudCommand( + self, 'bigtable', 'app-profiles', 'update', 'default') + cmd.flags['instance'] = self.name + cmd.flags['route-any'] = True + cmd.flags['force'] = True + cmd.flags['zone'] = [] + cmd.Issue() + + def _GetLabels(self) -> Dict[str, Any]: + """Gets labels from the current instance.""" + return self._DescribeInstance().get('labels', {}) + + def _UpdateLabels(self, labels: Dict[str, Any]) -> None: + """Updates the labels of the current instance.""" + header = {'Authorization': f'Bearer {util.GetAccessToken()}'} + url = ('https://bigtableadmin.googleapis.com/v2/' + f'projects/{self.project}/instances/{self.name}') + # Keep any existing labels + tags = self._GetLabels() + tags.update(labels) + response = requests.patch( + url, + headers=header, + params={'updateMask': 'labels'}, + json={'labels': tags}) + logging.info('Update labels: status code %s, %s', response.status_code, + response.text) + if response.status_code != 200: + raise errors.Resource.UpdateError( + f'Unable to update Bigtable instance: {response.text}') + + def _UpdateTimeout(self, timeout_minutes: int) -> None: + """See base class.""" + self._UpdateLabels(util.GetDefaultTags(timeout_minutes)) + + def _Delete(self): + """Deletes the instance.""" + cmd = util.GcloudCommand(self, 'bigtable', 'instances', 'delete', self.name) + # The zone flag makes this command fail. + cmd.flags['zone'] = [] + cmd.Issue(raise_on_failure=False) + + def _DescribeInstance(self) -> Dict[str, Any]: + cmd = util.GcloudCommand( + self, 'bigtable', 'instances', 'describe', self.name) + # The zone flag makes this command fail. + cmd.flags['zone'] = [] + stdout, stderr, retcode = cmd.Issue( + suppress_warning=True, raise_on_failure=False) + if retcode != 0: + logging.error('Describing instance %s failed: %s', self.name, stderr) + return {} + return json.loads(stdout) + + def _Exists(self): + """Returns true if the instance exists.""" + instance = self._DescribeInstance() + if not instance: + return False + return instance['state'] == 'READY' + + def GetResourceMetadata(self) -> Dict[str, Any]: + metadata = {} + if self.user_managed: + clusters = GetClustersDescription(self.name, self.project) + metadata['bigtable_zone'] = [ + cluster['zone'] for cluster in clusters] + metadata['bigtable_storage_type'] = [ + cluster['defaultStorageType'] for cluster in clusters] + metadata['bigtable_node_count'] = [ + cluster['serveNodes'] for cluster in clusters] + else: + metadata['bigtable_zone'] = self.zone + metadata['bigtable_replication_zone'] = self.replication_cluster_zone + metadata['bigtable_storage_type'] = self.storage_type + metadata['bigtable_node_count'] = self.node_count + metadata['bigtable_multicluster_routing'] = self.multicluster_routing + return metadata + + +def GetClustersDescription(instance_name, project): + """Gets descriptions of all the clusters given the instance and project. + + This is a module function to allow getting description of clusters not created + by pkb. + + Args: + instance_name: Instance to get cluster descriptions for. + project: Project where instance is in. + + Returns: + A list of cluster descriptions dicts. + """ + cmd = util.GcloudCommand(None, 'bigtable', 'clusters', 'list') + cmd.flags['instances'] = instance_name + cmd.flags['project'] = project + stdout, stderr, retcode = cmd.Issue( + suppress_warning=True, raise_on_failure=False) + if retcode: + logging.error('Command "%s" failed:\nSTDOUT:\n%s\nSTDERR:\n%s', + repr(cmd), stdout, stderr) + output = json.loads(stdout) + + result = [] + for cluster_details in output: + current_instance_name = cluster_details['name'].split('/')[3] + if current_instance_name == instance_name: + cluster_details['name'] = cluster_details['name'].split('/')[5] + cluster_details['zone'] = cluster_details['location'].split('/')[3] + result.append(cluster_details) + + return result diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_cloud_redis.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_cloud_redis.py new file mode 100644 index 0000000..92500a4 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_cloud_redis.py @@ -0,0 +1,213 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for GCP's cloud redis instances. + +Instances can be created and deleted. +""" +import json +import logging +import time + +from absl import flags +from google.cloud import monitoring_v3 +from google.cloud.monitoring_v3.types import TimeInterval +from perfkitbenchmarker import errors +from perfkitbenchmarker import managed_memory_store +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.gcp import flags as gcp_flags +from perfkitbenchmarker.providers.gcp import util + +FLAGS = flags.FLAGS +STANDARD_TIER = 'STANDARD' +BASIC_TIER = 'BASIC' +COMMAND_TIMEOUT = 600 # 10 minutes +# Default redis api endpoint +API_ENDPOINT = 'https://redis.googleapis.com/' + + +class CloudRedis(managed_memory_store.BaseManagedMemoryStore): + """Object representing a GCP cloud redis instance.""" + + CLOUD = providers.GCP + MEMORY_STORE = managed_memory_store.REDIS + + def __init__(self, spec): + super(CloudRedis, self).__init__(spec) + self.project = FLAGS.project + self.size = FLAGS.gcp_redis_gb + self.redis_region = FLAGS.cloud_redis_region + self.redis_version = spec.config.cloud_redis.redis_version + self.failover_style = FLAGS.redis_failover_style + if self.failover_style == managed_memory_store.Failover.FAILOVER_NONE: + self.tier = BASIC_TIER + elif self.failover_style == managed_memory_store.Failover.FAILOVER_SAME_REGION: + self.tier = STANDARD_TIER + cmd = util.GcloudCommand(self, 'config', 'set', + 'api_endpoint_overrides/redis', + gcp_flags.API_OVERRIDE.value) + cmd.Issue() + + @staticmethod + def CheckPrerequisites(benchmark_config): + if FLAGS.redis_failover_style == managed_memory_store.Failover.FAILOVER_SAME_ZONE: + raise errors.Config.InvalidValue( + 'GCP cloud redis does not support same zone failover') + if (FLAGS.managed_memory_store_version and + FLAGS.managed_memory_store_version + not in managed_memory_store.REDIS_VERSIONS): + raise errors.Config.InvalidValue('Invalid Redis version.') + + def GetResourceMetadata(self): + """Returns a dict containing metadata about the instance. + + Returns: + dict mapping string property key to value. + """ + result = { + 'cloud_redis_failover_style': self.failover_style, + 'cloud_redis_size': self.size, + 'cloud_redis_tier': self.tier, + 'cloud_redis_region': self.redis_region, + 'cloud_redis_version': self.ParseReadableVersion(self.redis_version), + } + return result + + @staticmethod + def ParseReadableVersion(version): + """Parses Redis major and minor version number.""" + if version.count('_') < 2: + logging.info( + 'Could not parse version string correctly, ' + 'full Redis version returned: %s', version) + return version + return '.'.join(version.split('_')[1:]) + + def _Create(self): + """Creates the instance.""" + cmd = util.GcloudCommand(self, 'redis', 'instances', 'create', self.name) + cmd.flags['region'] = self.redis_region + cmd.flags['zone'] = FLAGS.zone[0] + cmd.flags['network'] = FLAGS.gce_network_name + cmd.flags['tier'] = self.tier + cmd.flags['size'] = self.size + cmd.flags['redis-version'] = self.redis_version + cmd.flags['labels'] = util.MakeFormattedDefaultTags() + cmd.Issue(timeout=COMMAND_TIMEOUT) + + def _IsReady(self): + """Returns whether cluster is ready.""" + instance_details, _, _ = self.DescribeInstance() + return json.loads(instance_details).get('state') == 'READY' + + def _Delete(self): + """Deletes the instance.""" + cmd = util.GcloudCommand(self, 'redis', 'instances', 'delete', self.name) + cmd.flags['region'] = self.redis_region + cmd.Issue(timeout=COMMAND_TIMEOUT, raise_on_failure=False) + reset_cmd = util.GcloudCommand(self, 'config', 'set', + 'api_endpoint_overrides/redis', + 'https://redis.googleapis.com/') + reset_cmd.Issue(timeout=COMMAND_TIMEOUT, raise_on_failure=False) + + def _Exists(self): + """Returns true if the instance exists.""" + _, _, retcode = self.DescribeInstance() + return retcode == 0 + + def DescribeInstance(self): + """Calls describe instance using the gcloud tool. + + Returns: + stdout, stderr, and retcode. + """ + cmd = util.GcloudCommand(self, 'redis', 'instances', 'describe', self.name) + cmd.flags['region'] = self.redis_region + stdout, stderr, retcode = cmd.Issue( + suppress_warning=True, raise_on_failure=False) + if retcode != 0: + logging.info('Could not find redis instance %s', self.name) + return stdout, stderr, retcode + + @vm_util.Retry(max_retries=5) + def _PopulateEndpoint(self): + """Populates endpoint information about the instance. + + Raises: + errors.Resource.RetryableGetError: + Failed to retrieve information on instance + """ + stdout, _, retcode = self.DescribeInstance() + if retcode != 0: + raise errors.Resource.RetryableGetError( + 'Failed to retrieve information on {}'.format(self.name)) + self._ip = json.loads(stdout)['host'] + self._port = json.loads(stdout)['port'] + + def MeasureCpuUtilization(self, interval_length): + """Measure the average CPU utilization on GCP instance in percentage.""" + now = time.time() + seconds = int(now) + interval = TimeInterval() + interval.end_time.seconds = seconds + interval.start_time.seconds = seconds - interval_length + client = monitoring_v3.MetricServiceClient() + + api_filter = ( + 'metric.type = "redis.googleapis.com/stats/cpu_utilization" ' + 'AND resource.labels.instance_id = "projects/' + ) + self.project + '/locations/' + self.redis_region + '/instances/' + self.name + '"' + + time_series = client.list_time_series( + name='projects/' + self.project, + filter_=api_filter, + interval=interval, + view=monitoring_v3.enums.ListTimeSeriesRequest.TimeSeriesView.FULL) + + return self._ParseMonitoringTimeSeries(time_series) + + def _ParseMonitoringTimeSeries(self, time_series): + """Parses time series data and returns average CPU across intervals in %. + + For example, an interval of 3 minutes would be represented as [x, y, z], + where x, y, and z are cpu seconds. + average CPU usage per minute in cpu seconds = (x + y + z) / 3 + average cpu usage in percentage = [(x + y + z) / 3] / 60 + + Args: + time_series: time series of cpu seconds returned by monitoring. + + Returns: + Percentage CPU use. + """ + intervals = [] + # For each of the four types of load, sum the CPU across all intervals + for i, time_interval in enumerate(time_series): + for j, interval in enumerate(time_interval.points): + if i == 0: + intervals.append(interval.value.double_value) + else: + intervals[j] += interval.value.double_value + + if intervals: + # Average over all minute intervals captured + averaged = sum(intervals) / len(intervals) + # averaged is in the unit of cpu seconds per minute. + # So divide by 60sec in 1 min to get a percentage usage over the minute. + return averaged / 60 + return None + + def GetInstanceSize(self): + """Return the size of the GCP instance in gigabytes.""" + return self.size diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_dataproc.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_dataproc.py new file mode 100644 index 0000000..74b6b4b --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_dataproc.py @@ -0,0 +1,261 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for GCP's spark service. + +Spark clusters can be created and deleted. +""" + +import datetime +import json +import logging +import os +import re + +from absl import flags +from perfkitbenchmarker import providers +from perfkitbenchmarker import spark_service +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.gcp import util + + +FLAGS = flags.FLAGS + + +class GcpDataproc(spark_service.BaseSparkService): + """Object representing a GCP Dataproc cluster. + + Attributes: + cluster_id: ID of the cluster. + project: ID of the project. + """ + + CLOUD = providers.GCP + SERVICE_NAME = 'dataproc' + + def __init__(self, spark_service_spec): + super(GcpDataproc, self).__init__(spark_service_spec) + self.project = self.spec.master_group.vm_spec.project + self.region = self.zone.rsplit('-', 1)[0] + + @staticmethod + def _ParseTime(state_time): + """Parses time from json output. + + Args: + state_time: string. the state start time. + + Returns: + datetime. + """ + try: + return datetime.datetime.strptime(state_time, '%Y-%m-%dT%H:%M:%S.%fZ') + except ValueError: + return datetime.datetime.strptime(state_time, '%Y-%m-%dT%H:%M:%SZ') + + @staticmethod + def _GetStats(stdout): + results = json.loads(stdout) + stats = {} + done_time = GcpDataproc._ParseTime(results['status']['stateStartTime']) + pending_time = None + start_time = None + for state in results['statusHistory']: + if state['state'] == 'PENDING': + pending_time = GcpDataproc._ParseTime(state['stateStartTime']) + elif state['state'] == 'RUNNING': + start_time = GcpDataproc._ParseTime(state['stateStartTime']) + + if done_time and start_time: + stats[spark_service.RUNTIME] = (done_time - start_time).total_seconds() + if start_time and pending_time: + stats[spark_service.WAITING] = ( + (start_time - pending_time).total_seconds()) + return stats + + def DataprocGcloudCommand(self, *args): + all_args = ('dataproc',) + args + cmd = util.GcloudCommand(self, *all_args) + cmd.flags['region'] = self.region + return cmd + + def _Create(self): + """Creates the cluster.""" + + if self.cluster_id is None: + self.cluster_id = 'pkb-' + FLAGS.run_uri + cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id) + if self.project is not None: + cmd.flags['project'] = self.project + cmd.flags['num-workers'] = self.spec.worker_group.vm_count + + for group_type, group_spec in [ + ('worker', self.spec.worker_group), + ('master', self.spec.master_group)]: + flag_name = group_type + '-machine-type' + cmd.flags[flag_name] = group_spec.vm_spec.machine_type + + if group_spec.vm_spec.num_local_ssds: + ssd_flag = 'num-{0}-local-ssds'.format(group_type) + cmd.flags[ssd_flag] = group_spec.vm_spec.num_local_ssds + + if group_spec.vm_spec.boot_disk_size: + disk_flag = group_type + '-boot-disk-size' + cmd.flags[disk_flag] = group_spec.vm_spec.boot_disk_size + + if group_spec.vm_spec.boot_disk_type: + disk_flag = group_type + '-boot-disk-type' + cmd.flags[disk_flag] = group_spec.vm_spec.boot_disk_type + + if FLAGS.gcp_dataproc_subnet: + cmd.flags['subnet'] = FLAGS.gcp_dataproc_subnet + cmd.additional_flags.append('--no-address') + + if FLAGS.gcp_dataproc_property: + cmd.flags['properties'] = ','.join(FLAGS.gcp_dataproc_property) + + if FLAGS.gcp_dataproc_image: + cmd.flags['image'] = FLAGS.gcp_dataproc_image + + cmd.flags['metadata'] = util.MakeFormattedDefaultTags() + cmd.flags['labels'] = util.MakeFormattedDefaultTags() + cmd.Issue() + + def _Delete(self): + """Deletes the cluster.""" + cmd = self.DataprocGcloudCommand('clusters', 'delete', self.cluster_id) + # If we don't put this here, zone is automatically added, which + # breaks the dataproc clusters delete + cmd.flags['zone'] = [] + cmd.Issue(raise_on_failure=False) + + def _Exists(self): + """Check to see whether the cluster exists.""" + cmd = self.DataprocGcloudCommand('clusters', 'describe', self.cluster_id) + # If we don't put this here, zone is automatically added to + # the command, which breaks dataproc clusters describe + cmd.flags['zone'] = [] + _, _, retcode = cmd.Issue(raise_on_failure=False) + return retcode == 0 + + def SubmitJob(self, jarfile, classname, job_script=None, + job_poll_interval=None, + job_arguments=None, job_stdout_file=None, + job_type=spark_service.SPARK_JOB_TYPE): + cmd = self.DataprocGcloudCommand('jobs', 'submit', job_type) + cmd.flags['cluster'] = self.cluster_id + cmd.flags['labels'] = util.MakeFormattedDefaultTags() + # If we don't put this here, zone is auotmatically added to the command + # which breaks dataproc jobs submit + cmd.flags['zone'] = [] + + cmd.additional_flags = [] + if classname and jarfile: + cmd.flags['jars'] = jarfile + cmd.flags['class'] = classname + elif jarfile: + cmd.flags['jar'] = jarfile + elif job_script: + cmd.additional_flags += [job_script] + + # Dataproc gives as stdout an object describing job execution. + # Its stderr contains a mix of the stderr of the job, and the + # stdout of the job. We can set the driver log level to FATAL + # to suppress those messages, and we can then separate, hopefully + # the job standard out from the log messages. + cmd.flags['driver-log-levels'] = 'root={}'.format( + FLAGS.spark_service_log_level) + if job_arguments: + cmd.additional_flags += ['--'] + job_arguments + stdout, stderr, retcode = cmd.Issue(timeout=None, raise_on_failure=False) + if retcode != 0: + return {spark_service.SUCCESS: False} + + stats = self._GetStats(stdout) + stats[spark_service.SUCCESS] = True + + if job_stdout_file: + with open(job_stdout_file, 'w') as f: + lines = stderr.splitlines(True) + if (not re.match(r'Job \[.*\] submitted.', lines[0]) or + not re.match(r'Waiting for job output...', lines[1])): + raise Exception('Dataproc output in unexpected format.') + i = 2 + if job_type == spark_service.SPARK_JOB_TYPE: + if not re.match(r'\r', lines[i]): + raise Exception('Dataproc output in unexpected format.') + i += 1 + # Eat these status lines. They end in \r, so they overwrite + # themselves at the console or when you cat a file. But they + # are part of this string. + while re.match(r'\[Stage \d+:', lines[i]): + i += 1 + if not re.match(r' *\r$', lines[i]): + raise Exception('Dataproc output in unexpected format.') + + while i < len(lines) and not re.match(r'Job \[.*\]', lines[i]): + f.write(lines[i]) + i += 1 + if i != len(lines) - 1: + raise Exception('Dataproc output in unexpected format.') + return stats + + def ExecuteOnMaster(self, script_path, script_args): + master_name = self.cluster_id + '-m' + script_name = os.path.basename(script_path) + if FLAGS.gcp_internal_ip: + scp_cmd = ['gcloud', 'beta', 'compute', 'scp', '--internal-ip'] + else: + scp_cmd = ['gcloud', 'compute', 'scp'] + scp_cmd += ['--zone', self.GetZone(), '--quiet', script_path, + 'pkb@' + master_name + ':/tmp/' + script_name] + vm_util.IssueCommand(scp_cmd, force_info_log=True) + ssh_cmd = ['gcloud', 'compute', 'ssh'] + if FLAGS.gcp_internal_ip: + ssh_cmd += ['--internal-ip'] + ssh_cmd += ['--zone=' + self.GetZone(), '--quiet', + 'pkb@' + master_name, '--', + 'chmod +x /tmp/' + script_name + '; sudo /tmp/' + script_name + + ' ' + ' '.join(script_args)] + vm_util.IssueCommand(ssh_cmd, force_info_log=True) + + def CopyFromMaster(self, remote_path, local_path): + master_name = self.cluster_id + '-m' + if FLAGS.gcp_internal_ip: + scp_cmd = ['gcloud', 'beta', 'compute', 'scp', '--internal-ip'] + else: + scp_cmd = ['gcloud', 'compute', 'scp'] + scp_cmd += ['--zone=' + self.GetZone(), '--quiet', + 'pkb@' + master_name + ':' + + remote_path, local_path] + vm_util.IssueCommand(scp_cmd, force_info_log=True) + + def SetClusterProperty(self): + pass + + def GetMetadata(self): + basic_data = super(GcpDataproc, self).GetMetadata() + if self.spec.worker_group.vm_spec.num_local_ssds: + basic_data.update( + {'ssd_count': str(self.spec.worker_group.vm_spec.num_local_ssds)}) + return basic_data + + def GetZone(self): + cmd = self.DataprocGcloudCommand('clusters', 'describe', self.cluster_id) + cmd.flags['zone'] = [] + cmd.flags['format'] = ['value(config.gceClusterConfig.zoneUri)'] + r = cmd.Issue() + logging.info(r) + zone = r[0].strip().split('/')[-1] + logging.info(zone) + return zone diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_dpb_dataflow.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_dpb_dataflow.py new file mode 100644 index 0000000..98f9ec9 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_dpb_dataflow.py @@ -0,0 +1,147 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for GCP's dataflow service. + +No Clusters can be created or destroyed, since it is a managed solution +See details at: https://cloud.google.com/dataflow/ +""" + +import os + +from absl import flags +from perfkitbenchmarker import beam_benchmark_helper +from perfkitbenchmarker import dpb_service +from perfkitbenchmarker import errors +from perfkitbenchmarker import providers +from perfkitbenchmarker import vm_util + +flags.DEFINE_string('dpb_dataflow_staging_location', None, + 'Google Cloud Storage bucket for Dataflow to stage the ' + 'binary and any temporary files. You must create this ' + 'bucket ahead of time, before running your pipeline.') +flags.DEFINE_string('dpb_dataflow_runner', 'DataflowRunner', + 'Flag to specify the pipeline runner at runtime.') +flags.DEFINE_string('dpb_dataflow_sdk', None, + 'SDK used to build the Dataflow executable.') + + +FLAGS = flags.FLAGS + +GCP_TIME_FORMAT = '%Y-%m-%dT%H:%M:%S.%fZ' + +DATAFLOW_WC_INPUT = 'gs://dataflow-samples/shakespeare/kinglear.txt' + + +class GcpDpbDataflow(dpb_service.BaseDpbService): + """Object representing GCP Dataflow Service.""" + + CLOUD = providers.GCP + SERVICE_TYPE = 'dataflow' + + def __init__(self, dpb_service_spec): + super(GcpDpbDataflow, self).__init__(dpb_service_spec) + self.project = None + + @staticmethod + def _GetStats(stdout): + """Get Stats. + + TODO(saksena): Hook up the metrics API of dataflow to retrieve performance + metrics when available + """ + pass + + @staticmethod + def CheckPrerequisites(benchmark_config): + del benchmark_config # Unused + if not FLAGS.dpb_job_jarfile or not os.path.exists(FLAGS.dpb_job_jarfile): + raise errors.Config.InvalidValue('Job jar missing.') + if not FLAGS.dpb_dataflow_sdk: + raise errors.Config.InvalidValue('Dataflow SDK version missing.') + + def Create(self): + """See base class.""" + pass + + def Delete(self): + """See base class.""" + pass + + # TODO(saksena): Make this actually follow the contract or better yet delete + # this class. + def SubmitJob( + self, + jarfile='', + classname=None, + job_poll_interval=None, + job_arguments=None, + job_stdout_file=None, + job_type=None): + """See base class.""" + + if job_type == self.BEAM_JOB_TYPE: + full_cmd, base_dir = beam_benchmark_helper.BuildBeamCommand( + self.spec, classname, job_arguments) + _, _, retcode = vm_util.IssueCommand( + full_cmd, + cwd=base_dir, + timeout=FLAGS.beam_it_timeout, + raise_on_failure=False) + assert retcode == 0, 'Integration Test Failed.' + return + + worker_machine_type = self.spec.worker_group.vm_spec.machine_type + num_workers = self.spec.worker_count + max_num_workers = self.spec.worker_count + if (self.spec.worker_group.disk_spec and + self.spec.worker_group.disk_spec.disk_size): + disk_size_gb = self.spec.worker_group.disk_spec.disk_size + elif self.spec.worker_group.vm_spec.boot_disk_size: + disk_size_gb = self.spec.worker_group.vm_spec.boot_disk_size + else: + disk_size_gb = None + + cmd = [] + + # Needed to verify java executable is on the path + dataflow_executable = 'java' + if not vm_util.ExecutableOnPath(dataflow_executable): + raise errors.Setup.MissingExecutableError( + 'Could not find required executable "%s"' % dataflow_executable) + cmd.append(dataflow_executable) + + cmd.append('-cp') + cmd.append(jarfile) + + cmd.append(classname) + cmd += job_arguments + + cmd.append('--workerMachineType={}'.format(worker_machine_type)) + cmd.append('--numWorkers={}'.format(num_workers)) + cmd.append('--maxNumWorkers={}'.format(max_num_workers)) + + if disk_size_gb: + cmd.append('--diskSizeGb={}'.format(disk_size_gb)) + cmd.append('--defaultWorkerLogLevel={}'.format(FLAGS.dpb_log_level)) + _, _, _ = vm_util.IssueCommand(cmd) + + def SetClusterProperty(self): + pass + + def GetMetadata(self): + """Return a dictionary of the metadata for this cluster.""" + basic_data = super(GcpDpbDataflow, self).GetMetadata() + basic_data['dpb_dataflow_runner'] = FLAGS.dpb_dataflow_runner + basic_data['dpb_dataflow_sdk'] = FLAGS.dpb_dataflow_sdk + return basic_data diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_dpb_dataproc.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_dpb_dataproc.py new file mode 100644 index 0000000..30d5d12 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_dpb_dataproc.py @@ -0,0 +1,557 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for GCP's dataproc service. + +Clusters can be created, have jobs submitted to them and deleted. See details +at https://cloud.google.com/dataproc/ +""" + +import datetime +import json +import logging +from typing import Any, Dict, Optional + +from absl import flags +from perfkitbenchmarker import dpb_service +from perfkitbenchmarker import errors +from perfkitbenchmarker import flag_util +from perfkitbenchmarker import providers +from perfkitbenchmarker.linux_packages import aws_credentials +from perfkitbenchmarker.providers.gcp import gcs +from perfkitbenchmarker.providers.gcp import util + +FLAGS = flags.FLAGS +flags.DEFINE_string('dpb_dataproc_image_version', None, + 'The image version to use for the cluster.') + +disk_to_hdfs_map = { + 'pd-standard': 'HDD', + 'pd-balanced': 'SSD (Balanced)', + 'pd-ssd': 'SSD', +} + + +class GcpDpbBaseDataproc(dpb_service.BaseDpbService): + """Base class for all Dataproc-based services (cluster or serverless).""" + + def __init__(self, dpb_service_spec): + super().__init__(dpb_service_spec) + self.dpb_service_type = self.SERVICE_TYPE + self.project = FLAGS.project + if FLAGS.dpb_dataproc_image_version: + self.dpb_version = FLAGS.dpb_dataproc_image_version + if not self.dpb_service_zone: + raise errors.Setup.InvalidSetupError( + 'dpb_service_zone must be provided, for provisioning.') + self.region = self.dpb_service_zone.rsplit('-', 1)[0] + self.storage_service = gcs.GoogleCloudStorageService() + self.storage_service.PrepareService(location=self.region) + self.persistent_fs_prefix = 'gs://' + self._cluster_create_time = None + + @staticmethod + def _ParseTime(state_time: str) -> datetime.datetime: + """Parses time from json output. + + Args: + state_time: string. the state start time. + + Returns: + Parsed datetime. + """ + try: + return datetime.datetime.strptime(state_time, '%Y-%m-%dT%H:%M:%S.%fZ') + except ValueError: + return datetime.datetime.strptime(state_time, '%Y-%m-%dT%H:%M:%SZ') + + @staticmethod + def CheckPrerequisites(benchmark_config): + del benchmark_config # Unused + + def DataprocGcloudCommand(self, *args): + all_args = ('dataproc',) + tuple(args) + cmd = util.GcloudCommand(self, *all_args) + cmd.flags['region'] = self.region + return cmd + + def MigrateCrossCloud(self, + source_location, + destination_location, + dest_cloud='AWS'): + """Method to copy data cross cloud using a distributed job on the cluster. + + Currently the only supported destination cloud is AWS. + TODO(user): Add support for other destination clouds. + + Args: + source_location: The source GCS path to migrate. + destination_location: The destination path. + dest_cloud: The cloud to copy data to. + + Returns: + A dictionary with key 'success' and boolean value set to the status of + data migration command. + """ + if dest_cloud == 'AWS': + dest_prefix = 's3a://' + else: + raise ValueError('Unsupported destination cloud.') + s3_access_key, s3_secret_key = aws_credentials.GetCredentials() + return self.DistributedCopy( + 'gs://' + source_location, + dest_prefix + destination_location, + properties={ + 'fs.s3a.access.key': s3_access_key, + 'fs.s3a.secret.key': s3_secret_key, + }) + + +class GcpDpbDataproc(GcpDpbBaseDataproc): + """Object representing a managed GCP Dataproc cluster. + + Attributes: + project: ID of the project. + """ + + CLOUD = providers.GCP + SERVICE_TYPE = 'dataproc' + + def __init__(self, dpb_service_spec): + super().__init__(dpb_service_spec) + if self.user_managed and not FLAGS.dpb_service_bucket: + self.bucket = self._GetCluster()['config']['tempBucket'] + + def GetClusterCreateTime(self) -> Optional[float]: + """Returns the cluster creation time. + + On this implementation, the time returned is based on the timestamps + reported by the Dataproc API (which is stored in the _cluster_create_time + attribute). + + Returns: + A float representing the creation time in seconds or None. + """ + return self._cluster_create_time + + def _Create(self): + """Creates the cluster.""" + cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id) + if self.project is not None: + cmd.flags['project'] = self.project + + if self.spec.worker_count: + # The number of worker machines in the cluster + cmd.flags['num-workers'] = self.spec.worker_count + else: + cmd.flags['single-node'] = True + + # Initialize applications on the dataproc cluster + if self.spec.applications: + logging.info('Include the requested applications') + cmd.flags['optional-components'] = ','.join(self.spec.applications) + + # Enable component gateway for debuggability. Does not impact performance. + cmd.flags['enable-component-gateway'] = True + + # TODO(pclay): stop ignoring spec.master_group? + for role in ['worker', 'master']: + # Set machine type + if self.spec.worker_group.vm_spec.machine_type: + self._AddToCmd(cmd, '{0}-machine-type'.format(role), + self.spec.worker_group.vm_spec.machine_type) + # Set boot_disk_size + if self.spec.worker_group.disk_spec.disk_size: + size_in_gb = '{}GB'.format( + str(self.spec.worker_group.disk_spec.disk_size)) + self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role), size_in_gb) + # Set boot_disk_type + if self.spec.worker_group.disk_spec.disk_type: + self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role), + self.spec.worker_group.disk_spec.disk_type) + self.dpb_hdfs_type = disk_to_hdfs_map[ + self.spec.worker_group.disk_spec.disk_type] + + # Set ssd count + if self.spec.worker_group.vm_spec.num_local_ssds: + self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role), + self.spec.worker_group.vm_spec.num_local_ssds) + # This will actually be used for storage + self.dpb_hdfs_type = 'Local SSD' + # Set zone + cmd.flags['zone'] = self.dpb_service_zone + if self.dpb_version: + cmd.flags['image-version'] = self.dpb_version + + if FLAGS.gcp_dataproc_image: + cmd.flags['image'] = FLAGS.gcp_dataproc_image + + if FLAGS.dpb_cluster_properties: + cmd.flags['properties'] = ','.join(FLAGS.dpb_cluster_properties) + + # Ideally DpbServiceSpec would have a network spec, which we would create to + # Resolve the name, but because EMR provisions its own VPC and we are + # generally happy using pre-existing networks for Dataproc. Just use the + # underlying flag instead. + if FLAGS.gce_network_name: + cmd.flags['network'] = FLAGS.gce_network_name + + metadata = util.GetDefaultTags() + metadata.update(flag_util.ParseKeyValuePairs(FLAGS.gcp_instance_metadata)) + cmd.flags['metadata'] = util.FormatTags(metadata) + cmd.flags['labels'] = util.MakeFormattedDefaultTags() + timeout = 900 # 15 min + stdout, stderr, retcode = cmd.Issue(timeout=timeout, raise_on_failure=False) + self._cluster_create_time = self._ParseClusterCreateTime(stdout) + if retcode: + util.CheckGcloudResponseKnownFailures(stderr, retcode) + raise errors.Resource.CreationError(stderr) + + @classmethod + def _ParseClusterCreateTime(cls, stdout: str) -> Optional[float]: + """Parses the cluster create time from a raw API response.""" + try: + creation_data = json.loads(stdout) + except json.JSONDecodeError: + creation_data = {} + can_parse = creation_data.get('status', {}).get('state') == 'RUNNING' + status_history = creation_data.get('statusHistory', []) + can_parse = can_parse and len( + status_history) == 1 and status_history[0]['state'] == 'CREATING' + if not can_parse: + logging.warning('Unable to parse cluster creation duration.') + return None + creation_start = cls._ParseTime(status_history[0]['stateStartTime']) + creation_end = cls._ParseTime(creation_data['status']['stateStartTime']) + return (creation_end - creation_start).total_seconds() + + def _Delete(self): + """Deletes the cluster.""" + cmd = self.DataprocGcloudCommand('clusters', 'delete', self.cluster_id) + cmd.Issue(raise_on_failure=False) + + def _GetCluster(self) -> Optional[Dict[str, Any]]: + """Get the cluster resource in a dict.""" + cmd = self.DataprocGcloudCommand('clusters', 'describe', self.cluster_id) + stdout, _, retcode = cmd.Issue(raise_on_failure=False) + if not retcode: + return json.loads(stdout) + + def _Exists(self): + """Check to see whether the cluster exists.""" + return self._GetCluster() is not None + + def SubmitJob(self, + jarfile=None, + classname=None, + pyspark_file=None, + query_file=None, + job_poll_interval=None, + job_stdout_file=None, + job_arguments=None, + job_files=None, + job_jars=None, + job_type=None, + properties=None): + """See base class.""" + assert job_type + args = ['jobs', 'submit', job_type] + + if job_type == self.PYSPARK_JOB_TYPE: + args.append(pyspark_file) + + cmd = self.DataprocGcloudCommand(*args) + + cmd.flags['cluster'] = self.cluster_id + cmd.flags['labels'] = util.MakeFormattedDefaultTags() + + job_jars = job_jars or [] + if classname: + if jarfile: + # Dataproc does not support both a main class and a main jar so just + # make the main jar an additional jar instead. + job_jars.append(jarfile) + cmd.flags['class'] = classname + elif jarfile: + cmd.flags['jar'] = jarfile + + if query_file: + cmd.flags['file'] = query_file + + if job_files: + cmd.flags['files'] = ','.join(job_files) + if job_jars: + cmd.flags['jars'] = ','.join(job_jars) + + # Dataproc gives as stdout an object describing job execution. + # Its stderr contains a mix of the stderr of the job, and the + # stdout of the job. We set the driver log level to FATAL + # to suppress those messages, and we can then separate, hopefully + # the job standard out from the log messages. + cmd.flags['driver-log-levels'] = 'root={}'.format(FLAGS.dpb_log_level) + + all_properties = self.GetJobProperties() + all_properties.update(properties or {}) + if all_properties: + # For commas: https://cloud.google.com/sdk/gcloud/reference/topic/escaping + cmd.flags['properties'] = '^@^' + '@'.join( + '{}={}'.format(k, v) for k, v in all_properties.items()) + + if job_arguments: + cmd.additional_flags = ['--'] + job_arguments + + stdout, stderr, retcode = cmd.Issue(timeout=None, raise_on_failure=False) + if retcode != 0: + raise dpb_service.JobSubmissionError(stderr) + + results = json.loads(stdout) + # Otherwise retcode would not have been 0 + assert results['status']['state'] == 'DONE' + done_time = GcpDpbDataproc._ParseTime(results['status']['stateStartTime']) + pending_time = None + start_time = None + for state in results['statusHistory']: + if state['state'] == 'PENDING': + pending_time = GcpDpbDataproc._ParseTime(state['stateStartTime']) + elif state['state'] == 'RUNNING': + start_time = GcpDpbDataproc._ParseTime(state['stateStartTime']) + + assert pending_time and start_time and done_time + + return dpb_service.JobResult( + run_time=(done_time - start_time).total_seconds(), + pending_time=(start_time - pending_time).total_seconds()) + + def _AddToCmd(self, cmd, cmd_property, cmd_value): + flag_name = cmd_property + cmd.flags[flag_name] = cmd_value + + +class GcpDpbDpgke(GcpDpbDataproc): + """Dataproc on GKE cluster. + + Extends from GcpDpbDataproc and not GcpDpbBaseDataproc as this represents a + cluster with managed infrastructure. + """ + + CLOUD = providers.GCP + SERVICE_TYPE = 'dataproc_gke' + + def __init__(self, dpb_service_spec): + super(GcpDpbDpgke, self).__init__(dpb_service_spec) + required_spec_attrs = [ + 'gke_cluster_name', 'gke_cluster_nodepools', + 'gke_cluster_location' + ] + missing_attrs = [ + attr for attr in required_spec_attrs + if not getattr(self.spec, attr, None) + ] + if missing_attrs: + raise errors.Setup.InvalidSetupError( + f'{missing_attrs} must be provided for provisioning DPGKE.') + + def _Create(self): + """Creates the dpgke virtual cluster.""" + cmd = self.DataprocGcloudCommand('clusters', 'gke', 'create', + self.cluster_id) + cmd.use_alpha_gcloud = True + cmd.flags['setup-workload-identity'] = True + cmd.flags['gke-cluster'] = self.spec.gke_cluster_name + cmd.flags['namespace'] = self.cluster_id + # replace ':' field delimiter with '=' since create cluster command + # only accept '=' as field delimiter but pkb doesn't allow overriding + # spec parameters containing '=' + cmd.flags['pools'] = self.spec.gke_cluster_nodepools.replace(':', '=') + cmd.flags['gke-cluster-location'] = self.spec.gke_cluster_location + if FLAGS.dpb_service_bucket: + cmd.flags['staging-bucket'] = FLAGS.dpb_service_bucket + if self.project is not None: + cmd.flags['project'] = self.project + cmd.flags['image-version'] = self.spec.version + if FLAGS.dpb_cluster_properties: + cmd.flags['properties'] = ','.join(FLAGS.dpb_cluster_properties) + timeout = 900 # 15 min + logging.info('Issuing command to create dpgke cluster. Flags %s, Args %s', + cmd.flags, cmd.args) + stdout, stderr, retcode = cmd.Issue(timeout=timeout, raise_on_failure=False) + self._cluster_create_time = self._ParseClusterCreateTime(stdout) + if retcode: + util.CheckGcloudResponseKnownFailures(stderr, retcode) + raise errors.Resource.CreationError(stderr) + + +class GcpDpbDataprocServerless(GcpDpbBaseDataproc): + """Resource that allows spawning serverless Dataproc Jobs.""" + + CLOUD = providers.GCP + SERVICE_TYPE = 'dataproc_serverless' + + def SubmitJob(self, + jarfile=None, + classname=None, + pyspark_file=None, + query_file=None, + job_poll_interval=None, + job_stdout_file=None, + job_arguments=None, + job_files=None, + job_jars=None, + job_type=None, + properties=None): + """See base class.""" + assert job_type + args = ['batches', 'submit', job_type] + additional_args = [] + + if job_type == self.PYSPARK_JOB_TYPE: + args.append(pyspark_file) + + cmd = self.DataprocGcloudCommand(*args) + + cmd.flags['batch'] = self.cluster_id + cmd.flags['labels'] = util.MakeFormattedDefaultTags() + + job_jars = job_jars or [] + if classname: + if jarfile: + # Dataproc does not support both a main class and a main jar so just + # make the main jar an additional jar instead. + job_jars.append(jarfile) + cmd.flags['class'] = classname + elif jarfile: + cmd.flags['jar'] = jarfile + + if query_file: + additional_args += query_file + + if job_files: + cmd.flags['files'] = ','.join(job_files) + if job_jars: + cmd.flags['jars'] = ','.join(job_jars) + + if FLAGS.gce_network_name: + cmd.flags['network'] = FLAGS.gce_network_name + + if self.dpb_version: + cmd.flags['version'] = self.dpb_version + if FLAGS.gcp_dataproc_image: + cmd.flags['container-image'] = FLAGS.gcp_dataproc_image + + all_properties = self.GetJobProperties() + all_properties.update(properties or {}) + if all_properties: + # For commas: https://cloud.google.com/sdk/gcloud/reference/topic/escaping + cmd.flags['properties'] = '^@^' + '@'.join( + '{}={}'.format(k, v) for k, v in all_properties.items()) + + if job_arguments: + additional_args += ['--'] + job_arguments + cmd.additional_flags = additional_args + + _, stderr, retcode = cmd.Issue(timeout=None, raise_on_failure=False) + if retcode != 0: + raise dpb_service.JobSubmissionError(stderr) + + fetch_batch_cmd = self.DataprocGcloudCommand( + 'batches', 'describe', self.cluster_id) + stdout, stderr, retcode = fetch_batch_cmd.Issue( + timeout=None, raise_on_failure=False) + if retcode != 0: + raise dpb_service.JobSubmissionError(stderr) + + results = json.loads(stdout) + # Otherwise retcode would not have been 0 + assert results['state'] == 'SUCCEEDED' + done_time = self._ParseTime(results['stateTime']) + pending_time = None + start_time = None + for state in results['stateHistory']: + if state['state'] == 'PENDING': + pending_time = self._ParseTime(state['stateStartTime']) + elif state['state'] == 'RUNNING': + start_time = self._ParseTime(state['stateStartTime']) + + assert pending_time and start_time and done_time + + return dpb_service.JobResult( + run_time=(done_time - start_time).total_seconds(), + pending_time=(start_time - pending_time).total_seconds()) + + def _Create(self): + # Since there's no managed infrastructure, this is a no-op. + pass + + def _Delete(self): + # Since there's no managed infrastructure, this is a no-op. + pass + + def GetClusterCreateTime(self) -> Optional[float]: + return None + + def GetJobProperties(self) -> Dict[str, str]: + result = {} + if self.spec.dataproc_serverless_core_count: + result['spark.executor.cores'] = self.spec.dataproc_serverless_core_count + result['spark.driver.cores'] = self.spec.dataproc_serverless_core_count + if self.spec.dataproc_serverless_initial_executors: + result['spark.executor.instances'] = ( + self.spec.dataproc_serverless_initial_executors) + if self.spec.dataproc_serverless_min_executors: + result['spark.dynamicAllocation.minExecutors'] = ( + self.spec.dataproc_serverless_min_executors) + if self.spec.dataproc_serverless_max_executors: + result['spark.dynamicAllocation.maxExecutors'] = ( + self.spec.dataproc_serverless_max_executors) + if self.spec.worker_group.disk_spec.disk_size: + result['spark.dataproc.driver.disk_size'] = ( + f'{self.spec.worker_group.disk_spec.disk_size}g' + ) + result['spark.dataproc.executor.disk_size'] = ( + f'{self.spec.worker_group.disk_spec.disk_size}g' + ) + result.update(super().GetJobProperties()) + return result + + def GetMetadata(self): + basic_data = super().GetMetadata() + + if self.spec.dataproc_serverless_core_count: + cluster_shape = ( + f'dataproc-serverless-{self.spec.dataproc_serverless_core_count}') + else: + cluster_shape = 'dataproc-serverless-default' + + initial_executors = self.spec.dataproc_serverless_initial_executors + min_executors = self.spec.dataproc_serverless_min_executors + max_executors = self.spec.dataproc_serverless_max_executors + + cluster_size = None + if initial_executors == min_executors == max_executors: + cluster_size = initial_executors + + return { + 'dpb_service': basic_data['dpb_service'], + 'dpb_version': basic_data['dpb_version'], + 'dpb_service_version': basic_data['dpb_service_version'], + 'dpb_batch_id': basic_data['dpb_cluster_id'], + 'dpb_cluster_shape': cluster_shape, + 'dpb_cluster_size': cluster_size, + 'dpb_cluster_min_executors': min_executors, + 'dpb_cluster_max_executors': max_executors, + 'dpb_cluster_initial_executors': initial_executors, + 'dpb_cores_per_node': self.spec.dataproc_serverless_core_count, + 'dpb_hdfs_type': 'default-disk', + 'dpb_service_zone': basic_data['dpb_service_zone'], + 'dpb_job_properties': basic_data['dpb_job_properties'], + } diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_pubsub.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_pubsub.py new file mode 100644 index 0000000..598425a --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_pubsub.py @@ -0,0 +1,162 @@ +"""GCP PubSub interface for resources. + +This class handles resource creation/cleanup for messaging service benchmark +on GCP Cloud PubSub. https://cloud.google.com/pubsub/docs +""" + +import json +import logging +import os +from typing import Any, Dict + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import messaging_service as msgsvc +from perfkitbenchmarker import providers +from perfkitbenchmarker.providers.gcp import util + +FLAGS = flags.FLAGS +MESSAGING_SERVICE_SCRIPTS_VM_GCP_DIR = os.path.join( + msgsvc.MESSAGING_SERVICE_SCRIPTS_VM_LIB_DIR, 'gcp') +MESSAGING_SERVICE_SCRIPTS_GCP_PREFIX = 'messaging_service_scripts/gcp' +MESSAGING_SERVICE_SCRIPTS_GCP_FILES = ['__init__.py', 'gcp_pubsub_client.py'] +MESSAGING_SERVICE_SCRIPTS_GCP_BIN = 'messaging_service_scripts/gcp_benchmark.py' + + +class GCPCloudPubSub(msgsvc.BaseMessagingService): + """GCP Cloud PubSub Interface Class for prepare phase. + + This class has methods that allow us to run the provision/prepare and cleanup + phase for GCP from the benchmark VM. The provision/prepare phase involve + things like: installing specific packages on the client VM, uploading files + to client VM, resource creation on the cloud provider (PubSub needs a topic + and subcription). + """ + + CLOUD = providers.GCP + + def __init__(self): + super().__init__() + self.project = FLAGS.project or util.GetDefaultProject() + self.pubsub_topic = 'pkb-topic-{0}'.format(FLAGS.run_uri) + self.pubsub_subscription = 'pkb-subscription-{0}'.format(FLAGS.run_uri) + + def _Create(self): + """Handles provision of resources needed for GCP Pub/Sub benchmark.""" + self._CreateTopic() + self._CreateSubscription() + + def _Exists(self): + return self._TopicExists() and self._SubscriptionExists() + + def _Delete(self): + self._DeleteSubscription() + self._DeleteTopic() + + def _IsDeleting(self): + """Overrides BaseResource._IsDeleting. + + Used internally while deleting to check if the deletion is still in + progress. + + Returns: + A bool. True if the resource is not yet deleted, else False. + """ + return self._SubscriptionExists() or self._TopicExists() + + def _InstallCloudClients(self): + # Install/uploads GCP specific modules/files. + self.client_vm.RemoteCommand( + 'sudo pip3 install --upgrade --ignore-installed google-cloud-pubsub', + ignore_failure=False) + + self._CopyFiles( + MESSAGING_SERVICE_SCRIPTS_GCP_PREFIX, + MESSAGING_SERVICE_SCRIPTS_GCP_FILES, + MESSAGING_SERVICE_SCRIPTS_VM_GCP_DIR) + self.client_vm.PushDataFile(MESSAGING_SERVICE_SCRIPTS_GCP_BIN) + + def Run(self, benchmark_scenario: str, number_of_messages: str, + message_size: str) -> Dict[str, Any]: + """Runs a benchmark on GCP PubSub from the client VM. + + Runs a benchmark based on the configuration specified through the arguments: + benchmark_scenario, number_of_messages, and message_size. This contains + the GCP specific command that we need to run on the client VM to run the + benchmark. + + Args: + benchmark_scenario: Specifies which benchmark scenario to run. + number_of_messages: Number of messages to use on the benchmark. + message_size: Size of the messages that will be used on the benchmark. It + specifies the number of characters in those messages. + + Returns: + Dictionary produce by the benchmark with metric_name (mean_latency, + p50_latency...) as key and the results from the benchmark as the value: + + data = { + 'mean_latency': 0.3423443... + ... + } + """ + command = (f'python3 -m gcp_benchmark ' + f'--pubsub_project={self.project} ' + f'--pubsub_topic={self.pubsub_topic} ' + f'--pubsub_subscription={self.pubsub_subscription} ' + f'--benchmark_scenario={benchmark_scenario} ' + f'--number_of_messages={number_of_messages} ' + f'--message_size={message_size} ') + stdout, _ = self.client_vm.RemoteCommand(command) + metrics = json.loads(stdout) + return metrics + + def _CreateTopic(self): + """Handles topic creation on GCP Pub/Sub.""" + cmd = util.GcloudCommand(self, 'pubsub', 'topics', 'create', + self.pubsub_topic) + _, stderr, retcode = cmd.Issue(raise_on_failure=False) + if retcode != 0: + logging.error('Creation of GCP PubSub topic failed.') + raise errors.Resource.CreationError( + 'Failed to create PubSub Topic: %s return code: %s' % + (retcode, stderr)) + + def _TopicExists(self) -> bool: + """Check if subscription exists on GCP Pub/Sub.""" + cmd = util.GcloudCommand(self, 'pubsub', 'topics', 'describe', + self.pubsub_topic) + _, _, retcode = cmd.Issue(raise_on_failure=False) + return retcode == 0 + + def _DeleteTopic(self): + """Handles topic deletion on GCP Pub/Sub.""" + cmd = util.GcloudCommand(self, 'pubsub', 'topics', 'delete', + self.pubsub_topic) + cmd.Issue(raise_on_failure=False) + + def _CreateSubscription(self): + """Handles Subscription creation on GCP Pub/Sub.""" + cmd = util.GcloudCommand(self, 'pubsub', 'subscriptions', 'create', + self.pubsub_subscription) + cmd.flags['topic'] = self.pubsub_topic + cmd.flags['topic-project'] = self.project + _, stderr, retcode = cmd.Issue(raise_on_failure=False) + if retcode != 0: + logging.error('Creation of GCP PubSub subscription failed.') + raise errors.Resource.CreationError( + 'Failed to create PubSub Subscription: %s return code: %s' % + (retcode, stderr)) + + def _SubscriptionExists(self) -> bool: + """Check if subscription exists on GCP Pub/Sub..""" + cmd = util.GcloudCommand(self, 'pubsub', 'subscriptions', 'describe', + self.pubsub_subscription) + _, _, retcode = cmd.Issue(raise_on_failure=False) + return retcode == 0 + + def _DeleteSubscription(self): + """Handles subscription deletion on GCP Pub/Sub.""" + cmd = util.GcloudCommand(self, 'pubsub', 'subscriptions', 'delete', + self.pubsub_subscription) + cmd.Issue(raise_on_failure=False) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_relational_db.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_relational_db.py new file mode 100644 index 0000000..4485e68 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_relational_db.py @@ -0,0 +1,497 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Managed relational database provisioning for GCP. + +As of June 2017 to make this benchmark run for GCP you must install the +gcloud beta component. This is necessary because creating a Cloud SQL instance +with a non-default storage size is in beta right now. This can be removed when +this feature is part of the default components. +See https://cloud.google.com/sdk/gcloud/reference/beta/sql/instances/create +for more information. +""" + + +import datetime +import json +import logging +import time + +from absl import flags +from perfkitbenchmarker import data +from perfkitbenchmarker import providers +from perfkitbenchmarker import relational_db +from perfkitbenchmarker import sql_engine_utils +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.gcp import gce_network +from perfkitbenchmarker.providers.gcp import util +from six.moves import range + +FLAGS = flags.FLAGS + +GCP_DATABASE_VERSION_MAPPING = { + sql_engine_utils.MYSQL: { + '5.5': 'MYSQL_5_5', + '5.6': 'MYSQL_5_6', + '5.7': 'MYSQL_5_7', + '8.0': 'MYSQL_8_0' + }, + sql_engine_utils.POSTGRES: { + '9.6': 'POSTGRES_9_6', + '10': 'POSTGRES_10', + '11': 'POSTGRES_11', + '12': 'POSTGRES_12', + '13': 'POSTGRES_13' + }, + sql_engine_utils.SQLSERVER: { + '2017_Standard': 'SQLSERVER_2017_Standard', + '2017_Enterprise': 'SQLSERVER_2017_ENTERPRISE', + '2017_Express': 'SQLSERVER_2017_EXPRESS', + '2017_Web': 'SQLSERVER_2017_WEB' + } +} + + +DEFAULT_MYSQL_VERSION = '5.7' +DEFAULT_POSTGRES_VERSION = '9.6' +DEFAULT_SQL_SERVER_VERSION = '2017_Standard' + +DEFAULT_ENGINE_VERSIONS = { + sql_engine_utils.MYSQL: DEFAULT_MYSQL_VERSION, + sql_engine_utils.POSTGRES: DEFAULT_POSTGRES_VERSION, + sql_engine_utils.SQLSERVER: DEFAULT_SQL_SERVER_VERSION, +} + +# TODO(chunla): Move to engine specific module +DEFAULT_USERNAME = { + sql_engine_utils.MYSQL: 'root', + sql_engine_utils.POSTGRES: 'postgres', + sql_engine_utils.SQLSERVER: 'sqlserver', +} + +# PostgreSQL restrictions on memory. +# Source: https://cloud.google.com/sql/docs/postgres/instance-settings. +CUSTOM_MACHINE_CPU_MEM_RATIO_LOWER_BOUND = 0.9 +CUSTOM_MACHINE_CPU_MEM_RATIO_UPPER_BOUND = 6.5 +MIN_CUSTOM_MACHINE_MEM_MB = 3840 + +IS_READY_TIMEOUT = 600 # 10 minutes +DELETE_INSTANCE_TIMEOUT = 600 # 10 minutes +CREATION_TIMEOUT = 1200 # 20 minutes + + +class UnsupportedDatabaseEngineError(Exception): + pass + + +class GCPRelationalDb(relational_db.BaseRelationalDb): + """A GCP CloudSQL database resource. + + This class contains logic required to provision and teardown the database. + Currently, the database will be open to the world (0.0.0.0/0) which is not + ideal; however, a password is still required to connect. Currently only + MySQL 5.7 and Postgres 9.6 are supported. + """ + CLOUD = providers.GCP + + def __init__(self, relational_db_spec): + super(GCPRelationalDb, self).__init__(relational_db_spec) + self.project = FLAGS.project or util.GetDefaultProject() + + self.unmanaged_db_exists = None if self.is_managed_db else False + + def _GetAuthorizedNetworks(self, vms): + """Get CIDR connections for list of VM specs that need to access the db.""" + for vm in vms: + if not vm.HasIpAddress: + raise Exception('Client vm needs to be initialized before database can ' + 'discover authorized network.') + # create the CIDR of the client VM that is configured to access + # the database + return ','.join('{0}/32'.format(vm.ip_address) for vm in vms) + + def _CreateGcloudSqlInstance(self): + storage_size = self.spec.db_disk_spec.disk_size + instance_zone = self.spec.db_spec.zone + + authorized_network = self._GetAuthorizedNetworks([self.client_vm]) + + database_version_string = self._GetEngineVersionString( + self.spec.engine, self.spec.engine_version) + + cmd_string = [ + self, + 'beta', + 'sql', + 'instances', + 'create', + self.instance_id, + '--quiet', + '--format=json', + '--activation-policy=ALWAYS', + '--assign-ip', + '--authorized-networks=%s' % authorized_network, + '--zone=%s' % instance_zone, + '--database-version=%s' % database_version_string, + '--storage-size=%d' % storage_size, + '--labels=%s' % util.MakeFormattedDefaultTags(), + ] + if self.spec.engine == sql_engine_utils.MYSQL: + cmd_string.append('--enable-bin-log') + + if self.spec.engine == sql_engine_utils.SQLSERVER: + # `--root-password` is required when creating SQL Server instances. + cmd_string.append('--root-password={0}'.format( + self.spec.database_password)) + + if (self.spec.db_spec.cpus and self.spec.db_spec.memory): + self._ValidateSpec() + memory = self.spec.db_spec.memory + cpus = self.spec.db_spec.cpus + self._ValidateMachineType(memory, cpus) + cmd_string.append('--cpu={}'.format(cpus)) + cmd_string.append('--memory={}MiB'.format(memory)) + elif hasattr(self.spec.db_spec, 'machine_type'): + machine_type_flag = '--tier=%s' % self.spec.db_spec.machine_type + cmd_string.append(machine_type_flag) + else: + raise Exception('Unspecified machine type') + + if self.spec.high_availability: + cmd_string.append(self._GetHighAvailabilityFlag()) + + if self.spec.backup_enabled: + cmd_string.append('--backup') + cmd_string.append('--backup-start-time={}'.format( + self.spec.backup_start_time)) + else: + cmd_string.append('--no-backup') + cmd = util.GcloudCommand(*cmd_string) + cmd.flags['project'] = self.project + + _, stderr, retcode = cmd.Issue(timeout=CREATION_TIMEOUT) + + util.CheckGcloudResponseKnownFailures(stderr, retcode) + + def _Create(self): + """Creates the Cloud SQL instance and authorizes traffic from anywhere. + + Raises: + UnsupportedDatabaseEngineError: + if the database is unmanaged and the engine isn't MYSQL. + Exception: if an invalid MySQL flag was used. + """ + if self.is_managed_db: + self._CreateGcloudSqlInstance() + else: + self._SetupUnmanagedDatabase() + + if FLAGS.ip_addresses == vm_util.IpAddressSubset.INTERNAL: + self.endpoint = self.server_vm.internal_ip + else: + self.endpoint = self.server_vm.ip_address + self.firewall = gce_network.GceFirewall() + self.firewall.AllowPort( + self.server_vm, self.port, source_range=[self.client_vm.ip_address]) + self.unmanaged_db_exists = True + + def _GetHighAvailabilityFlag(self): + """Returns a flag that enables high-availability. + + Returns: + Flag (as string) to be appended to the gcloud sql create command. + """ + return '--availability-type=REGIONAL' + + def _ValidateSpec(self): + """Validates PostgreSQL spec for CPU and memory. + + Raises: + data.ResourceNotFound: On missing memory or cpus in postgres benchmark + config. + """ + if not hasattr(self.spec.db_spec, 'cpus') or not self.spec.db_spec.cpus: + raise data.ResourceNotFound( + 'Must specify cpu count in benchmark config. See https://' + 'cloud.google.com/sql/docs/postgres/instance-settings for more ' + 'details about size restrictions.') + if not hasattr(self.spec.db_spec, 'memory') or not self.spec.db_spec.memory: + raise data.ResourceNotFound( + 'Must specify a memory amount in benchmark config. See https://' + 'cloud.google.com/sql/docs/postgres/instance-settings for more ' + 'details about size restrictions.') + + def _ValidateMachineType(self, memory, cpus): + """Validates the custom machine type configuration. + + Memory and CPU must be within the parameters described here: + https://cloud.google.com/sql/docs/postgres/instance-settings + + Args: + memory: (int) in MiB + cpus: (int) + + Raises: + ValueError on invalid configuration. + """ + if cpus not in [1] + list(range(2, 97, 2)): + raise ValueError( + 'CPUs (%i) much be 1 or an even number in-between 2 and 96, ' + 'inclusive.' % cpus) + + if memory % 256 != 0: + raise ValueError( + 'Total memory (%dMiB) for a custom machine must be a multiple' + 'of 256MiB.' % memory) + ratio = memory / 1024.0 / cpus + if (ratio < CUSTOM_MACHINE_CPU_MEM_RATIO_LOWER_BOUND or + ratio > CUSTOM_MACHINE_CPU_MEM_RATIO_UPPER_BOUND): + raise ValueError( + 'The memory (%.2fGiB) per vCPU (%d) of a custom machine ' + 'type must be between %.2f GiB and %.2f GiB per vCPU, ' + 'inclusive.' % + (memory / 1024.0, cpus, CUSTOM_MACHINE_CPU_MEM_RATIO_LOWER_BOUND, + CUSTOM_MACHINE_CPU_MEM_RATIO_UPPER_BOUND)) + if memory < MIN_CUSTOM_MACHINE_MEM_MB: + raise ValueError('The total memory (%dMiB) for a custom machine type' + 'must be at least %dMiB.' % + (memory, + MIN_CUSTOM_MACHINE_MEM_MB)) + + def _Delete(self): + """Deletes the underlying resource. + + Implementations of this method should be idempotent since it may + be called multiple times, even if the resource has already been + deleted. + """ + if not self.is_managed_db: + if hasattr(self, 'firewall'): + self.firewall.DisallowAllPorts() + self.unmanaged_db_exists = False + self.PrintUnmanagedDbStats() + return + if hasattr(self, 'replica_instance_id'): + cmd = util.GcloudCommand(self, 'sql', 'instances', 'delete', + self.replica_instance_id, '--quiet') + cmd.Issue(raise_on_failure=False, timeout=DELETE_INSTANCE_TIMEOUT) + + cmd = util.GcloudCommand(self, 'sql', 'instances', 'delete', + self.instance_id, '--quiet', '--async') + cmd.Issue(raise_on_failure=False, timeout=DELETE_INSTANCE_TIMEOUT) + + def _Exists(self): + """Returns true if the underlying resource exists. + + Supplying this method is optional. If it is not implemented then the + default is to assume success when _Create and _Delete do not raise + exceptions. + """ + if not self.is_managed_db: + return self.unmanaged_db_exists + cmd = util.GcloudCommand(self, 'sql', 'instances', 'describe', + self.instance_id) + stdout, _, _ = cmd.Issue(raise_on_failure=False) + try: + json_output = json.loads(stdout) + return json_output['kind'] == 'sql#instance' + except: + return False + + def _IsDBInstanceReady(self, instance_id, timeout=IS_READY_TIMEOUT): + cmd = util.GcloudCommand(self, 'sql', 'instances', 'describe', + instance_id) + start_time = datetime.datetime.now() + + while True: + if (datetime.datetime.now() - start_time).seconds > timeout: + logging.exception('Timeout waiting for sql instance to be ready') + return False + stdout, _, _ = cmd.Issue(suppress_warning=True, raise_on_failure=False) + + try: + json_output = json.loads(stdout) + state = json_output['state'] + logging.info('Instance %s state: %s', instance_id, state) + if state == 'RUNNABLE': + break + except: + logging.exception('Error attempting to read stdout. Creation failure.') + return False + time.sleep(5) + + return True + + def _IsReady(self, timeout=IS_READY_TIMEOUT): + """Return true if the underlying resource is ready. + + Supplying this method is optional. Use it when a resource can exist + without being ready. If the subclass does not implement + it then it just returns true. + + Args: + timeout: how long to wait when checking if the DB is ready. + + Returns: + True if the resource was ready in time, False if the wait timed out. + """ + if not self.is_managed_db: + return self._IsReadyUnmanaged() + + if not self._IsDBInstanceReady(self.instance_id, timeout): + return False + if self.spec.high_availability and hasattr(self, 'replica_instance_id'): + if not self._IsDBInstanceReady(self.replica_instance_id, timeout): + return False + + cmd = util.GcloudCommand( + self, 'sql', 'instances', 'describe', self.instance_id) + stdout, _, _ = cmd.Issue() + json_output = json.loads(stdout) + self.endpoint = self._ParseEndpoint(json_output) + return True + + def _ParseEndpoint(self, describe_instance_json): + """Returns the IP of the resource given the metadata as JSON. + + Args: + describe_instance_json: JSON output. + Returns: + public IP address (string) + """ + if describe_instance_json is None: + return '' + try: + selflink = describe_instance_json['ipAddresses'][0]['ipAddress'] + except: + selflink = '' + logging.exception('Error attempting to read stdout. Creation failure.') + return selflink + + @vm_util.Retry(max_retries=4, poll_interval=2) + def SetManagedDatabasePassword(self): + # The hostname '%' means unrestricted access from any host. + cmd = util.GcloudCommand( + self, 'sql', 'users', 'create', self.spec.database_username, + '--host=%', '--instance={0}'.format(self.instance_id), + '--password={0}'.format(self.spec.database_password)) + _, _, _ = cmd.Issue() + + # By default the empty password is a security violation. + # Change the password to a non-default value. + default_user = DEFAULT_USERNAME[self.spec.engine] + + cmd = util.GcloudCommand( + self, 'sql', 'users', 'set-password', default_user, + '--host=%', '--instance={0}'.format(self.instance_id), + '--password={0}'.format(self.spec.database_password)) + _, _, _ = cmd.Issue() + + def _PostCreate(self): + """Creates the PKB user and sets the password. + """ + super()._PostCreate() + + if self.is_managed_db: + self.SetManagedDatabasePassword() + + self.client_vm_query_tools.InstallPackages() + + def _ApplyManagedDbFlags(self): + cmd_string = [ + self, 'sql', 'instances', 'patch', self.instance_id, + '--database-flags=%s' % ','.join(FLAGS.db_flags) + ] + cmd = util.GcloudCommand(*cmd_string) + _, stderr, _ = cmd.Issue() + if stderr: + # sql instance patch outputs information to stderr + # Reference to GCP documentation + # https://cloud.google.com/sdk/gcloud/reference/sql/instances/patch + # Example output + # Updated [https://sqladmin.googleapis.com/]. + if 'Updated' in stderr: + return + raise Exception('Invalid flags: %s' % stderr) + + self._Reboot() + + def _Reboot(self): + cmd_string = [ + self, 'sql', 'instances', 'restart', self.instance_id + ] + cmd = util.GcloudCommand(*cmd_string) + cmd.Issue() + + if not self._IsReady(): + raise Exception('Instance could not be set to ready after ' + 'reboot') + + @staticmethod + def GetDefaultEngineVersion(engine): + """Returns the default version of a given database engine. + + Args: + engine (string): type of database (my_sql or postgres). + + Returns: + (string): Default version for the given database engine. + """ + if engine not in DEFAULT_ENGINE_VERSIONS: + raise NotImplementedError('Default engine not specified for ' + 'engine {0}'.format(engine)) + return DEFAULT_ENGINE_VERSIONS[engine] + + @staticmethod + def _GetEngineVersionString(engine, version): + """Returns CloudSQL-specific version string for givin database engine. + + Args: + engine: database engine + version: engine version + + Returns: + (string): CloudSQL-specific name for requested engine and version. + + Raises: + NotImplementedError on invalid engine / version combination. + """ + if engine not in GCP_DATABASE_VERSION_MAPPING: + valid_databases = ', '.join(GCP_DATABASE_VERSION_MAPPING.keys()) + raise NotImplementedError( + 'Database {0} is not supported,supported ' + 'databases include {1}'.format(engine, valid_databases)) + + version_mapping = GCP_DATABASE_VERSION_MAPPING[engine] + if version not in version_mapping: + valid_versions = ', '.join(version_mapping.keys()) + raise NotImplementedError( + 'Version {0} is not supported,supported ' + 'versions include {1}'.format(version, valid_versions)) + + return version_mapping[version] + + def _FailoverHA(self): + """Fail over from master to replica.""" + cmd_string = [ + self, + 'sql', + 'instances', + 'failover', + self.instance_id, + ] + cmd = util.GcloudCommand(*cmd_string) + cmd.flags['project'] = self.project + # this command doesnt support the specifier: 'format' + del cmd.flags['format'] + cmd.IssueRetryable() diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_spanner.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_spanner.py new file mode 100644 index 0000000..9f08237 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_spanner.py @@ -0,0 +1,354 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for GCP's spanner instances. + +Instances can be created and deleted. +""" + +import dataclasses +import json +import logging +from typing import Any, Dict, Optional + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import resource +from perfkitbenchmarker.configs import freeze_restore_spec +from perfkitbenchmarker.configs import option_decoders +from perfkitbenchmarker.configs import spec +from perfkitbenchmarker.providers.gcp import util +import requests + + +FLAGS = flags.FLAGS +flags.DEFINE_string('cloud_spanner_config', + None, + 'The config for the Cloud Spanner instance. Use default ' + 'config if unset.') +flags.DEFINE_integer('cloud_spanner_nodes', None, + 'The number of nodes for the Cloud Spanner instance.') +flags.DEFINE_string('cloud_spanner_project', + None, + 'The project for the Cloud Spanner instance. Use default ' + 'project if unset.') + +# Valid GCP Spanner types: +DEFAULT_SPANNER_TYPE = 'default' + +_DEFAULT_REGION = 'us-central1' +_DEFAULT_DESCRIPTION = 'Spanner instance created by PKB.' +_DEFAULT_DDL = """ + CREATE TABLE pkb_table ( + id STRING(MAX), + field0 STRING(MAX) + ) PRIMARY KEY(id) + """ +_DEFAULT_NODES = 1 +_FROZEN_NODE_COUNT = 1 + +# Common decoder configuration option. +_NONE_OK = {'default': None, 'none_ok': True} + + +@dataclasses.dataclass +class SpannerSpec(freeze_restore_spec.FreezeRestoreSpec): + """Configurable options of a Spanner instance.""" + + # Needed for registering the spec class. + SPEC_TYPE = 'SpannerSpec' + SPEC_ATTRS = ['SERVICE_TYPE'] + SERVICE_TYPE = DEFAULT_SPANNER_TYPE + + service_type: str + name: str + description: str + database: str + ddl: str + config: str + nodes: int + project: str + + def __init__(self, + component_full_name: str, + flag_values: Optional[Dict[str, flags.FlagValues]] = None, + **kwargs): + super().__init__(component_full_name, flag_values=flag_values, **kwargs) + + @classmethod + def _GetOptionDecoderConstructions(cls): + """Gets decoder classes and constructor args for each configurable option. + + Returns: + dict. Maps option name string to a (ConfigOptionDecoder class, dict) pair. + The pair specifies a decoder class and its __init__() keyword arguments + to construct in order to decode the named option. + """ + result = super()._GetOptionDecoderConstructions() + result.update({ + 'service_type': ( + option_decoders.EnumDecoder, + { + 'valid_values': [ + DEFAULT_SPANNER_TYPE, + ], + 'default': DEFAULT_SPANNER_TYPE + }), + 'name': (option_decoders.StringDecoder, _NONE_OK), + 'database': (option_decoders.StringDecoder, _NONE_OK), + 'description': (option_decoders.StringDecoder, _NONE_OK), + 'ddl': (option_decoders.StringDecoder, _NONE_OK), + 'config': (option_decoders.StringDecoder, _NONE_OK), + 'nodes': (option_decoders.IntDecoder, _NONE_OK), + 'project': (option_decoders.StringDecoder, _NONE_OK), + }) + return result + + @classmethod + def _ApplyFlags(cls, config_values, flag_values): + """Modifies config options based on runtime flag values. + + Can be overridden by derived classes to add support for specific flags. + + Args: + config_values: dict mapping config option names to provided values. May + be modified by this function. + flag_values: flags.FlagValues. Runtime flags that may override the + provided config values. + """ + super()._ApplyFlags(config_values, flag_values) + if flag_values['cloud_spanner_config'].present: + config_values['config'] = flag_values.cloud_spanner_config + if flag_values['cloud_spanner_nodes'].present: + config_values['nodes'] = flag_values.cloud_spanner_nodes + if flag_values['cloud_spanner_project'].present: + config_values['project'] = flag_values.cloud_spanner_project + + +def GetSpannerSpecClass(service_type) -> Optional[spec.BaseSpecMetaClass]: + """Return the SpannerSpec class corresponding to 'service_type'.""" + return spec.GetSpecClass(SpannerSpec, SERVICE_TYPE=service_type) + + +class GcpSpannerInstance(resource.BaseResource): + """Object representing a GCP Spanner Instance. + + The project and Cloud Spanner config must already exist. Instance and database + will be created and torn down before and after the test. + + The following parameters are overridden by the corresponding FLAGs. + project: FLAGS.cloud_spanner_project + config: FLAGS.cloud_spanner_config + nodes: FLAGS.cloud_spanner_nodes + + Attributes: + name: Name of the instance to create. + description: Description of the instance. + database: Name of the database to create + ddl: The schema of the database. + """ + # Required for registering the class. + RESOURCE_TYPE = 'GcpSpannerInstance' + REQUIRED_ATTRS = ['SERVICE_TYPE'] + SERVICE_TYPE = DEFAULT_SPANNER_TYPE + + def __init__(self, + name: Optional[str] = None, + description: Optional[str] = None, + database: Optional[str] = None, + ddl: Optional[str] = None, + config: Optional[str] = None, + nodes: Optional[int] = None, + project: Optional[str] = None, + **kwargs): + super(GcpSpannerInstance, self).__init__(**kwargs) + self.name = name or f'pkb-instance-{FLAGS.run_uri}' + self.database = database or f'pkb-database-{FLAGS.run_uri}' + self._description = description or _DEFAULT_DESCRIPTION + self._ddl = ddl or _DEFAULT_DDL + self._config = config or self._GetDefaultConfig() + self._nodes = nodes or _DEFAULT_NODES + self._end_point = None + + # Cloud Spanner may not explicitly set the following common flags. + self.project = ( + project or FLAGS.project or util.GetDefaultProject()) + self.zone = None + + def _GetDefaultConfig(self) -> str: + """Gets the config that corresponds the region used for the test.""" + try: + region = util.GetRegionFromZone( + FLAGS.zones[0] if FLAGS.zones else FLAGS.zone[0]) + except IndexError: + region = _DEFAULT_REGION + return f'regional-{region}' + + @classmethod + def FromSpec(cls, spanner_spec: SpannerSpec) -> 'GcpSpannerInstance': + """Initialize Spanner from the provided spec.""" + return cls( + name=spanner_spec.name, + description=spanner_spec.description, + database=spanner_spec.database, + ddl=spanner_spec.ddl, + config=spanner_spec.config, + nodes=spanner_spec.nodes, + project=spanner_spec.project, + enable_freeze_restore=spanner_spec.enable_freeze_restore, + create_on_restore_error=spanner_spec.create_on_restore_error, + delete_on_freeze_error=spanner_spec.delete_on_freeze_error) + + def _Create(self) -> None: + """Creates the instance, the database, and update the schema.""" + cmd = util.GcloudCommand(self, 'spanner', 'instances', 'create', self.name) + cmd.flags['description'] = self._description + cmd.flags['nodes'] = self._nodes + cmd.flags['config'] = self._config + _, _, retcode = cmd.Issue(raise_on_failure=False) + if retcode != 0: + logging.error('Create GCP Spanner instance failed.') + return + + self._UpdateLabels(util.GetDefaultTags()) + + cmd = util.GcloudCommand(self, 'spanner', 'databases', 'create', + self.database) + cmd.flags['instance'] = self.name + _, _, retcode = cmd.Issue(raise_on_failure=False) + if retcode != 0: + logging.error('Create GCP Spanner database failed.') + return + + cmd = util.GcloudCommand(self, 'spanner', 'databases', 'ddl', 'update', + self.database) + cmd.flags['instance'] = self.name + cmd.flags['ddl'] = self._ddl + _, _, retcode = cmd.Issue(raise_on_failure=False) + if retcode != 0: + logging.error('Update GCP Spanner database schema failed.') + else: + logging.info('Created GCP Spanner instance and database.') + + def _Delete(self) -> None: + """Deletes the instance.""" + cmd = util.GcloudCommand(self, 'spanner', 'instances', 'delete', + self.name) + _, _, retcode = cmd.Issue(raise_on_failure=False) + if retcode != 0: + logging.error('Delete GCP Spanner instance failed.') + else: + logging.info('Deleted GCP Spanner instance.') + + def _Exists(self, instance_only: bool = False) -> bool: + """Returns true if the instance and the database exists.""" + cmd = util.GcloudCommand(self, 'spanner', 'instances', 'describe', + self.name) + + # Do not log error or warning when checking existence. + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + if retcode != 0: + logging.info('Could not find GCP Spanner instance %s.', self.name) + return False + + if instance_only: + return True + + cmd = util.GcloudCommand(self, 'spanner', 'databases', 'describe', + self.database) + cmd.flags['instance'] = self.name + + # Do not log error or warning when checking existence. + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + if retcode != 0: + logging.info('Could not find GCP Spanner database %s.', self.database) + return False + + return True + + def GetEndPoint(self) -> Optional[str]: + """Returns the end point for Cloud Spanner.""" + if self._end_point: + return self._end_point + + cmd = util.GcloudCommand(self, 'config', 'get-value', + 'api_endpoint_overrides/spanner') + stdout, _, retcode = cmd.Issue(raise_on_failure=False) + if retcode != 0: + logging.warning('Fail to retrieve cloud spanner end point.') + return None + self._end_point = json.loads(stdout) + return self._end_point + + def _SetNodes(self, nodes: int) -> None: + """Sets the number of nodes on the Spanner instance.""" + cmd = util.GcloudCommand(self, 'spanner', 'instances', 'update', self.name) + cmd.flags['nodes'] = nodes + cmd.Issue(raise_on_failure=True) + + def _Restore(self) -> None: + """See base class. + + Increases the number of nodes on the instance to the specified number. See + https://cloud.google.com/spanner/pricing for Spanner pricing info. + """ + self._SetNodes(self._nodes) + + def _Freeze(self) -> None: + """See base class. + + Lowers the number of nodes on the instance to one. Note there are + restrictions to being able to lower the number of nodes on an instance. See + https://cloud.google.com/spanner/docs/create-manage-instances. + """ + self._SetNodes(_FROZEN_NODE_COUNT) + + def _GetLabels(self) -> Dict[str, Any]: + """Gets labels from the current instance.""" + cmd = util.GcloudCommand(self, 'spanner', 'instances', 'describe', + self.name) + stdout, _, _ = cmd.Issue(raise_on_failure=True) + return json.loads(stdout).get('labels', {}) + + def _UpdateLabels(self, labels: Dict[str, Any]) -> None: + """Updates the labels of the current instance.""" + header = {'Authorization': f'Bearer {util.GetAccessToken()}'} + url = ('https://spanner.googleapis.com/v1/projects/' + f'{self.project}/instances/{self.name}') + # Keep any existing labels + tags = self._GetLabels() + tags.update(labels) + args = { + 'instance': { + 'labels': tags + }, + 'fieldMask': 'labels', + } + response = requests.patch(url, headers=header, json=args) + logging.info('Update labels: status code %s, %s', + response.status_code, response.text) + if response.status_code != 200: + raise errors.Resource.UpdateError( + f'Unable to update Spanner instance: {response.text}') + + def _UpdateTimeout(self, timeout_minutes: int) -> None: + """See base class.""" + labels = util.GetDefaultTags(timeout_minutes) + self._UpdateLabels(labels) + + +def GetSpannerClass( + service_type: str) -> Optional[resource.AutoRegisterResourceMeta]: + """Return the Spanner class associated with service_type.""" + return resource.GetResourceClass( + GcpSpannerInstance, SERVICE_TYPE=service_type) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_tpu.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_tpu.py new file mode 100644 index 0000000..21ff192 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcp_tpu.py @@ -0,0 +1,152 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing class for GCP's cloud TPU. + +cloud TPU can be created and deleted. +""" + +import json +import logging +from absl import flags +from perfkitbenchmarker import cloud_tpu +from perfkitbenchmarker import errors +from perfkitbenchmarker import providers +from perfkitbenchmarker.providers.gcp import util + +FLAGS = flags.FLAGS +TPU_TIMEOUT = 1200 +_INSUFFICIENT_CAPACITY = 'There is no more capacity in the zone' + + +class GcpTpu(cloud_tpu.BaseTpu): + """class representing a GCP cloud TPU. + + Attributes: + name: Name of the cloud TPU to create. + project: the GCP project. + version: the TPU version. + zone: the GCP zone. + tpu_ip: the TPU IP. + """ + + CLOUD = providers.GCP + SERVICE_NAME = 'tpu' + TPU_IP = '10.240.{}.2' + DEFAULT_TPU_VERSION = '1.6' + + def __init__(self, tpu_spec): + super(GcpTpu, self).__init__(tpu_spec) + self.spec = tpu_spec + self.project = FLAGS.project or util.GetDefaultProject() + + def _Create(self): + """Create Cloud TPU.""" + cmd = util.GcloudCommand(self, 'compute', 'tpus', 'create', + self.spec.tpu_name) + cmd.flags['range'] = self.spec.tpu_cidr_range + if self.spec.tpu_accelerator_type: + cmd.flags['accelerator-type'] = self.spec.tpu_accelerator_type + if self.spec.tpu_description: + cmd.flags['description'] = self.spec.tpu_description + if self.spec.tpu_network: + cmd.flags['network'] = self.spec.tpu_network + if self.spec.tpu_tf_version: + cmd.flags['version'] = self.spec.tpu_tf_version + if self.spec.tpu_zone: + cmd.flags['zone'] = self.spec.tpu_zone + if self.spec.tpu_preemptible: + cmd.flags['preemptible'] = self.spec.tpu_preemptible + cmd.flags['project'] = self.project + _, stderr, retcode = cmd.Issue(raise_on_failure=False) + + if _INSUFFICIENT_CAPACITY in stderr: + logging.error(util.STOCKOUT_MESSAGE) + raise errors.Benchmarks.InsufficientCapacityCloudFailure( + util.STOCKOUT_MESSAGE) + + if retcode != 0: + logging.error('Create GCP cloud TPU failed.') + + def _Delete(self): + """Deletes the cloud TPU.""" + cmd = util.GcloudCommand(self, 'compute', 'tpus', 'delete', + self.spec.tpu_name) + if self.spec.tpu_zone: + cmd.flags['zone'] = self.spec.tpu_zone + cmd.flags['project'] = self.project + _, _, retcode = cmd.Issue(timeout=TPU_TIMEOUT, raise_on_failure=False) + if retcode != 0: + logging.error('Delete GCP cloud TPU failed.') + else: + logging.info('Deleted GCP cloud TPU.') + + def _GetTpuDescription(self): + """Gets the cloud TPU description.""" + cmd = util.GcloudCommand(self, 'compute', 'tpus', 'describe', + self.spec.tpu_name) + if self.spec.tpu_zone: + cmd.flags['zone'] = self.spec.tpu_zone + cmd.flags['project'] = self.project + stdout, _, retcode = cmd.Issue(raise_on_failure=False) + if retcode != 0: + logging.info('Could not found GCP cloud TPU %s.', + self.spec.tpu_name) + return stdout and json.loads(stdout), retcode + + def _Exists(self): + """Returns true if the cloud TPU exists.""" + _, retcode = self._GetTpuDescription() + return retcode == 0 + + def GetName(self): + """Gets the name of the cloud TPU.""" + return self.spec.tpu_name + + def GetMasterGrpcAddress(self): + """Gets the grpc address of the 0th NetworkEndpoint.""" + master_network_endpoint = self._GetTpuDescription()[0]['networkEndpoints'][ + 0] + + return 'grpc://{ip_address}:{port}'.format( + ip_address=master_network_endpoint['ipAddress'], + port=master_network_endpoint['port']) + + def GetNumShards(self): + """Gets the number of TPU shards.""" + num_tpus = len(self._GetTpuDescription()[0]['networkEndpoints']) + return num_tpus * FLAGS.tpu_cores_per_donut + + def GetZone(self): + """Gets the TPU zone.""" + return self.spec.tpu_zone + + def GetAcceleratorType(self): + """Gets the TPU accelerator type.""" + return self.spec.tpu_accelerator_type + + def GetResourceMetadata(self): + """Returns the metadata associated with the resource. + + All keys will be prefaced with tpu before + being published (done in publisher.py). + + Returns: + metadata: dict of GCP cloud TPU metadata. + """ + metadata = super(GcpTpu, self).GetResourceMetadata() + metadata.update({ + 'project': self.project, + 'cloud': self.CLOUD + }) + return metadata diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcs.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcs.py new file mode 100644 index 0000000..06aec50 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcs.py @@ -0,0 +1,371 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Contains classes/functions related to Google Cloud Storage.""" + +import logging +import ntpath +import os +import posixpath +import re +from typing import List as TList + +from absl import flags +from perfkitbenchmarker import errors +from perfkitbenchmarker import linux_packages +from perfkitbenchmarker import object_storage_service +from perfkitbenchmarker import os_types +from perfkitbenchmarker import providers +from perfkitbenchmarker import temp_dir +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.gcp import util + +_DEFAULT_GCP_SERVICE_KEY_FILE = 'gcp_credentials.json' +DEFAULT_GCP_REGION = 'us-central1' +GCLOUD_CONFIG_PATH = '.config/gcloud' +GCS_CLIENT_PYTHON = 'python' +GCS_CLIENT_BOTO = 'boto' +READER = 'objectViewer' +WRITER = 'objectCreator' + +flags.DEFINE_string('google_cloud_sdk_version', None, + 'Use a particular version of the Google Cloud SDK, e.g.: ' + '103.0.0') +flags.DEFINE_enum('gcs_client', GCS_CLIENT_BOTO, + [GCS_CLIENT_PYTHON, GCS_CLIENT_BOTO], + 'The GCS client library to use (default boto).') + +FLAGS = flags.FLAGS + + +class GoogleCloudStorageService(object_storage_service.ObjectStorageService): + """Interface to Google Cloud Storage.""" + + STORAGE_NAME = providers.GCP + + location: str + + def PrepareService(self, location): + self.location = location or DEFAULT_GCP_REGION + + def MakeBucket(self, bucket, raise_on_failure=True): + command = ['gsutil', 'mb'] + if self.location: + command.extend(['-l', self.location]) + if self.location and '-' in self.location: + # regional buckets + command.extend(['-c', 'regional']) + elif FLAGS.object_storage_storage_class is not None: + command.extend(['-c', FLAGS.object_storage_storage_class]) + if FLAGS.project: + command.extend(['-p', FLAGS.project]) + command.extend(['gs://%s' % bucket]) + + _, stderr, ret_code = vm_util.IssueCommand(command, raise_on_failure=False) + if ret_code and raise_on_failure: + raise errors.Benchmarks.BucketCreationError(stderr) + + command = ['gsutil', 'label', 'ch'] + for key, value in util.GetDefaultTags().items(): + command.extend(['-l', f'{key}:{value}']) + command.extend([f'gs://{bucket}']) + _, stderr, ret_code = vm_util.IssueCommand(command, raise_on_failure=False) + if ret_code and raise_on_failure: + raise errors.Benchmarks.BucketCreationError(stderr) + + def Copy(self, src_url, dst_url, recursive=False): + """See base class.""" + cmd = ['gsutil', 'cp'] + if recursive: + cmd += ['-r'] + cmd += [src_url, dst_url] + vm_util.IssueCommand(cmd) + + def CopyToBucket(self, src_path, bucket, object_path): + """See base class.""" + dst_url = self.MakeRemoteCliDownloadUrl(bucket, object_path) + vm_util.IssueCommand(['gsutil', 'cp', src_path, dst_url]) + + def MakeRemoteCliDownloadUrl(self, bucket, object_path): + """See base class.""" + path = posixpath.join(bucket, object_path) + return 'gs://' + path + + def GenerateCliDownloadFileCommand(self, src_url, local_path): + """See base class.""" + return 'gsutil cp "%s" "%s"' % (src_url, local_path) + + def List(self, bucket): + """See base class.""" + # Full URI is required by gsutil. + if not bucket.startswith('gs://'): + bucket = 'gs://' + bucket + stdout, _, _ = vm_util.IssueCommand(['gsutil', 'ls', bucket]) + return stdout + + def ListTopLevelSubfolders(self, bucket): + """Lists the top level folders (not files) in a bucket. + + Each folder is returned as its full uri, eg. "gs://pkbtpch1/customer/", so + just the folder name is extracted. When there's more than one, splitting + on the newline returns a final blank row, so blank values are skipped. + + Args: + bucket: Name of the bucket to list the top level subfolders of. + + Returns: + A list of top level subfolder names. Can be empty if there are no folders. + """ + return [ + obj.split('/')[-2].strip() + for obj in self.List(bucket).split('\n') + if obj and obj.endswith('/') + ] + + @vm_util.Retry() + def DeleteBucket(self, bucket): + # We want to retry rm and rb together because it's possible that + # we issue rm followed by rb, but then rb fails because the + # metadata store isn't consistent and the server that handles the + # rb thinks there are still objects in the bucket. It's also + # possible for rm to fail because the metadata store is + # inconsistent and rm doesn't find all objects, so can't delete + # them all. + self.EmptyBucket(bucket) + + def _bucket_not_found(stdout, stderr, retcode): + del stdout # unused + + return retcode and 'BucketNotFoundException' in stderr + + vm_util.IssueCommand(['gsutil', 'rb', 'gs://%s' % bucket], + suppress_failure=_bucket_not_found) + + def EmptyBucket(self, bucket): + # Ignore failures here and retry in DeleteBucket. See more comments there. + vm_util.IssueCommand( + ['gsutil', '-m', 'rm', '-r', + 'gs://%s/*' % bucket], raise_on_failure=False) + + def AclBucket(self, entity: str, roles: TList[str], bucket: str): + """Updates access control lists. + + Args: + entity: the user or group to grant permission. + roles: the IAM roles to be granted. + bucket: the name of the bucket to change + """ + vm_util.IssueCommand([ + 'gsutil', 'iam', 'ch', f"{entity}:{','.join(roles)}", f'gs://{bucket}' + ]) + + def MakeBucketPubliclyReadable(self, bucket, also_make_writable=False): + """See base class.""" + roles = [READER] + logging.warning('Making bucket %s publicly readable!', bucket) + if also_make_writable: + roles.append(WRITER) + logging.warning('Making bucket %s publicly writable!', bucket) + self.AclBucket('allUsers', roles, bucket) + + # Use JSON API over XML for URLs + def GetDownloadUrl(self, bucket, object_name, use_https=True): + """See base class.""" + # https://cloud.google.com/storage/docs/downloading-objects + scheme = 'https' if use_https else 'http' + return (f'{scheme}://storage.googleapis.com/storage/v1/' + f'b/{bucket}/o/{object_name}?alt=media') + + def GetUploadUrl(self, bucket, object_name, use_https=True): + """See base class.""" + # https://cloud.google.com/storage/docs/uploading-objects + # Note I don't believe GCS supports upload via HTTP. + scheme = 'https' if use_https else 'http' + return (f'{scheme}://storage.googleapis.com/upload/storage/v1/' + f'b/{bucket}/o?uploadType=media&name={object_name}') + + UPLOAD_HTTP_METHOD = 'POST' + + @classmethod + def AcquireWritePermissionsWindows(cls, vm): + """Prepare boto file on a remote Windows instance. + + If the boto file specifies a service key file, copy that service key file to + the VM and modify the .boto file on the VM to point to the copied file. + + Args: + vm: gce virtual machine object. + """ + boto_src = object_storage_service.FindBotoFile() + boto_des = object_storage_service.DEFAULT_BOTO_LOCATION_USER + stdout, _ = vm.RemoteCommand(f'Test-Path {boto_des}') + if 'True' in stdout: + return + with open(boto_src) as f: + boto_contents = f.read() + match = re.search(r'gs_service_key_file\s*=\s*(.*)', boto_contents) + if match: + service_key_src = match.group(1) + service_key_des = ntpath.join(vm.home_dir, + posixpath.basename(service_key_src)) + boto_src = cls._PrepareGcsServiceKey(vm, boto_src, service_key_src, + service_key_des) + vm.PushFile(boto_src, boto_des) + + @classmethod + def AcquireWritePermissionsLinux(cls, vm): + """Prepare boto file on a remote Linux instance. + + If the boto file specifies a service key file, copy that service key file to + the VM and modify the .boto file on the VM to point to the copied file. + + Args: + vm: gce virtual machine object. + """ + vm_pwd, _ = vm.RemoteCommand('pwd') + home_dir = vm_pwd.strip() + boto_src = object_storage_service.FindBotoFile() + boto_des = object_storage_service.DEFAULT_BOTO_LOCATION_USER + if vm.TryRemoteCommand(f'test -f {boto_des}'): + return + with open(boto_src) as f: + boto_contents = f.read() + match = re.search(r'gs_service_key_file\s*=\s*(.*)', boto_contents) + if match: + service_key_src = match.group(1) + service_key_des = posixpath.join(home_dir, + posixpath.basename(service_key_src)) + boto_src = cls._PrepareGcsServiceKey(vm, boto_src, service_key_src, + service_key_des) + vm.PushFile(boto_src, boto_des) + + @classmethod + def _PrepareGcsServiceKey(cls, vm, boto_src, service_key_src, + service_key_des): + """Copy GS service key file to remote VM and update key path in boto file. + + Args: + vm: gce virtual machine object. + boto_src: string, the boto file path in local machine. + service_key_src: string, the gs service key file in local machine. + service_key_des: string, the gs service key file in remote VM. + + Returns: + The updated boto file path. + """ + vm.PushFile(service_key_src, service_key_des) + key = 'gs_service_key_file' + with open(boto_src, 'r') as src_file: + boto_path = os.path.join(temp_dir.GetRunDirPath(), + posixpath.basename(boto_src)) + with open(boto_path, 'w') as des_file: + for line in src_file: + if line.startswith(f'{key} = '): + des_file.write(f'{key} = {service_key_des}\n') + else: + des_file.write(line) + return boto_path + + def PrepareVM(self, vm): + vm.Install('wget') + # Unfortunately there isn't one URL scheme that works for both + # versioned archives and "always get the latest version". + if FLAGS.google_cloud_sdk_version is not None: + sdk_file = ('google-cloud-sdk-%s-linux-x86_64.tar.gz' % + FLAGS.google_cloud_sdk_version) + sdk_url = 'https://storage.googleapis.com/cloud-sdk-release/' + sdk_file + else: + sdk_file = 'google-cloud-sdk.tar.gz' + sdk_url = 'https://dl.google.com/dl/cloudsdk/release/' + sdk_file + vm.RemoteCommand('wget ' + sdk_url) + vm.RemoteCommand('tar xvf ' + sdk_file) + # Versioned and unversioned archives both unzip to a folder called + # 'google-cloud-sdk'. + vm.RemoteCommand('bash ./google-cloud-sdk/install.sh ' + '--disable-installation-options ' + '--usage-report=false ' + '--rc-path=.bash_profile ' + '--path-update=true ' + '--bash-completion=true') + vm.Install('google_cloud_storage') + + vm.RemoteCommand('mkdir -p .config') + + if FLAGS.gcs_client == GCS_CLIENT_BOTO: + if vm.BASE_OS_TYPE == os_types.WINDOWS: + self.AcquireWritePermissionsWindows(vm) + else: + self.AcquireWritePermissionsLinux(vm) + vm.Install('gcs_boto_plugin') + + vm.gsutil_path, _ = vm.RemoteCommand('which gsutil', login_shell=True) + vm.gsutil_path = vm.gsutil_path.split()[0] + + # Detect if we need to install crcmod for gcp. + # See "gsutil help crc" for details. + raw_result, _ = vm.RemoteCommand('%s version -l' % vm.gsutil_path) + logging.info('gsutil version -l raw result is %s', raw_result) + search_string = 'compiled crcmod: True' + result_string = re.findall(search_string, raw_result) + if not result_string: + logging.info('compiled crcmod is not available, installing now...') + try: + # Try uninstall first just in case there is a pure python version of + # crcmod on the system already, this is required by gsutil doc: + # https://cloud.google.com/storage/docs/ + # gsutil/addlhelp/CRC32CandInstallingcrcmod + vm.Uninstall('crcmod') + except errors.VirtualMachine.RemoteCommandError: + logging.info('pip uninstall crcmod failed, could be normal if crcmod ' + 'is not available at all.') + vm.Install('crcmod') + vm.installed_crcmod = True + else: + logging.info('compiled crcmod is available, not installing again.') + vm.installed_crcmod = False + + def CleanupVM(self, vm): + vm.RemoveFile('google-cloud-sdk') + vm.RemoveFile(GCLOUD_CONFIG_PATH) + if FLAGS.gcs_client == GCS_CLIENT_BOTO: + vm.RemoveFile(object_storage_service.DEFAULT_BOTO_LOCATION_USER) + vm.Uninstall('gcs_boto_plugin') + + def CLIUploadDirectory(self, vm, directory, files, bucket): + return vm.RemoteCommand( + 'time %s -m cp %s/* gs://%s/' % ( + vm.gsutil_path, directory, bucket)) + + def CLIDownloadBucket(self, vm, bucket, objects, dest): + return vm.RemoteCommand( + 'time %s -m cp gs://%s/* %s' % (vm.gsutil_path, bucket, dest)) + + def Metadata(self, vm): + metadata = { + 'pkb_installed_crcmod': vm.installed_crcmod, + 'gcs_client': str(FLAGS.gcs_client) + } + if FLAGS.gcs_client == GCS_CLIENT_BOTO: + metadata.update({ + object_storage_service.BOTO_LIB_VERSION: + linux_packages.GetPipPackageVersion(vm, 'boto') + }) + return metadata + + def APIScriptArgs(self): + return ['--gcs_client=' + str(FLAGS.gcs_client)] + + @classmethod + def APIScriptFiles(cls): + return ['gcs.py', 'gcs_boto.py'] diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcsfuse_disk.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcsfuse_disk.py new file mode 100644 index 0000000..24e87e6 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/gcsfuse_disk.py @@ -0,0 +1,34 @@ +"""GCS FUSE based disk implementation.""" + +from absl import flags +from perfkitbenchmarker import disk + +FLAGS = flags.FLAGS + +DEFAULT_MOUNT_OPTIONS = [ + 'allow_other', + 'dir_mode=777', + 'file_mode=777', + 'implicit_dirs', +] + + +class GcsFuseDisk(disk.MountableDisk): + """GCS FUSE based disk implementation. + + Mount the bucket specified by flag gcsfuse_bucket at the mount_point. If not + specified, all the buckets are mounted as subdirectories. + """ + + def Attach(self, vm): + vm.Install('gcsfuse') + + def Mount(self, vm): + vm.RemoteCommand( + f'sudo mkdir -p {self.mount_point} && ' + f'sudo chmod a+w {self.mount_point}') + + opts = ','.join(DEFAULT_MOUNT_OPTIONS + FLAGS.mount_options) + bucket = FLAGS.gcsfuse_bucket + target = self.mount_point + vm.RemoteCommand(f'sudo mount -t gcsfuse -o {opts} {bucket} {target}') diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py new file mode 100644 index 0000000..f867c5a --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py @@ -0,0 +1,301 @@ +# Copyright 2018 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes/functions related to GKE (Google Kubernetes Engine).""" + +import json +import logging +import math +import os +import re + +from absl import flags +from perfkitbenchmarker import container_service +from perfkitbenchmarker import data +from perfkitbenchmarker import errors +from perfkitbenchmarker import kubernetes_helper +from perfkitbenchmarker import providers +from perfkitbenchmarker.providers.gcp import gce_virtual_machine +from perfkitbenchmarker.providers.gcp import util +import six + +FLAGS = flags.FLAGS + +NVIDIA_DRIVER_SETUP_DAEMON_SET_SCRIPT = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml' +NVIDIA_UNRESTRICTED_PERMISSIONS_DAEMON_SET = 'nvidia_unrestricted_permissions_daemonset.yml' +DEFAULT_RELEASE_CHANNEL = 'regular' +SERVICE_ACCOUNT_PATTERN = r'.*((? int: + # Defaults are used for pod and services CIDR ranges: + # https://cloud.google.com/kubernetes-engine/docs/concepts/alias-ips#cluster_sizing_secondary_range_svcs) + # Each node requires a /24 CIDR range for pods + # The cluster requires a /20 CIDR range for services + # So 2^(32 - nodes) - 2^(32 - 20) >= 2^(32 - 24) * CIDR + # OR CIDR <= 32 - log2(2^8 * nodes + 2^12) + cidr_size = int(32 - math.log2((nodes << 8) + (1 << 12))) + # /19 is narrowest CIDR range GKE supports + return min(cidr_size, 19) + + +class GoogleContainerRegistry(container_service.BaseContainerRegistry): + """Class for building and storing container images on GCP.""" + + CLOUD = providers.GCP + + def __init__(self, registry_spec): + super(GoogleContainerRegistry, self).__init__(registry_spec) + self.project = self.project or util.GetDefaultProject() + + def GetFullRegistryTag(self, image): + """Gets the full tag of the image.""" + region = util.GetMultiRegionFromRegion(util.GetRegionFromZone(self.zone)) + hostname = '{region}.gcr.io'.format(region=region) + full_tag = '{hostname}/{project}/{name}'.format( + hostname=hostname, project=self.project, name=image) + return full_tag + + def Login(self): + """Configure docker to be able to push to remote repo.""" + # TODO(pclay): Don't edit user's docker config. It is idempotent. + cmd = util.GcloudCommand(self, 'auth', 'configure-docker') + del cmd.flags['zone'] + cmd.Issue() + + def RemoteBuild(self, image): + """Build the image remotely.""" + full_tag = self.GetFullRegistryTag(image.name) + build_cmd = util.GcloudCommand(self, 'builds', 'submit', '--tag', full_tag, + image.directory) + del build_cmd.flags['zone'] + build_cmd.Issue() + + +class GkeCluster(container_service.KubernetesCluster): + """Class representing a Google Kubernetes Engine cluster.""" + + CLOUD = providers.GCP + + def __init__(self, spec): + super(GkeCluster, self).__init__(spec) + self.project = spec.vm_spec.project + self.cluster_version = ( + FLAGS.container_cluster_version or DEFAULT_RELEASE_CHANNEL) + self.use_application_default_credentials = True + self.zones = self.zone and self.zone.split(',') + if not self.zones: + raise errors.Config.MissingOption( + 'container_cluster.vm_spec.GCP.zone is required.') + elif len(self.zones) == 1 and util.IsRegion(self.zone): + self.region = self.zone + self.zones = [] + logging.info("Interpreting zone '%s' as a region", self.zone) + else: + self.region = util.GetRegionFromZone(self.zones[0]) + + def GetResourceMetadata(self): + """Returns a dict containing metadata about the cluster. + + Returns: + dict mapping string property key to value. + """ + result = super(GkeCluster, self).GetResourceMetadata() + result['project'] = self.project + if self.cluster_version in RELEASE_CHANNELS: + result['gke_release_channel'] = self.cluster_version + + result['boot_disk_type'] = self.vm_config.boot_disk_type + result['boot_disk_size'] = self.vm_config.boot_disk_size + if self.vm_config.max_local_disks: + result['gce_local_ssd_count'] = self.vm_config.max_local_disks + # TODO(pclay): support NVME when it leaves alpha + # Also consider moving FLAGS.gce_ssd_interface into the vm_spec. + result['gce_local_ssd_interface'] = gce_virtual_machine.SCSI + return result + + def _GcloudCommand(self, *args, **kwargs): + """Fix zone and region.""" + cmd = util.GcloudCommand(self, *args, **kwargs) + if len(self.zones) != 1: + del cmd.flags['zone'] + cmd.flags['region'] = self.region + return cmd + + def _Create(self): + """Creates the cluster.""" + cmd = self._GcloudCommand('container', 'clusters', 'create', self.name) + + self._AddNodeParamsToCmd(self.vm_config, self.num_nodes, + container_service.DEFAULT_NODEPOOL, cmd) + + if self.cluster_version in RELEASE_CHANNELS: + cmd.flags['release-channel'] = self.cluster_version + else: + cmd.flags['cluster-version'] = self.cluster_version + if FLAGS.gke_enable_alpha: + cmd.args.append('--enable-kubernetes-alpha') + cmd.args.append('--no-enable-autorepair') + cmd.args.append('--no-enable-autoupgrade') + + user = util.GetDefaultUser() + if FLAGS.gcp_service_account: + cmd.flags['service-account'] = FLAGS.gcp_service_account + # Matches service accounts that either definitely belongs to this project or + # are a GCP managed service account like the GCE default service account, + # which we can't tell to which project they belong. + elif re.match(SERVICE_ACCOUNT_PATTERN, user): + logging.info('Re-using configured service-account for GKE Cluster: %s', + user) + cmd.flags['service-account'] = user + self.use_application_default_credentials = False + else: + logging.info('Using default GCE service account for GKE cluster') + cmd.flags['scopes'] = 'cloud-platform' + + if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes: + cmd.args.append('--enable-autoscaling') + cmd.flags['max-nodes'] = self.max_nodes + cmd.flags['min-nodes'] = self.min_nodes + + cmd.flags['cluster-ipv4-cidr'] = f'/{_CalculateCidrSize(self.max_nodes)}' + + if self.vm_config.network: + cmd.flags['network'] = self.vm_config.network.network_resource.name + + cmd.flags['metadata'] = util.MakeFormattedDefaultTags() + cmd.flags['labels'] = util.MakeFormattedDefaultTags() + self._IssueResourceCreationCommand(cmd) + + self._CreateNodePools() + + def _CreateNodePools(self): + """Creates additional nodepools for the cluster, if applicable.""" + for name, nodepool in six.iteritems(self.nodepools): + cmd = self._GcloudCommand('container', 'node-pools', 'create', name, + '--cluster', self.name) + self._AddNodeParamsToCmd(nodepool.vm_config, nodepool.vm_count, name, cmd) + self._IssueResourceCreationCommand(cmd) + + def _IssueResourceCreationCommand(self, cmd): + """Issues a command to gcloud to create resources.""" + + # This command needs a long timeout due to the many minutes it + # can take to provision a large GPU-accelerated GKE cluster. + _, stderr, retcode = cmd.Issue(timeout=1200, raise_on_failure=False) + if retcode: + # Log specific type of failure, if known. + if 'ZONE_RESOURCE_POOL_EXHAUSTED' in stderr: + logging.exception('Container resources exhausted: %s', stderr) + raise errors.Benchmarks.InsufficientCapacityCloudFailure( + 'Container resources exhausted in zone %s: %s' % + (self.zone, stderr)) + util.CheckGcloudResponseKnownFailures(stderr, retcode) + raise errors.Resource.CreationError(stderr) + + def _AddNodeParamsToCmd(self, vm_config, num_nodes, name, cmd): + """Modifies cmd to include node specific command arguments.""" + + if vm_config.gpu_count: + cmd.flags['accelerator'] = ( + gce_virtual_machine.GenerateAcceleratorSpecString( + vm_config.gpu_type, + vm_config.gpu_count)) + if vm_config.min_cpu_platform: + cmd.flags['min-cpu-platform'] = vm_config.min_cpu_platform + + if vm_config.threads_per_core: + # TODO(user): Remove when threads-per-core is available in GA + cmd.use_alpha_gcloud = True + cmd.flags['threads-per-core'] = vm_config.threads_per_core + + if vm_config.boot_disk_size: + cmd.flags['disk-size'] = vm_config.boot_disk_size + if vm_config.boot_disk_type: + cmd.flags['disk-type'] = vm_config.boot_disk_type + if vm_config.max_local_disks: + # TODO(pclay): Switch to local-ssd-volumes which support NVME when it + # leaves alpha. See + # https://cloud.google.com/sdk/gcloud/reference/alpha/container/clusters/create + cmd.flags['local-ssd-count'] = vm_config.max_local_disks + + cmd.flags['num-nodes'] = num_nodes + # vm_config.zone may be split a comma separated list + if vm_config.zone: + cmd.flags['node-locations'] = vm_config.zone + + if vm_config.machine_type is None: + cmd.flags['machine-type'] = 'custom-{0}-{1}'.format( + vm_config.cpus, + vm_config.memory_mib) + else: + cmd.flags['machine-type'] = vm_config.machine_type + + cmd.flags['node-labels'] = f'pkb_nodepool={name}' + + def _PostCreate(self): + """Acquire cluster authentication.""" + super(GkeCluster, self)._PostCreate() + cmd = self._GcloudCommand('container', 'clusters', 'get-credentials', + self.name) + env = os.environ.copy() + env['KUBECONFIG'] = FLAGS.kubeconfig + cmd.IssueRetryable(env=env) + + if self.vm_config.gpu_count: + kubernetes_helper.CreateFromFile(NVIDIA_DRIVER_SETUP_DAEMON_SET_SCRIPT) + kubernetes_helper.CreateFromFile( + data.ResourcePath(NVIDIA_UNRESTRICTED_PERMISSIONS_DAEMON_SET)) + + # GKE does not wait for kube-dns by default + logging.info('Waiting for kube-dns') + self.WaitForResource( + 'deployment/kube-dns', + condition_name='Available', + namespace='kube-system') + + def _GetInstanceGroups(self): + cmd = self._GcloudCommand('container', 'node-pools', 'list') + cmd.flags['cluster'] = self.name + stdout, _, _ = cmd.Issue() + json_output = json.loads(stdout) + instance_groups = [] + for node_pool in json_output: + for group_url in node_pool['instanceGroupUrls']: + instance_groups.append(group_url.split('/')[-1]) # last url part + return instance_groups + + def _IsDeleting(self): + cmd = self._GcloudCommand('container', 'clusters', 'describe', self.name) + stdout, _, _ = cmd.Issue(raise_on_failure=False) + return True if stdout else False + + def _Delete(self): + """Deletes the cluster.""" + super()._Delete() + cmd = self._GcloudCommand('container', 'clusters', 'delete', self.name) + cmd.args.append('--async') + cmd.Issue(raise_on_failure=False) + + def _Exists(self): + """Returns True if the cluster exits.""" + cmd = self._GcloudCommand('container', 'clusters', 'describe', self.name) + _, _, retcode = cmd.Issue(suppress_warning=True, raise_on_failure=False) + return retcode == 0 + + def GetDefaultStorageClass(self) -> str: + """Get the default storage class for the provider.""" + # https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/gce-pd-csi-driver + # PD-SSD + return 'premium-rwo' diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/memcache.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/memcache.py new file mode 100644 index 0000000..171e3b5 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/memcache.py @@ -0,0 +1,39 @@ +# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from perfkitbenchmarker import providers +from perfkitbenchmarker.memcache_service import MemcacheService + + +class MemcacheService(MemcacheService): + + CLOUD = providers.GCP + + def __init__(self): + pass + + def Create(self): + raise NotImplementedError + + def Destroy(self): + raise NotImplementedError + + def Flush(self): + raise NotImplementedError + + def GetHosts(self): + raise NotImplementedError + + def GetMetadata(self): + raise NotImplementedError diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/provider_info.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/provider_info.py new file mode 100644 index 0000000..9551c00 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/provider_info.py @@ -0,0 +1,24 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Provider info for Google Cloud Platform.""" + +from perfkitbenchmarker import provider_info +from perfkitbenchmarker import providers + + +class GCPProviderInfo(provider_info.BaseProviderInfo): + + UNSUPPORTED_BENCHMARKS = [] + CLOUD = providers.GCP diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/util.py b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/util.py new file mode 100644 index 0000000..37cea08 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/gcp/util.py @@ -0,0 +1,511 @@ +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities for working with Google Cloud Platform resources.""" + +import collections +import functools +import json +import logging +import re +from typing import Set +from absl import flags +from perfkitbenchmarker import context +from perfkitbenchmarker import errors +from perfkitbenchmarker import virtual_machine +from perfkitbenchmarker import vm_util +import six + +FLAGS = flags.FLAGS + +RATE_LIMITED_MESSAGE = 'Rate Limit Exceeded' +# regex to check API limits when tagging resources +# matches a string like: +# ERROR: (gcloud.compute.disks.add-labels) PERMISSION_DENIED: Quota exceeded +# for quota group 'ReadGroup' and limit 'Read requests per 100 seconds' of +# service 'compute.googleapis.com' for consumer 'project_number:012345678901'. +TAGGING_RATE_LIMITED_REGEX = re.compile( + "Quota exceeded for quota group '.*?' and limit " + "'.*? per.*?seconds' of service 'compute.googleapis.com'") +RATE_LIMITED_MAX_RETRIES = 10 +# 200s is chosen because 1) quota is measured in 100s intervals and 2) fuzzing +# causes a random number between 100 and this to be chosen. +RATE_LIMITED_MAX_POLLING_INTERVAL = 200 +# This must be set. Otherwise, calling Issue() will fail in util_test.py. +RATE_LIMITED_FUZZ = 0.5 +RATE_LIMITED_TIMEOUT = 1200 +STOCKOUT_MESSAGE = ('Creation failed due to insufficient capacity indicating a ' + 'potential stockout scenario.') + + +@functools.lru_cache() +def GetDefaultProject(): + """Get the default project.""" + cmd = [FLAGS.gcloud_path, 'config', 'list', '--format=json'] + stdout, _, _ = vm_util.IssueCommand(cmd) + result = json.loads(stdout) + return result['core']['project'] + + +@functools.lru_cache() +def GetDefaultUser(): + """Get the default project.""" + cmd = [FLAGS.gcloud_path, 'config', 'list', '--format=json'] + stdout, _, _ = vm_util.IssueCommand(cmd) + result = json.loads(stdout) + return result['core']['account'] + + +def GetRegionFromZone(zone): + """Returns the region name from a fully-qualified zone name. + + Each fully-qualified GCP zone name is formatted as - where, for + example, each region looks like us-central1, europe-west1, or asia-east1. + Therefore, we pull the first two parts the fully qualified zone name delimited + by a dash and assume the rest is the name of the zone. See + https://cloud.google.com/compute/docs/regions-zones for more information. + + Args: + zone: The fully-qualified name of a GCP zone. + """ + parts = zone.split('-') + return '-'.join(parts[:2]) + + +def IsRegion(location: str) -> bool: + """Determine if a zone or region is a region.""" + return bool(re.fullmatch(r'[a-z]+-[a-z]+[0-9]', location)) + + +def GetAllZones() -> Set[str]: + """Gets a list of valid zones.""" + cmd = GcloudCommand(None, 'compute', 'zones', 'list') + cmd.flags = { + 'format': 'value(name)', + } + stdout, _, _ = cmd.Issue() + return set(stdout.splitlines()) + + +def GetAllRegions() -> Set[str]: + """Gets a list of valid regions.""" + cmd = GcloudCommand(None, 'compute', 'regions', 'list') + cmd.flags = { + 'format': 'value(name)', + } + stdout, _, _ = cmd.Issue() + return set(stdout.splitlines()) + + +def GetZonesInRegion(region) -> Set[str]: + """Gets a list of zones for the given region.""" + cmd = GcloudCommand(None, 'compute', 'zones', 'list') + cmd.flags = { + 'filter': f"name~'{region}'", + 'format': 'value(name)', + } + stdout, _, _ = cmd.Issue() + return set(stdout.splitlines()) + + +def GetZonesFromMachineType() -> Set[str]: + """Gets a list of zones for the given machine type.""" + cmd = GcloudCommand(None, 'compute', 'machine-types', 'list') + cmd.flags = { + 'filter': f"name~'{FLAGS.machine_type}'", + 'format': 'value(zone)' + } + stdout, _, _ = cmd.Issue() + return set(stdout.splitlines()) + + +def GetGeoFromRegion(region: str) -> str: + """Gets valid geo from the region, i.e. region us-central1 returns us.""" + return region.split('-')[0] + + +def GetRegionsInGeo(geo: str) -> Set[str]: + """Gets valid regions in the geo.""" + return {region for region in GetAllRegions() if region.startswith(geo)} + + +def GetMultiRegionFromRegion(region): + """Gets the closest multi-region location to the region.""" + if (region.startswith('us') or + region.startswith('northamerica') or + region.startswith('southamerica')): + return 'us' + elif region.startswith('europe'): + return 'eu' + elif region.startswith('asia') or region.startswith('australia'): + return 'asia' + else: + raise Exception('Unknown region "%s".' % region) + + +def IssueCommandFunction(cmd, **kwargs): + """Use vm_util to issue the given command. + + Args: + cmd: the gcloud command to run + **kwargs: additional arguments for the gcloud command + + Returns: + stdout, stderr, retcode tuple from running the command + """ + return vm_util.IssueCommand(cmd.GetCommand(), **kwargs) + + +def IssueRetryableCommandFunction(cmd, **kwargs): + """Use vm_util to issue the given retryable command. + + Args: + cmd: the gcloud command to run + **kwargs: additional arguments for the gcloud command + + Returns: + stdout, stderr, tuple from running the command + """ + return vm_util.IssueRetryableCommand(cmd.GetCommand(), **kwargs) + + +# The function that is used to issue a command, when given a GcloudCommand +# object and additional arguments. Can be overridden. +_issue_command_function = IssueCommandFunction + +# The function that is used to issue a retryable command, when given a +# GcloudCommand object and additional arguments. Can be overridden. +_issue_retryable_command_function = IssueRetryableCommandFunction + + +def SetIssueCommandFunction(func): + """Set the issue command function to be the given function. + + Args: + func: the function to run when issuing a GcloudCommand. + """ + global _issue_command_function + _issue_command_function = func + + +def SetIssueRetryableCommandFunction(func): + """Set the issue retryable command function to be the given function. + + Args: + func: the function to run when issuing a GcloudCommand. + """ + global _issue_retryable_command_function + _issue_retryable_command_function = func + + +class GcloudCommand(object): + """A gcloud command. + + Attributes: + args: list of strings. Non-flag args to pass to gcloud, typically + specifying an operation to perform (e.g. ['compute', 'images', 'list'] + to list available images). + flags: OrderedDict mapping flag name string to flag value. Flags to pass to + gcloud (e.g. {'project': 'my-project-id'}). If a provided value is + True, the flag is passed to gcloud without a value. If a provided value + is a list, the flag is passed to gcloud multiple times, once with each + value in the list. + additional_flags: list of strings. Additional flags to append unmodified to + the end of the gcloud command (e.g. ['--metadata', 'color=red']). + rate_limited: boolean. True if rate limited, False otherwise. + use_alpha_gcloud: boolean. Defaults to False. + """ + + def __init__(self, resource, *args): + """Initializes a GcloudCommand with the provided args and common flags. + + Args: + resource: A GCE resource of type BaseResource. + *args: sequence of strings. Non-flag args to pass to gcloud, typically + specifying an operation to perform (e.g. ['compute', 'images', 'list'] + to list available images). + """ + self.args = list(args) + self.flags = collections.OrderedDict() + self.additional_flags = [] + self._AddCommonFlags(resource) + self.rate_limited = False + self.use_alpha_gcloud = False + + def GetCommand(self): + """Generates the gcloud command. + + Returns: + list of strings. When joined by spaces, forms the gcloud shell command. + + Raises: + ValueError: if passed a None value + """ + cmd = [FLAGS.gcloud_path] + cmd.extend(self.args) + for flag_name, values in sorted(self.flags.items()): + flag_name_str = '--{0}'.format(flag_name) + if values is True: + cmd.append(flag_name_str) + elif values is None: + raise ValueError(f'Flag {flag_name} is None. Please filter out.') + else: + values_iterable = values if isinstance(values, list) else [values] + for value in values_iterable: + cmd.append(flag_name_str) + cmd.append(str(value)) + cmd.extend(self.additional_flags) + if self.use_alpha_gcloud and len(cmd) > 1 and cmd[1] != 'alpha': + cmd.insert(1, 'alpha') + return cmd + + def __repr__(self): + return '{0}({1})'.format(type(self).__name__, ' '.join(self.GetCommand())) + + @staticmethod + def _IsIssueRateLimitMessage(text) -> bool: + if RATE_LIMITED_MESSAGE in text: + return True + match = TAGGING_RATE_LIMITED_REGEX.search(text) + if match: + return True + return False + + @vm_util.Retry( + poll_interval=RATE_LIMITED_MAX_POLLING_INTERVAL, + max_retries=RATE_LIMITED_MAX_RETRIES, + fuzz=RATE_LIMITED_FUZZ, + timeout=RATE_LIMITED_TIMEOUT, + retryable_exceptions=( + errors.Benchmarks.QuotaFailure.RateLimitExceededError,)) + def Issue(self, **kwargs): + """Tries to run the gcloud command once, retrying if Rate Limited. + + Args: + **kwargs: Keyword arguments to forward to vm_util.IssueCommand when + issuing the gcloud command. + + Returns: + A tuple of stdout, stderr, and retcode from running the gcloud command. + Raises: + RateLimitExceededError: if command fails with Rate Limit Exceeded. + QuotaFailure: if command fails without Rate Limit Exceeded and + retry_on_rate_limited is set to false + IssueCommandError: if command fails without Rate Limit Exceeded. + + """ + try: + stdout, stderr, retcode = _issue_command_function(self, **kwargs) + except errors.VmUtil.IssueCommandError as error: + error_message = str(error) + if GcloudCommand._IsIssueRateLimitMessage(error_message): + self._RaiseRateLimitedException(error_message) + else: + raise error + if retcode and GcloudCommand._IsIssueRateLimitMessage(stderr): + self._RaiseRateLimitedException(stderr) + + return stdout, stderr, retcode + + def _RaiseRateLimitedException(self, error): + """Raise rate limited exception based on the retry_on_rate_limited flag. + + Args: + error: Error message to raise + + Raises: + RateLimitExceededError: if command fails with Rate Limit Exceeded and + retry_on_rate_limited is set to true + QuotaFailure: if command fails without Rate Limit Exceeded and + retry_on_rate_limited is set to false + """ + self.rate_limited = True + if FLAGS.retry_on_rate_limited: + raise errors.Benchmarks.QuotaFailure.RateLimitExceededError(error) + raise errors.Benchmarks.QuotaFailure(error) + + def IssueRetryable(self, **kwargs): + """Tries running the gcloud command until it succeeds or times out. + + Args: + **kwargs: Keyword arguments to forward to vm_util.IssueRetryableCommand + when issuing the gcloud command. + + Returns: + (stdout, stderr) pair of strings from running the gcloud command. + """ + return _issue_retryable_command_function(self, **kwargs) + + def _AddCommonFlags(self, resource): + """Adds common flags to the command. + + Adds common gcloud flags derived from the PKB flags and provided resource. + + Args: + resource: A GCE resource of type BaseResource. + """ + self.flags['format'] = 'json' + self.flags['quiet'] = True + if resource: + if resource.project is not None: + self.flags['project'] = resource.project + if hasattr(resource, 'zone') and resource.zone: + self.flags['zone'] = resource.zone + self.additional_flags.extend(FLAGS.additional_gcloud_flags or ()) + + +_QUOTA_EXCEEDED_REGEX = re.compile( + r"(Quota '.*' exceeded|Insufficient \w+ quota)") + +_NOT_ENOUGH_RESOURCES_STDERR = ('does not have enough resources available to ' + 'fulfill the request.') +_NOT_ENOUGH_RESOURCES_MESSAGE = 'Creation failed due to not enough resources: ' + + +def CheckGcloudResponseKnownFailures(stderr, retcode): + """Checks gcloud responses for quota exceeded errors. + + Args: + stderr: The stderr from a gcloud command. + retcode: The return code from a gcloud command. + """ + if retcode: + if _QUOTA_EXCEEDED_REGEX.search(stderr): + message = virtual_machine.QUOTA_EXCEEDED_MESSAGE + stderr + logging.error(message) + raise errors.Benchmarks.QuotaFailure(message) + if _NOT_ENOUGH_RESOURCES_STDERR in stderr: + message = _NOT_ENOUGH_RESOURCES_MESSAGE + stderr + logging.error(message) + raise errors.Benchmarks.InsufficientCapacityCloudFailure(message) + + +def AuthenticateServiceAccount(vm, vm_gcloud_path='gcloud', benchmark=None): + """Authorize gcloud to access Google Cloud Platform with a service account. + + If you want gcloud (and other tools in the Cloud SDK) to use service account + credentials to make requests, use this method to authenticate. + Account name is provided by FLAGS.gcp_service_account + Credentials are fetched from a file whose local path is provided by + FLAGS.gcp_service_account_key_file, which contains private authorization key. + In the absence of a locally supplied credential file, the file is retrieved + from pre-provisioned data bucket. + + Args: + vm: vm on which the gcloud library needs to be authenticated. + vm_gcloud_path: Optional path to the gcloud binary on the vm. + benchmark: The module for retrieving the associated service account file. + """ + if not FLAGS.gcp_service_account: + raise errors.Setup.InvalidFlagConfigurationError( + 'Authentication requires the service account name to be ' + 'specified via --gcp_service_account.') + if not FLAGS.gcp_service_account_key_file: + raise errors.Setup.InvalidFlagConfigurationError( + 'Authentication requires the service account credential json to be ' + 'specified via --gcp_service_account_key_file.') + if '/' in FLAGS.gcp_service_account_key_file: + vm.PushFile(FLAGS.gcp_service_account_key_file, vm_util.VM_TMP_DIR) + key_file_name = FLAGS.gcp_service_account_key_file.split('/')[-1] + else: + vm.InstallPreprovisionedBenchmarkData(benchmark, + [FLAGS.gcp_service_account_key_file], + vm_util.VM_TMP_DIR) + key_file_name = FLAGS.gcp_service_account_key_file + activate_cmd = ('{} auth activate-service-account {} --key-file={}/{}' + .format(vm_gcloud_path, FLAGS.gcp_service_account, + vm_util.VM_TMP_DIR, key_file_name)) + vm.RemoteCommand(activate_cmd) + + +def InstallGcloudComponents(vm, vm_gcloud_path='gcloud', component='alpha'): + """Install gcloud components on the target vm. + + Args: + vm: vm on which the gcloud's alpha components need to be installed. + vm_gcloud_path: Optional path to the gcloud binary on the vm. + component: Gcloud component to install. + """ + install_cmd = '{} components install {} --quiet'.format(vm_gcloud_path, + component) + vm.RemoteCommand(install_cmd) + + +def FormatTags(tags_dict): + """Format a dict of tags into arguments. + + Args: + tags_dict: Tags to be formatted. + + Returns: + A string contains formatted tags + """ + return ','.join( + '{0}={1}'.format(k, v) for k, v in sorted(six.iteritems(tags_dict))) + + +def SplitTags(tags): + """Formats a string of joined tags into a dictionary. + + Args: + tags: A string containing tags formatted as key1=value1,key2=value2,... + + Returns: + An OrderedDict mapping tag keys to values in the order the tags were given. + """ + return collections.OrderedDict( + tag_pair.split('=') for tag_pair in tags.split(',')) + + +def GetDefaultTags(timeout_minutes=None): + """Get the default tags in a dictionary. + + Args: + timeout_minutes: Timeout used for setting the timeout_utc tag. + + Returns: + A dict of tags, contributed from the benchmark spec. + """ + benchmark_spec = context.GetThreadBenchmarkSpec() + if not benchmark_spec: + return {} + return benchmark_spec.GetResourceTags(timeout_minutes) + + +def MakeFormattedDefaultTags(timeout_minutes=None): + """Get the default tags formatted. + + Args: + timeout_minutes: Timeout used for setting the timeout_utc tag. + + Returns: + A string contains tags, contributed from the benchmark spec. + """ + return FormatTags(GetDefaultTags(timeout_minutes)) + + +def GetAccessToken(application_default: bool = True) -> str: + """Gets the access token for the default project. + + Args: + application_default: whether to use application-default in gcloud args. + + Returns: + Text string of the access token. + """ + cmd = [FLAGS.gcloud_path, 'auth'] + if application_default: + cmd.append('application-default') + cmd.append('print-access-token') + stdout, _, _ = vm_util.IssueCommand(cmd) + return stdout.strip() diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/__init__.py b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/__init__.py new file mode 100644 index 0000000..d90275b --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/flags.py b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/flags.py new file mode 100644 index 0000000..0ca6a58 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/flags.py @@ -0,0 +1,34 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from absl import flags + +flags.DEFINE_string('tencent_user_name', 'ubuntu', + 'This determines the user name that Perfkit will ' + 'attempt to use. This must be changed in order to ' + 'use any image other than ubuntu.') +flags.DEFINE_string('tencent_boot_disk_type', None, + '"CLOUD_BASIC" - HDD cloud disk, ' + '"CLOUD_PREMIUM" - premium cloud disk, ' + '"CLOUD_SSD" - cloud SSD disk, ' + '"LOCAL_BASIC" - local disk, ' + '"LOCAL_SSD" - local SSD disk') +flags.DEFINE_string('tencent_boot_disk_size', None, + 'Boot disk size in GB.') +flags.DEFINE_string('tencent_internet_bandwidth', None, + 'Internet bandwidth in Mbps.') +flags.DEFINE_string('tencent_image_id', None, + 'Image ID.') +flags.DEFINE_integer('tencent_project_id', 0, + 'Project ID.') diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/provider_info.py b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/provider_info.py new file mode 100644 index 0000000..3ffb78c --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/provider_info.py @@ -0,0 +1,24 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Provider info for Tencent +""" + +from perfkitbenchmarker import providers +from perfkitbenchmarker import provider_info + + +class TencentProviderInfo(provider_info.BaseProviderInfo): + + CLOUD = providers.TENCENT diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/requirements.txt b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/requirements.txt new file mode 100644 index 0000000..61bd443 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/requirements.txt @@ -0,0 +1,18 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Requirements for running PerfKit Benchmarker on Tencent. +-r../../../requirements.txt +tccli>=3.0.68.1 + diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/tencent_disk.py b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/tencent_disk.py new file mode 100644 index 0000000..21cde97 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/tencent_disk.py @@ -0,0 +1,184 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing classes related to Tencent disks. +""" + +import json +import threading +import logging + +from perfkitbenchmarker import disk +from perfkitbenchmarker import vm_util +from perfkitbenchmarker import providers +from perfkitbenchmarker.providers.tencent import util + +TENCENT_CLOUD_EXCEPTION_STDOUT = '[TencentCloudSDKException]' +DISK_CHARGE_TYPE = 'POSTPAID_BY_HOUR' +LOCAL_DISK_TYPES = ['LOCAL_BASIC', 'LOCAL_SSD'] + + +class TencentWaitOnDiskRetryableError(Exception): + """Error for retrying _Exists when an Tencent Disk has an ID but is not yet created.""" + + +class TencentCloudDiskUnknownCLIRetryableError(Exception): + """Error for retrying commands when STDOUT returns an unexpected CLI error (and still exit 0)""" + + +class TencentDisk(disk.BaseDisk): + """Object representing a Tencent Disk.""" + _lock = threading.Lock() + vm_devices = {} + + def __init__(self, disk_spec, vm): + super(TencentDisk, self).__init__(disk_spec) + + self.id = None + self.vm = vm + self.zone = vm.zone + self.project_id = vm.project_id + self.region = util.GetRegionFromZone(self.zone) + self.device_letter = None + self.attached_vm_id = None + self.device_path = None + self.disk_recently_created = False + + @vm_util.Retry(poll_interval=5, log_errors=False, max_retries=5, + retryable_exceptions=(TencentCloudDiskUnknownCLIRetryableError,)) + def _Create(self): + """Creates the disk.""" + placement = { + "Zone": self.zone + } + create_cmd = util.TENCENT_PREFIX + [ + 'cbs', + 'CreateDisks', + '--region', self.region, + '--DiskType', self.disk_type, + '--DiskChargeType', DISK_CHARGE_TYPE, + '--Placement', json.dumps(placement), + '--DiskSize', str(self.disk_size)] + util.TENCENT_SUFFIX + + stdout, _, _ = vm_util.IssueCommand(create_cmd) + try: + response = json.loads(stdout) + except ValueError as e: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudDiskUnknownCLIRetryableError + self.id = response['DiskIdSet'][0] + self.disk_recently_created = True + util.AddDefaultTags(self.id, self.region) + + @vm_util.Retry(poll_interval=5, log_errors=False, max_retries=10, + retryable_exceptions=(TencentCloudDiskUnknownCLIRetryableError)) + def _Delete(self): + """Deletes the disk.""" + delete_cmd = util.TENCENT_PREFIX + [ + 'cbs', + 'TerminateDisks', + '--region', self.region, + '--DiskIds', json.dumps([self.id])] + util.TENCENT_SUFFIX + logging.info('Deleting Tencent volume %s. This may fail if the disk is not ' + 'yet detached, but will be retried.', self.id) + stdout, _, _ = vm_util.IssueCommand(delete_cmd) + try: + json.loads(stdout) + except ValueError as e: + if TENCENT_CLOUD_EXCEPTION_STDOUT not in stdout: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudDiskUnknownCLIRetryableError + self.disk_recently_created = False + + @vm_util.Retry(poll_interval=3, log_errors=False, max_retries=5, + retryable_exceptions=(TencentWaitOnDiskRetryableError, TencentCloudDiskUnknownCLIRetryableError)) + def _Exists(self): + """Returns true if the disk exists.""" + describe_cmd = util.TENCENT_PREFIX + [ + 'cbs', + 'DescribeDisks', + '--region', self.region, + '--DiskIds', json.dumps([self.id])] + util.TENCENT_SUFFIX + stdout, _ = util.IssueRetryableCommand(describe_cmd) + try: + response = json.loads(stdout) + except ValueError as e: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudDiskUnknownCLIRetryableError + disks = response['DiskSet'] + assert len(disks) < 2, 'Too many volumes.' + if not disks: + if self.disk_recently_created: + raise TencentWaitOnDiskRetryableError + else: + return False + return len(disks) > 0 + + @vm_util.Retry(poll_interval=5, log_errors=False, max_retries=10, + retryable_exceptions=(TencentCloudDiskUnknownCLIRetryableError)) + def Attach(self, vm): + """Attaches the disk to a VM. + + Args: + vm: The Tencent instance to which the disk will be attached. + """ + self.attached_vm_id = vm.id + attach_cmd = util.TENCENT_PREFIX + [ + 'cbs', + 'AttachDisks', + '--region', self.region, + '--InstanceId', vm.id, + '--DiskIds', json.dumps([self.id])] + util.TENCENT_SUFFIX + logging.info('Attaching Tencent disk %s. This may fail if the disk is not ' + 'ready, but will be retried.', self.id) + stdout, _ = util.IssueRetryableCommand(attach_cmd) + try: + json.loads(stdout) + except ValueError as e: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudDiskUnknownCLIRetryableError + + @vm_util.Retry(poll_interval=5, log_errors=False, max_retries=10, + retryable_exceptions=(TencentCloudDiskUnknownCLIRetryableError)) + def Detach(self): + """Detaches the disk from a VM.""" + detach_cmd = util.TENCENT_PREFIX + [ + 'cbs', + 'DetachDisks', + '--region', self.region, + '--DiskIds', json.dumps([self.id])] + util.TENCENT_SUFFIX + stdout, _ = util.IssueRetryableCommand(detach_cmd) + try: + json.loads(stdout) + except ValueError as e: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudDiskUnknownCLIRetryableError + util.IssueRetryableCommand(detach_cmd) + + def GetDevicePath(self): + """Returns the path to the device inside the VM.""" + if not self.device_path: + self._GetPathFromRemoteHost() + return self.device_path + + def SetDiskId(self, id): + """Sets Disk ID for the local disk case since Create() will not be called""" + self.id = id + + @vm_util.Retry(log_errors=False, poll_interval=5, max_retries=10) + def _GetPathFromRemoteHost(self): + """Waits until VM is has booted.""" + readlink_cmd = 'readlink -e /dev/disk/by-id/virtio-%s' % self.id + resp, _ = self.vm.RemoteHostCommand(readlink_cmd, suppress_warning=True) + self.device_path = resp[:-1] diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/tencent_network.py b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/tencent_network.py new file mode 100644 index 0000000..4aea85f --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/tencent_network.py @@ -0,0 +1,244 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing classes related to Tencent VM networking. +""" +import json +import logging +import threading + +from absl import flags +from perfkitbenchmarker import network +from perfkitbenchmarker import providers +from perfkitbenchmarker import resource +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.providers.tencent import util + +FLAGS = flags.FLAGS +TENCENT_CLOUD_EXCEPTION_STDOUT = '[TencentCloudSDKException]' + + +class TencentCloudVPCError(Exception): + """Error for VPC-related failures.""" + + +class TencentCloudNetworkUnknownCLIRetryableError(Exception): + """Error for retrying commands when STDOUT returns an unexpected CLI error (and still exit 0)""" + + +class TencentNetwork(network.BaseNetwork): + """Object representing a Tencent Network. + + Attributes: + region: The Tencent region the Network is in. + zone: The Zone within the region for this network. + vpc: The Tencent VPC for this network. + subnet: the Tencent for this zone. + """ + + CLOUD = providers.TENCENT + + def __repr__(self): + return '%s(%r)' % (self.__class__, self.__dict__) + + def __init__(self, spec): + """Initializes TencentNetwork instances. + + Args: + spec: A BaseNetworkSpec object. + """ + super(TencentNetwork, self).__init__(spec) + self.region = util.GetRegionFromZone(spec.zone) + self.zone = spec.zone + self.vpc = TencentVpc(self.region) + self.subnet = None + self._create_lock = threading.Lock() + + def Create(self): + """Creates the network.""" + self.route_table = None + self.created = False + self.vpc.Create() + if self.subnet is None: + cidr = self.vpc.NextSubnetCidrBlock() + self.subnet = TencentSubnet(self.zone, self.vpc.id, + cidr_block=cidr) + self.subnet.Create() + + def Delete(self): + """Deletes the network.""" + self.vpc.Delete() + + +class TencentVpc(resource.BaseResource): + """An object representing an Tencent VPC.""" + + def __init__(self, region): + super(TencentVpc, self).__init__() + self.region = region + self.id = None + self.name = 'perfkit-vpc-{0}'.format(FLAGS.run_uri) + + # _subnet_index tracks the next unused 10.0.x.0/24 block. + self._subnet_index = 0 + # Lock protecting _subnet_index + self._subnet_index_lock = threading.Lock() + self.default_security_group_id = None + + @vm_util.Retry(poll_interval=5, log_errors=False, max_retries=5, + retryable_exceptions=(TencentCloudNetworkUnknownCLIRetryableError,)) + def _Create(self): + """Creates the VPC.""" + create_cmd = util.TENCENT_PREFIX + [ + 'vpc', 'CreateVpc', + '--region', self.region, + '--CidrBlock', '10.0.0.0/16', + '--VpcName', self.name + ] + util.TENCENT_SUFFIX + stdout, _, _ = vm_util.IssueCommand(create_cmd) + try: + response = json.loads(stdout) + except ValueError: + logging.error(stdout) + raise TencentCloudVPCError + self.id = response['Vpc']['VpcId'] + + def _PostCreate(self): + """Looks up the VPC default security group.""" + util.AddDefaultTags(self.id, self.region) + return + + @vm_util.Retry(poll_interval=5, log_errors=False, max_retries=5, + retryable_exceptions=(TencentCloudNetworkUnknownCLIRetryableError,)) + def _Exists(self): + """Returns true if the VPC exists.""" + describe_cmd = util.TENCENT_PREFIX + [ + 'vpc', 'DescribeVpcs', + '--region', self.region, + '--VpcIds', json.dumps([self.id]) + ] + util.TENCENT_SUFFIX + stdout, _ = util.IssueRetryableCommand(describe_cmd) + try: + response = json.loads(stdout) + except ValueError as e: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudNetworkUnknownCLIRetryableError + assert response['TotalCount'] < 2, 'Too many VPCs.' + return response['TotalCount'] > 0 + + @vm_util.Retry(poll_interval=5, log_errors=False, max_retries=5, + retryable_exceptions=(TencentCloudNetworkUnknownCLIRetryableError,)) + def _Delete(self): + """Deletes the VPC.""" + delete_cmd = util.TENCENT_PREFIX + [ + 'vpc', 'DeleteVpc', + '--region', self.region, + '--VpcId', self.id + ] + util.TENCENT_SUFFIX + stdout, _, _ = vm_util.IssueCommand(delete_cmd) + try: + json.loads(stdout) + except ValueError as e: + if TENCENT_CLOUD_EXCEPTION_STDOUT not in stdout: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudNetworkUnknownCLIRetryableError + + def NextSubnetCidrBlock(self): + """Returns the next available /24 CIDR block in this VPC. + + Each VPC has a 10.0.0.0/16 CIDR block. + Each subnet is assigned a /24 within this allocation. + Calls to this method return the next unused /24. + + Returns: + A string representing the next available /24 block, in CIDR notation. + Raises: + ValueError: when no additional subnets can be created. + """ + with self._subnet_index_lock: + if self._subnet_index >= (1 << 8) - 1: + raise ValueError('Exceeded subnet limit ({0}).'.format( + self._subnet_index)) + cidr = '10.0.{0}.0/24'.format(self._subnet_index) + self._subnet_index += 1 + return cidr + + +class TencentSubnet(resource.BaseResource): + def __init__(self, zone, vpc_id, cidr_block='10.0.0.0/24'): + super(TencentSubnet, self).__init__() + self.zone = zone + self.region = util.GetRegionFromZone(zone) + self.vpc_id = vpc_id + self.id = None + self.cidr_block = cidr_block + self.name = self._GetSubNetName() + + @vm_util.Retry(poll_interval=5, log_errors=False, max_retries=5, + retryable_exceptions=(TencentCloudNetworkUnknownCLIRetryableError,)) + def _Create(self): + """Creates the subnet.""" + create_cmd = util.TENCENT_PREFIX + [ + 'vpc', 'CreateSubnet', + '--region', self.region, + '--VpcId', self.vpc_id, + '--SubnetName', self.name, + '--CidrBlock', self.cidr_block, + '--Zone', self.zone + ] + util.TENCENT_SUFFIX + stdout, _, _ = vm_util.IssueCommand(create_cmd) + try: + response = json.loads(stdout) + except ValueError as e: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudNetworkUnknownCLIRetryableError + self.id = response['Subnet']['SubnetId'] + util.AddDefaultTags(self.id, self.region) + + def _Delete(self): + """Deletes the subnet.""" + delete_cmd = util.TENCENT_PREFIX + [ + 'vpc', 'DeleteSubnet', + '--region', self.region, + '--SubnetId', self.id + ] + util.TENCENT_SUFFIX + stdout, _, _ = vm_util.IssueCommand(delete_cmd) + try: + json.loads(stdout) + except ValueError as e: + if TENCENT_CLOUD_EXCEPTION_STDOUT not in stdout: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudNetworkUnknownCLIRetryableError + + + @vm_util.Retry(poll_interval=5, log_errors=False, max_retries=5, + retryable_exceptions=(TencentCloudNetworkUnknownCLIRetryableError,)) + def _Exists(self): + """Returns true if the subnet exists.""" + describe_cmd = util.TENCENT_PREFIX + [ + 'vpc', 'DescribeSubnets', + '--region', self.region, + '--SubnetIds', json.dumps([self.id]) + ] + util.TENCENT_SUFFIX + stdout, _ = util.IssueRetryableCommand(describe_cmd) + try: + response = json.loads(stdout) + except ValueError as e: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudNetworkUnknownCLIRetryableError + assert response['TotalCount'] < 2, 'Too many Subnets.' + return response['TotalCount'] > 0 + + @classmethod + def _GetSubNetName(cls): + return 'perfkit_subnet_{0}'.format(FLAGS.run_uri) diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/tencent_virtual_machine.py b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/tencent_virtual_machine.py new file mode 100644 index 0000000..528fafd --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/tencent_virtual_machine.py @@ -0,0 +1,422 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Class to represent a Tencent Virtual Machine object. + +All VM specifics are self-contained and the class provides methods to +operate on the VM: boot, shutdown, etc. +""" + +import json +import threading +import logging +import base64 + +from absl import flags +from perfkitbenchmarker import virtual_machine +from perfkitbenchmarker import linux_virtual_machine +from perfkitbenchmarker import vm_util +from perfkitbenchmarker import disk +from perfkitbenchmarker import errors +from perfkitbenchmarker import providers +from perfkitbenchmarker.providers.tencent import tencent_disk +from perfkitbenchmarker.providers.tencent import tencent_network +from perfkitbenchmarker.providers.tencent import util +FLAGS = flags.FLAGS + +# Tencent CLI returns 0 even when an exception is encountered. +# In some cases, stdout must be scanned for this message. +TENCENT_CLOUD_EXCEPTION_STDOUT = '[TencentCloudSDKException]' +TENCENT_CLOUD_SOLD_OUT_MSG = 'ResourcesSoldOut' +TENCENT_CLOUD_INSUFFICIENT_BALANCE = 'InsufficientBalance' + +INSTANCE_TRANSITIONAL_STATUSES = frozenset(['TERMINATING', 'PENDING']) +DEFAULT_SYSTEM_DISK_SIZE = 50 +DEFAULT_SYSTEM_DISK_TYPE = 'CLOUD_PREMIUM' +DEFAULT_INTERNET_BANDWIDTH = 100 +DEFAULT_USER_NAME = 'perfkit' + + +class TencentTransitionalVmRetryableError(Exception): + """Error for retrying _Exists when an Tencent VM is in a transitional state.""" + + +class TencentCloudSDKExceptionRetryableError(Exception): + """Error for retrying commands when STDOUT returns TencentCLoudSDKException (and still exit 0)""" + + +class TencentCloudUnknownCLIRetryableError(Exception): + """Error for retrying commands when STDOUT returns an unexpected CLI error (and still exit 0)""" + + +class TencentCloudResourceSoldOut(Exception): + """Error for resouce sold out""" + + +class TencentCloudInsufficientBalance(Exception): + """Error for insufficient account balance""" + + +class TencentVirtualMachine(virtual_machine.BaseVirtualMachine): + """Object representing an Tencent Virtual Machine.""" + + IMAGE_NAME_FILTER = None + CLOUD = providers.TENCENT + + def __init__(self, vm_spec): + """Initialize a Tencent virtual machine. + + Args: + vm_spec: virtual_machine.BaseVirtualMachineSpec object of the VM. + """ + super(TencentVirtualMachine, self).__init__(vm_spec) + self.region = util.GetRegionFromZone(self.zone) + self.user_name = FLAGS.tencent_user_name + self.network = tencent_network.TencentNetwork.GetNetwork(self) + self.host = None + self.id = None + self.project_id = FLAGS.tencent_project_id + self.key_id = None + self.boot_disk_size = FLAGS.tencent_boot_disk_size or DEFAULT_SYSTEM_DISK_SIZE + self.boot_disk_type = FLAGS.tencent_boot_disk_type or DEFAULT_SYSTEM_DISK_TYPE + self.internet_bandwidth = FLAGS.tencent_internet_bandwidth or DEFAULT_INTERNET_BANDWIDTH + self.image_id = FLAGS.tencent_image_id or None + + @vm_util.Retry() + def _PostCreate(self): + """Get the instance's data.""" + describe_cmd = util.TENCENT_PREFIX + [ + 'cvm', 'DescribeInstances', + '--region', self.region, + '--InstanceIds', json.dumps([self.id])] + util.TENCENT_SUFFIX + logging.info('Getting instance %s public IP. This will fail until ' + 'a public IP is available, but will be retried.', self.id) + stdout, _ = util.IssueRetryableCommand(describe_cmd) + try: + response = json.loads(stdout) + except ValueError as e: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudUnknownCLIRetryableError + instance = response['InstanceSet'][0] + if not instance['PublicIpAddresses']: + raise TencentTransitionalVmRetryableError + self.ip_address = instance['PublicIpAddresses'][0] + self.internal_ip = instance['PrivateIpAddresses'][0] + + def _CreateDependencies(self): + """Create VM dependencies.""" + self.image_id = self.image_id or self.GetDefaultImage(self.machine_type, + self.region) + self.key_id = TencentKeyFileManager.ImportKeyfile(self.region) + + + def _DeleteDependencies(self): + """Delete VM dependencies.""" + if self.key_id: + TencentKeyFileManager.DeleteKeyfile(self.region, self.key_id) + + @vm_util.Retry(poll_interval=5, log_errors=False, max_retries=5, + retryable_exceptions=(TencentCloudUnknownCLIRetryableError,)) + def _Create(self): + """Create a VM instance.""" + placement = { + 'Zone': self.zone, + 'ProjectId': self.project_id + + } + login_settings = { + 'KeyIds': [self.key_id] + } + vpc = { + 'VpcId': self.network.vpc.id, + 'SubnetId': self.network.subnet.id + } + system_disk = { + 'DiskType': self.boot_disk_type, + 'DiskSize': self.boot_disk_size + } + internet_accessible = { + 'PublicIpAssigned': True, + 'InternetMaxBandwidthOut': self.internet_bandwidth + } + + create_cmd = util.TENCENT_PREFIX + [ + 'cvm', 'RunInstances', + '--region', self.region, + '--Placement', json.dumps(placement), + '--InstanceType', self.machine_type, + '--ImageId', self.image_id, + '--VirtualPrivateCloud', json.dumps(vpc), + '--InternetAccessible', json.dumps(internet_accessible), + '--SystemDisk', json.dumps(system_disk), + '--LoginSettings', json.dumps(login_settings), + '--InstanceName', self.name + ] + util.TENCENT_SUFFIX + + # Create user and add SSH key if image doesn't have a default non-root user + if self.CREATE_NON_ROOT_USER: + public_key = TencentKeyFileManager.GetPublicKey() + user_data = util.ADD_USER_TEMPLATE.format(self.user_name, public_key) + logging.debug('encoding startup script: %s' % user_data) + create_cmd.extend(['--UserData', base64.b64encode(user_data.encode("utf-8"))]) + + # Tccli will exit 0 and provide a non-json formatted error msg to stdout in case of failure + stdout, _, _ = vm_util.IssueCommand(create_cmd) + try: + response = json.loads(stdout) + except ValueError: + if TENCENT_CLOUD_EXCEPTION_STDOUT in stdout: + if TENCENT_CLOUD_SOLD_OUT_MSG in stdout: + raise TencentCloudResourceSoldOut + elif TENCENT_CLOUD_INSUFFICIENT_BALANCE in stdout: + raise TencentCloudInsufficientBalance + else: + raise TencentCloudSDKExceptionRetryableError + else: + raise TencentCloudUnknownCLIRetryableError + + self.id = response['InstanceIdSet'][0] + util.AddDefaultTags(self.id, self.region) + + @vm_util.Retry(poll_interval=5, log_errors=False, max_retries=5, + retryable_exceptions=(TencentCloudUnknownCLIRetryableError,)) + def _Delete(self): + """Delete a VM instance.""" + delete_cmd = util.TENCENT_PREFIX + [ + 'cvm', + 'TerminateInstances', + '--region', self.region, + '--InstanceIds', json.dumps([self.id])] + util.TENCENT_SUFFIX + stdout, stderr, _ = vm_util.IssueCommand(delete_cmd) + try: + json.loads(stdout) + except ValueError as e: + if TENCENT_CLOUD_EXCEPTION_STDOUT not in stderr: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudUnknownCLIRetryableError + + @vm_util.Retry(poll_interval=5, log_errors=False, + retryable_exceptions=(TencentTransitionalVmRetryableError, TencentCloudUnknownCLIRetryableError)) + def _Exists(self): + """Returns whether the VM exists.""" + describe_cmd = util.TENCENT_PREFIX + [ + 'cvm', 'DescribeInstances', + '--region', self.region, + '--InstanceIds', json.dumps([self.id])] + util.TENCENT_SUFFIX + stdout, _ = util.IssueRetryableCommand(describe_cmd) + try: + response = json.loads(stdout) + except ValueError as e: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudUnknownCLIRetryableError + instance_set = response['InstanceSet'] + if not instance_set: + return False + # TODO There is potential issue here with tccli, possibly during high traffic periods where the instance query will return empty when an instnace is transitioning from PENDING -> RUNNING + if instance_set[0]['InstanceState'] in INSTANCE_TRANSITIONAL_STATUSES: + raise TencentTransitionalVmRetryableError + assert len(instance_set) < 2, 'Too many instances.' + return len(instance_set) > 0 + + @classmethod + @vm_util.Retry(poll_interval=5, log_errors=False, + retryable_exceptions=(TencentTransitionalVmRetryableError, TencentCloudUnknownCLIRetryableError)) + def GetDefaultImage(cls, machine_type, region): + """Returns Image ID of first match with IMAGE_NAME_MATCH. + Results from DescribeImages are evaluated in the order they are returned from the command (assumed arbitrary). + """ + if cls.IMAGE_NAME_MATCH is None: + return None + describe_cmd = util.TENCENT_PREFIX + [ + 'cvm', 'DescribeImages', + '--region', region, + '--Limit', '100', + '--InstanceType', machine_type + ] + util.TENCENT_SUFFIX + stdout, _ = util.IssueRetryableCommand(describe_cmd) + try: + response = json.loads(stdout) + except ValueError as e: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudUnknownCLIRetryableError + for i in response['ImageSet']: + if i['ImageName'] in cls.IMAGE_NAME_MATCH and i['ImageSource'] == 'OFFICIAL': + logging.debug('Found image %s, %s' % (i['ImageName'], i['ImageId'])) + return i['ImageId'] + return None + + def CreateScratchDisk(self, disk_spec): + """Create a VM's scratch disk. + + Args: + disk_spec: virtual_machine.BaseDiskSpec object of the disk. + """ + disk_ids = [] + if disk_spec.disk_type in tencent_disk.LOCAL_DISK_TYPES: + disk_spec.disk_type = disk.LOCAL + logging.debug("Querying instance for local disk ids") + disk_ids = self._GetDataDiskIds() + if len(disk_ids) != disk_spec.num_striped_disks: + raise errors.Error('Expected %s local disks but found %s local disks' % + (disk_spec.num_striped_disks, len(disk_ids))) + disks = [] + for i in range(disk_spec.num_striped_disks): + data_disk = tencent_disk.TencentDisk(disk_spec, self) + if disk_spec.disk_type == disk.LOCAL: + data_disk.SetDiskId(disk_ids[i]) + disks.append(data_disk) + self._CreateScratchDiskFromDisks(disk_spec, disks) + + @vm_util.Retry(poll_interval=2, log_errors=False, + retryable_exceptions=(TencentCloudUnknownCLIRetryableError,)) + def _GetDataDiskIds(self): + """Returns Ids of attached data disks""" + disk_ids = [] + describe_cmd = util.TENCENT_PREFIX + [ + 'cvm', 'DescribeInstances', + '--region', self.region, + '--InstanceIds', json.dumps([self.id])] + util.TENCENT_SUFFIX + stdout, _ = util.IssueRetryableCommand(describe_cmd) + try: + response = json.loads(stdout) + except ValueError as e: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudUnknownCLIRetryableError + instance = response['InstanceSet'][0] + for data_disk in instance['DataDisks']: + disk_ids.append(data_disk['DiskId']) + return disk_ids + + +class TencentKeyFileManager(object): + """Object for managing Tencent Keyfiles.""" + _lock = threading.Lock() + imported_keyfile_set = set() + deleted_keyfile_set = set() + run_uri_key_ids = {} + + @classmethod + @vm_util.Retry(poll_interval=2, log_errors=False, + retryable_exceptions=(TencentCloudSDKExceptionRetryableError, TencentCloudUnknownCLIRetryableError)) + def ImportKeyfile(cls, region): + """Imports the public keyfile to Tencent.""" + with cls._lock: + if FLAGS.run_uri in cls.run_uri_key_ids: + return cls.run_uri_key_ids[FLAGS.run_uri] + public_key = cls.GetPublicKey() + import_cmd = util.TENCENT_PREFIX + [ + 'cvm', + 'ImportKeyPair', + '--ProjectId', '0', + '--region', region, + '--KeyName', cls.GetKeyNameForRun(), + '--PublicKey', public_key] + util.TENCENT_SUFFIX + stdout, _ = util.IssueRetryableCommand(import_cmd) + try: + response = json.loads(stdout) + except ValueError as e: + if TENCENT_CLOUD_EXCEPTION_STDOUT in stdout: + raise TencentCloudSDKExceptionRetryableError + else: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudUnknownCLIRetryableError + + key_id = response['KeyId'] + cls.run_uri_key_ids[FLAGS.run_uri] = key_id + return key_id + + + @classmethod + @vm_util.Retry(poll_interval=2, log_errors=False, + retryable_exceptions=(TencentCloudSDKExceptionRetryableError, TencentCloudUnknownCLIRetryableError)) + def DeleteKeyfile(cls, region, key_id): + """Deletes the imported KeyPair for a run_uri.""" + with cls._lock: + if FLAGS.run_uri not in cls.run_uri_key_ids: + return + delete_cmd = util.TENCENT_PREFIX + [ + 'cvm', + 'DeleteKeyPairs', + '--region', region, + '--KeyIds', json.dumps([key_id])] + util.TENCENT_SUFFIX + stdout, _ = util.IssueRetryableCommand(delete_cmd) + try: + json.loads(stdout) + except ValueError as e: + if TENCENT_CLOUD_EXCEPTION_STDOUT in stdout: + raise TencentCloudSDKExceptionRetryableError + else: + logging.warn("Encountered unexpected return from command '{}', retrying.".format(e)) + raise TencentCloudUnknownCLIRetryableError + del cls.run_uri_key_ids[FLAGS.run_uri] + + @classmethod + def GetKeyNameForRun(cls): + return 'perfkit_key_{0}'.format(FLAGS.run_uri) + + @classmethod + def GetPublicKey(cls): + cat_cmd = ['cat', + vm_util.GetPublicKeyPath()] + keyfile, _ = vm_util.IssueRetryableCommand(cat_cmd) + return keyfile + + +class DebianBasedTencentVirtualMachine(TencentVirtualMachine, + linux_virtual_machine.BaseDebianMixin): + CREATE_NON_ROOT_USER = False + IMAGE_NAME_MATCH = 'Ubuntu Server 16.04.1 LTS 64' + + +class Ubuntu1604BasedTencentVirtualMachine(TencentVirtualMachine, + linux_virtual_machine.Ubuntu1604Mixin): + CREATE_NON_ROOT_USER = False + IMAGE_NAME_MATCH = 'Ubuntu Server 16.04.1 LTS 64' + + +class Ubuntu1804BasedTencentVirtualMachine(TencentVirtualMachine, + linux_virtual_machine.Ubuntu1804Mixin): + CREATE_NON_ROOT_USER = False + IMAGE_NAME_MATCH = 'Ubuntu Server 18.04.1 LTS 64' + + +class Ubuntu2004BasedTencentVirtualMachine(TencentVirtualMachine, + linux_virtual_machine.Ubuntu2004Mixin): + CREATE_NON_ROOT_USER = False + IMAGE_NAME_MATCH = ['Ubuntu Server 20.04 LTS 64', 'Ubuntu 20.04(arm64)'] + + def UpdateEnvironmentPath(self): + # Tencent's image for Ubuntu 20.04 seems to have root-owned files in user home + self.RemoteCommand('sudo chown -R {0}:{0} /home/{0}'.format(self.user_name)) + + +# TODO to be verified +class Ubuntu2204BasedTencentVirtualMachine(TencentVirtualMachine, + linux_virtual_machine.Ubuntu2204Mixin): + CREATE_NON_ROOT_USER = False + IMAGE_NAME_MATCH = 'Ubuntu Server 22.04 LTS 64' + + +class CentOs7BasedTencentVirtualMachine(TencentVirtualMachine, + linux_virtual_machine.CentOs7Mixin): + CREATE_NON_ROOT_USER = True + IMAGE_NAME_MATCH = 'CentOS 7.9 64' + + def __init__(self, vm_spec): + super(CentOs7BasedTencentVirtualMachine, self).__init__(vm_spec) + user_name_set = FLAGS['tencent_user_name'].present + self.user_name = FLAGS.tencent_user_name if user_name_set else DEFAULT_USER_NAME + self.python_package_config = 'python' + self.python_dev_package_config = 'python2-devel' + self.python_pip_package_config = 'python2-pip' diff --git a/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/util.py b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/util.py new file mode 100644 index 0000000..f6af819 --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/providers/tencent/util.py @@ -0,0 +1,125 @@ +# Copyright 2015 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for working with Tencent Cloud resources.""" + + +import re +import shlex +import six + +from perfkitbenchmarker import errors +from absl import flags +from perfkitbenchmarker import vm_util + +TENCENT_PATH = 'tccli' +TENCENT_PREFIX = [TENCENT_PATH] +# Tencent positional args such as 'vpc' and 'CreateVpc' must come before flags, so format flag must come later. +TENCENT_SUFFIX = ['--output', 'json'] +INSTANCE = 'instance' +FLAGS = flags.FLAGS + +ADD_USER_TEMPLATE = '''#!/bin/bash +echo "{0} ALL = NOPASSWD: ALL" >> /etc/sudoers +useradd {0} --home /home/{0} --shell /bin/bash -m +mkdir /home/{0}/.ssh +echo "{1}" >> /home/{0}/.ssh/authorized_keys +chown -R {0}:{0} /home/{0}/.ssh +chmod 700 /home/{0}/.ssh +chmod 600 /home/{0}/.ssh/authorized_keys +''' + + +def TokenizeCommand(cmd): + # This probably doesnt work for most Tencent commands that have JSON args - it strips quotes it seems + cmd_line = ' '.join(cmd) + cmd_args = shlex.split(cmd_line) + return cmd_args + + +def IsRegion(region_or_zone): + if not re.match(r'[a-z]{2}-[a-z]+$', region_or_zone): + return False + else: + return True + + +def GetRegionFromZone(zone): + """Returns the region a zone is in """ + m = re.match(r'([a-z]{2}-[a-z]+)-[0-9]?$', zone) + if not m: + raise ValueError( + '%s is not a valid Tencent zone' % zone) + return m.group(1) + + +@vm_util.Retry() +def IssueRetryableCommand(cmd, env=None): + """Tries running the provided command until it succeeds or times out. + + Args: + cmd: A list of strings such as is given to the subprocess.Popen() + constructor. + env: An alternate environment to pass to the Popen command. + + Returns: + A tuple of stdout and stderr from running the provided command. + """ + stdout, stderr, retcode = vm_util.IssueCommand(cmd, env=env) + if retcode: + raise errors.VmUtil.CalledProcessException( + 'Command returned a non-zero exit code.\n') + if stderr: + raise errors.VmUtil.CalledProcessException( + 'The command had output on stderr:\n%s' % stderr) + return stdout, stderr + + +def AddTags(resource_id, region, **kwargs): + """Adds tags to a Tencent cloud resource created by PerfKitBenchmarker. + + Args: + resource_id: An extant Tencent cloud resource to operate on. + region: The Tencent cloud region 'resource_id' was created in. + **kwargs: dict. Key-value pairs to set on the instance. + """ + if not kwargs: + return + + tag_cmd = TENCENT_PREFIX + [ + 'tag', 'AddResourceTag', + '--Resource', f'qcs::cvm:{region}::{INSTANCE}/{resource_id}' + ] + for _, (key, value) in enumerate(six.iteritems(kwargs)): + tmp_cmd = tag_cmd.copy() + tmp_cmd.extend([ + '--TagKey', str(key), + '--TagValue', str(value) + ]) + vm_util.IssueRetryableCommand(tmp_cmd) + + +def AddDefaultTags(resource_id, region): + """Adds tags to a Tencent cloud resource created by PerfKitBenchmarker. + + By default, resources are tagged with "owner" and "perfkitbenchmarker-run" + key-value + pairs. + + Args: + resource_id: An extant Tencent cloud resource to operate on. + region: The Tencent cloud region 'resource_id' was created in. + """ + tags = {'owner': FLAGS.owner, 'perfkitbenchmarker-run': FLAGS.run_uri} + AddTags(resource_id, region, **tags) diff --git a/script/cumulus/pkb/perfkitbenchmarker/publisher.py b/script/cumulus/pkb/perfkitbenchmarker/publisher.py new file mode 100755 index 0000000..75b775b --- /dev/null +++ b/script/cumulus/pkb/perfkitbenchmarker/publisher.py @@ -0,0 +1,1099 @@ +#!/usr/bin/env python + +# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Classes to collect and publish performance samples to various sinks.""" + + +import abc +import collections +import copy +import csv +import datetime +import fcntl +import itertools +import json +import logging +import math +import operator +import posixpath +import pprint +import requests +import sys +import time +from typing import List +import uuid + +from absl import flags +from perfkitbenchmarker import events +from perfkitbenchmarker import flag_util +from perfkitbenchmarker import log_util +from perfkitbenchmarker import sample as pkb_sample +from perfkitbenchmarker import stages +from perfkitbenchmarker import version +from perfkitbenchmarker import vm_util +import six +from six.moves import urllib +import six.moves.http_client as httplib + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + 'product_name', + 'PerfKitBenchmarker', + 'The product name to use when publishing results.') + +flags.DEFINE_boolean( + 'official', + False, + 'A boolean indicating whether results are official or not. The ' + 'default is False. Official test results are treated and queried ' + 'differently from non-official test results.') + +flags.DEFINE_boolean( + 'hostname_metadata', + False, + 'A boolean indicating whether to publish VM hostnames as part of sample ' + 'metadata.') + +flags.DEFINE_string( + 'json_path', + None, + 'A path to write newline-delimited JSON results ' + 'Default: write to a run-specific temporary directory') +flags.DEFINE_enum( + 'json_write_mode', + 'w', + ['w', 'a'], + 'Open mode for file specified by --json_path. Default: overwrite file') +flags.DEFINE_boolean( + 'collapse_labels', + False, + 'Collapse entries in labels in JSON output.') +flags.DEFINE_string( + 'csv_path', + None, + 'A path to write CSV-format results') + +flags.DEFINE_string( + 'bigquery_table', + None, + 'The BigQuery table to publish results to. This should be of the form ' + '"[project_id:]dataset_name.table_name".') +flags.DEFINE_string( + 'bq_path', 'bq', 'Path to the "bq" executable.') +flags.DEFINE_string( + 'bq_project', None, 'Project to use for authenticating with BigQuery.') +flags.DEFINE_string( + 'service_account', None, 'Service account to use to authenticate with BQ.') +flags.DEFINE_string( + 'service_account_private_key', None, + 'Service private key for authenticating with BQ.') +flags.DEFINE_string( + 'application_default_credential_file', None, + 'Application default credentials file for authenticating with BQ.') + +flags.DEFINE_string( + 'gsutil_path', 'gsutil', 'path to the "gsutil" executable') +flags.DEFINE_string( + 'cloud_storage_bucket', + None, + 'GCS bucket to upload records to. Bucket must exist. ' + 'This flag differs from --hourly_partitioned_cloud_storage_bucket ' + 'by putting records directly in the bucket.') +PARTITIONED_GCS_URL = flags.DEFINE_string( + 'hourly_partitioned_cloud_storage_bucket', None, + 'GCS bucket to upload records to. Bucket must exist. This flag differs ' + 'from --cloud_storage_bucket by putting records in subfolders based on ' + 'time of publish. i.e. gs://bucket/YYYY/mm/dd/HH/data.') +flags.DEFINE_string( + 'es_uri', None, + 'The Elasticsearch address and port. e.g. http://localhost:9200') + +flags.DEFINE_string( + 'es_index', 'perfkit', 'Elasticsearch index name to store documents') + +flags.DEFINE_string('es_type', 'result', 'Elasticsearch document type') + +flags.DEFINE_multi_string( + 'metadata', + [], + 'A colon separated key-value pair that will be added to the labels field ' + 'of all samples as metadata. Multiple key-value pairs may be specified ' + 'by separating each pair by commas.') + +flags.DEFINE_string( + 'influx_uri', None, + 'The Influx DB address and port. Expects the format hostname:port' + 'If port is not passed in it assumes port 80. e.g. localhost:8086') + +flags.DEFINE_string( + 'influx_db_name', 'perfkit', + 'Name of Influx DB database that you wish to publish to or create') + +flags.DEFINE_boolean( + 'publish_config', + True, + 'A boolean indicating whether to publish user config including ' + 'command line and user configuration file contents.') + +flags.DEFINE_boolean( + 'record_log_publisher', True, + 'Whether to use the log publisher or not.') + +DEFAULT_JSON_OUTPUT_NAME = 'perfkitbenchmarker_results.json' +DEFAULT_CREDENTIALS_JSON = 'credentials.json' +GCS_OBJECT_NAME_LENGTH = 20 +# A list of SamplePublishers that can be extended to add support for publishing +# types beyond those in this module. The classes should not require any +# arguments to their __init__ methods. The SampleCollector will unconditionally +# call PublishSamples using Publishers added via this method. +EXTERNAL_PUBLISHERS = [] + + +def PublishRunStageSamples(benchmark_spec, samples): + """Publishes benchmark run-stage samples immediately. + + Typically, a benchmark publishes samples by returning them from the Run + function so that they can be pubished at set points (publish periods or at the + end of a run). This function can be called to publish the samples immediately. + + Note that metadata for the run number will not be added to such samples. + TODO(deitz): Can we still add the run number? This will require passing a run + number or callback to the benchmark Run functions (or some other mechanism). + + Args: + benchmark_spec: The BenchmarkSpec created for the benchmark. + samples: A list of samples to publish. + """ + events.samples_created.send( + stages.RUN, benchmark_spec=benchmark_spec, samples=samples) + collector = SampleCollector() + collector.AddSamples(samples, benchmark_spec.name, benchmark_spec) + collector.PublishSamples() + + +def GetLabelsFromDict(metadata): + """Converts a metadata dictionary to a string of labels sorted by key. + + Args: + metadata: a dictionary of string key value pairs. + + Returns: + A string of labels, sorted by key, in the format that Perfkit uses. + """ + labels = [] + for k, v in sorted(six.iteritems(metadata)): + labels.append('|%s:%s|' % (k, v)) + return ','.join(labels) + + +def FormatTimestampForElasticsearch(epoch_us): + """Convert the floating epoch timestamp in micro seconds epoch_us to + yyyy-MM-dd HH:mm:ss.SSSSSS in string + """ + ts = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(epoch_us)) + num_dec = ("%.6f" % (epoch_us - math.floor(epoch_us))).split('.')[1] + new_ts = '%s.%s' % (ts, num_dec) + return new_ts + + +def DeDotKeys(res): + """Recursively replace dot with underscore in all keys in a dictionary.""" + for key, value in res.copy().items(): + if isinstance(value, dict): + DeDotKeys(value) + new_key = key.replace('.', '_') + if new_key != key: + res[new_key] = res.pop(key) + return res + + +class MetadataProvider(six.with_metaclass(abc.ABCMeta, object)): + """A provider of sample metadata.""" + + @abc.abstractmethod + def AddMetadata(self, metadata, benchmark_spec): + """Add metadata to a dictionary. + + Existing values will be overwritten. + + Args: + metadata: dict. Dictionary of metadata to update. + benchmark_spec: BenchmarkSpec. The benchmark specification. + + Returns: + Updated 'metadata'. + """ + raise NotImplementedError() + + +class DefaultMetadataProvider(MetadataProvider): + """Adds default metadata to samples.""" + + def AddMetadata(self, metadata, benchmark_spec): + metadata = metadata.copy() + metadata['perfkitbenchmarker_version'] = version.VERSION + if FLAGS.simulate_maintenance: + metadata['simulate_maintenance'] = True + if FLAGS.hostname_metadata: + metadata['hostnames'] = ','.join([vm.hostname + for vm in benchmark_spec.vms]) + if benchmark_spec.container_cluster: + cluster = benchmark_spec.container_cluster + for k, v in six.iteritems(cluster.GetResourceMetadata()): + metadata['container_cluster_' + k] = v + + if benchmark_spec.relational_db: + db = benchmark_spec.relational_db + for k, v in six.iteritems(db.GetResourceMetadata()): + # TODO(jerlawson): Rename to relational_db. + metadata['managed_relational_db_' + k] = v + + for name, tpu in six.iteritems(benchmark_spec.tpu_groups): + for k, v in six.iteritems(tpu.GetResourceMetadata()): + metadata['tpu_' + k] = v + + for name, vms in six.iteritems(benchmark_spec.vm_groups): + if len(vms) == 0: + continue + # Get a representative VM so that we can publish the cloud, zone, + # machine type, and image. + vm = vms[-1] + name_prefix = '' if name == 'default' else name + '_' + for k, v in six.iteritems(vm.GetResourceMetadata()): + metadata[name_prefix + k] = v + metadata[name_prefix + 'vm_count'] = len(vms) + for k, v in six.iteritems(vm.GetOSResourceMetadata()): + metadata[name_prefix + k] = v + + if vm.scratch_disks: + data_disk = vm.scratch_disks[0] + metadata[name_prefix + 'data_disk_count'] = len(vm.scratch_disks) + for key, value in six.iteritems(data_disk.GetResourceMetadata()): + metadata[name_prefix + 'data_disk_0_%s' % (key,)] = value + + if FLAGS.set_files: + metadata['set_files'] = ','.join(FLAGS.set_files) + if FLAGS.sysctl: + metadata['sysctl'] = ','.join(FLAGS.sysctl) + + # Flatten all user metadata into a single list (since each string in the + # FLAGS.metadata can actually be several key-value pairs) and then iterate + # over it. + parsed_metadata = flag_util.ParseKeyValuePairs(FLAGS.metadata) + metadata.update(parsed_metadata) + return metadata + + +DEFAULT_METADATA_PROVIDERS = [DefaultMetadataProvider()] + + +class UserConfigurationMetadataProvider(MetadataProvider): + + def AddMetadata(self, metadata, benchmark_spec): + metadata = metadata.copy() + metadata['user_cmdline'] = " ".join(sys.argv[:]) + return metadata + + +class SamplePublisher(six.with_metaclass(abc.ABCMeta, object)): + """An object that can publish performance samples.""" + + @abc.abstractmethod + def PublishSamples(self, samples: List[pkb_sample.SampleDict]): + """Publishes 'samples'. + + PublishSamples will be called exactly once. Calling + SamplePublisher.PublishSamples multiple times may result in data being + overwritten. + + Args: + samples: list of dicts to publish. + """ + raise NotImplementedError() + + +class CSVPublisher(SamplePublisher): + """Publisher which writes results in CSV format to a specified path. + + The default field names are written first, followed by all unique metadata + keys found in the data. + """ + + _DEFAULT_FIELDS = ('timestamp', 'test', 'metric', 'value', 'unit', + 'product_name', 'official', 'owner', 'run_uri', + 'sample_uri') + + def __init__(self, path): + super().__init__() + self._path = path + + def PublishSamples(self, samples): + samples = list(samples) + # Union of all metadata keys. + meta_keys = sorted( + set(key for sample in samples for key in sample['metadata'])) + + logging.info('Writing CSV results to %s', self._path) + with open(self._path, 'w') as fp: + writer = csv.DictWriter(fp, list(self._DEFAULT_FIELDS) + meta_keys) + writer.writeheader() + + for sample in samples: + d = {} + d.update(sample) + d.update(d.pop('metadata')) + writer.writerow(d) + + +class PrettyPrintStreamPublisher(SamplePublisher): + """Writes samples to an output stream, defaulting to stdout. + + Samples are pretty-printed and summarized. Example output (truncated): + + -------------------------PerfKitBenchmarker Results Summary-------------- + COREMARK: + num_cpus="4" + Coremark Score 44145.237832 + End to End Runtime 289.477677 seconds + NETPERF: + client_machine_type="n1-standard-4" client_zone="us-central1-a" .... + TCP_RR_Transaction_Rate 1354.04 transactions_per_second (ip_type="ext ... + TCP_RR_Transaction_Rate 3972.70 transactions_per_second (ip_type="int ... + TCP_CRR_Transaction_Rate 449.69 transactions_per_second (ip_type="ext ... + TCP_CRR_Transaction_Rate 1271.68 transactions_per_second (ip_type="int ... + TCP_STREAM_Throughput 1171.04 Mbits/sec (ip_type="ext ... + TCP_STREAM_Throughput 6253.24 Mbits/sec (ip_type="int ... + UDP_RR_Transaction_Rate 1380.37 transactions_per_second (ip_type="ext ... + UDP_RR_Transaction_Rate 4336.37 transactions_per_second (ip_type="int ... + End to End Runtime 444.33 seconds + + ------------------------- + For all tests: cloud="GCP" image="ubuntu-14-04" machine_type="n1-standa ... + + Attributes: + stream: File-like object. Output stream to print samples. + """ + + def __init__(self, stream=None): + super().__init__() + self.stream = stream or sys.stdout + + def __repr__(self): + return '<{0} stream={1}>'.format(type(self).__name__, self.stream) + + def _FindConstantMetadataKeys(self, samples): + """Finds metadata keys which are constant across a collection of samples. + + Args: + samples: List of dicts, as passed to SamplePublisher.PublishSamples. + + Returns: + The set of metadata keys for which all samples in 'samples' have the same + value. + """ + unique_values = {} + + for sample in samples: + for k, v in six.iteritems(sample['metadata']): + if len(unique_values.setdefault(k, set())) < 2 and v.__hash__: + unique_values[k].add(v) + + # Find keys which are not present in all samples + for sample in samples: + for k in frozenset(unique_values) - frozenset(sample['metadata']): + unique_values[k].add(None) + + return frozenset(k for k, v in six.iteritems(unique_values) + if len(v) == 1 and None not in v) + + def _FormatMetadata(self, metadata): + """Format 'metadata' as space-delimited key="value" pairs.""" + return ' '.join('{0}="{1}"'.format(k, v) + for k, v in sorted(six.iteritems(metadata))) + + def PublishSamples(self, samples): + # result will store the formatted text, then be emitted to self.stream and + # logged. + result = six.StringIO() + dashes = '-' * 25 + result.write('\n' + dashes + + 'PerfKitBenchmarker Results Summary' + + dashes + '\n') + + if not samples: + logging.debug('Pretty-printing results to %s:\n%s', self.stream, + result.getvalue()) + self.stream.write(result.getvalue()) + return + + key = operator.itemgetter('test') + samples = sorted(samples, key=key) + globally_constant_keys = self._FindConstantMetadataKeys(samples) + + for benchmark, test_samples in itertools.groupby(samples, key): + test_samples = list(test_samples) + # Drop end-to-end runtime: it always has no metadata. + non_endtoend_samples = [i for i in test_samples + if i['metric'] != 'End to End Runtime'] + locally_constant_keys = ( + self._FindConstantMetadataKeys(non_endtoend_samples) - + globally_constant_keys) + all_constant_meta = globally_constant_keys.union(locally_constant_keys) + + benchmark_meta = {k: v + for k, v in six.iteritems(test_samples[0]['metadata']) + if k in locally_constant_keys} + result.write('{0}:\n'.format(benchmark.upper())) + + if benchmark_meta: + result.write(' {0}\n'.format( + self._FormatMetadata(benchmark_meta))) + + for sample in test_samples: + meta = {k: v for k, v in six.iteritems(sample['metadata']) + if k not in all_constant_meta} + if not isinstance(sample['value'], six.string_types): + result.write(' {0:<30s} {1:>15f} {2:<30s}'.format( + sample['metric'], sample['value'], sample['unit'])) + else: + result.write(' {0:<30s} {1:<30s} {2:<30s}'.format( + sample['metric'], sample['value'], sample['unit'])) + + if meta: + result.write(' ({0})'.format(self._FormatMetadata(meta))) + result.write('\n') + + global_meta = {k: v for k, v in six.iteritems(samples[0]['metadata']) + if k in globally_constant_keys} + result.write('\n' + dashes + '\n') + result.write('For all tests: {0}\n'.format( + self._FormatMetadata(global_meta))) + + value = result.getvalue() + logging.debug('Pretty-printing results to %s:\n%s', self.stream, value) + self.stream.write(value) + + +class LogPublisher(SamplePublisher): + """Writes samples to a Python Logger. + + Attributes: + level: Logging level. Defaults to logging.INFO. + logger: Logger to publish to. Defaults to the root logger. + """ + + def __init__(self, level=logging.INFO, logger=None): + super().__init__() + self.level = level + self.logger = logger or logging.getLogger() + self._pprinter = pprint.PrettyPrinter() + + def __repr__(self): + return '<{0} logger={1} level={2}>'.format(type(self).__name__, self.logger, + self.level) + + def PublishSamples(self, samples): + header = '\n' + '-' * 25 + 'PerfKitBenchmarker Complete Results' + '-' * 25 + self.logger.log(self.level, header) + for sample in samples: + self.logger.log(self.level, self._pprinter.pformat(sample)) + + +# TODO: Extract a function to write delimited JSON to a stream. +class NewlineDelimitedJSONPublisher(SamplePublisher): + """Publishes samples to a file as newline delimited JSON. + + The resulting output file is compatible with 'bq load' using + format NEWLINE_DELIMITED_JSON. + + If 'collapse_labels' is True, metadata is converted to a flat string with key + 'labels' via GetLabelsFromDict. + + Attributes: + file_path: string. Destination path to write samples. + mode: Open mode for 'file_path'. Set to 'a' to append. + collapse_labels: boolean. If true, collapse sample metadata. + """ + + def __init__(self, file_path, mode='wt', collapse_labels=True): + super().__init__() + self.file_path = file_path + self.mode = mode + self.collapse_labels = collapse_labels + + def __repr__(self): + return '<{0} file_path="{1}" mode="{2}">'.format( + type(self).__name__, self.file_path, self.mode) + + def PublishSamples(self, samples): + logging.info('Publishing %d samples to %s', len(samples), + self.file_path) + with open(self.file_path, self.mode) as fp: + fcntl.flock(fp, fcntl.LOCK_EX) + for sample in samples: + sample = sample.copy() + if self.collapse_labels: + sample['labels'] = GetLabelsFromDict(sample.pop('metadata', {})) + fp.write(json.dumps(sample) + '\n') + + +class BigQueryPublisher(SamplePublisher): + """Publishes samples to BigQuery. + + Attributes: + bigquery_table: string. The bigquery table to publish to, of the form + '[project_name:]dataset_name.table_name' + project_id: string. Project to use for authenticating with BigQuery. + bq_path: string. Path to the 'bq' executable'. + service_account: string. Use this service account email address for + authorization. For example, 1234567890@developer.gserviceaccount.com + service_account_private_key: Filename that contains the service account + private key. Must be specified if service_account is specified. + application_default_credential_file: Filename that holds Google applciation + default credentials. Cannot be set alongside service_account. + """ + + def __init__(self, + bigquery_table, + project_id=None, + bq_path='bq', + service_account=None, + service_account_private_key_file=None, + application_default_credential_file=None): + super().__init__() + self.bigquery_table = bigquery_table + self.project_id = project_id + self.bq_path = bq_path + self.service_account = service_account + self.service_account_private_key_file = service_account_private_key_file + self._credentials_file = vm_util.PrependTempDir(DEFAULT_CREDENTIALS_JSON) + self.application_default_credential_file = ( + application_default_credential_file) + + if ((self.service_account is None) != + (self.service_account_private_key_file is None)): + raise ValueError('service_account and service_account_private_key ' + 'must be specified together.') + if (application_default_credential_file is not None and + self.service_account is not None): + raise ValueError('application_default_credential_file cannot be used ' + 'alongside service_account.') + + def __repr__(self): + return '<{0} table="{1}">'.format(type(self).__name__, self.bigquery_table) + + def PublishSamples(self, samples): + if not samples: + logging.warning('No samples: not publishing to BigQuery') + return + + with vm_util.NamedTemporaryFile(prefix='perfkit-bq-pub', + dir=vm_util.GetTempDir(), + suffix='.json') as tf: + json_publisher = NewlineDelimitedJSONPublisher(tf.name, + collapse_labels=True) + json_publisher.PublishSamples(samples) + tf.close() + logging.info('Publishing %d samples to %s', len(samples), + self.bigquery_table) + load_cmd = [self.bq_path] + if self.project_id: + load_cmd.append('--project_id=' + self.project_id) + if self.service_account: + assert self.service_account_private_key_file is not None + load_cmd.extend(['--service_account=' + self.service_account, + '--service_account_credential_file=' + + self._credentials_file, + '--service_account_private_key_file=' + + self.service_account_private_key_file]) + elif self.application_default_credential_file is not None: + load_cmd.append('--application_default_credential_file=' + + self.application_default_credential_file) + load_cmd.extend(['load', + '--autodetect', + '--source_format=NEWLINE_DELIMITED_JSON', + self.bigquery_table, + tf.name]) + vm_util.IssueRetryableCommand(load_cmd) + + +class CloudStoragePublisher(SamplePublisher): + """Publishes samples to a Google Cloud Storage bucket using gsutil. + + Samples are formatted using a NewlineDelimitedJSONPublisher, and written to a + the destination file within the specified bucket named: + +