diff --git a/LICENSE b/LICENSE
index 4032d952..0bb898dc 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,188 @@
-MIT License
-
-Copyright (c) 2020 Noah's Ark Lab / Huawei
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+ Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+Apache License, Version 2.0
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
diff --git a/MANIFEST.in b/MANIFEST.in
index 5094c3bc..901d2a02 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,5 @@
#dispatch files to site-packages
recursive-include docs *
-recursive-include evaluate_service *
recursive-include examples *
include LICENSE
include MANIFEST.in
diff --git a/README.cn.md b/README.cn.md
index b193cc3e..5ae3cbcc 100644
--- a/README.cn.md
+++ b/README.cn.md
@@ -9,13 +9,13 @@
---
-**Vega ver1.7.1 released**
+**Vega ver1.8.0 发布**
-- Bug修复:
+- 特性增强
- - 增加评估服务最大尝试次数限制.
- - 使用SafeLoader加载YAML文件.
- - 增加评估服务输入参数异常处理.
+ - 安全增强,组件间通信支持安全协议。
+ - 提供独立的评估服务安装。
+ - 更新Auto-lane模型,提供auto-lane推理代码。
---
@@ -30,16 +30,6 @@ Vega是诺亚方舟实验室自研的AutoML算法工具链,有主要特点:
5. 多Backend支持:支持PyTorch(GPU, Ascend 910), TensorFlow(GPU, Ascend 910), MindSpore(Ascend 910).。
6. 支持昇腾平台:支持在Ascend 910搜索和训练,支持在Ascend 310上模型评估。
-## AutoML工具特性
-
-| | 平台 | HPO算法 | NAS算法 | 端侧评估 | 模型过滤 | 统一网络 |
-| :--: | :-- | :-- | :-- | :-- | :-- | :-- |
-| **AutoGluon** | mxnet, PyTorch | Random Search, Bayesian, Hyper-Band | Random Search, RL | × | × | × |
-| **AutoKeras** | Keras | No Restrictions | Network Morphism | × | × | × |
-| **Model Search** | TensorFlow | No Restrictions | Random Search, Beam Search | × | × | × |
-| **NNI** | No Restrictions | Random Search and Grid Search, Bayesian, Annealing, Hyper-Band, Evolution, RL | Random Search, Gradient-Based, One-Shot | × | × | × |
-| **Vega** | PyTorch, TensorFlow, MindSpore | Random Search, Grid Search, Bayesian, Hyper-Band, Evolution | Random Search, Gradient-Based, Evalution, One-Shot | Ascend 310, Kirin 980/990 | Quota (在NAS搜索中根据parameters, flops, latency过滤模型) | 提供同时用于PyTorch、TensorFlow和MindSpore的网络 |
-
## 算法列表
| 分类 | 算法 | 说明 | 参考 |
@@ -67,14 +57,12 @@ Vega是诺亚方舟实验室自研的AutoML算法工具链,有主要特点:
## 安装
-执行如下命令安装Vega和相关开源软件:
+执行如下命令安装Vega:
```bash
pip3 install --user --upgrade noah-vega
```
-若需要在Ascend 910训练环境上安装,请联系我们。
-
## 使用
使用`vega`命令来运行Vega应用,比如可执行如下命令运行`CARS`算法:
@@ -86,12 +74,18 @@ vega ./examples/nas/cars/cars.yml
其中`cars.yml`中包含了pipeline、搜索算法、搜索空间、训练参数等定义。
Vega提供了40+示例供参考:[示例](https://github.com/huawei-noah/vega/tree/master/examples)、[示例参考](./docs/cn/user/examples.md)、[配置参考](./docs/cn/user/config_reference.md)。
+安全模式适用于通信安全要求高的场景,在运行前请执行[安全配置](./docs/cn/security.md):
+
+```bash
+vega ./examples/nas/cars/cars.yml -s
+```
+
## 参考
| 对象 | 参考 |
| :--: | :-- |
-| [**用户**
(用户指南)](./docs/cn/user/README.md) | [安装指导](./docs/cn/user/install.md)、[部署指导](./docs/cn/user/deployment.md)、[配置指导](./docs/cn/user/config_reference.md)、[示例参考](./docs/cn/user/examples.md)、[评估服务](./docs/cn/user/evaluate_service.md)、任务参考([分类](./docs/cn/tasks/classification.md)、[检测](./docs/cn/tasks/detection.md)、[分割](./docs/cn/tasks/segmentation.md)、[超分](./docs/cn/tasks/segmentation.md)) |
-| [**开发者**
(开发者指南)](./docs/cn/developer/README.md) | [开发者指导](./docs/cn/developer/developer_guide.md)、[快速入门指导](./docs/cn/developer/quick_start.md)、[数据集指导](./docs/cn/developer/datasets.md)、[算法开发指导](./docs/cn/developer/new_algorithm.md)、[细粒度搜索空间指导](./docs/cn/developer/fine_grained_space.md) |
+| **用户** | [安装指导](./docs/cn/user/install.md)、[部署指导](./docs/cn/user/deployment.md)、[安全配置](./docs/cn/security.md)、[配置指导](./docs/cn/user/config_reference.md)、[示例参考](./docs/cn/user/examples.md)、[评估服务](./evaluate_service/docs/cn/evaluate_service.md) |
+| **开发者** | [开发者指导](./docs/cn/developer/developer_guide.md)、[快速入门指导](./docs/cn/developer/quick_start.md)、[数据集指导](./docs/cn/developer/datasets.md)、[算法开发指导](./docs/cn/developer/new_algorithm.md) |
## FAQ
@@ -113,4 +107,3 @@ Vega提供了40+示例供参考:[示例](https://github.com/huawei-noah/vega/t
## 合作和贡献
欢迎大家使用Vega,有任何疑问、求助、修改bug、贡献算法、完善文档,请在社区提交issue,我们会及时回复沟通交流。
-欢迎大家加入我们的QQ群: **833345709** 。
diff --git a/README.md b/README.md
index eb0f7eb7..f376176b 100644
--- a/README.md
+++ b/README.md
@@ -8,13 +8,13 @@
---
-**Vega ver1.7.1 released**
+**Vega ver1.8.0 released**
-- Bug fixes:
+- Feature enhancement:
- - Maximum number of evaluation service attempts.
- - Use SafeLoader to load the YAML file.
- - Catch evaluation service input parameter exceptions.
+ - Security enhancement: Security protocols communication.
+ - Provide evaluation service release package.
+ - Update the auto-lane model and provide auto-lane inference sample code.
---
@@ -29,16 +29,6 @@ Vega is an AutoML algorithm tool chain developed by Noah's Ark Laboratory, the m
5. Multi-Backend: PyTorch (GPU and Ascend 910), TensorFlow (GPU and Ascend 910), MindSpore (Ascend 910).
6. Ascend platform: Search and training on the Ascend 910 and model evaluation on the Ascend 310.
-## AutoML Tools Features
-
-| | Supported Frameworks | HPO Algorithms | NAS Algorithms | Device-Side Evaluation | Model Filter | Universal Network |
-| :--: | :-- | :-- | :-- | :-- | :-- | :-- |
-| **AutoGluon** | mxnet, PyTorch | Random Search, Bayesian, Hyper-Band | Random Search, RL | × | × | × |
-| **AutoKeras** | Keras | No Restrictions | Network Morphism | × | × | × |
-| **Model Search** | TensorFlow | No Restrictions | Random Search, Beam Search | × | × | × |
-| **NNI** | No Restrictions | Random Search and Grid Search, Bayesian, Annealing, Hyper-Band, Evolution, RL | Random Search, Gradient-Based, One-Shot | × | × | × |
-| **Vega** | PyTorch, TensorFlow, MindSpore | Random Search, Grid Search, Bayesian, Hyper-Band, Evolution | Random Search, Gradient-Based, Evalution, One-Shot | Ascend 310, Kirin 980/990 | Quota (filter model based on parameters, flops, latency) | provides networks compatibility with PyTorch, TensorFlow, and MindSpore |
-
## Algorithm List
| Category | Algorithm | Description | reference |
@@ -68,14 +58,12 @@ Vega is an AutoML algorithm tool chain developed by Noah's Ark Laboratory, the m
## Installation
-Run the following commands to install Vega and related open-source software:
+Run the following commands to install Vega:
```bash
pip3 install --user --upgrade noah-vega
```
-If you need to install the Ascend 910 training environment, please contact us.
-
## Usage
Run the `vega` command to run the Vega application. For example, run the following command to run the `CARS` algorithm:
@@ -87,12 +75,18 @@ vega ./examples/nas/cars/cars.yml
The `cars.yml` file contains definitions such as pipeline, search algorithm, search space, and training parameters.
Vega provides more than 40 examples for reference: [Examples](https://github.com/huawei-noah/vega/tree/master/examples), [Example Guide](./docs/en/user/examples.md), and [Configuration Guide](./docs/en/user/config_reference.md).
+The security mode is applicable to communication with high security requirements. Before running this command, run the security configuration (./docs/en/security.md).
+
+```bash
+vega ./examples/nas/cars/cars.yml -s
+```
+
## Reference
-| object | refrence |
+| Reader | Refrence |
| :--: | :-- |
-| [**User**
(User Guide)](./docs/en/user/README.md) | [Install Guide](./docs/en/user/install.md), [Deployment Guide](./docs/en/user/deployment.md), [Configuration Guide](./docs/en/user/config_reference.md), [Examples](./docs/en/user/examples.md), [Evaluate Service](./docs/en/user/evaluate_service.md) |
-| [**Developer**
(Developer Guide)](./docs/en/developer/README.md) | [Development Reference](./docs/en/developer/developer_guide.md), [Quick Start Guide](./docs/en/developer/quick_start.md), [Dataset Guide](./docs/en/developer/datasets.md), [Algorithm Development Guide](./docs/en/developer/new_algorithm.md), [Fine-Grained Search Space Guide](./docs/en/developer/fine_grained_space.md) |
+| **User** | [Install Guide](./docs/en/user/install.md), [Deployment Guide](./docs/en/user/deployment.md), [Configuration Guide](./docs/en/user/config_reference.md), [Security Configuration](./docs/en/security.md), [Examples](./docs/en/user/examples.md), [Evaluate Service](./evaluate_service/docs/en/evaluate_service.md) |
+| **Developer** | [Development Reference](./docs/en/developer/developer_guide.md), [Quick Start Guide](./docs/en/developer/quick_start.md), [Dataset Guide](./docs/en/developer/datasets.md), [Algorithm Development Guide](./docs/en/developer/new_algorithm.md) |
## FAQ
@@ -114,4 +108,3 @@ For common problems and exception handling, please refer to [FAQ](./docs/en/user
## Cooperation and Contribution
Welcome to use Vega. If you have any questions or suggestions, need help, fix bugs, contribute new algorithms, or improve the documentation, submit an issue in the community. We will reply to and communicate with you in a timely manner.
-Welcome to join our QQ chatroom (Chinese): **833345709**.
diff --git a/RELEASE.md b/RELEASE.md
index ba30fc80..9564fff2 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,4 +1,4 @@
-**Vega ver1.7.1 released:**
+**Vega ver1.8.0 released:**
**Introduction**
@@ -19,4 +19,3 @@ Install Vega and the open source softwares that Vega depends on:
**Cooperation and Contribution**
Welcome to use Vega. If you have any questions or suggestions, need help, fix bugs, contribute new algorithms, or improve the documentation, submit an issue in the community. We will reply to and communicate with you in a timely manner.
-Welcome to join our QQ chatroom (Chinese): **833345709**.
diff --git a/docs/cn/algorithms/adelaide_ea.md b/docs/cn/algorithms/adelaide_ea.md
index ad3b424d..6cb60f6f 100644
--- a/docs/cn/algorithms/adelaide_ea.md
+++ b/docs/cn/algorithms/adelaide_ea.md
@@ -98,16 +98,4 @@ mutate:
### 4. 算法输出
-输出结果包括一系列.pth文件(训练到配置文件中```num_iter```迭代次数的模型)、```result.csv```文件以及```pareto_front.csv```文件。```result.csv```文件记录了所有搜索模型,```pareto_front.csv```文件记录了所有```pareto_front```模型。.csv文件中包含了```encoding```、```flops```、```parameters```以及```mIOU```:
-
-1. ```encoding```:19位字符串表示了模型的结构,19位字符串以“_”为结尾(避免以“0”开头的```encoding```造成记录错误)。
-
-2. ```flops```:记录的是模型的Macc值,如:1371603728表示的就是1.277G。
-
-3. ```parameters```:记录的是模型的parameters值,如:3162900表示的就是3.016M。
-
-4. ```mIOU```:记录的是训练到配置文件中num_iter迭代次数后的模型mIOU。
-
-## 5. Benchmark
-
-请参考 [adelaide_ea.yml](https://github.com/huawei-noah/vega/blob/master/examples/nas/adelaide_ea/adelaide_ea.yml)。
+输出结果包括预训练模型、架构描述文件、和性能结果文件,其中架构描述文件中,`encoding`使用19位字符串表示了模型的结构,19位字符串以“_”为结尾(避免以“0”开头的`encoding`造成记录错误)。
diff --git a/docs/cn/algorithms/modnas.md b/docs/cn/algorithms/modnas.md
index afa0aeb4..3ef14438 100644
--- a/docs/cn/algorithms/modnas.md
+++ b/docs/cn/algorithms/modnas.md
@@ -302,10 +302,6 @@ search_space:
现在,我们在基础模型之上定义了一个超网,其中原来的卷积算子被指定的混合算子和原语替换。然后,可以通过将搜索空间与选定的优化器和估计器匹配来设置搜索例程。
-## 已知问题
-
-- 目前, ModularNAS例程在单独的线程中运行,并监听Vega中的条件变量,这可能导致死锁。
-
## 参考文献
[^fn1]: Liu, H., Simonyan, K., and Yang, Y. Darts: Differentiable architecture search. ArXiv, abs/1806.09055, 2019b.
diff --git a/docs/cn/algorithms/nago.md b/docs/cn/algorithms/nago.md
index 9ca0d424..a8b0c8d3 100644
--- a/docs/cn/algorithms/nago.md
+++ b/docs/cn/algorithms/nago.md
@@ -88,7 +88,4 @@ search_algorithm:
### 5. 算法输出
-以下两个输出文件会在指定的输出目录中生成(默认输出目录是 `./example/tasks//output/nas/` ):
-
-- `output.csv` 文件包含了BOHB推荐的最优网络结构生成器的超参数值。
-- `reports.csv` 文件包含了BOHB搜索过程中评估过的所有超参组合的数据。
+包含最优超参的文件 `desc_nn.json` 在目录`./tasks//output/nas/`中。
diff --git a/docs/cn/algorithms/pba.md b/docs/cn/algorithms/pba.md
index 917934e3..5df6a6e3 100644
--- a/docs/cn/algorithms/pba.md
+++ b/docs/cn/algorithms/pba.md
@@ -105,18 +105,3 @@ PBA算法在vega pipeline上使用参数配置文件中的默认参数(搜索阶
|:--:|:--:|:--:|:--:|:--:|
|Ho et at.,2019|96.13%|96.92%|97.32%|97.42%|
|Vega Pipeline|96.26%|97.18%| \ |97.57%|
-
-最终输出文件和目录如下:
-
-```text
-output:
- best_hps.json: 其中为pba算法搜索得到的最佳数据增广策略表及其搜索阶段的ID与得分
- hps.csv: 其中为pba算法搜索阶段得到的16组数据增广策略表的ID与得分
- score_board.csv: 其中为pba算法搜索阶段得到的16组数据增广操作每轮迭代过程中的具体得分与状态
-workers:
- hpo: 其中16个文件夹分别为16组模型的最终结果,包括得分与模型等
- 0:
- 1:
- ...
- 16:
-```
diff --git a/docs/cn/algorithms/quant_ea.md b/docs/cn/algorithms/quant_ea.md
index 26b01d02..5241d242 100644
--- a/docs/cn/algorithms/quant_ea.md
+++ b/docs/cn/algorithms/quant_ea.md
@@ -73,8 +73,4 @@ nas和fully_train两个过程会依次进行,搜索过程会搜出Pareto前沿
### 5. 算法输出
-输出文件:
-
-- 搜索到的帕雷托前沿的模型经充分训练后得到的模型及结果
-- `reports.csv`包含了搜索过程中所有模型的encoding/flops/parameters/accuracy;
-- `output.csv`包含了搜索出来的pareto front的信息。
+输出未搜索到的帕雷托前沿的模型经充分训练后得到的模型及结果,在目录`./tasks//output/nas/`中。
diff --git a/docs/cn/algorithms/sp_nas.md b/docs/cn/algorithms/sp_nas.md
index 4bdfd198..da90d624 100644
--- a/docs/cn/algorithms/sp_nas.md
+++ b/docs/cn/algorithms/sp_nas.md
@@ -137,4 +137,4 @@ fine_tune:
## Benchmark
-Benchmark配置信息请参考: [spnas.yml](https://github.com/huawei-noah/vega/tree/master/examples/nas/sp_nas/spnas.yml)
+Benchmark配置信息请参考: [spnas.yml](https://github.com/huawei-noah/vega/blob/master/examples/nas/sp_nas/spnas.yml)
diff --git a/docs/cn/algorithms/sr_ea.md b/docs/cn/algorithms/sr_ea.md
index 74bc01f8..52e5f704 100644
--- a/docs/cn/algorithms/sr_ea.md
+++ b/docs/cn/algorithms/sr_ea.md
@@ -81,7 +81,4 @@ mutate:
### 算法输出
-算法的输出有
-
-- 搜索到的帕雷托前沿的模型经充分训练后得到的模型及结果。
-- 随机搜索及进化搜索过程中所有模型的结果reports.csv,以及帕雷托前沿的结果output.csv。
+算法的输出为搜索到的帕雷托前沿的模型经充分训练后得到的模型及结果。
diff --git a/docs/cn/developer/developer_guide.md b/docs/cn/developer/developer_guide.md
index 4d5e5db2..c428446a 100644
--- a/docs/cn/developer/developer_guide.md
+++ b/docs/cn/developer/developer_guide.md
@@ -1,17 +1,11 @@
# 开发参考
+**已过时,待刷新。**
+
## 1. Vega简介
Vega的重点特性是网络架构搜索和超参优化,在网络架构搜索流程中,搜索空间`Search Space`、搜索算法`Search Algorithm`是核心部分,并通过`Generator`来控制搜索的采样、更新和结束等流程步骤。
-搜索空间和搜索算法的类图如下所示:
-
-![Search Space类图](../../images/search_space_classes.png)
-
-搜索空间和搜索算法的流程图如下所示:
-
-![Search Space流程图](../../images/search_space_flow.png)
-
以下就分别介绍下面几个部分:
* 搜索空间
diff --git a/docs/cn/developer/fine_grained_search_space.md b/docs/cn/developer/fine_grained_search_space.md
deleted file mode 100644
index fb2fe433..00000000
--- a/docs/cn/developer/fine_grained_search_space.md
+++ /dev/null
@@ -1,268 +0,0 @@
-# 搜索空间和细粒度网络指导
-
-## 1. 细粒度简介
-
-在Automl的大多数算法中搜索空间和网络是强相关的,每种搜索算法都会定义一系列与之识别的搜索空间和网络类型,这些网络类型大都在基础网络上做一些较少的改动,导致网络不能复用。另外,搜索空间和搜索算法也是强耦合的,每个算法都有自己的搜索空间的定义,这种搜索空间只能用于特定的场景,缺乏通用性和扩展能力。
-
-我们对这些问题进行了分析,提出了通用的SearchSpace细粒度网络的方案:
-
-- 能够统一搜索空间的定义方式,同一种搜索空间能够适配不同的搜索算法
-- 能够对基础网络进行复用,提供细粒度的网络,通过组合的模式构建出不同形式的网络。
-- 搜索空间能够根据定义出来的网络自由扩展。
-- 支持多个backend
-
-## 2. 细粒度演示
-
-### 2.1. 使用细粒度构建一个网络
-
-- 继承Module基类,并调用`@ClassFactory.register(ClassType.NETWORK)`注册网络
-- 沿用了pytorch的风格,我们会将`self.xx`的变量放入到模块中,默认按照顺序执行。
-- 如果需要自定义moduels的执行顺序,可以重写`call`方法
-
-```python
-from vega.common import ClassFactory, ClassType
-from vega.modules.module import Module
-from vega.modules.operators import ops
-
-@ClassFactory.register(ClassType.NETWORK)
-class SimpleCnn(Module):
- def __init__(self, block_nums=3, filters=32, kernel_size=3):
- super(SimpleCnn, self).__init__()
- in_channels = 3
- out_channels = filters
- output_size = 32
- for i in range(block_nums):
- block = ConvBlock(in_channels, out_channels, kernel_size)
- self.add_module("block{}".format(i), block)
- in_channels = out_channels
- output_size = (output_size - kernel_size + 1) // 2
- self.fc1 = ops.Linear(in_channels * output_size * output_size, 120)
- self.relu = ops.Relu()
- self.fc2 = ops.Linear(120, 10)
-
-@ClassFactory.register(ClassType.NETWORK)
-class ConvBlock(Module):
- def __init__(self, in_channels, out_channels, kernel_size=3):
- super(ConvBlock, self).__init__()
- self.conv = ops.Conv2d(in_channels, out_channels, kernel_size)
- self.bn = ops.BatchNorm2d(out_channels)
- self.relu = ops.Relu()
- self.pool = ops.MaxPool2d((2, 2))
-
- def call(x):
- x = self.conv(x)
- x = self.bn(x)
- x = self.relu(x)
- return self.pool(x)
-
-model = SimpleCnn()
-print(model)
-```
-
-### 2.2. 定义Search Space并使用随机搜索对网络进行搜索
-
-- 利用Vega的pipeline能力
-
- ```yaml
- pipeline: [hpo]
-
- hpo:
- pipe_step:
- type: SearchPipeStep
-
- search_algorithm:
- type: RandomSearch
-
- search_space:
- type: SearchSpace
- hyperparameters:
- - key: backbone.block1.conv.in_channels
- type: CATEGORY
- range: [8, 16, 32, 64, 128, 256]
- model:
- model_desc:
- modules: ["backbone"]
- backbone:
- type: SimpleCnn
- dataset:
- type: Cifar10
- common:
- data_path: /cache/datasets/cifar10/
- batch_size: 256
- trainer:
- type: Trainer
- epochs: 1
- ```
-
-- 编写代码单独使用
-
-```python
-from vega.algorithms.hpo.random_hpo import RandomSearch
-from vega.core.search_space import SearchSpace
-from vega.core.search_space.param_types import ParamTypes
-from vega.core.search_space.params_factory import ParamsFactory
-from vega.networks.network_desc import NetworkDesc
-
-# SearchSpace的定义
-params = ParamsFactory.create_search_space(
- param_name='backbone.block1.conv.in_channels',
- param_type=ParamTypes.CATEGORY,
- param_range=[8, 16, 32, 64, 128, 256])
-search_space = SearchSpace().add_hp(params)
-# 搜索算法
-id, desc = RandomSearch(search_space).search()
-# 解析成模型
-model = NetworkDesc(desc).to_model()
-print(model)
-```
-
-## 3. 网络模块化分组
-
-为了方便网络模块的重用,我们将细粒度的模块按照其功能的不同,进行了分组,每个分组都有其相应的特性。
-
-- **Networks**:定义一个常用的网络,属于粗粒度的网络,如ResNet 和FasterRcnn。网络是其他分组中的子模块。
-- **Backbone**:骨干网络。通常采用backbone+ head的模式组成一个网络。在很多场景下我们可以自由的替换不同的backbone已达到处理不同的featureMap。
-- **Head**:一般用于特征融合,例如作为分类或者回归问题。这样可以确保更换不同的头,以适应不同的场景。
-- **Cells:**组合多个blocks,我们定义了多种Cells来定义组合场景.
-- **Blocks**:由基本的算子构成,组合成一个特定功能的block。我们提供给了一些常用的block,这些Block可以用于不同的网络中。
-- **Connections**:定义模块之间的连接关系,包括Sequential、Add等,以及一些条件分支的实现语句,如Repeat。
-- **Operators:**定义底层算子,如conv、batch_normal等,我们在此对每个算子做了多个平台的适配,统一了对外的输入输出和接口调用。
-
-例如一个ResNet18的组成如下:
-
-![resnet](../../images/resnet.png)
-
-## 4. Search Space的定义
-
-Search Space 分为**hyper_parameters**和**condition**两部分:
-
-**hyper_parameters**
-
-用于表示超参的定义,包含key,type和value三个设置:key表示超参的名称,type指定了超参的类型即ParamType,系统根据ParamType选择不同的采样方式。range表示定义的采样的范围。
-
-我们当前预置了如下几种ParamType:
-
-- **INT**: 从一个整数范围上采样一个值,如果range=[0, 10],表示从0到10中随机采样出一个value
-
-- **INT_EXP:**在整数范围上按照10的指数级采样方式采样一个值,如range=[0, 1000],会通过log函数映射到[0,10,100,1000]这几个值上
-
-- **INT_CAT**:表示从多个INT类型的数值中选择一个,如range=[16, 32, 64, 128]
-
-- **FLOAT:** 从一个Float范围上采样一个值,如range=[0.001, 1],采样一个值
-
-- **FLOAT_EXP**:在Float类型范围上按照10的指数级采样方式采样一个值,如range=[0.001, 1],会通过log函数映射到[1,0.01,0.001]这几个值上
-
-- **FLOAT_CAT :** 表示从多个FLOAT类型的数值中选择一个,如range=[0.1, 0.01, 0.001, 0.99]
-
-- **STRING:** 表示从多个字符串中选择一个,如range=[‘block1’, 'block2', 'block3', 'block4']
-
-
-
-**condition**
-
-用于表示2个节点之间的关系,当parent满足一定条件时,child节点才会生效
-
-![img](http://hi3ms-image.huawei.com/hi/staticimages/hi3msh/images/2019/0731/15/5d414a699c009.png)![img](http://image.huawei.com/tiny-lts/v1/images/9ed3126327ed5a8abb80_844x290.png@900-0-90-f.png)
-
-这里用一个condition_range来传入条件的值或者范围。具体的:
-
-- **EQUAL**:condition_range只能包含一个parent的数值,表示child被选择,需要满足parent的值**等于**该数值;
-- **NOT_EQUAL**:condition_range可以包含一个或多个parent的数值,表示child被选择,需要满足parent的值**不等于**condition_range中的提供的所有数值;
-- **IN**:如果parent是range类型的,则condition_range必须包含两个值表示该cond_range的最小值和最大值,child被选中必须满足parent当前值落在该cond_range范围内;如果parent是CAT类型的,则condition_range必须包含一个或者多个parent数值,child被选中必须满足parent当前值落在condition_range中的某个数值上。
-
-**forbidden**
-
-用于表示2节点之间的值的互斥关系,节点1含有某个值时,节点2的某些值不会被选择
-
-## 5. 支持多个Backend
-
-我们对底层架构做了封装,统一上层的接口来适配多个不同的backend。其主要核心功能分为:
-
-- **Module**:实现自定义模块的需要继承的基类,统一了各个平台的对于模块内部操作的实现。
-- **ops**:上层调用算子的接口,统一了不同平台同一功能算子的命名和输入输出。
-- **Serializable:** 对模块中的超参和层次结构进行提取和解析,并序列化成json格式的字典。
-
-![fine_grained_space](../../images/fine_grained_space.png)
-
-## 6. 如何进行细粒度网络的开发
-
-对于算法开发者来说,我们希望其聚焦于网络结构和超参的搜索算法的开发,而不用关心网络本身构建。当前已预置了一些Modules和Networks能够提供该类型网络的超参定义和架构定义的描述,算法开发者只需要根据其描述通过搜索算法装配成新的网络。
-
-### 6.1 定义一个Modules
-
-为了方便大家的使用,我们继承了pytorch的开发习惯,仅仅需要几行的变化就可以成为细粒度中的一个Module。
-
-- 继承Module类,注册到`ClassFactory.register(ClassType.NETWORK)`中
-- 使用ops下的算子替换nn下的算子
-- 对于顺序执行的网络结构,我们默认会按照self的顺序生成网络,无需再实现forward方法
-
-```python
-@ClassFactory.register(ClassType.NETWORK)
-class ConvBlock(Module):
- def __init__(self, in_channels, out_channels, kernel_size=3):
- super(ConvBlock, self).__init__()
- self.conv = ops.conv2d(in_channels, out_channels, kernel_size)
- self.bn = ops.batch_norm2d(out_channels)
- self.relu = ops.relu()
- self.pool = ops.max_pool2d((2, 2))
-```
-
-- 如果对于输入需要进行特殊的处理,可以根据自己的需要重写`call`方法
-
- ```python
- @ClassFactory.register(ClassType.NETWORK)
- class MixedOp(Module):
-
- def __init__(self, C, stride, ops_cands):
- """Init MixedOp."""
- super(MixedOp, self).__init__()
- self.add_spaces(ops_cands, OPS[ops_cands](C, stride, True))
-
- def call(self, x, weights=None, *args, **kwargs):
- """Call function of MixedOp."""
- if weights is None:
- for model in self.children():
- x = model(x)
- return x
- return ops.add_n(weights[idx] * op(x) for idx, op in enumerate(self.children()) if weights[idx] != 0)
- ```
-
-### 6.2 使用Connections组装多个模块
-
-我们默认都会采用Sequential的方式组装多个网络,当其他的连接方法时需要手动调用连接的方法。如下面样例采用Add作为两个网络的加和拼接
-
-```python
-@ClassFactory.register(ClassType.NETWORK)
-class BasicBlock(Module):
- """Create BasicBlock SearchSpace."""
-
- def __init__(self, inchannel, outchannel, groups=1, base_width=64, stride=1):
- super(BasicBlock, self).__init__()
- base_conv = BasicConv(inchannel,outchannel)
- shortcut = ShortCut(inchannel,outchannel)
- self.add_block = Add(base_conv, shortcut)
- self.relu = ops.relu()
-```
-
-开发者也可以自己定义Connections:
-
-- 继承`ConnectionsDecorator`,并注册到`ClassFactory.register(ClassType.NETWORK)`
-- init函数接受入参为`*models`,表示接受多个模块,我们会自动调用add_module将这些模块设置到modules中
-- 重写`call`方法,通过`self.children()`获取已经添加的模块,并进行详细的操作
-
-```python
-@ClassFactory.register(ClassType.NETWORK)
-class Sequential(ConnectionsDecorator):
- """Sequential Connections."""
-
- def __init__(self, *models):
- super(Sequential, self).__init__(*models)
-
- def compile(self, inputs):
- """Override compile function, conect models into a seq."""
- output = inputs
- models = self.children()
- for model in models:
- output = model(output)
- return output
-```
diff --git a/docs/cn/developer/new_algorithm.md b/docs/cn/developer/new_algorithm.md
index 73439deb..dead967a 100644
--- a/docs/cn/developer/new_algorithm.md
+++ b/docs/cn/developer/new_algorithm.md
@@ -1,5 +1,7 @@
# 算法开发指导
+**已过时,待刷新。**
+
向Vega库中新增算法,如新的网络搜索算法、模型压缩算法、超参优化算法、数据增广算法等,需要基于Vega提供的基础类进行扩展。AutoML算法的核心的核心是搜索空间、搜索算法、网络构造和评估,新增算法主要考虑这几个方面。
## 1. 新增架构搜索算法
diff --git a/docs/cn/user/ascend_910.md b/docs/cn/user/ascend_910.md
new file mode 100644
index 00000000..5ba594db
--- /dev/null
+++ b/docs/cn/user/ascend_910.md
@@ -0,0 +1,146 @@
+# 部署Ascend环境
+
+请参考Ascend官方文档部署Ascend环境,如下安装指导是安装过程中的关键步骤,若安装过程中出现问题,请以官方文档为准。
+在进行部署前,请在官方网站下载安装包。
+
+## 1 检查已安装的Driver和CANN版本
+
+若是全新的Ascend主机,需要检查是否存在`/usr/local/HiAi`目录,若存在,需要使用root账号执行如下命令卸载该目录:
+
+```bash
+/usr/local/HiAi/uninstall.sh
+```
+
+需要使用非root账号执行如下命令创建`Ascend`目录,并给该目录设置为用户`HwHiAiUser`可访问:
+
+```bash
+mkdir /usr/local/Ascend/
+sudo chown -R :HwHiAiUser /usr/local/Ascend/
+sudo chmod -R 750 /usr/local/Ascend/
+```
+
+若`/usr/local/Ascend/`已存在,则需要在安装前需要检查是否已安装了较旧的Driver和CANN包,请使用如下命令查询各个组件的版本号:
+
+```bash
+cat /usr/local/Ascend/driver/version.info
+cat /usr/local/Ascend/firmware/version.info
+cat /usr/local/Ascend/nnae/latest/ascend_nnae_install.info
+cat /usr/local/Ascend/ascend-toolkit/latest/arm64-linux/ascend_toolkit_install.info
+cat /usr/local/Ascend/tfplugin/latest/ascend_tfplugin_install.info
+```
+
+如上`/usr/local/Ascend`目录是较常使用的目录,也可能是``
+
+若版本号较低,需要使用root账号执行卸载:
+
+```bash
+/usr/local/Ascend/driver/script/uninstall.sh
+/usr/local/Ascend/firmware/script/uninstall.sh
+/usr/local/Ascend/nnae/latest/script/uninstall.sh
+/usr/local/Ascend/ascend-toolkit/latest/arm64-linux/script/uninstall.sh
+/usr/local/Ascend/tfplugin/latest/script/uninstall.sh
+```
+
+若nnae、ascend-toolkit、tfplugin使用非root安装,请使用该用户卸载。
+
+## 2 安装Driver和CANN
+
+使用root用户执行如下命令安装,如下版本号供参考:
+
+```bash
+chmod +x *.run
+./A800-9000-npu-driver_21.0.3.1_linux-aarch64.run --full
+./A800-9000-npu-firmware_1.79.22.4.220.run --full
+```
+
+执行如下命令,确认安装是否成功:
+
+```bash
+npu-smi info
+```
+
+使用非root用户安装其他包,在安装前,需要将该用户设置为和`HwHiAiUser`同组:
+
+```bash
+usermod -a -G HwHiAiUser
+```
+
+```bash
+./Ascend-cann-nnae_5.0.T306_linux-aarch64.run --install
+./Ascend-cann-nnrt_5.0.T306_linux-aarch64.run --install
+./Ascend-cann-tfplugin_5.0.T306_linux-aarch64.run --install
+./Ascend-cann-toolkit_5.0.T306_linux-aarch64.run --install
+```
+
+安装完成后,根据提示需要重启主机。
+
+## 3 配置rank_table_file
+
+请参考Ascend的官方文档,执行`hccn_tool`命令,生成`rank_table_file`。
+
+## 4 配置环境变量
+
+需要配置如下环境变量,建议放入`~/.bashrc`中:
+
+```bash
+export HOME_DIR=/home/
+export HOST_ASCEND_BASE=/usr/local/Ascend
+export JOB_ID=
+export DEVICE_ID=0
+export RANK_TABLE_FILE=
+export RANK_ID=0
+export RANK_SIZE=8
+export NPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export BATCH_TASK_INDEX=0
+export TF_CPP_MIN_LOG_LEVEL=3
+export LD_PRELOAD=export LD_PRELOAD=/lib64/libgomp.so.1:$HOME_DIR/.local/lib/python3.7/site-packages/sklearn/__check_build/../../scikit_learn.libs/libgomp-d22c30c5.so.1.0.0
+export GLOG_v=3
+export USE_NPU=True
+source /usr/local/Ascend/tfplugin/set_env.sh
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+source /usr/local/Ascend/nnae/set_env.sh
+export PATH=$HOME_DIR/.local/bin:$PATH
+export PYTHONPATH=$HOME_DIR/.local/lib/python3.7/site-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=$HOME_DIR/.local/lib/python3.7/site-packages/vega/security/kmc/aarch64:$LD_LIBRARY_PATH
+```
+
+如上``为当前用户名,``请设置一个整数,如`10087`,``请设置为该文件的全路径。
+
+## 5 安装Vega及依赖包
+
+先升级pip到最新版本:
+
+```bash
+pip3 install --user --upgrade pip
+```
+
+先安装nnae、topi、hccl等组件包:
+
+```bash
+export fwk_path='/usr/local/Ascend/nnae/latest'
+export te_path=${fwk_path}'/fwkacllib/lib64/te-*.whl'
+export topi_path=${fwk_path}'/fwkacllib/lib64/topi-*.whl'
+export hccl_path=${fwk_path}'/fwkacllib/lib64/hccl-*.whl'
+pip3 install --user ${te_path}
+pip3 install --user ${topi_path}
+pip3 install --user ${hccl_path}
+```
+
+再安装noah-vega,因Ascend环境特殊性,注意不要安装依赖包:
+
+```bash
+pip3 install --user --no-deps noah-vega
+```
+
+再通过如下的命令查看Vega的依赖包:
+
+```bash
+pip3 show noah-vega
+```
+
+另外要注意的是,dask和distributed这两个包,需要安装如下版本:
+
+```bash
+pip3 install --user distributed==2021.7.0
+pip3 install --user dask==2021.7.0
+```
diff --git a/docs/cn/user/config_reference.md b/docs/cn/user/config_reference.md
index 0ece95f3..a75672c7 100644
--- a/docs/cn/user/config_reference.md
+++ b/docs/cn/user/config_reference.md
@@ -159,6 +159,7 @@ fully_train:
common:
data_path: /cache/datasets/cifar10/
```
+**注**: HCCL支持多机多卡,Horovod目前只支持单机多卡。
## 3. NAS和HPO配置项
diff --git a/docs/cn/user/deployment.md b/docs/cn/user/deployment.md
index 90089324..6c81f525 100644
--- a/docs/cn/user/deployment.md
+++ b/docs/cn/user/deployment.md
@@ -2,18 +2,14 @@
## 1. 本地集群部署
-### 1.1 部署
-
本地集群部署Vega,需满足如下条件:
-1. Ubuntu 18.04 or later。
-2. CUDA 10.0
-3. Python 3.7
-4. pip3
-
-**注: 若需要在Ascend 910集群上部署,请和我们联系。**
+1. Ubuntu 18.04 or EulerOS 2.0 SP8
+2. CUDA 10.0 or CANN 20.1
+3. Python 3.7 or later
+4. pytorch, tensorflow(>1.14, <2.0) or mindspore
-集群在部署时,需要在每个集群节点中安装vega和一些必备的软件包,可执行如下命令进行安装:
+集群在部署时,需要在每个集群节点中安装vega:
```bash
pip3 install --user --upgrade noah-vega
@@ -25,23 +21,6 @@ pip3 install --user --upgrade noah-vega
以上工作完成后,集群已部署完成。
-### 1.2 校验
-
-集群部署完成后,请执行以下命令检查集群是否可用:
-
-```bash
-vega-verify-cluster -m -s ... -n
-```
-
-例如:
-
-```bash
-vega-verify-cluster -m 192.168.0.2 -s 192.168.0.3 192.168.0.4 -n /home/alan/nfs_folder
-```
-
-校验结束后,会有显示"All cluster check items have passed."。
-若校验中出现错误,请根据异常信息调整集群。
-
## 参考
### 安装MPI
@@ -76,6 +55,24 @@ vega-verify-cluster -m 192.168.0.2 -s 192.168.0.3 192.168.0.4 -n /home/alan/nfs_
### 构建NFS
+NFS是集群中用于数据共享的常用系统,若你所使用的集群中已经有NFS系统,请直接使用已有的NFS系统。
+
+以下配置NFS的简单指导,可能不适用于所有的NFS系统,请根据实际集群环境调整。
+
+在配置NFS服务器前,需要确定当前用户的在集群中的各个主机上的UID是否是同样的数值。若UID不相同,会造成无法访问NFS共享目录,需要调整当前用户的UID为同一个数值,同时要避免和其他用户的UID冲突。
+
+查询当前用户的UID:
+
+```bash
+id
+```
+
+修改当前用的UID,(请慎重修改,请咨询集群系统管理员获取帮助):
+
+```bash
+sudo usermod -u
+```
+
NFS服务器设置:
1. 安装NFS服务器:
@@ -97,13 +94,7 @@ NFS服务器设置:
sudo bash -c "echo '/home//nfs_cache *(rw,sync,no_subtree_check,no_root_squash,all_squash)' >> /etc/exports"
```
-4. 将共享目录设置为`nobody`用户
-
- ```bash
- sudo chown -R nobody: //nfs_cache
- ```
-
-5. 重启nfs服务器:
+4. 重启nfs服务器:
```bash
sudo service nfs-kernel-server restart
diff --git a/docs/cn/user/evaluate_service.md b/docs/cn/user/evaluate_service.md
deleted file mode 100644
index 4d01af4b..00000000
--- a/docs/cn/user/evaluate_service.md
+++ /dev/null
@@ -1,319 +0,0 @@
-# Evaluate Service
-
-## 1. 简介
-
-模型评估服务是用于评估模型在特定硬件设备上的性能,如评估剪枝和量化后的模型在Atlas 200 DK上的准确率、模型大小和时延等。
-
-评估服务目前支持的硬件设备为Davinci推理芯片(Atlas200 DK、ATLAS300产品和开发板环境Evb)和手机,后继会扩展支持更多的设备。
-
-评估服务为CS架构, 评估服务在服务端部署, 客户端通过`REST`接口向服务端发送评估请求和获取结果。Vega在进行网络架构搜索时,可以利用评估服务进行实时检测模型性能。在搜索阶段产生备选网络后,可以将该网络模型发送给评估服务,评估服务完成模型评估后,返回评估结果给Vega,Vega根据评估结果,进行后继的搜索。这种实时的在实际的设备上的评估,有利于搜索出对实际硬件更加友好的网络结构。
-
-## 2. 规格
-
-支持的模型和硬件设备
-
-| 算法 | 模型 | Atlas 200 DK |Atlas 300 | Bolt |
-| :--: | :--: | :--: | :--: | :--: |
-| Prune-EA | ResNetGeneral | √ | √ | √|
-| ESR-EA | ESRN | | √ | √ |
-| SR-EA | MtMSR | | √ | √ |
-| Backbone-nas | ResNet | √ | √ | |
-| CARS | CARSDartsNetwork | | √ | |
-| Quant-EA | ResNetGeneral | √ | √ | √ |
-| CycleSR | CycleSRModel | | | |
-| Adlaide-EA | AdelaideFastNAS | | √ | |
-| Auto-Lane | ResNetVariantDet | | |
-| Auto-Lane | ResNeXtVariantDet | | |
-
-## 3. 评估服务部署
-
-### 3.1 环境安装配置(可选)
-
-根据评估硬件(Atlas200 DK 、Atlas300、或者手机),分别按照如下章节指导配置。
-
-### 3.1.1 安装 Atlas200 DK 环境(可选)
-
-#### 3.1.1.1 准备工作
-
-1. 准备好一张8GB以上SD卡及读卡器。
-2. 已安装 ubuntu 16.04.3 的服务器一台。
-3. 下载系统镜像: [ubuntu-16.04.3-server-arm64.iso](http://old-releases.ubuntu.com/releases/16.04.3/ubuntu-16.04.3-server-arm64.iso)
-4. 下载制卡脚本: make_sd_card.py 和 make_ubuntu_sd.sh,下载地址:
-5. 下载开发者运行包: mini_developerkit-1.3.T34.B891.rar,下载地址:
-6. 解压开发者运行包,并上传到用户目录下。
-
-#### 3.1.1.2 安装和配置Atlas200 DK
-
-1. 将SD卡放入读卡器,并将读卡器与Ubuntu服务器的USB接口连接。
-2. Ubuntu服务器上安装依赖项:
-
- ```bash
- apt-get install qemu-user-static binfmt-support python3-yaml gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
- ```
-
-3. 执行如下命令查找SD卡所在的USB设备名称。
-
- ```bash
- fdisk -l
- ```
-
-4. 运行SD制卡脚本开始制卡,此处“USB设备名称”即为上一步得到的名称。
-
- ```bash
- python3 make_sd_card.py local USB设备名称
- ```
-
-5. 制卡成功后,将SD卡从读卡器取出并插入Atlas 200 DK开发者板卡槽, 上电Atlas 200 DK开发者板。
-
-#### 3.1.1.3 安装和配置评估服务器环境
-
-1. 下载安装DDK包及同步lib库
-
- 下载地址:
- 安装步骤可参考官方文档:
-
-2. 配置交叉编译环境
- 需要在评估服务器上安装Atlas200 DK所需的编译环境,执行如下命令:
-
- ```bash
- sudo apt-get install g++-aarch64-linux-gnu
- ```
-
-3. 在服务器的 `/etc/profile` 中配置如下环境变量,注意文件中的`/home/`要配置为正确的路径:
-
- ```bash
- export DDK_PATH=/home//huawei/ddk
- export PYTHONPATH=$DDK_PATH/site-packages/te-0.4.0.egg:$DDK_PATH/site-packages/topi-0.4.0.egg
- export LD_LIBRARY_PATH=$DDK_PATH/uihost/lib:$DDK_PATH/lib/x86_64-linux-gcc5.4
- export PATH=$PATH:$DDK_PATH/toolchains/ccec-linux/bin:$DDK_PATH/uihost/bin
- export TVM_AICPU_LIBRARY_PATH=$DDK_PATH/uihost/lib/:$DDK_PATH/uihost/toolchains/ccec-linux/aicpu_lib
- export TVM_AICPU_INCLUDE_PATH=$DDK_PATH/include/inc/tensor_engine
- export TVM_AICPU_OS_SYSROOT=/home//tools/sysroot/aarch64_Ubuntu16.04.3
- export NPU_HOST_LIB=/home//tools/1.32.0.B080/RC/host-aarch64_Ubuntu16.04.3/lib
- export NPU_DEV_LIB=/home//tools/1.32.0.B080/RC/host-aarch64_Ubuntu16.04.3/lib
- ```
-
-4. 配置ssh互信
- 由于评估服务器和Atlas200 DK 之间需要进行文件传输以及远端命令的执行,因此需要分别在两个环境上配置ssh互信,确保脚本能够自动化运行。
-
- a. 安装ssh:`sudo apt-get install ssh`
- b. 生成密钥:`ssh-keygen -t rsa` 会在~/.ssh/文件下生成id_rsa, id_rsa.pub两个文件,其中id_rsa.pub是公钥
- c. 确认目录下的authorized_keys文件。若不存在需要创建, 并`chmod 600 ~/.ssh/authorized_keys`改变权限。
- d. 拷贝公钥:分别将公钥id_rsa.pub内容拷贝到其他机器的authorized_keys文件中。
- **注意**: 以上步骤需要在评估服务器和Atlas 200 DK 分别执行一遍, 确保这两台机器之间ssh互信。
-
-### 3.1.2 安装配置Atlas300环境(可选)
-
-参考华为图灵官方教程自行安装配置: Atlas 300I 推理卡 用户指南(型号 3000)
-
-注意:上述文档可能发生更新, 请及时关注我们发布的更新或自行获取得到相应的指导文档。环境安装后一般需要设置相应的环境变量, 请参考上述指导文档进行相应配置。为了方便您更好地进行环境配置, 我们提供了相关环境变量配置的模板 [env_atlas300.sh](https://github.com/huawei-noah/vega/blob/master/evaluate_service/hardwares/davinci/env/env_atlas300.sh) 供您参考, 请您以实际安装环境为准。
-
-由于Atlas300环境安装较为复杂, 为了确保您的环境安装正确, 请您完成安装后运行检查环境脚本[check_atlas300.sh](https://github.com/huawei-noah/vega/blob/master/evaluate_service/hardwares/davinci/env/check_atlas300.sh)。
-
-### 3.1.3 安装和配置手机环境(可选)
-
-#### 3.1.3.1 准备工作
-
-1. 准备Kirin 980手机1台,推荐Nova 5。
-2. 已安装 ubuntu 16.04.3 的服务器一台。
-
-#### 3.1.3.2 安装和配置评估服务器和手机
-
-1. 在linux 系统服务器上安装adb工具。
-
- ```bash
- apt install adb
- ```
-
-2. 通过USB端口将手机接入到评估服务器,并打开开发者选项,并在评估服务器上执行如下命令:
-
- ```bash
- adb devices
- ```
-
- 出现如下信息即为连接成功:
-
- ```text
- List of devices attached
- E5B0119506000260 device
- ```
-
-#### 3.1.3.3 设备连接失败的处理
-
-若在服务器上通过 `adb devices` 命令不能获取到设备,则可以通过以下步骤尝试连接:
-
-1. 在评估服务器上执行`lsusb`命令, 出现设备列表, 找到设备的ID。
-
-2. 编辑51-android.rules 文件:
-
- ```bash
- sudo vim /etc/udev/rules.d/51-android.rules
- ```
-
- 写入如下内容
-
- ```text
- SUBSYSTEM=="usb", ATTR{idVendor}=="12d1", ATTR{idProduct}=="107e", MODE="0666"
- ```
-
- 注意: 上面的12d1和107e是上一步查询到的ID。
-
-3. 编辑adb_usb.ini 文件:
-
- ```bash
- vim ~/.android/adb_usb.ini
- ```
-
- 写入如下内容:
-
- ```text
- 0x12d1
- ```
-
- 注意: 上面的12d1是步骤5.1查询到的ID。
-
-4. 重启adb服务
-
- ```bash
- sudo adb kill-server
- sudo adb start-server
- ```
-
-5. 再次执行`adb devices`,确认是否连接成功。
-
-### 3.1.4 安装和配置麒麟990手机NPU环境(可选)
-3.1.4.1 准备工作
-1. 准备Kirin 990手机1台,推荐mate30 pro。
-2. 已安装 ubuntu 16.04.3 的服务器一台。
-
-3.1.4.2 安装和部署
-1 下载HUAWEI HiAI DDK, 下载链接:https://developer.huawei.com/consumer/cn/doc/development/hiai-Library/ddk-download-0000001053590180,
-选择下载hwhiai-ddk-100.500.010.010.zip, 下载后解压到"/data/tools/"目录下, 解压后目录结构为"/data/tools/hwhiai-ddk-100.500.010.010/"。
-2 拷贝相关依赖文件到手机
-把tools_sysdbg目录下所有内容拷贝到手机上的/data/local/tmp目录下
-```bash
-adb push /data/tools/hwhiai-ddk-100.500.010.010/tools/tools_sysdbg/* /data/local/tmp/
-```
-3 进入到手机上, 设置环境变量, 添加文件执行权限
-```bash
-adb shell
-export LD_LIBRARY_PATH=/data/local/tmp/
-chmod +x /data/local/tmp/model_run_tool
-chmod +x /data/local/tmp/data_proc_tool
-```
-4 安装adb调试工具
-参考3.1.3.2节。
-
-### 3.2 安装和启动评估服务
-
-1 安装:在评估服务器上安装vega, 安装时加上`--no-dependencies`参数, 不安装依赖项;
-2 启动:运行命令`vega-evaluate_service-service -i {your_ip_adress} -w {your_work_path}`, 其中`-i`参数指定当前使用的服务器的ip地址,
-`-w`参数指定工作路径, 程序运行时的中间文件将存储在该目录下,请使用绝对路径。
-其他可选参数的设置可查看该命令的帮助信息, 一般情况下建议采用默认值。
-
-## 4. 使用评估服务
-
-使用评估服务时, 只需要在配置文件中进行简单的几行配置即可, 如下面示例所示:
-
-```yaml
-evaluator:
- type: Evaluator
- device_evaluator:
- type: DeviceEvaluator
- hardware: "Davinci"
- remote_host: "http://192.168.0.2:8888"
-```
-
-`evaluator`的配置与您的`trainer`配置处于同一层级。其中需要配置的参数有2个, `hardware`为您指定的需要评估的硬件设备,当前支持`Davinci`和`Bolt`两种,
-`remote_host`为您部署的评估服务器的ip和端口号。
-
-## 5. 自定义评估服务(可选)
-
-vega评估服务当前已经支持Davinci推理芯片和手机等端侧设备的评估, 但新的硬件设备是层出不穷的, 因此评估服务提供了可自定义的扩展能力。
-
-评估服务的流程是:
-
-1. 获取输入信息
-2. 根据需要评估的硬件实例化一个具体的硬件实例
-3. 模型转换
-4. 推理
-5. 返回推理结果
-
-对于不同的硬件, 步骤3和4可能是不同的。 因此当需要添加新的硬件时, 需要根据具体硬件的用法实现这2个步骤。具体来说, 分以下几个步骤:
-
-在hardwares目录下添加一个硬件类, 并实现`convert_model`和`inference`两个接口 如下:
-
-```python
-from class_factory import ClassFactory
-@ClassFactory.register()
-class MyHardware(object):
-
- def __init__(self, optional_params):
- pass
-
- def convert_model(self, backend, model, weight, **kwargs):
- pass
-
- def inference(self, converted_model, input_data, **kwargs):
-
- return latency, output
-```
-
-上面的示例中定义了`MyHardware`类, 并通过`@ClassFactory.register()`进行注册。 类中实现了`convert_model`和`inference`两个接口, `backend`表示模型是通过何种训练框架保存的, 如`pytorch`, `tensorflow`等, 为模型解析提供必要的辅助信息,`model`和`weight`分别表示需要转换的模型和权重,`weight`是非必须的,其值可能为空。`converted_model`和`input_data`分别表示转换之后的模型和输入数据。
-
-然后在hardware的`__init__.py`中加入自定义的类。
-
-```python
-from .my_hardware import MyHardware
-```
-
-## 6. FAQ
-
-### 6.1 Pytorch模型转换caffe模型
-
-如果需要将pytorch模型转换为caffe模型,请下载[PytorchToCaffe](https://github.com/xxradon/PytorchToCaffe)获取并放在`./third_party`目录下(third_party目录与vega处于同一目录层级)。
-
-注意: 该第三方开源软件不支持pytorch1.1版本, 并且如果您使用原生torchvisoin中的模型, 当torchvision版本高于0.2.0时, 您需要做以下额外修改:
-修改`pytorch_to_caffe.py`文件, 增加以下内容:
-
-```python
-
-def _flatten(raw , input, * args):
- x = raw(input, *args)
- if not NET_INITTED:
- return x
- layer_name=log.add_layer(name='flatten')
- top_blobs=log.add_blobs([x],name='flatten_blob')
- layer=caffe_net.Layer_param(name=layer_name,type='Reshape',
- bottom=[log.blobs(input)],top=top_blobs)
- start_dim = args[0]
- end_dim = len(x.shape)
- if len(args) > 1:
- end_dim = args[1]
- dims = []
- for i in range(start_dim):
- dims.append(x.shape[i])
- cum = 1
- for i in range(start_dim, end_dim):
- cum = cum * x.shape[i]
- dims.append(cum)
- if end_dim != len(x.shape):
- cum = 1
- for i in range(end_dim, len(x.shape)):
- cum = cum * x.shape[i]
- dims.append(cum)
- layer.param.reshape_param.shape.CopyFrom(caffe_net.pb.BlobShape(dim=dims))
- log.cnet.add_layer(layer)
- return x
-
-
-torch.flatten = Rp(torch.flatten,_flatten)
-```
-
-### 6.2 Pytorch 1.2版本及以下模型评估
-
-如果您使用的`Pytorch`版本在1.2及以下, 在`Pytorch`模型转换为`onnx`模型时可能会遇到算子不支持的情况。 如`upsample_bilinear2d`算子不支持, 您可以选择升级`Pytorch`版本到1.3及以上, 或者您可以从`Pytorch`官方代码库中获取`pytorch/torch/onnx/symbolic_opset10.py`, 拷贝到对应的`Pytorch`安装目录下。
-
-### 6.3 找不到`model_convert.sh`等脚本错误
-
-评估服务中有很多`shell`脚本, 其文件格式应该为`unix`格式, 如果在windows上打开过相应文件, 或是`git`下载代码时进行了相应转换, 文件格式可能会变成`dos`格式, 需要转换为`unix`格式。
diff --git a/docs/cn/user/faq.md b/docs/cn/user/faq.md
index 1a414c40..7d7a701d 100644
--- a/docs/cn/user/faq.md
+++ b/docs/cn/user/faq.md
@@ -2,22 +2,14 @@
## 1. 常见异常汇总
-### 1.1 异常 `ModuleNotFoundError: No module named 'mmdet'`
-
-运行SP-NAS等算法时,需要单独安装开源软件mmdetection,具体安装步骤请参考该软件的安装指导。
-
-### 1.2 异常 `ModuleNotFoundError: No module named 'nasbench'`
-
-运行Benchmark时,需要单独安装开源软件NASBench,具体安装步骤请参考该软件的安装指导。
-
-### 1.3 异常 `Exception: Failed to create model, model desc={}`
+### 1.1 异常 `Exception: Failed to create model, model desc={}`
出现该类问题的原因有两类:
1. 该网络未注册到Vega中,在调用该网络前,需要使用`@ClassFactory.register`注册该网络,可参考示例。
2. 该网络的模型描述文件有错误,可通过异常信息中的``定位问题的原因。
-### 1.5 异常 `ImportError: libgthread-2.0.so.0: cannot open shared object file: No such file or directory`
+### 1.2 异常 `ImportError: libgthread-2.0.so.0: cannot open shared object file: No such file or directory`
该异常可能是因为opencv-python缺少了系统依赖库,可尝试使用如下命令解决:
@@ -25,7 +17,7 @@
sudo apt install libglib2.0-0
```
-### 1.6 安装过程中出现异常 `ModuleNotFoundError: No module named 'skbuild'`,或者在安装过程中卡在`Running setup.py bdist_wheel for opencv-python-headless ...`
+### 1.3 安装过程中出现异常 `ModuleNotFoundError: No module named 'skbuild'`,或者在安装过程中卡在`Running setup.py bdist_wheel for opencv-python-headless ...`
该异常可能是pip的版本过低,可尝试使用如下命令解决:
@@ -33,17 +25,13 @@ sudo apt install libglib2.0-0
pip3 install --user --upgrade pip
```
-### 1.7 异常 `PermissionError: [Errno 13] Permission denied: 'dask-scheduler'`, `FileNotFoundError: [Errno 2] No such file or directory: 'dask-scheduler': 'dask-scheduler'`, 或者 `vega: command not found`
+### 1.4 异常 `PermissionError: [Errno 13] Permission denied: 'dask-scheduler'`, `FileNotFoundError: [Errno 2] No such file or directory: 'dask-scheduler': 'dask-scheduler'`, 或者 `vega: command not found`
这类异常一般是因为在 `PATH` 路径中未找到 `dask-scheduler` ,一般该文件会安装在 `//.local/bin` 路径下。
在安装完 Vega ,会自动添加 `//.local/bin/` 到 `PATH` 环境变量中,但不会即时生效,需要该用户执行`source ~/.profile`,或者再次登录服务器后才会生效。
若问题还未解决,可先检查在 `//.local/bin` 路径下是否存在 `dask-scheduler` 文件。
若该文件已存在,则需要手动添加 `//.local/bin` 到环境变量 `PATH` 中。
-### 1.8 Pytorch模型评估时,出现异常 `FileNotFoundError: [Errno 2] No such file or directory: '/torch2caffe.prototxt'`
-
-请参考文档 [Evaluate Service](./evaluate_service.md) 6.1 章节。
-
## 2. 常见配置问题汇总
### 2.1 如何配置多GPU/NPU支持
@@ -112,29 +100,11 @@ general:
level: info # debug|info|warn|error|
```
-### 2.5 如何实时查看搜索进展
-
-Vega提供了模型搜索过程可视化进展,用户只需在`USER.yml` 中配置`VisualCallBack`, 如下所示
-
-```yaml
- trainer:
- type: Trainer
- callbacks: [VisualCallBack, ]
-```
-
-可视化信息输出目录为:
-
-```text
-./tasks//visual
-```
-
-在主机上执行`tensorboard --logdir PATH`如下启动服务,在浏览器上查看进展。具体可参考tensorboard的相关命令和指导。
-
-### 2.6 如何终止后台运行的vega程序
+### 2.5 如何终止后台运行的vega程序
Vega在多个GPU/NPU场景中,会启动dask scheduler、dask worker及训练器,若仅仅杀死Vega主进程会造成部分进程不会及时的关闭,其占用的资源一直不会被释放。
-可使用如下命令终止Vega应用程序:
+在安全模式下,可使用如下命令终止Vega应用程序:
```bash
# 查询运行中的Vega主程序的进程ID
@@ -146,3 +116,57 @@ vega-kill -a
# 若主程序被非常正常关闭,还存在遗留的相关进程,可使用强制清理
vega-kill -f
```
+
+在普通模式下,使用如下命令:
+
+```bash
+vega-kill -s -l
+vega-kill -s -p
+vega-kill -s -a
+vega-kill -s -f
+```
+
+### 2.6 如何查询正在运行的vega程序
+
+在安全模式下,可通过如下命令查询正在运行的Vega应用程序:
+
+```bash
+vega-process
+```
+
+在普通模式下,可通过如下命令查询:
+
+```bash
+vega-process -s
+```
+
+### 2.7 如何查询vega程序运行进度
+
+在安全模式下,可通过如下命令查询正在运行的Vega程序运行进度:
+
+```bash
+vega-progress -t -r
+```
+
+在普通模式下,可通过如下命令查询:
+
+```bash
+vega-progress -s -t -r
+```
+
+### 2.8 如何使用vega程序执行模型推理
+
+可通过命令`vega-inference`执行分类模型推理,通过执行命令`vega-inference-det`执行检测模型推理。
+
+通过如下命令查询命令参数。
+
+```bash
+vega-inference --help
+vega-inference-det --help
+```
+
+## 3. 注意事项
+
+### 3.1 请预留足够的磁盘空间
+
+在Vega运行期间,会有缓存每一个搜索到的网络的模型,当搜索的数量较大是,需要较大的存储空间。请根据每个搜索算法的搜索网络模型的数量的大小,预留足够的磁盘空间。
diff --git a/docs/cn/user/install.md b/docs/cn/user/install.md
index 522e728d..6a495813 100644
--- a/docs/cn/user/install.md
+++ b/docs/cn/user/install.md
@@ -6,8 +6,8 @@
1. Ubuntu 18.04 or EulerOS 2.0 SP8
2. CUDA 10.0 or CANN 20.1
-3. Python 3.7
-4. pip3
+3. Python 3.7 or later
+4. pytorch, tensorflow(>1.14, <2.0) or mindspore
## 2. 安装Vega
diff --git a/docs/cn/user/security_configure.md b/docs/cn/user/security_configure.md
index c5ed9672..087490f4 100644
--- a/docs/cn/user/security_configure.md
+++ b/docs/cn/user/security_configure.md
@@ -1,174 +1,258 @@
# vega 安全配置
-## 用户数据保护
-用户用于训练的模型脚本/文件、预训练模型以及数据集属于比较重要的数据文件,需要做好安全保护,可以通过设置正确的文件权限来提升其安全性。可以通过如下命令来设置正确的文件权限
+
+Vega的安全配置,包括如下步骤:
+
+1. 安装OpenSSL
+2. 生成CA根证书
+3. 生成评估服务用的证书
+4. 生成Dask用的证书
+5. 加密私钥口令
+6. 配置安全相关的配置文件
+7. 配置评估服务守护服务
+8. 安装dask和distributed
+9. 配置HCCL白名单
+10. 注意事项
+
+## 1.安装OpenSSL
+
+首先要安装OpenSSL 1.1.1,从源码编译安装,或者直接安装编译后的发行包。
+
+然后安装OpenSSL的python接口,如下:
+
```shell
-chmod 640 -R "file_path"
+pip3 install --user pyOpenSSL==19.0.0
```
-## 安全配置文件
-vega在启动时会尝试读取```~/.vega/vega.ini```配置文件中的内容,如果该文件不存在或者文件中的配置不正确,那么vega会报错并自动退出。
+## 2.生成CA证书
-用户在安装vega之后,可以通过命令```vega-security-config -i```初始化该文件,初始化之后该文件内容如下:
-```ini
-[security]
-enable = True
+执行如下命令生成CA证书:
-[https]
-cert_pem_file =
-secret_key_file =
+```shell
+openssl genrsa -out ca.key 4096
+openssl req -new -x509 -key ca.key -out ca.crt -subj "/C=/ST=/L=/O=/OU=/CN="
```
-```[security] -> enable```的默认配置为True,此时用户还需要配置```[https]```段落下的```cert_pem_file```与```secret_key_file。```关于如何生成这2个文件请参考下面的章节,生成文件之后用户可以直接编辑vega.ini配置这2项内容,也可以通过如下命令来配置
+
+注意:以上``、``、``、``、``、``根据实际情况填写,本文后面的配置也是同样的。并且CA的配置需要和其他的不同。
+
+## 3. 生成评估服务使用的证书
+
+评估服务支持加密证书和普通证书:
+
+1. 若使用加密证书,需要安装华为公司的KMC安全组件,参考`生成加密证书`章节
+2. 若使用普通证书,参考`生成普通证书`章节
+
+### 3.1 生成加密证书
+
+执行如下脚本,生成评估服务器所使用的证书的加密私钥,执行该命令时,会提示输入加密密码,密码的强度要求如下:
+
+1. 密码长度大于等于8位
+2. 必须包含至少1位大写字母
+3. 必须包含至少1位小写字母
+4. 必须包含至少1位数字
+
```shell
-vega-security-config -m https -c "cert_file_path" -k "key_file_path"
-# 替换“cert_file_path”与“key_file_path"为真实的文件路径
+openssl genrsa -aes-256-ofb -out server.key 4096
```
-> 注意:用户也可以选择关闭安全配置,通过运行命令```vega-security-config -s 0```来实现。关闭安全配置之后,训练服务器与推理服务器之间的通信将不再使用https而是https协议,无法保证通信安全。
->
-> 用户在关闭安全配置后,可以通过命令```vega-security-config -s 1```来重新开启安全配置。
->
+然后再执行如下命令,生成证书,并删除临时文件:
-vega-security-config提供的操作vega.ini文件的命令总览如下:
```shell
-# 1. 初始化vega.ini文件
-vega-security-config -i
-# 2. 关闭安全配置
-vega-security-config -s 0
-# 3. 打开安全配置
-vega-security-config -s 1
-# 4. 查询当前的安全配置开关是否打开
-vega-security-config -q sec
-# 5. 查询https的证书与密钥配置
-vega-security-config -q https
-# 6. 配置https的证书与密钥文件路径
-vega-security-config -m https -c "cert_file_path" -k "key_file_path"
-# 7. 只配置https的证书路径(在训练服务器上)
-vega-security-config -m https -c "cert_file_path"
-```
-
-## 评估服务器
-### 评估服务器 https 安全配置
-#### 生成评估服务器密钥和证书
-
-在评估服务器上执行以下操作
-
-1.将/etc/pki/tls/openssl.cnf或者/etc/ssl/openssl.cnf拷贝到当前文件夹
-
-2.修改当前目录下的openssl.cnf文件内容,在[ v3_ca ]段落中添加内容
-```ini
-subjectAltName = IP:xx.xx.xx.xx
+openssl req -new -key server.key -out server.csr -extensions v3_ca -subj "/C=/ST=/L=/O=/OU=/CN="
+openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out server.crt
+rm server.csr
```
-> 注意:xx.xx.xx.xx修改为推理服务器的IP地址
->
-3.生成服务器密钥
+
+执行如下脚本生成评估服务客户端所使用的证书的加密私钥,执行该命令时,会提示输入加密密码,密码的强度要求如服务器端私钥,且和服务器段私钥密码不同,请记录好改密码,后继还需使用:
+
```shell
-openssl genrsa -aes-256-ofb -out example_key.pem 4096
+openssl genrsa -aes-256-ofb -out client.key 4096
```
-> 注意:在这个阶段需要用户输入保护密钥的密码,此密码由用户自己记住,并且输入的密码强度需满足需求,具体的密码强度需求见下面的启动评估服务器章节
->
-4.生成证书请求文件
+
+然后再执行如下命令,生成证书,并删除临时文件:
+
```shell
-openssl req -new -key example_key.pem -out example.csr -extensions v3_ca \
--config openssl.cnf
+openssl req -new -key client.key -out client.csr -extensions v3_ca -subj "/C=/ST=/L=/O=/OU=/CN="
+openssl x509 -req -in client.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out client.crt
+rm client.csr
```
-5.生成自签名证书
+
+### 3.2 生成普通证书
+
+执行如下脚本,生成评估服务器端和客户端使用的证书的私钥和证书:
+
```shell
-openssl x509 -req -days 365 -in example.csr -signkey example_key.pem \
--out example_crt.pem -extensions v3_ca -extfile openssl.cnf
+openssl genrsa -out server.key 4096
+openssl req -new -key server.key -out server.csr -extensions v3_ca -subj "/C=/ST=/L=/O=/OU=/CN="
+openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out server.crt
+rm server.csr
+
+openssl genrsa -out client.key 4096
+openssl req -new -key client.key -out client.csr -extensions v3_ca -subj "/C=/ST=/L=/O=/OU=/CN="
+openssl x509 -req -in client.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out client.crt
+rm client.csr
```
-6.设置密钥/证书权限
-为了确保系统安全,需要正确配置密钥/证书文件的权限,用户可以使用如下命令进行配置
+
+## 4. 生成Dask使用的证书
+
+执行如下脚本,生成Dask服务器端和客户端使用的证书的私钥和证书:
+
```shell
-sudo chmod 600 example_key.pem example_crt.pem
+openssl genrsa -out server_dask.key 4096
+openssl req -new -key server_dask.key -out server_dask.csr -extensions v3_ca -subj "/C=/ST=/L=/O=/OU=/CN="
+openssl x509 -req -in server_dask.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out server_dask.crt
+rm server_dask.csr
+
+openssl genrsa -out client_dask.key 4096
+openssl req -new -key client_dask.key -out client_dask.csr -extensions v3_ca -subj "/C=/ST=/L=/O=/OU=/CN="
+openssl x509 -req -in client_dask.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out client_dask.crt
+rm client_dask.csr
```
-#### 评估服务器配置https密钥和证书
-将example_key.pem和example_crt.pem拷贝到```~/.vega```文件夹下
+删除CA私钥:
-修改配置文件`~/.vega/vega.ini` 配置密钥和证书
-```ini
-[security]
-enable = True # 需要配置成True才能启用https加密通信
+```shell
+rm ca.key
+```
-[https]
-cert_pem_file = /home//.vega/example_crt.pem # 修改username和证书文件名
-secret_key_file = /home//.vega/example_key.pem # 修改username和密钥文件名
+## 5. 加密私钥口令
+
+若加密服务器使用加密证书,则需要执行本章节余下步骤,若使用普通证书,则跳过该章节。
+
+加密生成评估服务的服务器端和客户端的私钥口令,需要安装华为公司KMC安全组件,并将该安全组件动态链接库所在的目录添加到`LD_LIBRARY_PATH`中。
+
+```shell
+export LD_LIBRARY_PATH=:$LD_LIBRARY_PATH
```
+接下来安装Vega,使用Vega的密码加密工具调用KMC安全组件对密码加密。
+在执行如下命令时,请输入在生成私钥时输入的口令,该命令会生成加密后的口令,请注意保存,在配置文件中会使用到这两个加密后的口令:
-#### 评估服务器配置访问频率
-配置文件`~/.vega/vega.ini` 配置访问频率,默认限制每分钟最大100次访问
-```ini
-[limit]
-request_frequency_limit=5/minute # 配置为每分钟最大5次访问
+```shell
+vega-encrypt_key --cert=server.crt --key=server.key --key_component_1=ksmaster_server.dat --key_component_2=ksstandby_server.dat
+vega-encrypt_key --cert=client.crt --key=client.key --key_component_1=ksmaster_client.dat --key_component_2=ksstandby_client.dat
```
-#### 评估服务器配置请求大小限制
-配置文件`~/.vega/vega.ini` 配置请求大小限制,可以控制上传文件大小,默认配置 1G
-```ini
-[limit]
-max_content_length=100000 # 配置请求大小最大100K
+## 6. 配置安全配置文件
+
+请在当前用户的主目录下创建`.vega`目录,并将如上生成的秘钥、证书、加密材料等,拷贝到该目录下,并改变权限:
+
+```shell
+mkdir ~/.vega
+mv * ~/.vega/
+chmod -R 600 ~/.vega
```
-#### 评估服务器配置白名单,仅可信的服务器连接评估服务器
-1. linux 白名单配置
- * 配置白名单:
- ```
- sudo iptables -I INPUT -p tcp --dport 评估端口 -j DROP
- sudo iptables -I INPUT -s 白名单IP地址1 -p tcp --dport 评估端口 -j ACCEPT
- sudo iptables -I INPUT -s 白名单IP地址2 -p tcp --dport 评估端口 -j ACCEPT
- sudo iptables -I INPUT -s 白名单IP地址3 -p tcp --dport 评估端口 -j ACCEPT
- sudo iptables -I INPUT -s 白名单IP地址4 -p tcp --dport 评估端口 -j ACCEPT
- ```
- * 如果需要从白名单中删除某一项
- 1. 查询白名单 ```sudo iptables -L -n --line-number```
- 2. 删除白名单 ```sudo iptables -D INPUT 查询的对应行编号```
+说明:
-2. 配置文件 `.vega/vega.ini` 配置白名单
- * 在配置中的 limit.white_list 中配置白名单,用逗号分隔
- ```ini
- [limit]
- white_list=127.0.0.1,10.174.183.95
- ```
+1. 如上的秘钥、证书、加密材料也可以放到其他目录位置,注意访问权限要设置为`600`,并在后继的配置文件中同步修改该文件的位置。
+2. 在训练集群上,需要保留`ca.crt`、`client.key`、`client.crt`、`ksmaster_client.dat`、`ksstandby_client.dat`、`server_dask.key`、`server_dask.crt`、`client_dask.key`、`client_dask.crt`,并删除其他文件。
+3. 评估服务上,需要保留`ca.crt`、`server.key`、`server.crt`、`ksmaster_server.dat`、`ksstandby_server.dat`,并删除其他文件。
-#### 启动评估服务器
-在配置了以上安全配置项之后,用户用以下命令启动评估服务器
+在`~/.vega`目录下创建`server.ini`和`client.ini`。
-```vega-evaluate_service-service -i {your_ip_adress} -w {your_work_path}```
+在训练集群中,需要配置`~/.vega/server.ini`和`~/.vega/client.ini`:
-其中`-i`参数指定当前使用的服务器的ip地址,
-`-w`参数指定工作路径, 程序运行时的中间文件将存储在该目录下,请使用绝对路径。 其他可选参数的设置可查看该命令的帮助信息, 一般情况下建议采用默认值。
+server.ini:
-在评估服务器启动时需要用户输入服务器密钥对应的密码(在生成密钥时输入的密码),系统会检查用户密码的强度,如果密码强度不符合需求,将会提示用户并自动退出。密码强度要求如下:
+```ini
+[security]
+ ca_cert=<~/.vega/car.crt>
+ server_cert_dask=<~/.vega/server_dask.crt>
+ server_secret_key_dask=<~/.vega/server_dask.key>
+ client_cert_dask=<~/.vega/client_dask.crt>
+ client_secret_key_dask=<~/.vega/ client_dask.key>
```
-1. 密码长度大于等于8位
-2. 必须包含至少1位大写字母
-3. 必须包含至少1位小写字母
-4. 必须包含至少1位数字
+
+client.ini:
+
+```ini
+[security]
+ ca_cert=<~/.vega/car.crt>
+ client_cert=<~/.vega/client.crt>
+ client_secret_key=<~/.vega/client.key>
+ encrypted_password=<加密后的client端的口令> #如果使用普通证书, 此项配置为空
+ key_component_1=<~/.vega/ksmaster_client.dat> #如果使用普通证书, 此项配置为空
+ key_component_2=<~/.vega/ksstandby_client.dat> #如果使用普通证书, 此项配置为空
```
-## 训练服务器
-### 训练服务器安全配置
-训练服务器需要配置推理服务器的证书信息,才能正常向推理服务器发送请求进行推理。用户可以按照如下方法进行配置:
+在评估服务器上,需要配置`~/.vega/vega.ini`:
-修改配置文件`~/.vega/vega.ini` 配置密钥和证书
```ini
[security]
-enable = True # 需要配置成True才能启用https加密通信
+ca_cert=<~/.vega/car.crt>
+server_cert=<~/.vega/server.crt>
+server_secret_key=<~/.vega/server.key>
+encrypted_password=<加密后的server端的口令> #如果使用普通证书, 此项配置为空
+key_component_1=<~/.vega/ksmaster_server.dat> #如果使用普通证书, 此项配置为空
+key_component_2=<~/.vega/ksstandby_server.dat> #如果使用普通证书, 此项配置为空
+```
+
+## 7. 配置评估服务守护服务
+
+使用systemctl管理评估服务器进程,当进程出现异常时自动重启,保证评估服务器连续性。
+
+首先创建一个启动评估服务的脚本`run_evaluate_service.sh`,内容如下,注意替换``、``为真实IP和目录:
+
+```shell
+vega-evaluate_service-service -i -w
+```
+
+然后再创建一个守护服务的文件`evaluate-service`,脚本内容如下,注意替换为真实的脚本位置:
-[https]
-cert_pem_file = /home//.vega/example_crt.pem # 修改username和证书文件名
+```ini
+[Unit]
+ Description=Vega Evaluate Service Daemon
+[Service]
+ Type=forking
+ ExecStart=//run.sh
+ Restart=always
+ RestartSec=60
+[Install]
+ WantedBy=multi-user.target
```
-> 注意:这里的example_crt.pem为上面的步骤中生成的证书文件,用户需要手动将该证书文件拷贝到训练节点的对应目录下。
-### 训练服务器防火墙设置
-训练节点在进行多卡训练时需要启动dask和zmq服务,这些服务会随机监听本地127.0.0.1的27000 - 34000 端口。为了保护用户的服务不被恶意攻击,可以通过如下方式配置防火墙保护这些端口:
+然后将`evaluate-service`拷贝到目录`/usr/lib/systemd/system`中,并启动该服务:
```shell
-iptables -I OUTPUT -p tcp -m owner --uid-owner "user_id" -d 127.0.0.1 --match multiport --dports 27000:34000 -j ACCEPT
-iptables -A OUTPUT -p tcp --match multiport -d 127.0.0.1 --dports 27000:34000 -j DROP
+sudo cp evaluate-service /usr/lib/systemd/system/
+sudo systemctl daemon-reload
+sudo systemctl start evaluate-service
```
-其中```"user_id"```需要用户执行命令```id "username"```查看用户的id并镜像替换。
-> 注意:该配置限制了所有其他用户对端口27000-34000的访问,在多用户环境下如果其他用户也需要运行vega训练任务,需要使用其他用户的id去运行第一条命令,以便使该用户添加到防火墙的白名单中。
->
+## 8. 安装Dask和distributed
+
+安装Vega时,会自动安装Dask和Distributed的最新版本,我们发现在当前版本中Distributed关闭dash board时存在bug,需要执行如下命令,安装如下版本的这两个组件:
+
+```shell
+pip3 install --user dask==2.11.0
+pip3 install --user distributed==2.11.0
+```
+
+## 9. 配置HCCL白名单
+
+请参考Ascend提供的[配置指导](https://support.huawei.com/enterprise/zh/doc/EDOC1100206668/8e964064)。
+
+## 10. 注意事项
+
+### 10.1 模型风险
+
+对于AI框架来说,模型就是程序,模型可能会读写文件、发送网络数据。例如Tensorflow提供了本地操作API tf.read_file, tf.write_file,返回值是一个operation,可以被Tensorflow直接执行。
+因此对于未知来源的模型,请谨慎使用,使用前应该排查该模型是否存在恶意操作,消除安全隐患。
+
+### 10.2 运行脚本风险
+
+Vega提供的script_runner功能可以调用外部脚本进行超参优化,请确认脚本来源,确保不存在恶意操作,谨慎运行未知来源脚本。
+
+### 10.3 KMC组件不支持多个用户同时使用
+
+若使用KMC组件对私钥密码加密,需要注意KMC组件不支持不同的用户同时使用KMC组件。若需要切换用户,需要在root用户下,使用如下命令查询当前信号量:
+
+```bash
+ipcs
+```
+
+然后删除查询到的当前所有的信号量:
+
+```bash
+ipcrm -S '<信号量>'
+```
diff --git a/docs/en/algorithms/adelaide_ea.md b/docs/en/algorithms/adelaide_ea.md
index 05e6e7b8..02dc25b6 100644
--- a/docs/en/algorithms/adelaide_ea.md
+++ b/docs/en/algorithms/adelaide_ea.md
@@ -89,8 +89,6 @@ mutate:
type: AdelaideMutate
codec: AdelaideCodec
max_sample: 100
- pareto_front_file: "{local_base_path}/output/random/pareto_front.csv"
- random_file: "{local_base_path}/output/random/random.csv"
```
## 3. Dataset
@@ -101,13 +99,6 @@ The dataset for image semantic segmentation needs to include RGB images and corr
### 4. Output
-The output includes a series of .pth files (models trained to the num_iter iteration times in the configuration file), the result.csv file, and the pause_front.csv file. The result.csv file records all search models, and the pareto_front.csv file records all pareto_front models. The .csv file contains encoding, flops, parameters, and mIOU.
+The output includes model files, network description files, performance files.
+The network descrition files has encoding item, using a 19-character string indicates the structure of a model, which ends with an underscore (_) to avoid record errors caused by encoding starting with 0.
-1. encoding: A 19-character string indicates the structure of a model, which ends with an underscore (_) to avoid record errors caused by encoding starting with 0.
-2. flops: Records the macc value of the model. For example, 1371603728 indicates 1.277 GB.
-3. parameters: Records the values of parameters in the model. For example, 3162900 indicates 3.016 MB.
-4. mIOU: Records the segmentation performance.
-
-## 5. Benchmark
-
-For details, see the benchmark configuration item in the [adelaide_ea.yml](https://github.com/huawei-noah/vega/blob/master/examples/nas/adelaide_ea/adelaide_ea.yml) configuration file.
diff --git a/docs/en/algorithms/modnas.md b/docs/en/algorithms/modnas.md
index f8b19a84..a84383d8 100644
--- a/docs/en/algorithms/modnas.md
+++ b/docs/en/algorithms/modnas.md
@@ -303,10 +303,6 @@ search_space:
Now we have a supernet on top of the base model where the original convolution operators are replaced with specified mixed operators and primitives. A search routine can then be set up by matching the search space with selected Optimizer and Estimators.
-## Known Issues
-
-- Currently the ModularNAS routine runs in a separate thread and listens on condition variables in Vega, which might lead to deadlocks.
-
## Reference
[^fn1]: Liu, H., Simonyan, K., and Yang, Y. Darts: Differentiable architecture search. ArXiv, abs/1806.09055, 2019b.
diff --git a/docs/en/algorithms/nago.md b/docs/en/algorithms/nago.md
index 37dbdc5a..430cc1c6 100644
--- a/docs/en/algorithms/nago.md
+++ b/docs/en/algorithms/nago.md
@@ -88,7 +88,4 @@ search_algorithm:
### 5. Output
-The following two files are generated in the specified output directory (the default directory is `./example/tasks//output/nas/`):
-
-- The `output.csv` file contains the best architecture generator hyperparameters found by BOHB
-- The `reports.csv` file contains all the architecture generator hyperparameters queried by BOHB at different epoch.
+The best hpyerparameters in file `desc_nn.json` in folder `./example/tasks//output/nas/`.
diff --git a/docs/en/algorithms/pba.md b/docs/en/algorithms/pba.md
index 60a8ce62..bfcaafd2 100644
--- a/docs/en/algorithms/pba.md
+++ b/docs/en/algorithms/pba.md
@@ -105,18 +105,3 @@ The PBA algorithm uses the default parameters in the parameter configuration fil
|:--:|:--:|:--:|:--:|:--:|
|Ho et at.,2019|96.13%|96.92%|97.32%|97.42%|
|Vega Pipeline|96.26%|97.18%| \ |97.57%|
-
-The final output files and directories are as follows:
-
-```text
-output:
- best_hps.json: best augmentation policy schedule obtained by the PBA algorithm and the ID and score of the search phase
- hps.csv: ID and score of 16 groups of augmentation policy schedules obtained by the PBA algorithm in the search phase
- score_board.csv: score and status of each round of iteration of the 16 groups of data augmentation operations obtained in the algorithm search phase.
-workers:
- hpo: The 16 folders are the final results of the 16 groups of models, including the score and model.
- 0:
- 1:
- ...
- 16:
-```
diff --git a/docs/en/algorithms/quant_ea.md b/docs/en/algorithms/quant_ea.md
index ffc744cb..aa4976e3 100644
--- a/docs/en/algorithms/quant_ea.md
+++ b/docs/en/algorithms/quant_ea.md
@@ -66,8 +66,4 @@ The two phases ("nas" and "fully_train") are performed in sequence. The Pareto f
### 5. Algorithm output
-The following two files are generated in the specified output directory:
-
-- The model on the found Pareto front after fully training.
-- The result.csv file contains the encoding, flops, parameters, and accuracy of all models during the search process.
-- pareto_front.csv contains the found pareto front information.
+The following two files are generated in `./tasks//output/nas/`.
diff --git a/docs/en/algorithms/sp_nas.md b/docs/en/algorithms/sp_nas.md
index 43b3ed46..7547fdcb 100644
--- a/docs/en/algorithms/sp_nas.md
+++ b/docs/en/algorithms/sp_nas.md
@@ -130,7 +130,6 @@ fine_tune:
models_folder: "{local_base_path}/output/parallel/" # Get desc file and weights file from parallel pipe step
```
-
### Algorithm output
- The optimal models with fully training.
@@ -138,4 +137,4 @@ fine_tune:
## Benchmark
-Benchmark configuration: [sp_nas.yml](https://github.com/huawei-noah/vega/tree/master/examples/nas/sp_nas/spnas.yml)
+Benchmark configuration: [spnas.yml](https://github.com/huawei-noah/vega/blob/master/examples/nas/sp_nas/spnas.yml)
diff --git a/docs/en/algorithms/sr_ea.md b/docs/en/algorithms/sr_ea.md
index fe4751de..e0b79ded 100644
--- a/docs/en/algorithms/sr_ea.md
+++ b/docs/en/algorithms/sr_ea.md
@@ -79,8 +79,4 @@ mutate:
### Output
-The outputs are as follows:
-
-• The model on the found Pareto front after fully training.
-• Logs of all models in random search and evolutionary search process (result.csv)
-• Logs of Pareto front results (pareto_front.csv).
+The outputs is the model on the found Pareto front after fully training.
diff --git a/docs/en/developer/developer_guide.md b/docs/en/developer/developer_guide.md
index 2a4dd3a5..72f7b081 100644
--- a/docs/en/developer/developer_guide.md
+++ b/docs/en/developer/developer_guide.md
@@ -1,18 +1,11 @@
# Development Reference
+**Outdated and to be updated.**
+
## 1. Introduction
The key features of Vega are network architecture search and hyperparameter optimization. In the network architecture search process, the search space and search algorithm are the core parts, and the generator is used to control the sampling, update, and end of the search process.
-The following figure shows the class diagram of the search space and search algorithm.
-
-![Search Space diagram](../../images/search_space_classes.png)
-
-The following figure shows the search space and search algorithm process.
-
-![Search Space process](../../images/search_space_flow.png)
-
-Search space process
The following describes the following parts:
- search space
diff --git a/docs/en/developer/fine_grained_search_space.md b/docs/en/developer/fine_grained_search_space.md
deleted file mode 100644
index 52dd3377..00000000
--- a/docs/en/developer/fine_grained_search_space.md
+++ /dev/null
@@ -1,211 +0,0 @@
-# Search space and Fine-Grained Network guidance
-## 1. Fine-grained Introduction
-In most Automl algorithms, the search space is closely related to the network. Each search algorithm defines a series of search space and network types that are identified by the search space and network types. Most of these network types are slightly modified on the basic network, resulting in network reuse failure. In addition, the search space and search algorithm are strongly coupled. Each algorithm has its own search space definition. This search space can only be used in specific scenarios and lacks universality and scalability.
-After analyzing these problems, we propose a general searchspace fine-grained network solution.
-
-- Unified search space definition mode. The same search space can adapt to different search algorithms.
-- Reuses basic networks, provides fine-grained networks, and constructs different types of networks through combinations.
-- The search space can be expanded freely based on the defined network.
-- Multiple backends are supported.
-## 2. Fine-grained demonstration
-### 2.1 Building a Network with Fine Grain
-- Inherit the Module base class and call `@ClassFactory.register(ClassType.NETWORK)` to register the network.
-- The pytorch style is used. The `self.xx` variable is placed in the module. By default, the variable is executed in sequence.
-- If you need to customize the execution sequence of modules, rewrite the `call` method.
-```python
-from vega.common import ClassFactory, ClassType
-from vega.modules.module import Module
-from vega.modules.operators import ops
-@ClassFactory.register(ClassType.NETWORK)
-class SimpleCnn(Module):
- def __init__(self, block_nums=3, filters=32, kernel_size=3):
- super(SimpleCnn, self).__init__()
- in_channels = 3
- out_channels = filters
- output_size = 32
- for i in range(block_nums):
- block = ConvBlock(in_channels, out_channels, kernel_size)
- self.add_module("block{}".format(i), block)
- in_channels = out_channels
- output_size = (output_size - kernel_size + 1) // 2
- self.fc1 = ops.Linear(in_channels * output_size * output_size, 120)
- self.relu = ops.Relu()
- self.fc2 = ops.Linear(120, 10)
-
-@ClassFactory.register(ClassType.NETWORK)
-class ConvBlock(Module):
- def __init__(self, in_channels, out_channels, kernel_size=3):
- super(ConvBlock, self).__init__()
- self.conv = ops.Conv2d(in_channels, out_channels, kernel_size)
- self.bn = ops.BatchNorm2d(out_channels)
- self.relu = ops.Relu()
- self.pool = ops.MaxPool2d((2, 2))
- def call(x):
- x = self.conv(x)
- x = self.bn(x)
- x = self.relu(x)
- return self.pool(x)
-model = SimpleCnn()
-print(model)
-```
-### 2.2. Define Search Space and Use Random Search
-- Config in pipeline
-```yaml
-pipeline: [hpo]
-
-hpo:
- pipe_step:
- type: SearchPipeStep
-
- search_algorithm:
- type: RandomSearch
-
- search_space:
- type: SearchSpace
- hyperparameters:
- - key: backbone.block1.conv.in_channels
- type: CATEGORY
- range: [8, 16, 32, 64, 128, 256]
- model:
- model_desc:
- modules: ["backbone"]
- backbone:
- type: SimpleCnn
- dataset:
- type: Cifar10
- common:
- data_path: /cache/datasets/cifar10/
- batch_size: 256
- trainer:
- type: Trainer
- epochs: 1
-```
-- Use SearchSpace in code.
-```python
-from vega.algorithms.hpo.random_hpo import RandomSearch
-from vega.core.search_space import SearchSpace
-from vega.core.search_space.param_types import ParamTypes
-from vega.core.search_space.params_factory import ParamsFactory
-from vega.networks.network_desc import NetworkDesc
-
-# Definition of SearchSpace
-params = ParamsFactory.create_search_space(
-param_name='backbone.block1.conv.in_channels',
-param_type=ParamTypes.CATEGORY,
-param_range=[8, 16, 32, 64, 128, 256])
-search_space = SearchSpace().add_hp(params)
-# Search algorithm
-id, desc = RandomSearch(search_space).search()
-# Parse into a model.
-model = NetworkDesc(desc).to_model()
-print(model)
-```
-## 3. Module Groups
-To facilitate the reuse of network modules, fine-grained modules are grouped based on their functions. Each group has its own features.
-- **Networks**: defines a common network, which is a coarse-grained network, such as ResNet and FasterRCNN. Networks are submodules in other groups.
-- **Backbone**: backbone network. Generally, the backbone+head mode is used to form a network. In many scenarios, we can flexibly replace different backbones to process different featureMaps.
-- **Head**: used for feature fusion, for example, as a classification or regression problem. This ensures that different heads are replaced to accommodate different scenarios.
-- **Cells:** Multiple blocks are combined. Multiple cells are defined to define combined scenarios.
-- **Blocks**: consists of basic operators and forms a block with specific functions. We provide some common blocks that can be used in different networks.
-- **Connections**: defines the connection relationships between modules, including Sequential and Add, and the implementation statements of some condition branches, such as Repeat.
-- **Operators:** Defines underlying operators, such as conv and batch_normal. Each operator is adapted to multiple platforms to unify external input, output, and interface invoking.
-For example, the composition of a ResNet18 is as follows:
-![resnet](../../images/resnet.png)
-
-## 4. Definition of Search Space
-
-The search space consists of **hyper_parameters** and **condition**.
-**hyper_parameters**
-Specifies the definition of a hyperparameter, including key, type, and value. key indicates the name of a hyperparameter, and type indicates the type of a hyperparameter, that is, ParamType. The system selects a sampling mode based on ParamType. range: specifies the sampling range.
-The following param types are preconfigured:
-
-- **INT**: indicates that a value is sampled from an integer range. If the value range is [0, 10], a value is randomly sampled from 0 to 10.
-- **INT_EXP:** A value in the integer range is sampled in the exponential sampling mode of 10. For example, if range is [0, 1000], the value is mapped to [0, 10, 100, 1000] through the log function.
-- **INT_CAT**: Select a value from multiple INT types, for example, range=[16, 32, 64, 128].
-- **FLOAT:** Sampling a value from a floating range. For example, if range is [0.001, 1], a value is sampled.
-- **FLOAT_EXP**: sample a value in the Float type range in exponential sampling mode of 10. For example, if range is [0.001, 1], the value is mapped to [1, 0.01, 0.001] through the log function.
-- **FLOAT_CAT:** indicates that a value is selected from multiple FLOAT types, for example, range=[0.1, 0.01, 0.001, 0.99].
-- **STRING:** indicates that one character string is selected from multiple character strings, for example, range=['block1','block2','block3','block4'].
-**condition**
-Indicates the relationship between two nodes. A child node takes effect only when the parent node meets certain conditions.
-![img](http://hi3ms-image.huawei.com/hi/staticimages/hi3msh/images/2019/0731/15/5d414a699c009.png)![img](http://image.huawei.com/tiny-lts/v1/images/9ed3126327ed5a8abb80_844x290.png@900-0-90-f.png)
-The value or range of the condition is transferred by using condition_range. Specifically:
-- **EQUAL**: condition_range can contain only one parent value, indicating that the child is selected. The value of parent must be equal to **.
-- **NOT_EQUAL**: condition_range can contain one or more values of parent, indicating that child is selected. The value of parent ** must not be equal to all values provided in **condition_range.
-- **IN**: If parent is of the range type, condition_range must contain two values, indicating the minimum value and maximum value of cond_range. If child is selected, the current value of parent must be within the range of cond_range. If parent is of the CAT type, condition_range must contain one or more parent values. If child is selected, the current parent value must be within a certain value in condition_range.
-**forbidden**
-Indicates the mutually exclusive relationship between values of two nodes. If node 1 contains a value, some values of node 2 are not selected.
-## 5. Support for Multiple Backends
-We encapsulate the underlying architecture and unify upper-layer interfaces to adapt to multiple backends. The core functions are as follows:
-- **Module**: base class to be inherited for implementing customized modules, which unifies the implementation of internal module operations on each platform.
-- **ops**: upper-layer operator invoking interface, which unifies the names, input, and output of the same functional operator on different platforms.
-- **Serializable:** Extracts and parses hyperparameters and hierarchies in the module, and serializes them into a JSON dictionary.
-![fine_grained_space](../../images/fine_grained_space.png)
-
-## 6. How to Develop Fine-Grained Networks
-
-For algorithm developers, we want them to focus on the development of search algorithms for network structure and hyperparameters rather than the construction of the network itself. Currently, some modules and networks have been preconfigured that can provide the hyperparameter definition and architecture definition description of this type of network. Algorithm developers only need to assemble new networks using search algorithms based on the description.
-### 6.1 Defining a Module
-To facilitate your use, we inherit the development habits of pytorch. Only a few lines of changes are required to become a module of fine granularity.
-- Inherit the Module class and register it with the `ClassFactory.register(ClassType.NETWORK)`.
-- Replace the operator in nn with the operator in ops.
-- For the network structure that is executed in sequence, the network is generated in the sequence of self by default, and the forward method does not need to be implemented.
-```python
-@ClassFactory.register(ClassType.NETWORK)
-class ConvBlock(Module):
- def __init__(self, in_channels, out_channels, kernel_size=3):
- super(ConvBlock, self).__init__()
- self.conv = ops.conv2d(in_channels, out_channels, kernel_size)
- self.bn = ops.batch_norm2d(out_channels)
- self.relu = ops.relu()
- self.pool = ops.max_pool2d((2, 2))
-```
-- If special processing is required for input, rewrite the `call` method as required.
-```python
-@ClassFactory.register(ClassType.NETWORK)
-class MixedOp(Module):
- def __init__(self, C, stride, ops_cands):
- """Init MixedOp."""
- super(MixedOp, self).__init__()
- self.add_spaces(ops_cands, OPS[ops_cands](C, stride, True))
-
- def call(self, x, weights=None, *args, **kwargs):
- """Call function of MixedOp."""
- if weights is None:
- for model in self.children():
- x = model(x)
- return x
- return ops.add_n(weights[idx] * op(x) for idx, op in enumerate(self.children()) if weights[idx] != 0)
-```
-### 6.2 Using Connections to Assemble Multiple Modules
-By default, multiple networks are assembled in Sequential mode. When other connection methods are used, you need to manually invoke the connection method. In the following example, Add is used to add and combine the two networks.
-```python
-@ClassFactory.register(ClassType.NETWORK)
-class BasicBlock(Module):
- """Create BasicBlock SearchSpace."""
- def __init__(self, inchannel, outchannel, groups=1, base_width=64, stride=1):
- super(BasicBlock, self).__init__()
- base_conv = BasicConv(inchannel, outchannel)
- shortcut = ShortCut(inchannel, outchannel)
- self.add_block = Add(base_conv, shortcut)
- self.relu = ops.relu()
-```
-Developers can also define connections as follows:
-- Inherit `ConnectionsDecorator` and register with `ClassFactory.register(ClassType.NETWORK)`
-- The input parameter of the `init` function is `*models`, indicating that multiple modules are accepted. We will automatically invoke add_module to set these modules to modules.
-- Rewrite the `call` method, use `self.children()` to obtain added modules, and perform detailed operations.
-```python
-@ClassFactory.register(ClassType.NETWORK)
-class Sequential(ConnectionsDecorator):
- """Sequential Connections."""
- def __init__(self, *models):
- super(Sequential, self).__init__(*models)
-
- def compile(self, inputs):
- """Override compile function, conect models into a seq."""
- output = inputs
- models = self.children()
- for model in models:
- output = model(output)
- return output
-```
diff --git a/docs/en/developer/new_algorithm.md b/docs/en/developer/new_algorithm.md
index 02bb8577..4627fe66 100644
--- a/docs/en/developer/new_algorithm.md
+++ b/docs/en/developer/new_algorithm.md
@@ -1,5 +1,7 @@
# Algorithm Development Guide
+**Outdated and to be updated.**
+
New algorithms, such as new network search algorithms, model compression algorithm, hyperparameter optimization algorithms, and data augmentation algorithms, need to be extended based on the basic classes provided by Vega. The core of the AutoML algorithm is search space, search algorithm, network construction and evaluation. The new algorithm mainly considers these aspects.
## 1. Add a schema search algorithm
diff --git a/docs/en/user/ascend_910.md b/docs/en/user/ascend_910.md
new file mode 100644
index 00000000..f666e466
--- /dev/null
+++ b/docs/en/user/ascend_910.md
@@ -0,0 +1,153 @@
+# Deploy the Ascend environment.
+
+Deploy the Ascend environment by referring to the Ascend official document. The following
+installation guide is a key step during the installation. If an error occurs during the
+installation, refer to the official document.
+Before the deployment, download the installation package from the official website.
+
+## 1 Check the install Driver and CANN Versions
+
+For a new Ascend host, check whether the `/usr/local/HiAi` directory exists. If yes,
+run the following command as root user to uninstall the directory:
+
+```bash
+/usr/local/HiAi/uninstall.sh
+```
+
+Run the following commands as a non-root user to create the `Ascend` directory
+and make the directory accessible to the `HwHiAiUser` user:
+
+```bash
+mkdir /usr/local/Ascend/
+sudo chown -R :HwHiAiUser /usr/local/Ascend/
+sudo chmod -R 750 /usr/local/Ascend/
+```
+
+If `/usr/local/Ascend/` exists, check whether the old Driver and CANN packages have been
+installed before the installation. Run the following command to query the version number of
+each component:
+
+```bash
+cat /usr/local/Ascend/driver/version.info
+cat /usr/local/Ascend/firmware/version.info
+cat /usr/local/Ascend/nnae/latest/ascend_nnae_install.info
+cat /usr/local/Ascend/ascend-toolkit/latest/arm64-linux/ascend_toolkit_install.info
+cat /usr/local/Ascend/tfplugin/latest/ascend_tfplugin_install.info
+```
+
+If the version is older than expected, uninstall it as root user.
+
+```bash
+/usr/local/Ascend/driver/script/uninstall.sh
+/usr/local/Ascend/firmware/script/uninstall.sh
+/usr/local/Ascend/nnae/latest/script/uninstall.sh
+/usr/local/Ascend/ascend-toolkit/latest/arm64-linux/script/uninstall.sh
+/usr/local/Ascend/tfplugin/latest/script/uninstall.sh
+```
+
+If nnae, ascend-toolkit, and tfplugin are not installed by the root user, uninstall them as the user.
+
+## 2 Install the driver and CANN
+
+Run the following command as the root user to install the software. The following version
+is for reference only:
+
+```bash
+chmod +x *.run
+./A800-9000-npu-driver_21.0.3.1_linux-aarch64.run --full
+./A800-9000-npu-firmware_1.79.22.4.220.run --full
+```
+
+Run the following command to check whether the installation is successful:
+
+```bash
+npu-smi info
+```
+
+Before installing other packages as a non-root user, set this user to the same group as `HwHiAiUser`.
+
+```bash
+usermod -a -G HwHiAiUser
+```
+
+```bash
+./Ascend-cann-nnae_5.0.T306_linux-aarch64.run --install
+./Ascend-cann-nnrt_5.0.T306_linux-aarch64.run --install
+./Ascend-cann-tfplugin_5.0.T306_linux-aarch64.run --install
+./Ascend-cann-toolkit_5.0.T306_linux-aarch64.run --install
+```
+
+After the installation is completed, restart the host as prompted.
+
+## 3 Configure rank_table_file.
+
+Run the `hccn_tool` command to generate `rank_table_file` by referring to the official Ascend document.
+
+## 4 Configure environment Variables
+
+The following environment variables need to be configured.
+You are advised to place them in the `~/.bashrc` directory:
+
+```bash
+export HOME_DIR=/home/
+export HOST_ASCEND_BASE=/usr/local/Ascend
+export JOB_ID=
+export DEVICE_ID=0
+export RANK_TABLE_FILE=
+export RANK_ID=0
+export RANK_SIZE=8
+export NPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export BATCH_TASK_INDEX=0
+export TF_CPP_MIN_LOG_LEVEL=3
+export LD_PRELOAD=export LD_PRELOAD=/lib64/libgomp.so.1:$HOME_DIR/.local/lib/python3.7/site-packages/sklearn/__check_build/../../scikit_learn.libs/libgomp-d22c30c5.so.1.0.0
+export GLOG_v=3
+export USE_NPU=True
+source /usr/local/Ascend/tfplugin/set_env.sh
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+source /usr/local/Ascend/nnae/set_env.sh
+export PATH=$HOME_DIR/.local/bin:$PATH
+export PYTHONPATH=$HOME_DIR/.local/lib/python3.7/site-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=$HOME_DIR/.local/lib/python3.7/site-packages/vega/security/kmc/aarch64:$LD_LIBRARY_PATH
+```
+
+In the preceding command, `` is the current user name. `` must be an integer,
+for example, `10087`. `` must be the full path of the file.
+
+## 5 Install Vega and Dependency Packages
+
+Upgrade the PIP to the latest version.
+
+```bash
+pip3 install --user --upgrade pip
+```
+
+Install the nnae, topi, and hccl component packages.
+
+```bash
+export fwk_path=' /usr/local/Ascend/nnae/latest'
+export te_path=${fwk_path}'/fwkacllib/lib64/te-*.whl'
+export topi_path=${fwk_path} '/fwkacllib/lib64/topi-*.whl'
+export hccl_path=${fwk_path} '/fwkacllib/lib64/hccl-*.whl'
+pip3 install --user ${te_path}
+pip3 install --user ${topi_path}
+pip3 install --user ${hccl_path}
+```
+
+Install noah-vega. Do not install the dependency package because of the special environment of Ascend.
+
+```bash
+pip3 install --user --no-deps noah-vega
+```
+
+Run the following command to view the Vega dependency package:
+
+```bash
+pip3 show noah-vega
+```
+
+Note that the following versions must be installed for the dask and distributed packages:
+
+```bash
+pip3 install --user distributed==2021.7.0
+pip3 install --user dask==2021.7.0
+```
\ No newline at end of file
diff --git a/docs/en/user/config_reference.md b/docs/en/user/config_reference.md
index c6e61b34..dd66fcfc 100644
--- a/docs/en/user/config_reference.md
+++ b/docs/en/user/config_reference.md
@@ -157,6 +157,7 @@ fully_train:
common:
data_path: /cache/datasets/cifar10/
```
+**Note**: HCCL supports multi-machine multi-card, Horovod currently only supports single machine multi-card.
## 3. NAS and HPO configuration items
diff --git a/docs/en/user/deployment.md b/docs/en/user/deployment.md
index 6452edbe..31a9c3e2 100644
--- a/docs/en/user/deployment.md
+++ b/docs/en/user/deployment.md
@@ -6,14 +6,14 @@
The following conditions must be met when the Vega is deployed in a local cluster:
-1. Ubuntu 18.04 or later
-2. CUDA 10.0
-3. Python 3.7
-4. pip
+1. Ubuntu 18.04 or EulerOS 2.0 SP8
+2. CUDA 10.0 or CANN 20.1
+3. Python 3.7 or later
+4. pytorch, tensorflow(>1.14, <2.0) or mindspore
**Note: If you need to deploy the Ascend 910 cluster, contact us.**
-During cluster deployment, you need to install the Vega and some mandatory software packages on each cluster node by running the following commands:
+During cluster deployment, you need to install the Vega:
```bash
pip3 install --user --upgrade noah-vega
@@ -25,23 +25,6 @@ After installing the preceding software on each host, you need to configure SSH
After the preceding operations are complete, the cluster has been deployed.
-### 1.2 Verify
-
-After the cluster is deployed, run the following command to check whether the cluster is available:
-
-```bash
-vega-verify-cluster -m -s ... -n
-```
-
-For example:
-
-```bash
-vega-verify-cluster -m 192.168.0.2 -s 192.168.0.3 192.168.0.4 -n /home/alan/nfs_folder
-```
-
-After the verification is complete, the message "All cluster check items have passed." is displayed.
-If an error occurs during the verification, please adjust the cluster based on the exception information.
-
## Reference
### Install MPI
@@ -74,6 +57,24 @@ Any two hosts on the network must support SSH mutual trust. The configuration me
### Building NFS
+NFS is a widely used system for data sharing in a cluster. If an NFS system already exists in the cluster, use the existing NFS system.
+
+The following instructions for configuring NFS may not apply to all NFS systems. Adjust the instructions based on the actual cluster environment.
+
+Before configuring the NFS server, check whether the UID of the current user on each host in the cluster are the same. If the UID are different, the NFS shared directory cannot be accessed. In this case, you need to change the UID of the current user to the same value to avoid conflicts with the UIDs of other users.
+
+To query the UID of the current user, run the following command:
+
+```bash
+id
+```
+
+Change the current UID (Change the value with caution, please contact the cluster system administrator for help):
+
+```bash
+sudo usermod -u
+```
+
NFS server settings:
1. Install the NFS server.
@@ -95,13 +96,7 @@ NFS server settings:
sudo bash -c "echo '/home//nfs_cache *(rw,sync,no_subtree_check,no_root_squash,all_squash)' >> /etc/exports"
```
-4. Set the shared directory to the `nobody` user.
-
- ```bash
- sudo chown -R nobody: //nfs_cache
- ```
-
-5. Restart the NFS server.
+4. Restart the NFS server.
```bash
sudo service nfs-kernel-server restart
diff --git a/docs/en/user/evaluate_service.md b/docs/en/user/evaluate_service.md
deleted file mode 100644
index 54902d69..00000000
--- a/docs/en/user/evaluate_service.md
+++ /dev/null
@@ -1,323 +0,0 @@
-# Evaluate Service
-
-## 1. Introduction
-
-The model evaluation service is used to evaluate the performance of a model on a specific hardware device, such as the accuracy, model size, and latency of a pruned and quantized model on the Atlas 200 DK.
-
-Currently, the evaluation service supports Davincit inference chips (Atlas 200 DK, ATLAS300, and development board environment Evb) and mobile phones. More devices will be supported in the future.
-
-The evaluation service uses the CS architecture. The evaluation service is deployed on the server. The client sends an evaluation request to the server through the `REST` interface and obtains the result. Vega can use the evaluation service to detect model performance in real time during network architecture search. After a candidate network is generated in the search phase, the network model can be sent to the evaluation service. After the model evaluation is complete, the evaluation service returns the evaluation result to Vega. Vega performs subsequent search based on the evaluation result. This real-time evaluation on the actual device helps to search for a network structure that is more friendly to the actual hardware.
-
-## 2. spec
-
-Supported Models and Hardware Devices:
-
-| Algorithm | Model | Atlas 200 DK |Atlas 300 | Bolt |
-| :--: | :--: | :--: | :--: | :--: |
-| Prune-EA | ResNetGeneral | √ | √ | √ |
-| ESR-EA | ESRN | | √ | √ |
-| SR-EA | MtMSR | | √ | √ |
-| Backbone-nas | ResNet| √| √ | |
-| CARS | CARSDartsNetwork | | √ | |
-| Quant-EA | ResNetGeneral | √ | √ | √ |
-| CycleSR | CycleSRModel | | | |
-| Adlaide-EA | AdelaideFastNAS | | √ | |
-| Auto-Lane | ResNetVariantDet | | |
-| Auto-Lane | ResNeXtVariantDet | | |
-
-## 3. Evaluation Service Deployment
-
-### 3.1 Environment Installation and Configuration (Optional)
-
-Configure the hardware (Atlas 200 DK, Atlas 300, or mobile phone) by following the instructions provided in the following sections.
-
-### 3.1.1 Installing the Atlas 200 DK Environment (Optional)
-
-#### 3.1.1.1 Preparations
-
-1. An 8 GB or larger SD card and a card reader are available.
-2. A server where Ubuntu 16.04.3 has been installed.
-3. Download the system image: [ubuntu-16.04.3-server-arm64.iso](http://old-releases.ubuntu.com/releases/16.04.3/ubuntu-16.04.3-server-arm64.iso)
-4. Download the make_sd_card.py and make_ubuntu_sd.sh from .
-5. Download the developer running package mini_developerkit-1.3.T34.B891.rar from .
-6. Decompress the developer package and upload it to the user directory.
-
-#### 3.1.1.2 Installing and Configuring the Atlas200 DK
-
-1. Insert the SD card into the card reader and connect the card reader to the USB port on the Ubuntu server.
-2. Install dependencies on the Ubuntu server:
-
- ```bash
- apt-get install qemu-user-static binfmt-support python3-yaml gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
- ```
-
-3. Run the following command to query the name of the USB device where the SD card is located:
-
- ```bash
- fdisk -l
- ```
-
-4. Run the SD card making script to make a card. The USB device name is the name obtained in the previous step.
-
- ```bash
- python3 make_sd_card.py local USB Device Name
- ```
-
-5. After the card is created, remove the SD card from the card reader, insert the SD card into the card slot of the Atlas 200 DK developer board, and power on the Atlas 200 DK developer board.
-
-#### 3.1.1.3 Installing and Configuring the Evaluation Server Environment
-
-1. Downloading and Installing the DDK Package and Synchronizing the Library
-
- Download address:
- For details about the installation procedure, see the official document:
-
-2. Configuring the Cross Compilation Environment
- To install the compilation environment required by the Atlas 200 DK on the evaluation server, run the following command:
-
- ```bash
- sudo apt-get install g++-aarch64-linux-gnu
- ```
-
-3. Configure the following environment variables in `/etc/profile` of the server. The value of `/home/` in the file must be a specific path.
-
- ```bash
- export DDK_PATH=/home//huawei/ddk
- export PYTHONPATH=$DDK_PATH/site-packages/te-0.4.0.egg:$DDK_PATH/site-packages/topi-0.4.0.egg
- export LD_LIBRARY_PATH=$DDK_PATH/uihost/lib:$DDK_PATH/lib/x86_64-linux-gcc5.4
- export PATH=$PATH:$DDK_PATH/toolchains/ccec-linux/bin:$DDK_PATH/uihost/bin
- export TVM_AICPU_LIBRARY_PATH=$DDK_PATH/uihost/lib/:$DDK_PATH/uihost/toolchains/ccec-linux/aicpu_lib
- export TVM_AICPU_INCLUDE_PATH=$DDK_PATH/include/inc/tensor_engine
- export TVM_AICPU_OS_SYSROOT=/home//tools/sysroot/aarch64_Ubuntu16.04.3
- export NPU_HOST_LIB=/home//tools/1.32.0.B080/RC/host-aarch64_Ubuntu16.04.3/lib
- export NPU_DEV_LIB=/home//tools/1.32.0.B080/RC/host-aarch64_Ubuntu16.04.3/lib
- ```
-
-4. Configuring SSH Mutual Trust
- File transfer and remote command execution are required between the evaluation server and the Atlas 200 DK. Therefore, you need to configure SSH mutual trust in the two environments to ensure that the script can be automatically executed.
-
- a. Install the SSH. `sudo apt-get install ssh`
- b. Generate a key. The `ssh-keygen -t rsa` command generates the id_rsa and id_rsa.pub files in the ~/.ssh/ directory. id_rsa.pub is the public key.
- c. Check the authorized_keys file in the directory. If the file does not exist, create it and run the `chmod 600 ~/.ssh/authorized_keys` command to change the permission.
- d. Copy the public key. Copy the content of the public key id_rsa.pub to the authorized_keys file on another host.
- **Note**: Perform the preceding steps on the evaluation server and Atlas 200 DK separately to ensure SSH trust between the two servers.
-
-### 3.1.2 Installing and Configuring the Atlas 300 Environment (Optional)
-
-For details, see the Huawei official tutorial at .
-
-Note: The preceding documents may be updated. Please follow the released updates or obtain the corresponding guide documents. After the environment is installed, you need to set environment variables. For details, see the preceding guide. To facilitate environment configuration, we provide the environment variable configuration template [env_atlas300.sh](https://github.com/huawei-noah/vega/blob/master/evaluate_service/hardwares/davinci/env/env_atlas300.sh) for your reference. The actual environment prevails.
-
-The installation of the Atlas300 environment is complex. To ensure that the environment is correctly installed, please run [check_atlas300.sh](https://github.com/huawei-noah/vega/blob/master/evaluate_service/hardwares/davinci/env/check_atlas300.sh).
-
-### 3.1.3 Installing and Configuring the Mobile Phone Environment (Optional)
-
-#### 3.1.3.1 Preparations
-
-1. Prepare a Kirin 980 mobile phone. Nova 5 is recommended.
-2. A server where Ubuntu 16.04.3 has been installed.
-
-#### 3.1.3.2 Installing and Configuring the Evaluation Server and Mobile Phone
-
-1. Install the adb tool on the Linux server.
-
- ```bash
- apt install adb
- ```
-
-2. Connect the mobile phone to the evaluation server through the USB port, enable the developer option, and run the following command on the evaluation server:
-
- ```bash
- adb devices
- ```
-
- If the following information is displayed, the connection is successful:
-
- ```text
- List of devices attached
- E5B0119506000260 device
- ```
-
-#### 3.1.3.3 Handling Device Connection Failures
-
-If you cannot obtain the device by running the `adb devices` command on the server, perform the following steps to connect to the device:
-
-1. Run the `lsusb` command on the evaluation server. The device list is displayed. Find the device ID.
-
-2. Edit the 51-android.rules file.
-
- ```bash
- sudo vim /etc/udev/rules.d/51-android.rules
- ```
-
- Write the following content:
-
- ```text
- SUBSYSTEM=="usb", ATTR{idVendor}=="12d1", ATTR{idProduct}=="107e", MODE="0666"
- ```
-
- Note: 12d1 and 107e are the IDs queried in the previous step.
-
-3. Edit the adb_usb.ini file.
-
- ```bash
- vim -/.android/adb_usb.ini
- ```
-
- Write the following content:
-
- ```text
- 0x12d1
- ```
-
- Note: 12d1 is the ID queried in step 5.1.
-
-4. Restart the ADB service.
-
- ```bash
- sudo adb kill-server
- sudo adb start-server
- ```
-
-5. Run the `adb devices` command again to check whether the connection is successful.
-
-
-### 3.1.4 Installing and Configuring the NPU Environment for Kirin 990 Mobile Phones (Optional)
-3.1.4.1 Preparations
-1. Prepare a Kirin 990 phone. The Mate30 Pro is recommended.
-2. A server on which ubuntu 16.04.3 has been installed.
-
-3.1.4.2 Installation and Deployment
-1 Download the HUAWEI HiAI DDK from https://developer.huawei.com/consumer/cn/doc/development/hiai-Library/ddk-download-0000001053590180, Download hwhiai-ddk-100.500.010.010.zip, and decompress it to the /data/tools/ directory. The directory structure is "/data/tools/hwhiai-ddk-100.500.010.010/".
-2 Copy the dependent files to the mobile phone.
-Copy all contents in the tools_sysdbg directory to the /data/local/tmp directory on the mobile phone.
-```bash
-adb push /data/tools/hwhiai-ddk-100.500.010.010/tools/tools_sysdbg/* /data/local/tmp/
-```
-3 Log in to the mobile phone, set environment variables, and add the file execution permission.
-```bash
-adb shell
-export LD_LIBRARY_PATH=/data/local/tmp/
-chmod +x /data/local/tmp/model_run_tool
-chmod +x /data/local/tmp/data_proc_tool
-```
-4 Installing the ADB Debug Tool
-Reference to section 3.1.3.2.
-
-### 3.2 Installing and Starting the Evaluation Service
-
-1 Installation: Install the vega on the evaluation server, and add the `--no-dependencies` parameter during installation. Do not install dependencies.
-2 Start: Run the `vega-evaluate_service-service -i {your_ip_adress} -w {your_work_path}` command. The `-i` parameter specifies the IP address of the current server and
-the `-w` parameter specifies the working path, please use absolute path. The intermediate files generated during program running are stored in this directory.
-For details about other optional parameters, see the help information of this command. Generally, the default values are recommended.
-
-## 4. Use evaluate service
-
-To use evaluate service, you only need to configure a few lines in the configuration file, as shown in the following example.
-
-```yaml
-evaluator:
- type: Evaluator
- device_evaluator:
- type: DeviceEvaluator
- hardware: "Davinci"
- remote_host: "http://192.168.0.2:8888"
-```
-
-The configuration of `evaluator` is at the same level as your configuration of `trainer`. Two parameters need to be configured. `hardware` indicates the hardware device to be evaluated. Currently, `Davinci` and `Bolt` are supported. `remote_host` indicates the IP address and port number of the evaluation server to be deployed.
-
-## 5. Customizing the Evaluation Service (Optional)
-
-Evaluate service supports devices such as Davinci inference chips and mobile phones. However, new hardware devices are emerging. Therefore, Vega provides customized scalability.
-
-The process of the evaluate service is as follows:
-
-1. obtaining input information
-2. Instantiate a specific hardware instance according to the hardware to be evaluated
-3. Model conversion
-4. inference
-5. Return the inference result
-
-Steps 3 and 4 may be different for different hardware. Therefore, when new hardware needs to be added, perform the two steps based on the hardware usage. Specifically, the procedure is as follows:
-
-Add a hardware class to the hardwares directory and implement the `convert_model` and `inference` interfaces as follows:
-
- ```python
-from class_factory import ClassFactory
-
-@ClassFactory.register()
-class MyHardware(object):
-
- def __init__(self, optional_params):
- pass
-
- def convert_model(self, backend, model, weight, **kwargs):
- pass
-
- def inference(self, converted_model, input_data, **kwargs):
-
- return latency, output
-```
-
-In the preceding example, the `MyHardware` class is defined and registered through `@ClassFactory.register()`.
-
-The class implements the `convert_model` and `inference` interfaces, `backend` indicates the training framework through which the model is saved, for example, `pytorch` and `tensorflow`, which provide necessary auxiliary information for model parsing. `model` and `weight` indicate the training framework through which the model is saved, respectively.
-
-Model and weight to be converted. The value of weight is optional and may be empty. `converted_model` and `input_data` indicate the converted model and input data, respectively.
-
-Add the class to `__init__.py` of the hardware.
-
-```python
-from .my_hardware import MyHardware
-```
-
-## 6. FAQ
-
-### 6.1 Convert pytorch model to caffe model
-
-If you need to convert the pytorch model to caffe model, download [PytorchToCaffe](https://github.com/xxradon/PytorchToCaffe) and store it in the `./third_party` directory (the third_party directory and vega directory are at the same directory level).
-
-Note: The third-party open-source software does not support pytorch1.1. If you use the model in the native torchvisoin and the torchvision version is later than 0.2.0, you need to make the following additional modifications:
-Add the following content to the `pytorch_to_caffe.py` file:
-
-```python
-
-def _flatten(raw , input, * args):
- x = raw(input, *args)
- if not NET_INITTED:
- return x
- layer_name=log.add_layer(name='flatten')
- top_blobs=log.add_blobs([x],name='flatten_blob')
- layer=caffe_net.Layer_param(name=layer_name,type='Reshape',
- bottom=[log.blobs(input)],top=top_blobs)
- start_dim = args[0]
- end_dim = len(x.shape)
- if len(args) > 1:
- end_dim = args[1]
- dims = []
- for i in range(start_dim):
- dims.append(x.shape[i])
- cum = 1
- for i in range(start_dim, end_dim):
- cum = cum * x.shape[i]
- dims.append(cum)
- if end_dim != len(x.shape):
- cum = 1
- for i in range(end_dim, len(x.shape)):
- cum = cum * x.shape[i]
- dims.append(cum)
- layer.param.reshape_param.shape.CopyFrom(caffe_net.pb.BlobShape(dim=dims))
- log.cnet.add_layer(layer)
- return x
-
-
-torch.flatten = Rp(torch.flatten,_flatten)
-```
-
-### 6.2 Model evaluation of Pytorch 1.2 and earlier versions
-
-If the `Pytorch` version is 1.2 or earlier, operators may not be supported when the `Pytorch` model is converted to the `onnx` model. If the `upsample_bilinear2d` operator is not supported, you can upgrade the `Pytorch` version to 1.3 or later, or you can obtain `pytorch/torch/onnx/symbolic_opset10.py`, from the `Pytorch` official code library and copy it to the `Pytorch` installation directory.
-
-### 6.3 Failed to find scripts such as model_convert.sh
-
-There are many `shell` scripts in the evaluation service. The file format must be `unix`. If you have opened a file in Windows or converted the file when downloading the code, the file format may be changed to DOS. Pay attention to the file format.
diff --git a/docs/en/user/faq.md b/docs/en/user/faq.md
index d9d45170..9cb22ae7 100644
--- a/docs/en/user/faq.md
+++ b/docs/en/user/faq.md
@@ -2,22 +2,14 @@
## 1. Exceptions
-### 1.1 Exception `ModuleNotFoundError: No module named 'mmdet'`
-
-To run algorithms such as SP-NAS, you need to install the open-source software mmdetection. For details, see the installation guide of the software.
-
-### 1.2 Exception `ModuleNotFoundError: No module named 'nasbench'`
-
-Before running the benchmark, install the open-source software NASBench. For details, see the installation guide of the software.
-
-### 1.3 Exception `Exception: Failed to create model, model desc={}`
+### 1.1 Exception `Exception: Failed to create model, model desc={}`
The possible causes are as follows:
1. The network is not registered with the Vega. Before invoking the network, you need to use `@ClassFactory.register` to register the network. For details, see .
2. The model description file of the network is incorrect. You can locate the fault based on `` in the exception information.
-### 1.5 Exception `ImportError: libgthread-2.0.so.0: cannot open shared object file: No such file or directory`
+### 1.2 Exception `ImportError: libgthread-2.0.so.0: cannot open shared object file: No such file or directory`
The opencv-python system dependency library is missing. Run the following command:
@@ -25,7 +17,7 @@ The opencv-python system dependency library is missing. Run the following comman
sudo apt install libglib2.0-0
```
-### 1.6 Exception `ModuleNotFoundError: No module named'skbuild '` or stuck in `Running setup.py bdist_wheel for opencv-python-headless...` during installation
+### 1.3 Exception `ModuleNotFoundError: No module named'skbuild '` or stuck in `Running setup.py bdist_wheel for opencv-python-headless...` during installation
The possible cause is that the PIP version is too early. Run the following command:
@@ -33,18 +25,14 @@ The possible cause is that the PIP version is too early. Run the following comma
pip3 install --user --upgrade pip
```
-### 1.7 Exception `PermissionError: [Errno 13] Permission denied: 'dask-scheduler'`, `FileNotFoundError: [Errno 2] No such file or directory: 'dask-scheduler': 'dask-scheduler'`, or `vega: command not found`
+### 1.4 Exception `PermissionError: [Errno 13] Permission denied: 'dask-scheduler'`, `FileNotFoundError: [Errno 2] No such file or directory: 'dask-scheduler': 'dask-scheduler'`, or `vega: command not found`
This type of exception is usually caused by the failure to find `dask-scheduler` in `PATH`. Generally, the file is installed in `//.local/bin`.
After the Vega is installed , `//.local/bin/` is automatically added to the `PATH` environment variable. The setting does not take effect immediately. You can run the ls command `source ~/.profile` or log in again to make the setting take effect.
If the problem persists, check whether the dask-scheduler file exists in the `//.local/bin` directory.
If the file already exists, manually add `//.local/bin` to the environment variable `PATH`.
-### 1.8 Exception During Pytorch model evaluation: `FileNotFoundError: [Errno 2] No such file or directory: '/torch2caffe.prototxt'`
-
-For details, see section 6.1 in [Evaluate Service](./evaluate_service.md).
-
-## 2. Common Configuration Problems
+## 2. Configuration Issues
### 2.1 How do I configure multi-GPU/NPU
@@ -112,29 +100,11 @@ general:
level: info # debug|info|warn|error|
```
-### 2.5 How do I view the search progress in real time
-
-Vega provides the visualized progress of the model search process. User could set `VisualCallBack` within `USER.yml` as follow,
-
-```yaml
- trainer:
- type: Trainer
- callbacks: [VisualCallBack, ]
-```
-
-The output directory of the visualized information is as follows:
-
-```text
-./tasks//visual
-```
-
-Run the `tensorboard --logdir PATH` command on the active node to start the service and view the progress in the browser. For details, see TensorBoard commands and instructions.
-
-### 2.6 How Do I Stop the VEGA Program Running in the Background
+### 2.5 How Do I Stop the VEGA Program Running in the Background
If only the main Vega process is killed, some processes will not be stopped in time, and the resources occupied by the processes will not be released.
-The Vega application can be terminated using the following command:
+In safe mode, the Vega application can be terminated using the following command:
```bash
# Query the process ID of the running Vega main program.
@@ -149,19 +119,56 @@ vega-kill -a
vega-kill -f
```
-### 2.6 How Do I Stop the Vega Program Running in the Background?
+In common mode, run the following command::
-In the multi-GPU/NPU scenario, Vega starts the dask scheduler, dask worker, and trainer. If only the main Vega process is killed, some processes are not stopped in time and the resources occupied by these processes are not released.
+```bash
+vega-kill -s -l
+vega-kill -s -p
+vega-kill -s -a
+vega-kill -s -f
+```
-Run the following command to stop the Vega application:
+### 2.6 How Do I Query the Running Vega Program
+
+In safe mode, run the following command to query the running Vega applications:
```bash
-# Query the process ID of the running Vega main program.
-vega-kill -l
-# Stop a Vega main program and related processes.
-vega-kill -p
-# Or stop all Vega processes at a time.
-vega-kill -a
-# If the main program is closed normally and there are still residual processes, you can forcibly clear the process.
-vega-kill -f
+vega-process
+```
+
+In common mode, you can run the following command to query:
+
+```bash
+vega-process -s
+```
+
+### 2.7 How Do I Query the Vega Program Running Progress
+
+In safe mode, you can run the following command to query the running progress of the Vega program:
+
+```bash
+vega-progress -t -r
```
+
+In common mode, you can run the following command to query:
+
+```bash
+vega-progress -s -t -r
+```
+
+### 2.8 How to Perform Model Inference Using the Vega Program
+
+Classification model inference can be performed with the command `vega-inference`, and detection model inference can be performed with the command `vega-inference-det`.
+
+Run the following command to query the command parameters:
+
+```bash
+vega-inference --help
+vega-inference-det --help
+```
+
+## 3. Precautions
+
+### 3.1 Reserve Sufficient Disk Space
+
+During Vega running, there is a model that caches each searched network. When the number of searched networks is large, a large amount of storage space is required. Reserve sufficient disk space based on the number of search network models for each search algorithm.
diff --git a/docs/en/user/install.md b/docs/en/user/install.md
index 7e8a16f8..059159d1 100644
--- a/docs/en/user/install.md
+++ b/docs/en/user/install.md
@@ -6,8 +6,8 @@ The host where the Vega is installed has a GPU and meets the following requireme
1. Ubuntu 18.04 or EulerOS 2.0 SP8
2. CUDA 10.0 or CANN 20.1
-3. Python 3.7
-4. pip3
+3. Python 3.7 or later
+4. pytorch, tensorflow(>1.14, <2.0) or mindspore
## 2. Installing Vega
diff --git a/docs/en/user/security_configure.md b/docs/en/user/security_configure.md
new file mode 100644
index 00000000..33df6d02
--- /dev/null
+++ b/docs/en/user/security_configure.md
@@ -0,0 +1,260 @@
+# VEGA security configuration
+
+The security configuration of the Vega includes the following steps:
+
+1. Install OpenSSL
+2. Generate the CA root certificate
+3. Generate the certificate for evaluate_services
+4. Generate the certificate for dask
+5. Encrypt the private key password
+6. Configure security-related configuration files
+7. Configure the evaluation service daemon service
+8. Install dask and distributed
+9. Configuring the HCCL trustlist
+10. Precautions
+
+## 1. Install OpenSSL
+
+You need to install OpenSSL 1.1.1, compile and install from the source code, or directly install the compiled release package.
+
+Install the Python interface of the OpenSSL as follows:
+
+```shell
+pip3 install --user pyOpenSSL==19.0.0
+```
+
+## 2. Generate the CA Certificate
+
+Run the following command to generate a CA certificate:
+
+```shell
+openssl genrsa -out ca.key 4096
+openssl req -new -x509 -key ca.key -out ca.crt -subj "/C=/ST=/L=/O=/OU=/CN="
+```
+
+Note: ``, ``, ``, ``, ``, and `` should be set based on the situation. The configuration in this document is the same.
+In addition, the CA configuration must be different from other configurations.
+
+## 3. Generate the Certificate for Evaluate_service
+
+The evaluation service supports encryption certificates and common certificates.
+
+1. If an encryption certificate is used, install Huawei KMC security components. For details, see section "Generating an Encryption Certificate."
+2. If a common certificate is used, see section "Generating a Common Certificate."
+
+### 3.1 Generating the Encryption Certificate
+
+Run the following commands to generate the encryption private key for the server of evaluate_service. When you run this command, the system prompts you to enter the encryption password. The password strength requirements are as follows:
+
+1. The password contains at least eight characters.
+2. The value must contain at least one uppercase letter.
+3. The value must contain at least one lowercase letter.
+4. The value must contain at least one digit.
+
+```shell
+openssl genrsa -aes-256-ofb -out server.key 4096
+```
+
+Run the following commands to generate a certificate and delete the temporary file:
+
+```shell
+openssl req -new -key server.key -out server.csr -extensions v3_ca -subj "/C=/ST=/L=/O=/OU=/CN="
+openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out server.crt
+rm server.csr
+```
+
+Run the following commands to generate the encryption private key of the certificate for the client of evaluate_service. When you run this command, the system prompts you to enter the encryption password. The password strength must be the same as that of the server private key and is different from that of the server private key. Record the password and use it later.
+
+```shell
+openssl genrsa -aes-256-ofb -out client.key 4096
+```
+
+Run the following commands to generate a certificate and delete the temporary file:
+
+```shell
+openssl req -new -key client.key -out client.csr -extensions v3_ca -subj "/C=/ST=/L=/O=/OU=/CN="
+openssl x509 -req -in client.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out client.crt
+rm client.csr
+```
+
+### 3.2 Generating the Common Certificate
+
+Run the following commands to generate the private key and certificate for server and client of evaluate_service:
+
+```shell
+openssl genrsa -out server.key 4096
+openssl req -new -key server.key -out server.csr -extensions v3_ca -subj "/C=/ST=/L=/O=/OU=/CN="
+openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out server.crt
+rm server.csr
+
+openssl genrsa -out client.key 4096
+openssl req -new -key client.key -out client.csr -extensions v3_ca -subj "/C=/ST=/L=/O=/OU=/CN="
+openssl x509 -req -in client.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out client.crt
+rm client.csr
+```
+
+## 4. Generate the Certificate for Dask
+
+Run the following commands to generate the private key and certificate for server and client of dask:
+
+```shell
+openssl genrsa -out server_dask.key 4096
+openssl req -new -key server_dask.key -out server_dask.csr -extensions v3_ca -subj "/C=/ST=/L=/O=/OU=/CN="
+openssl x509 -req -in server_dask.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out server_dask.crt
+rm server_dask.csr
+
+openssl genrsa -out client_dask.key 4096
+openssl req -new -key client_dask.key -out client_dask.csr -extensions v3_ca -subj "/C=/ST=/L=/O=/OU=/CN="
+openssl x509 -req -in client_dask.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out client_dask.crt
+rm client_dask.csr
+```
+
+Run the following command to delete the CA private key:
+
+```shell
+rm ca.key
+```
+
+## 5. Encrypting the Private Key Password
+
+If the encryption certificate is used, perform the rest of this section. If the common certificate is used, skip this section.
+
+To encrypt the private key passwords of the server and client for evaluate_service, you need to install Huawei KMC security component and add the directory where the dynamic link library of the security component is located to `LD_LIBRARY_PATH`.
+
+```shell
+export LD_LIBRARY_PATH=:$LD_LIBRARY_PATH
+```
+
+Install Vega and use the password encryption tool to encrypt the password.
+When running the following command, enter the password entered during private key generation. This command will generate an encrypted password. Save the two encrypted passwords in the configuration file:
+
+```shell
+vega-encrypt_key --cert=server.crt --key=server.key --key_component_1=ksmaster_server.dat --key_component_2=ksstandby_server.dat
+vega-encrypt_key --cert=client.crt --key=client.key --key_component_1=ksmaster_client.dat --key_component_2=ksstandby_client.dat
+```
+
+## 6.Configure Security-related Configuration Files
+
+Create the `.vega` directory in the home directory of the current user, copy the generated keys, certificates, and encryption materials to this directory, and change the permission.
+
+```shell
+mkdir -/.vega
+mv * -/.vega/
+chmod -R 600 -/.vega
+```
+
+Description:
+
+1. The preceding keys, certificates, and encryption materials can also be stored in other directories. The access permission must be set to 600 and the file location must be changed in subsequent configuration files.
+2. In the train cluster, reserve `ca.crt`, `client.key`, `client.crt`, `ksmaster_client.dat`, `ksstandby_client.dat`, and `server_dask.key. `, `server_dask.crt`, `client_dask.key`, `client_dask.crt`, and delete other files.
+3. In the evaluate service, reserve `ca.crt`, `server.key`, `server.crt`, `ksmaster_server.dat`, and `ksstandby_server.dat` files, and delete other files.
+
+Create `server.ini` and `client.ini` in the `~/.vega` directory.
+
+In the train cluster, configure `~/.vega/server.ini` and `~/.vega/client.ini`.
+
+server.ini:
+
+```ini
+[security]
+ca_cert=<-/.vega/car.crt>
+server_cert_dask=<-/.vega/server_dask.crt>
+server_secret_key_dask=<-/.vega/server_dask.key>
+client_cert_dask=<-/.vega/client_dask.crt>
+client_secret_key_dask=<-/.vega/ client_dask.key>
+```
+
+client.ini:
+
+```ini
+[security]
+ca_cert=<-/.vega/car.crt>
+client_cert=<-/.vega/client.crt>
+client_secret_key=<-/.vega/client.key>
+encrypted_password= #If a common certificate is used, leave this parameter blank.
+If the key_component_1=<~/.vega/ksmaster_client.dat> #If a common certificate is used, leave this parameter blank.
+If the key_component_2=<~/.vega/ksstandby_client.dat> #If a common certificate is used, leave this parameter blank.
+```
+
+On the evaluation server, configure `~/.vega/vega.ini`.
+
+```ini
+[security]
+ca_cert=<-/.vega/car.crt>
+server_cert=<-/.vega/server.crt>
+server_secret_key=<-/.vega/server.key>
+encrypted_password= #If a common certificate is used, leave this parameter blank.
+If the key_component_1=<~/.vega/ksmaster_server.dat> # uses a common certificate, leave this parameter blank.
+If the key_component_2=<~/.vega/ksstandby_server.dat> # uses a common certificate, leave this parameter blank.
+```
+
+## 7. Configuring the Evaluation Service Daemon Service
+
+The systemctl is used to manage the evaluation server process. When the process is abnormal, the systemctl automatically restarts to ensure the continuity of the evaluation server.
+
+Create a script `run_evaluate_service.sh` for starting the evaluation service. Replace `` and `` with the actual IP address and directory.
+
+```shell
+vega-evaluate_service-service -i -w
+```
+
+Create a daemon service file `evaluate-service`. The script content is as follows. Replace it with the actual script location.
+
+```ini
+[Unit]
+Description=Vega Evaluate Service Daemon
+[Service]
+Type=forking
+ExecStart=//run.sh
+Restart=always
+RestartSec=60
+[Install]
+WantedBy=multi-user.target
+```
+
+Copy `evaluate-service` to the `/usr/lib/systemd/system` directory and start the service.
+
+```shell
+sudo cp evaluate-service /usr/lib/systemd/system/
+sudo systemctl daemon-reload
+sudo systemctl start evaluate-service
+```
+
+## 8. Install Dask and Distributed
+
+When Vega is installed, the latest versions of the Dashboard and Distributed are automatically installed. In the current version, a bug exists when the Dashboard is disabled in Distributed. You need to run the following commands to install the two components of the following versions:
+
+```shell
+pip3 install --user dask==2.11.0
+pip3 install --user distributed==2.11.0
+```
+
+## 9. Configuring the HCCL Trustlist
+
+For details, see the [Configuration Guide](https://support.huawei.com/enterprise/en/doc/EDOC1100206669/8e964064) provided by the Ascend.
+
+## 10. Precautions
+
+### 10.1 Model Risks
+
+For an AI framework, a model is a program. A model may read and write files and send network data. For example, TensorFlow provides the local operation API tf.read_file, tf.write_file. The return value is an operation that can be directly executed by TensorFlow.
+Therefore, exercise caution when using a model with unknown sources. Before using the model, check whether malicious operations exist in the model to eliminate security risks.
+
+### 10.2 Risks of Running Scripts
+
+The script_runner function provided by Vega can invoke external scripts to perform hyperparameter optimization. Check the script source and ensure that no malicious operation exists. Exercise caution when running scripts from unknown sources.
+
+### 10.3 Do Not Use KMC Components By Different Users At The Same Time
+
+If the KMC component is used to encrypt the private key password, note that different users cannot use the KMC component at the same time.
+To switch user, run the following command as the root user to query the current semaphore:
+
+```bash
+ipcs
+```
+
+Run the following command to delete all the semaphores:
+
+```bash
+ipcrm -S ''
+```
diff --git a/docs/images/fine_grained_space.png b/docs/images/fine_grained_space.png
deleted file mode 100644
index 8e8362a3..00000000
Binary files a/docs/images/fine_grained_space.png and /dev/null differ
diff --git a/docs/images/search_space_classes.png b/docs/images/search_space_classes.png
deleted file mode 100644
index 8365a438..00000000
Binary files a/docs/images/search_space_classes.png and /dev/null differ
diff --git a/docs/images/search_space_flow.png b/docs/images/search_space_flow.png
deleted file mode 100644
index da923083..00000000
Binary files a/docs/images/search_space_flow.png and /dev/null differ
diff --git a/evaluate_service/LICENSE b/evaluate_service/LICENSE
new file mode 100644
index 00000000..0bb898dc
--- /dev/null
+++ b/evaluate_service/LICENSE
@@ -0,0 +1,188 @@
+ Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+Apache License, Version 2.0
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
diff --git a/evaluate_service/MANIFEST.in b/evaluate_service/MANIFEST.in
new file mode 100644
index 00000000..33f2919e
--- /dev/null
+++ b/evaluate_service/MANIFEST.in
@@ -0,0 +1,9 @@
+#dispatch files to site-packages
+recursive-include docs *
+recursive-include evaluate_service *
+include LICENSE
+include MANIFEST.in
+include README.cn.md
+include README.md
+include RELEASE.md
+include setup.py
diff --git a/evaluate_service/README.cn.md b/evaluate_service/README.cn.md
new file mode 100644
index 00000000..6e72cfba
--- /dev/null
+++ b/evaluate_service/README.cn.md
@@ -0,0 +1,142 @@
+# Vega 评估服务
+
+**中文 | [English](./README.md)**
+
+---
+
+## 1. 简介
+
+模型评估服务是用于评估模型在特定硬件设备上的性能,如评估剪枝和量化后的模型在Atlas200 DK、Atlas300上的准确率、模型大小和时延等。
+
+评估服务目前支持的硬件设备为Davinci推理芯片(Atlas200 DK、ATLAS300产品和开发板环境Evb)和手机,后继会扩展支持更多的设备。
+
+评估服务为CS架构, 评估服务在服务端部署, 客户端通过`REST`接口向服务端发送评估请求和获取结果。Vega在进行网络架构搜索时,可以利用评估服务进行实时检测模型性能。在搜索阶段产生备选网络后,可以将该网络模型发送给评估服务,评估服务完成模型评估后,返回评估结果给Vega,Vega根据评估结果,进行后继的搜索。这种实时的在实际的设备上的评估,有利于搜索出对实际硬件更加友好的网络结构。
+
+## 2. 规格
+
+支持的模型和硬件设备
+
+| 算法 | 模型 | Atlas 200 DK |Atlas 300 | Bolt |
+| :--: | :--: | :--: | :--: | :--: |
+| Prune-EA | ResNetGeneral | √ | √ | √|
+| ESR-EA | ESRN | | √ | √ |
+| SR-EA | MtMSR | | √ | √ |
+| Backbone-nas | ResNet | √ | √ | |
+| CARS | CARSDartsNetwork | | √ | |
+| Quant-EA | ResNetGeneral | √ | √ | √ |
+| CycleSR | CycleSRModel | | | |
+| Adlaide-EA | AdelaideFastNAS | | √ | |
+| Auto-Lane | ResNetVariantDet | | |
+| Auto-Lane | ResNeXtVariantDet | | |
+
+## 3. 评估服务部署
+
+以下介绍Atalas 300评估服务的部署过程,若需要部署Atlas 200DK或者ARM芯片手机,请联系我们。
+
+### 3.1 安装配置Atlas300环境
+
+首先需要配置Ascend 300环境,请参考[配置文档](./ascend_310.md)。
+
+然后请安装评估服务,请执行如下命令安装:
+
+```bash
+pip3 install --user --upgrade evaluate-service
+```
+
+安装完成后,将`~/.local/lib/python3.7/site-packages/evaluate_service/hardwares/davinci/samples/atlas300`拷贝到当前目录,执行如下操作,检查环境是否配置正确:
+
+```bash
+echo "[INFO] start check the enviroment..."
+python3 -c "import te" && echo "[INFO] check te sucess"
+python3 -c "import topi" && echo "[INFO] check topi sucess"
+atc --version && echo "[INFO] check atc sucess "
+echo "[INFO] start compile the example..."
+cd ./atlas300/
+mkdir -p build/intermediates/host
+cd build/intermediates/host
+cmake ../../src -DCMAKE_CXX_COMPILER=g++ -DCMAKE_SKIP_RPATH=TRUE
+make && echo "[INFO] check the env sucess!"
+```
+
+### 3.2 启动评估服务
+
+使用如下命令启动评估服务:
+
+```shell
+vega-evaluate_service-service -i {your_ip_adress} -p {port} -w {your_work_path}
+```
+
+其中:
+
+- `-i`参数指定当前使用的服务器的ip地址
+- `-p`参数指定当前使用的服务器的的监听端口,默认值8888
+- `-w`参数指定工作路径, 程序运行时的中间文件将存储在该目录下,请使用绝对路径
+
+注意:
+
+以上启动命令会启动安全模式,需要预先进行安全配置,请参考[安全配置](https://github.com/huawei-noah/vega/tree/master/docs/cn/user/security_configure.md)。
+
+也可以使用`-s`参数,启用普通模式,不需要如上配置,命令如下:
+
+```shell
+vega-evaluate_service-service -s -i {your_ip_adress} -w {your_work_path}
+```
+
+## 4. 使用评估服务
+
+使用评估服务时, 需要在Vega调用的配置文件中做如下配置:
+
+```yaml
+evaluator:
+ type: Evaluator
+ device_evaluator:
+ type: DeviceEvaluator
+ hardware: "Davinci"
+ remote_host: "https://:"
+```
+
+其中:
+
+- `evaluator`的配置和`trainer`配置处于同一层级。
+- `hardware`为评估的硬件设备,当前支持`Davinci`和`Bolt`两种。
+- `remote_host`为评估服务器的ip和端口号,对于普通模式,请设置为:`http://:`
+
+## 5. 自定义评估服务
+
+vega评估服务当前已经支持Davinci推理芯片和手机等端侧设备的评估, 但新的硬件设备是层出不穷的, 因此评估服务提供了可自定义的扩展能力。
+
+评估服务的流程是:
+
+1. 获取输入信息
+2. 根据需要评估的硬件实例化一个具体的硬件实例
+3. 模型转换
+4. 推理
+5. 返回推理结果
+
+对于不同的硬件, 步骤3和4可能是不同的。 因此当需要添加新的硬件时, 需要根据具体硬件的用法实现这2个步骤。具体来说, 分以下几个步骤:
+
+在hardwares目录下添加一个硬件类, 并实现`convert_model`和`inference`两个接口 如下:
+
+```python
+from class_factory import ClassFactory
+@ClassFactory.register()
+class MyHardware(object):
+
+ def __init__(self, optional_params):
+ pass
+
+ def convert_model(self, backend, model, weight, **kwargs):
+ pass
+
+ def inference(self, converted_model, input_data, **kwargs):
+
+ return latency, output
+```
+
+上面的示例中定义了`MyHardware`类, 并通过`@ClassFactory.register()`进行注册。 类中实现了`convert_model`和`inference`两个接口, `backend`表示模型是通过何种训练框架保存的, 如`pytorch`, `tensorflow`等, 为模型解析提供必要的辅助信息,`model`和`weight`分别表示需要转换的模型和权重,`weight`是非必须的,其值可能为空。`converted_model`和`input_data`分别表示转换之后的模型和输入数据。
+
+然后在hardware的`__init__.py`中加入自定义的类。
+
+```python
+from .my_hardware import MyHardware
+```
diff --git a/evaluate_service/README.md b/evaluate_service/README.md
new file mode 100644
index 00000000..08342e00
--- /dev/null
+++ b/evaluate_service/README.md
@@ -0,0 +1,142 @@
+# Vega Evaluate Service
+
+**English | [中文](./README.cn.md)**
+
+---
+
+## 1. Introduction
+
+The model evaluation service is used to evaluate the performance of a model on a specific hardware device, such as the accuracy, model size, and latency of a pruned and quantized model on the Atlas 200 DK.
+
+Currently, the evaluation service supports Davincit inference chips (Atlas 200 DK, ATLAS300, and development board environment Evb) and mobile phones. More devices will be supported in the future.
+
+The evaluation service uses the CS architecture. The evaluation service is deployed on the server. The client sends an evaluation request to the server through the `REST` interface and obtains the result. Vega can use the evaluation service to detect model performance in real time during network architecture search. After a candidate network is generated in the search phase, the network model can be sent to the evaluation service. After the model evaluation is complete, the evaluation service returns the evaluation result to Vega. Vega performs subsequent search based on the evaluation result. This real-time evaluation on the actual device helps to search for a network structure that is more friendly to the actual hardware.
+
+## 2. spec
+
+Supported Models and Hardware Devices:
+
+| Algorithm | Model | Atlas 200 DK |Atlas 300 | Bolt |
+| :--: | :--: | :--: | :--: | :--: |
+| Prune-EA | ResNetGeneral | √ | √ | √ |
+| ESR-EA | ESRN | | √ | √ |
+| SR-EA | MtMSR | | √ | √ |
+| Backbone-nas | ResNet| √| √ | |
+| CARS | CARSDartsNetwork | | √ | |
+| Quant-EA | ResNetGeneral | √ | √ | √ |
+| CycleSR | CycleSRModel | | | |
+| Adlaide-EA | AdelaideFastNAS | | √ | |
+| Auto-Lane | ResNetVariantDet | | |
+| Auto-Lane | ResNeXtVariantDet | | |
+
+## 3. Evaluation Service Deployment
+
+### 3.1 Environment installation and configuration (Optional)
+
+Configure the hardware (Atlas 200 DK, Atlas 300, or mobile phone) by following the instructions provided in the following sections.
+
+### 3.1.1 Install the Atlas 200DK environment (Optional)
+
+Please contact us.
+
+### 3.1.2 Install and configure the Atlas 300 Environment (Optional)
+
+For details, see the Huawei official tutorial at .
+
+Note: The preceding documents may be updated. Please follow the released updates or obtain the corresponding guide documents. After the environment is installed, you need to set environment variables. For details, see the preceding guide. To facilitate environment configuration, we provide the environment variable configuration template [env_atlas300.sh](https://github.com/huawei-noah/vega/blob/master/evaluate_service/hardwares/davinci/env/env_atlas300.sh) for your reference. The actual environment prevails.
+
+The installation of the Atlas300 environment is complex. To ensure that the environment is correctly installed, please run [check_atlas300.sh](https://github.com/huawei-noah/vega/blob/master/evaluate_service/hardwares/davinci/env/check_atlas300.sh).
+
+### 3.1.3 Install and configure the mobile environment (Optional)
+
+Please contact us.
+
+### 3.1.4 Install and configure the NPU environment for Kirin 990 mobile (Optional)
+
+Please contact us.
+
+
+### 3.2 Start the evaluation service
+
+Run the following command to start the evaluate service:
+```shell
+vega-evaluate_service-service -i {your_ip_adress} -p {port} -w {your_work_path}
+```
+
+where:
+- `-i` indicates the IP of the server
+- `-p` indicates the listen port,default is 8888
+- `-w` indicates the work dir, please use the absolute path
+
+Note:
+The above command will run in security mode, the security configurations need to be performed in advance.
+please refer to [security cinfigure](https://github.com/huawei-noah/vega/tree/master/docs/cn/user/security_configure.md)。
+
+You can also use the `-s` parameter to enable the common mode. The security configuration is not required. The command is as follows:
+```shell
+vega-evaluate_service-service -s -i {your_ip_adress} -w {your_work_path}
+```
+
+## 4. Use evaluate service
+
+To use evaluate service, you only need to configure a few lines in the configuration file, as shown in the following example.
+
+```yaml
+evaluator:
+ type: Evaluator
+ device_evaluator:
+ type: DeviceEvaluator
+ hardware: "Davinci"
+ remote_host: "https://:"
+```
+
+where:
+- `evaluator` is at the same level as your configuration of `trainer`.
+- `hardware` indicates the hardware device to be evaluated. Currently, `Davinci` and `Bolt` are supported.
+- `remote_host` indicates the IP address and port of the evaluation server. For common mode, please set as
+`http://:`
+
+## 5. Customizing the Evaluation Service (Optional)
+
+Evaluate service supports devices such as Davinci inference chips and mobile phones. However, new hardware devices are emerging. Therefore, Vega provides customized scalability.
+
+The process of the evaluate service is as follows:
+
+1. obtaining input information
+2. Instantiate a specific hardware instance according to the hardware to be evaluated
+3. Model conversion
+4. inference
+5. Return the inference result
+
+Steps 3 and 4 may be different for different hardware. Therefore, when new hardware needs to be added, perform the two steps based on the hardware usage. Specifically, the procedure is as follows:
+
+Add a hardware class to the hardwares directory and implement the `convert_model` and `inference` interfaces as follows:
+
+ ```python
+from class_factory import ClassFactory
+
+@ClassFactory.register()
+class MyHardware(object):
+
+ def __init__(self, optional_params):
+ pass
+
+ def convert_model(self, backend, model, weight, **kwargs):
+ pass
+
+ def inference(self, converted_model, input_data, **kwargs):
+
+ return latency, output
+```
+
+In the preceding example, the `MyHardware` class is defined and registered through `@ClassFactory.register()`.
+
+The class implements the `convert_model` and `inference` interfaces, `backend` indicates the training framework through which the model is saved, for example, `pytorch` and `tensorflow`, which provide necessary auxiliary information for model parsing. `model` and `weight` indicate the training framework through which the model is saved, respectively.
+
+Model and weight to be converted. The value of weight is optional and may be empty. `converted_model` and `input_data` indicate the converted model and input data, respectively.
+
+Add the class to `__init__.py` of the hardware.
+
+```python
+from .my_hardware import MyHardware
+```
\ No newline at end of file
diff --git a/evaluate_service/RELEASE.md b/evaluate_service/RELEASE.md
new file mode 100644
index 00000000..85891f28
--- /dev/null
+++ b/evaluate_service/RELEASE.md
@@ -0,0 +1,23 @@
+**Evaluate Service ver1.8.0 released:**
+
+**Introduction**
+
+The evaluation service is a tool used to evaluate the performance of a
+model on specific hardware developed by Noah's Ark Laboratory, the main features are as follows:
+1. Multi-Backend: PyTorch, TensorFlow, MindSpore and Caffe. The input model can come form Pytorch, Tensorflow,
+Mindspore and caffe.
+2. Multi-hardware: Mobile Phone, Ascend 310, Kirinri 990 and etc. The model can be evaluated on multiple types of hardware.
+3. Online real-time evaluation and offline evaluation. The evaluate service can combine with [Vega](https://github.com/huawei-noah/vega)
+to implement network architecture search with hardware in the ring. One can also use the evaluate service independently.
+4. Supports secure communication encryption. In security mode, communications are encrypted to secure the model and data.
+
+**Installation**
+
+Install evaluate service and the open source softwares that evaluate service depends on:
+
+`pip3 install --user --upgrade evaluate-service`
+
+**Cooperation and Contribution**
+
+Welcome to use evaluate-service. If you have any questions or suggestions, need help, fix bugs, contribute new algorithms,
+or improve the documentation, submit an issue in the community. We will reply to and communicate with you in a timely manner.
diff --git a/evaluate_service/docs/cn/ascend_310.md b/evaluate_service/docs/cn/ascend_310.md
new file mode 100644
index 00000000..61c140cb
--- /dev/null
+++ b/evaluate_service/docs/cn/ascend_310.md
@@ -0,0 +1,97 @@
+# 部署Ascend环境
+
+请参考Ascend官方文档部署Ascend环境,如下安装指导是安装过程中的关键步骤,若安装过程中出现问题,请以官方文档为准。
+在进行部署前,请在官方网站下载安装包。
+
+## 1 检查已安装的Driver和CANN版本
+
+若是全新的Ascend主机,需要检查是否存在`/usr/local/HiAi`目录,若存在,需要使用root账号执行如下命令卸载该目录:
+
+```bash
+/usr/local/HiAi/uninstall.sh
+```
+
+需要使用非root账号执行如下命令创建`Ascend`目录,并给该目录设置为用户`HwHiAiUser`可访问:
+
+```bash
+mkdir /usr/local/Ascend/
+sudo chown -R :HwHiAiUser /usr/local/Ascend/
+sudo chmod -R 750 /usr/local/Ascend/
+```
+
+若`/usr/local/Ascend/`已存在,则需要在安装前需要检查是否已安装了较旧的Driver和CANN包,
+请使用如下命令查询各个组件的版本号:
+
+```bash
+cat /usr/local/Ascend/driver/version.info
+cat /usr/local/Ascend/nnae/latest/ascend_nnae_install.info
+cat /usr/local/Ascend/ascend-toolkit/latest/arm64-linux/ascend_toolkit_install.info
+cat /usr/local/Ascend/nnrt/latest/arm64-linux/ascend_nnrt_install.info
+cat /usr/local/Ascend/tfplugin/latest/ascend_tfplugin_install.info
+```
+
+若版本号较低,需要使用root账号执行卸载:
+
+```bash
+/usr/local/Ascend/driver/script/uninstall.sh
+/usr/local/Ascend/nnae/latest/script/uninstall.sh
+/usr/local/Ascend/ascend-toolkit/latest/arm64-linux/script/uninstall.sh
+/usr/local/Ascend/nnrt/latest/arm64-linux/script/uninstall.sh
+/usr/local/Ascend/tfplugin/latest/script/uninstall.sh
+```
+
+若使用X86平台,请将如上命令中包含的目录中的`arm64-linux`替换为`x86_64-linux`。
+
+若nnae、ascend-toolkit、nnrt、tfplugin使用非root安装,请使用该用户卸载。
+
+## 2 安装Driver和CANN
+
+使用root用户执行如下命令安装,如下版本号供参考:
+
+```bash
+chmod +x *.run
+./A300-3000-3010-npu-driver_21.0.2_linux-aarch64.run --full
+```
+
+执行如下命令,确认安装是否成功:
+
+```bash
+npu-smi info
+```
+
+使用非root用户安装其他包,在安装前,需要将该用户设置为和`HwHiAiUser`同组:
+
+```bash
+usermod -a -G HwHiAiUser
+```
+
+```bash
+./Ascend-cann-nnae_5.0.T306_linux-aarch64.run --install
+./Ascend-cann-nnrt_5.0.T306_linux-aarch64.run --install
+./Ascend-cann-tfplugin_5.0.T306_linux-aarch64.run --install
+./Ascend-cann-toolkit_5.0.T306_linux-aarch64.run --install
+```
+
+安装完成后,根据提示需要重启主机。
+
+## 3 设置环境变量
+
+请设置如下环境变量:
+
+```bash
+export ASCEND_HOME=/usr/local/Ascend
+export HOME_DIR=/home/
+export PATH=$HOME_DIR/.local/bin:$PATH
+source /usr/local/Ascend/nnae/set_env.sh
+source /usr/local/Ascend/nnrt/set_env.sh
+source /usr/local/Ascend/tfplugin/set_env.sh
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+export NPU_HOST_LIB=/usr/local/Ascend/ascend-toolkit/latest/arm64-linux/atc/lib64
+export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64:$PYTHONPATH
+export DDK_PATH=/usr/local/Ascend/ascend-toolkit/latest
+export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/arm64-linux/fwkacllib/lib64:/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64:$LD_LIBRARY_PATH
+export PYTHONPATH=$HOME_DIR/.local/lib/python3.7/site-packages/evaluate_service/security:$PYTHONPATH
+export LD_LIBRARY_PATH=$HOME_DIR/.local/lib/python3.7/site-packages/evaluate_service/security/kmc/aarch64:$LD_LIBRARY_PATH
+```
+
+其中``为用户目录,`$NPU_HOST_LIB`为`libascendcl.so`的路径, 需要根据`libascendcl.so`实际所在的位置配置此变量。
diff --git a/evaluate_service/docs/en/ascend_310.md b/evaluate_service/docs/en/ascend_310.md
new file mode 100644
index 00000000..d376b938
--- /dev/null
+++ b/evaluate_service/docs/en/ascend_310.md
@@ -0,0 +1,104 @@
+# Deploy the Ascend environment.
+
+Deploy the Ascend environment by referring to the Ascend official document. The following installation guide
+is a key step during the installation. If an error occurs during the installation, refer to the official document.
+Before the deployment, download the installation package from the official website.
+
+## 1 Checking the Installed Driver and CANN Versions
+
+For a new Ascend host, check whether the `/usr/local/HiAi` directory exists. If yes,
+run the following command as user root to uninstall the directory:
+
+```bash
+/usr/local/HiAi/uninstall.sh
+```
+
+Run the following commands as a non-root user to create the `Ascend` directory and make the
+directory accessible to the `HwHiAiUser` user:
+
+```bash
+mkdir /usr/local/Ascend/
+sudo chown -R :HwHiAiUser /usr/local/Ascend/
+sudo chmod -R 750 /usr/local/Ascend/
+```
+
+If `/usr/local/Ascend/` exists, check if the old Driver and CANN packages have been installed
+before the installation.
+Run the following command to query the version number of each component:
+
+```bash
+cat /usr/local/Ascend/driver/version.info
+cat /usr/local/Ascend/nnae/latest/ascend_nnae_install.info
+cat /usr/local/Ascend/ascend-toolkit/latest/arm64-linux/ascend_toolkit_install.info
+cat /usr/local/Ascend/nnrt/latest/arm64-linux/ascend_nnrt_install.info
+cat /usr/local/Ascend/tfplugin/latest/ascend_tfplugin_install.info
+```
+
+
+If the version is older than expected, uninstall it as by root user.
+
+```bash
+/usr/local/Ascend/driver/script/uninstall.sh
+/usr/local/Ascend/nnae/latest/script/uninstall.sh
+/usr/local/Ascend/ascend-toolkit/latest/arm64-linux/script/uninstall.sh
+/usr/local/Ascend/nnrt/latest/arm64-linux/script/uninstall.sh
+/usr/local/Ascend/tfplugin/latest/script/uninstall.sh
+```
+
+If the platform is x86, replace `arm64-linux` in the directory contained in the preceding command with `x86_64-linux`.
+
+If nnae, ascend-toolkit, nnrt, and tfplugin are not installed by the root user, uninstall them as the user.
+
+## 2 Installing the Driver and CANN
+
+Run the following command as the root user to install the software. The following version number is for reference only:
+
+```bash
+chmod +x *.run
+./A300-3000-3010-npu-driver_21.0.2_linux-aarch64.run --full
+```
+
+Run the following command to check whether the installation is successful:
+
+```bash
+npu-smi info
+```
+
+Before installing other packages as a non-root user, set this user to the same group as `HwHiAiUser`.
+
+```bash
+usermod -a -G HwHiAiUser
+` ` `
+
+```bash
+./Ascend-cann-nnae_5.0.T306_linux-aarch64.run --install
+./Ascend-cann-nnrt_5.0.T306_linux-aarch64.run --install
+./Ascend-cann-tfplugin_5.0.T306_linux-aarch64.run --install
+./Ascend-cann-toolkit_5.0.T306_linux-aarch64.run --install
+```
+
+After the installation is complete, restart the host as prompted.
+
+## 3 Setting Environment Variables
+
+Set the following environment variables:
+
+```bash
+export ASCEND_HOME=/usr/local/Ascend
+export HOME_DIR=/home/
+export PATH=$HOME_DIR/.local/bin:$PATH
+source /usr/local/Ascend/nnae/set_env.sh
+source /usr/local/Ascend/nnrt/set_env.sh
+source /usr/local/Ascend/tfplugin/set_env.sh
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+export NPU_HOST_LIB=/usr/local/Ascend/ascend-toolkit/latest/arm64-linux/atc/lib64
+export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64:$PYTHONPATH
+export DDK_PATH=/usr/local/Ascend/ascend-toolkit/latest
+export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/arm64-linux/fwkacllib/lib64:/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64:$LD_LIBRARY_PATH
+export PYTHONPATH=$HOME_DIR/.local/lib/python3.7/site-packages/evaluate_service/security:$PYTHONPATH
+export LD_LIBRARY_PATH=$HOME_DIR/.local/lib/python3.7/site-packages/evaluate_service/security/kmc/aarch64:$LD_LIBRARY_PATH
+```
+
+In the preceding command, `` indicates the user directory,
+and `$NPU_HOST_LIB` indicates the path of `libascendcl.so`.
+Set this variable based on the actual location of `libascendcl.so`.
\ No newline at end of file
diff --git a/evaluate_service/evaluate_service/__init__.py b/evaluate_service/evaluate_service/__init__.py
new file mode 100644
index 00000000..42f0b744
--- /dev/null
+++ b/evaluate_service/evaluate_service/__init__.py
@@ -0,0 +1,19 @@
+# -*- coding:utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Evaluate service."""
+
+__version__ = "1.8.0"
diff --git a/evaluate_service/class_factory.py b/evaluate_service/evaluate_service/class_factory.py
similarity index 70%
rename from evaluate_service/class_factory.py
rename to evaluate_service/evaluate_service/class_factory.py
index 3100a741..dad504df 100644
--- a/evaluate_service/class_factory.py
+++ b/evaluate_service/evaluate_service/class_factory.py
@@ -1,12 +1,18 @@
# -*- coding:utf-8 -*-
# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the MIT License.
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# MIT License for more details.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""Management class registration and bind configuration properties, provides the type of class supported."""
diff --git a/evaluate_service/evaluate_service/hardwares/__init__.py b/evaluate_service/evaluate_service/hardwares/__init__.py
new file mode 100644
index 00000000..e1b24a54
--- /dev/null
+++ b/evaluate_service/evaluate_service/hardwares/__init__.py
@@ -0,0 +1,3 @@
+from .davinci.davinci import Davinci
+
+__all__ = ['Davinci']
diff --git a/evaluate_service/hardwares/davinci/compile_atlas300.sh b/evaluate_service/evaluate_service/hardwares/davinci/compile_atlas300.sh
similarity index 76%
rename from evaluate_service/hardwares/davinci/compile_atlas300.sh
rename to evaluate_service/evaluate_service/hardwares/davinci/compile_atlas300.sh
index 84dbbdfa..8b8c7810 100644
--- a/evaluate_service/hardwares/davinci/compile_atlas300.sh
+++ b/evaluate_service/evaluate_service/hardwares/davinci/compile_atlas300.sh
@@ -4,7 +4,7 @@ SAVE_PATH=$2
cd $EXAMPLE_DIR/
mkdir -p build/intermediates/host
cd build/intermediates/host
-cmake ../../../src -DCMAKE_CXX_COMPILER=g++ -DCMAKE_SKIP_RPATH=TRUE -DCMAKE_CXX_FLAGS="-s" -DCMAKE_C_FLAGS="-s"
+cmake ../../../src -DCMAKE_CXX_COMPILER=g++ -DCMAKE_SKIP_RPATH=TRUE -DCMAKE_CXX_FLAGS="-s" -DCMAKE_C_FLAGS="-s" -DCMAKE_FORTIFY_SOURCE=2
make
cd ../../../out
diff --git a/evaluate_service/hardwares/davinci/davinci.py b/evaluate_service/evaluate_service/hardwares/davinci/davinci.py
similarity index 67%
rename from evaluate_service/hardwares/davinci/davinci.py
rename to evaluate_service/evaluate_service/hardwares/davinci/davinci.py
index f8c7cecb..f95132cc 100644
--- a/evaluate_service/hardwares/davinci/davinci.py
+++ b/evaluate_service/evaluate_service/hardwares/davinci/davinci.py
@@ -1,20 +1,27 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the MIT License.
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# MIT License for more details.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""The hardware of davinci."""
-import subprocess
+import datetime
import logging
import os
-from evaluate_service.class_factory import ClassFactory
-import datetime
+import subprocess
+
import numpy as np
+from evaluate_service.class_factory import ClassFactory
@ClassFactory.register()
@@ -49,7 +56,27 @@ def convert_model(self, backend, model, weight, **kwargs):
except subprocess.CalledProcessError as exc:
logging.error("convert model to om model failed. the return message is : {}.".format(exc))
- def inference(self, converted_model, input_data, **kwargs):
+ def _get_200dk_infer_cmd(self, save_path):
+ app_dir = datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')
+ example_dir = self.current_path + "/samples/atlas200dk"
+ ddk_user_name = self.optional_params.get("ddk_user_name")
+ ddk_host_ip = self.optional_params.get("ddk_host_ip")
+ atlas_host_ip = self.optional_params.get("atlas_host_ip")
+ command_line = ["bash", self.current_path + "inference_atlas200.sh",
+ save_path, example_dir, ddk_user_name, ddk_host_ip, atlas_host_ip, app_dir]
+ return command_line
+
+ def _compile_atlas300(self, save_path):
+ # compile the Davinci program
+ example_dir = self.current_path + "/samples/atlas300"
+ command_line = ["bash", self.current_path + "/compile_atlas300.sh",
+ example_dir, save_path]
+ try:
+ subprocess.check_output(command_line)
+ except subprocess.CalledProcessError as exc:
+ logging.error("compile failed. the return message is : {}.".format(exc))
+
+ def inference(self, converted_model, input_data, is_last=False, cal_metric=False, **kwargs):
"""Inference in Davinci.
:param converted_model: converted model file
@@ -64,25 +91,11 @@ def inference(self, converted_model, input_data, **kwargs):
converted_model = os.path.join(converted_model, "davinci_model.om")
log_save_path = os.path.dirname(input_data)
if self.davinci_environment_type == "ATLAS200DK":
- task_dir = log_save_path
- app_dir = datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')
- example_dir = self.current_path + "/samples/atlas200dk"
- ddk_user_name = self.optional_params.get("ddk_user_name")
- ddk_host_ip = self.optional_params.get("ddk_host_ip")
- atlas_host_ip = self.optional_params.get("atlas_host_ip")
- command_line = ["bash", self.current_path + "/utils/atlas200_dk/inference_atlas300.sh",
- task_dir, example_dir, ddk_user_name, ddk_host_ip, atlas_host_ip, app_dir]
+ command_line = self._get_200dk_infer_cmd(save_path=log_save_path)
result_file = os.path.join(log_save_path, "result_file")
else:
if not os.path.exists(os.path.join(share_dir, "main")):
- # compile the Davinci program
- example_dir = self.current_path + "/samples/atlas300"
- command_line = ["bash", self.current_path + "/compile_atlas300.sh",
- example_dir, share_dir]
- try:
- subprocess.check_output(command_line)
- except subprocess.CalledProcessError as exc:
- logging.error("compile failed. the return message is : {}.".format(exc))
+ self._compile_atlas300()
# execute the Davinci program
command_line = ["bash", self.current_path + "/inference_atlas300.sh",
input_data, converted_model, share_dir, log_save_path]
@@ -94,7 +107,13 @@ def inference(self, converted_model, input_data, **kwargs):
logging.error("inference failed. the return message is : {}.".format(exc))
latency = self._get_latency(os.path.join(log_save_path, "ome.log"))
- output = self._get_output(result_file)
+ if cal_metric:
+ output = self._get_output(result_file)
+ else:
+ output = None
+ if is_last:
+ os.remove(input_data)
+ os.remove(converted_model)
return latency, output
def _get_latency(self, log_file):
diff --git a/evaluate_service/hardwares/davinci/get_latency_from_log.sh b/evaluate_service/evaluate_service/hardwares/davinci/get_latency_from_log.sh
similarity index 100%
rename from evaluate_service/hardwares/davinci/get_latency_from_log.sh
rename to evaluate_service/evaluate_service/hardwares/davinci/get_latency_from_log.sh
diff --git a/evaluate_service/hardwares/davinci/inference_atlas300.sh b/evaluate_service/evaluate_service/hardwares/davinci/inference_atlas300.sh
similarity index 92%
rename from evaluate_service/hardwares/davinci/inference_atlas300.sh
rename to evaluate_service/evaluate_service/hardwares/davinci/inference_atlas300.sh
index 3eb54666..d8a91aac 100644
--- a/evaluate_service/hardwares/davinci/inference_atlas300.sh
+++ b/evaluate_service/evaluate_service/hardwares/davinci/inference_atlas300.sh
@@ -10,4 +10,4 @@ cp $EXECUTE_FILE_PATH/acl.json $LOG_SAVE_PATH/
cd $LOG_SAVE_PATH/
#sudo env "LD_LIBRARY_PATH=/usr/local/Ascend/acllib/lib64:/usr/local/Ascend/add-ons:/usr/local/Ascend/driver/lib64/" ./main >$WORK_DIR/ome.log
-./main >$LOG_SAVE_PATH/ome.log
+./main >$LOG_SAVE_PATH/ome.log
\ No newline at end of file
diff --git a/evaluate_service/hardwares/davinci/model_convert.sh b/evaluate_service/evaluate_service/hardwares/davinci/model_convert.sh
similarity index 76%
rename from evaluate_service/hardwares/davinci/model_convert.sh
rename to evaluate_service/evaluate_service/hardwares/davinci/model_convert.sh
index edeb6805..97f20b70 100644
--- a/evaluate_service/hardwares/davinci/model_convert.sh
+++ b/evaluate_service/evaluate_service/hardwares/davinci/model_convert.sh
@@ -9,21 +9,29 @@ PRECISION=$8
if [ $DAVINCI_ENV_TYPE == "ATLAS200DK" ]; then
if [ $BACKEND == "tensorflow" ]; then
- omg --model=$MODEL --framework=3 --output=$OM_SAVE_PATH/davinci_model >$LOG_SAVE_PATH/omg.log 2>&1
+ omg --model=$MODEL --framework=3 --output=$OM_SAVE_PATH/davinci_model >$LOG_SAVE_PATH/omg.log 2>&1 &&
+ rm -f $MODEL
elif [ $BACKEND == "caffe" ]; then
- omg --model=$MODEL --weight=$WEIGHT --framework=0 --output=$OM_SAVE_PATH/davinci_model >$LOG_SAVE_PATH/omg.log 2>&1
+ omg --model=$MODEL --weight=$WEIGHT --framework=0 --output=$OM_SAVE_PATH/davinci_model >$LOG_SAVE_PATH/omg.log 2>&1 &&
+ rm -f $MODEL
+ rm -f $WEIGHT
else
echo "[ERROR] Davinci model convert: The backend must be tensorflow, caffe."
fi
else
if [ $BACKEND == "tensorflow" ]; then
- atc --model=$MODEL --framework=3 --input_format='NCHW' --disable_reuse_memory=1 --input_shape=$INPUT_SHAPE --output=$OM_SAVE_PATH/davinci_model --soc_version=Ascend310 --core_type=AiCore --output_type=$PRECISION >$LOG_SAVE_PATH/omg.log 2>&1
+ atc --model=$MODEL --framework=3 --input_format='NCHW' --disable_reuse_memory=1 --input_shape=$INPUT_SHAPE --output=$OM_SAVE_PATH/davinci_model --soc_version=Ascend310 --core_type=AiCore --output_type=$PRECISION >$LOG_SAVE_PATH/omg.log 2>&1 &&
+ rm -f $MODEL
elif [ $BACKEND == "caffe" ]; then
- atc --model=$MODEL --weight=$WEIGHT --framework=0 --input_format='NCHW' --disable_reuse_memory=1 --output=$OM_SAVE_PATH/davinci_model --soc_version=Ascend310 --core_type=AiCore >$LOG_SAVE_PATH/omg.log 2>&1
+ atc --model=$MODEL --weight=$WEIGHT --framework=0 --input_format='NCHW' --disable_reuse_memory=1 --output=$OM_SAVE_PATH/davinci_model --soc_version=Ascend310 --core_type=AiCore >$LOG_SAVE_PATH/omg.log 2>&1 &&
+ rm -f $MODEL
+ rm -f $WEIGHT
elif [ $BACKEND == "mindspore" ]; then
- atc --model=$MODEL --framework=1 --disable_reuse_memory=1 --output=$OM_SAVE_PATH/davinci_model --soc_version=Ascend310 --core_type=AiCore --output_type=$PRECISION >$LOG_SAVE_PATH/omg.log 2>&1
+ atc --model=$MODEL --framework=1 --disable_reuse_memory=1 --output=$OM_SAVE_PATH/davinci_model --soc_version=Ascend310 --core_type=AiCore --output_type=$PRECISION >$LOG_SAVE_PATH/omg.log 2>&1 &&
+ rm -f $MODEL
elif [ $BACKEND == "onnx" ]; then
- atc --model=$MODEL --framework=5 --output=$OM_SAVE_PATH/davinci_model --soc_version=Ascend310 --core_type=AiCore --output_type=$PRECISION >$LOG_SAVE_PATH/omg.log 2>&1
+ atc --model=$MODEL --framework=5 --output=$OM_SAVE_PATH/davinci_model --soc_version=Ascend310 --core_type=AiCore --output_type=$PRECISION >$LOG_SAVE_PATH/omg.log 2>&1 &&
+ rm -f $MODEL
else
echo "[ERROR] Davinci model convert: The backend must be tensorflow, caffe, mindspore or onnx."
fi
diff --git a/evaluate_service/hardwares/davinci/samples/atlas300/inc/model_process.h b/evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/inc/model_process.h
similarity index 100%
rename from evaluate_service/hardwares/davinci/samples/atlas300/inc/model_process.h
rename to evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/inc/model_process.h
diff --git a/evaluate_service/hardwares/davinci/samples/atlas300/inc/sample_process.h b/evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/inc/sample_process.h
similarity index 100%
rename from evaluate_service/hardwares/davinci/samples/atlas300/inc/sample_process.h
rename to evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/inc/sample_process.h
diff --git a/evaluate_service/hardwares/davinci/samples/atlas300/inc/utils.h b/evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/inc/utils.h
similarity index 100%
rename from evaluate_service/hardwares/davinci/samples/atlas300/inc/utils.h
rename to evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/inc/utils.h
diff --git a/evaluate_service/hardwares/davinci/samples/atlas300/src/CMakeLists.txt b/evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/src/CMakeLists.txt
similarity index 94%
rename from evaluate_service/hardwares/davinci/samples/atlas300/src/CMakeLists.txt
rename to evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/src/CMakeLists.txt
index 098aa7de..c2d13aca 100644
--- a/evaluate_service/hardwares/davinci/samples/atlas300/src/CMakeLists.txt
+++ b/evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/src/CMakeLists.txt
@@ -10,9 +10,9 @@ project(ACL_RESNET50)
add_compile_options(-std=c++11)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../../../out")
-set(CMAKE_CXX_FLAGS_DEBUG "-fPIC -O0 -g -Wall")
+set(CMAKE_CXX_FLAGS_DEBUG "-fPIC -O2 -g -Wall")
set(CMAKE_CXX_FLAGS_RELEASE "-fPIC -O2 -Wall")
-
+set(CMAKE_FORTIFY_SOURCE "2")
set(INC_PATH $ENV{DDK_PATH})
if (NOT DEFINED ENV{DDK_PATH})
diff --git a/evaluate_service/hardwares/davinci/samples/atlas300/src/acl.json b/evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/src/acl.json
similarity index 100%
rename from evaluate_service/hardwares/davinci/samples/atlas300/src/acl.json
rename to evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/src/acl.json
diff --git a/evaluate_service/hardwares/davinci/samples/atlas300/src/main.cpp b/evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/src/main.cpp
similarity index 100%
rename from evaluate_service/hardwares/davinci/samples/atlas300/src/main.cpp
rename to evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/src/main.cpp
diff --git a/evaluate_service/hardwares/davinci/samples/atlas300/src/model_process.cpp b/evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/src/model_process.cpp
similarity index 95%
rename from evaluate_service/hardwares/davinci/samples/atlas300/src/model_process.cpp
rename to evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/src/model_process.cpp
index 1a76eb34..dd32b562 100644
--- a/evaluate_service/hardwares/davinci/samples/atlas300/src/model_process.cpp
+++ b/evaluate_service/evaluate_service/hardwares/davinci/samples/atlas300/src/model_process.cpp
@@ -30,21 +30,22 @@ double tick(void)
}
-double difftimeval(const struct timeval *start, const struct timeval *end)
+double eplasedtime(const struct timeval *end_time, const struct timeval *start_time)
{
- double d;
- time_t s;
- suseconds_t u;
-
- s = start->tv_sec - end->tv_sec;
- u = start->tv_usec - end->tv_usec;
- d = s;
- d *= 1000000.0;
- d += u;
- return d;
+ double time_total;
+ time_t time_second;
+ suseconds_t time_microsecond ;
+
+ time_second = end_time->tv_sec - start_time->tv_sec;
+ time_microsecond = end_time->tv_usec - start_time->tv_usec;
+ time_total = time_second;
+ time_total *= 1000000.0;
+ time_total += time_microsecond;
+ return time_total;
}
+
ModelProcess::ModelProcess() :modelId_(0), modelMemSize_(0), modelWeightSize_(0), modelMemPtr_(nullptr),
modelWeightPtr_(nullptr), loadFlag_(false), modelDesc_(nullptr), input_(nullptr), output_(nullptr)
{
@@ -327,7 +328,7 @@ Result ModelProcess::Execute()
gettimeofday(&start, NULL);
aclError ret = aclmdlExecute(modelId_, input_, output_);
gettimeofday(&end, NULL);
- cout<< "costTime "<< difftimeval(&end, &start)/1000<(end_time-start_time)/CLOCKS_PER_SEC*1000< 0:
- logging.warning("job_id {} contains invalid characters".format(job_id))
- abort(400, "job_id {} contains invalid characters".format(job_id))
- return job_id
+ if self.security_mode:
+ security.args.check_backend(self.backend)
+ security.args.check_hardware(self.hardware)
+ security.args.check_job_id(self.job_id)
+ security.args.check_input_shape(self.input_shape)
+ security.args.check_out_nodes(self.out_nodes)
+ security.args.check_repeat_times(self.repeat_times)
+ security.args.check_precision(self.precision)
def upload_files(self):
"""Upload the files from the client to the service."""
@@ -167,20 +172,28 @@ def upload_files(self):
self.upload_file_path = os.path.join(self.current_path, "out", self.now_time)
self.share_dir = os.path.join(self.current_path, "out", self.job_id)
os.makedirs(self.upload_file_path)
-
+ os.makedirs(self.share_dir)
+ patterns = [".pkl", ".pth", ".pt", ".pb", ".ckpt", ".air", '.om',
+ ".onnx", ".caffemodel", ".pbtxt", ".prototxt"]
model_file = request.files.get("model_file")
if model_file is not None:
self.model = self.upload_file_path + "/" + secure_filename(model_file.filename)
+ if os.path.splitext(self.model)[1] not in patterns:
+ raise ValueError(f'{model_file.filename} file type is not supported.')
model_file.save(self.model)
data_file = request.files.get("data_file")
if data_file is not None:
self.input_data = self.upload_file_path + "/" + secure_filename(data_file.filename)
+ if not os.path.basename(self.input_data) == 'input.bin':
+ raise ValueError(f'data {data_file.filename} file is not supported.')
data_file.save(self.input_data)
weight_file = request.files.get("weight_file")
if weight_file is not None:
self.weight = self.upload_file_path + "/" + secure_filename(weight_file.filename)
+ if os.path.splitext(self.weight)[1] not in patterns:
+ raise ValueError(f'{weight_file.filename} file type is not supported.')
weight_file.save(self.weight)
else:
self.weight = ""
@@ -190,7 +203,6 @@ def upload_files(self):
def _clean_data_path(clean_interval, work_path):
while True:
_clean_time = time.time() - clean_interval
- # _current_path = os.path.dirname(os.path.abspath(__file__))
folder_pattern = "{}/out/*".format(work_path)
folders = glob.glob(folder_pattern)
for folder in folders:
@@ -216,26 +228,29 @@ def _parse_args():
help="the user to acess ATLAS200200 DK")
parser.add_argument("-atlas_host_ip", "--atlas_host_ip", type=str, required=False, default=None,
help="the ip of ATLAS200200 DK")
-
+ parser.add_argument("-s", "--security_mode", action='store_true',
+ help="enable safe mode")
args = parser.parse_args()
return args
def run():
"""Run the evaluate service."""
- os.umask(0o027)
args = _parse_args()
ip_address = args.host_ip
listen_port = args.port
clean_interval = args.clean_interval
work_path = args.work_path
+ security_mode = args.security_mode
+ if security_mode:
+ os.umask(0o077)
optional_params = {"davinci_environment_type": args.davinci_environment_type,
"ddk_user_name": args.ddk_user_name,
"atlas_host_ip": args.atlas_host_ip
}
-
p = multiprocessing.Process(target=_clean_data_path, args=(clean_interval, work_path), daemon=True)
p.start()
- Evaluate._add_params(work_path, optional_params)
+ Evaluate._add_params(work_path, args.security_mode, optional_params)
api.add_resource(Evaluate, '/')
- run_flask(app, host=ip_address, port=listen_port)
+
+ run_flask(app, host=ip_address, port=listen_port, security_mode=security_mode)
diff --git a/evaluate_service/evaluate_service/run_flask.py b/evaluate_service/evaluate_service/run_flask.py
new file mode 100644
index 00000000..ba50dcbf
--- /dev/null
+++ b/evaluate_service/evaluate_service/run_flask.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run Flask."""
+
+import configparser
+import logging
+import os
+from multiprocessing import Process
+import gevent
+from gevent import pywsgi
+from .security.utils import create_context
+from .security.verify_config import check_risky_files
+
+white_list = None
+request_frequency_limit = "100/minute"
+max_content_length = 1000 * 1000 * 1000
+
+
+def get_request_frequency_limit():
+ """Get request frequncy limit."""
+ global request_frequency_limit
+ return request_frequency_limit
+
+
+def get_max_content_length():
+ """Get max contect length."""
+ global max_content_length
+ return max_content_length
+
+
+def get_white_list():
+ """Get white list."""
+ global white_list
+ return white_list
+
+
+def load_security_setting():
+ """Load security settings."""
+ home = os.environ['HOME']
+ config_file = os.path.join(home, ".vega/vega.ini")
+ if not check_risky_files([config_file]):
+ return False
+ cfg = configparser.ConfigParser()
+ cfg.read(config_file)
+ config = dict(cfg._sections)
+ for k in config:
+ config[k] = dict(config[k])
+
+ return config
+
+
+def run_flask(app, host, port, security_mode):
+ """Run flask."""
+ if security_mode:
+ app.config['MAX_CONTENT_LENGTH'] = get_max_content_length()
+ config = load_security_setting()
+ if not config:
+ return False
+ ca_cert = config.get('security').get('ca_cert')
+ server_cert = config.get('security').get('server_cert')
+ server_secret_key = config.get('security').get('server_secret_key')
+ encrypted_password = config.get('security').get('encrypted_password')
+ key_component_1 = config.get('security').get('key_component_1')
+ key_component_2 = config.get('security').get('key_component_2')
+ if not check_risky_files((ca_cert, server_cert, server_secret_key, key_component_1, key_component_2)):
+ return
+ try:
+ if encrypted_password == "":
+ ssl_context = create_context(ca_cert, server_cert, server_secret_key)
+ else:
+ ssl_context = create_context(ca_cert, server_cert, server_secret_key,
+ encrypted_password, key_component_1, key_component_2)
+ except Exception:
+ logging.error("Fail to create context.")
+ return False
+
+ server = pywsgi.WSGIServer((host, port), app, ssl_context=ssl_context)
+ if "limit" in config:
+ global white_list
+ global request_frequency_limit
+ global max_content_length
+ if "white_list" in config["limit"]:
+ white_list = config["limit"]["white_list"].replace(" ", "").split(',')
+ if "request_frequency_limit" in config["limit"]:
+ request_frequency_limit = config["limit"]["request_frequency_limit"]
+ if "max_content_length" in config["limit"]:
+ max_content_length = int(config["limit"]["max_content_length"])
+ else:
+ server = pywsgi.WSGIServer((host, port), app)
+
+ server.init_socket()
+ server._stop_event.clear()
+
+ def _server_forever():
+ server.start_accepting()
+ logging.info("server started.")
+ server._stop_event.wait()
+ gevent.wait()
+
+ p = Process(target=_server_forever)
+ p.start()
diff --git a/evaluate_service/evaluate_service/security/__init__.py b/evaluate_service/evaluate_service/security/__init__.py
new file mode 100644
index 00000000..c014103b
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/__init__.py
@@ -0,0 +1,25 @@
+# -*- coding:utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run pipeline."""
+
+__all__ = ["load_config", "get_config", "add_args", "check_args", "check_yml", "check_msg", "post"]
+
+from .conf import ServerConfig, ClientConfig, Config
+from .args import add_args, check_args, check_yml, check_msg
+from .post import post
+from .conf import load_config, get_config
+from .verify_config import check_risky_file
diff --git a/evaluate_service/evaluate_service/security/args.py b/evaluate_service/evaluate_service/security/args.py
new file mode 100644
index 00000000..1a2245bf
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/args.py
@@ -0,0 +1,120 @@
+# -*- coding:utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Security args."""
+import os
+import re
+import yaml
+
+
+def add_args(parser):
+ """Add security args."""
+ _config = parser.add_argument_group(title='security setting')
+ _config.add_argument("-s", "--security", dest='security', action='store_true',
+ help="enable safe mode")
+ return parser
+
+
+def _check_value(value, pattern):
+ if isinstance(value, str) and len(re.compile(pattern).findall(value)) > 0:
+ raise ValueError("{} contains invalid characters.".format(value))
+
+
+def _check_dict(dict_value, pattern):
+ """Check dict."""
+ if not isinstance(dict_value, dict):
+ return
+ for item in dict_value:
+ value = dict_value[item]
+ if isinstance(value, dict):
+ _check_dict(value, pattern)
+ else:
+ _check_value(value, pattern)
+
+
+def check_msg(msg):
+ """Check msg."""
+ _check_dict(msg, pattern="[^_A-Za-z0-9\\s:/.~-]")
+
+
+def check_args(args):
+ """Check args."""
+ args_dict = vars(args)
+ _check_dict(args_dict, pattern="[^_A-Za-z0-9:/.~-]")
+
+
+def check_yml(config_yaml):
+ """Check yml."""
+ if config_yaml is None:
+ raise ValueError("config path can't be None or empty")
+ if os.stat(config_yaml).st_uid != os.getuid():
+ raise ValueError(f"The file {config_yaml} not belong to the current user")
+ with open(config_yaml) as f:
+ raw_dict = yaml.safe_load(f)
+ _check_dict(raw_dict, pattern=r"[^_A-Za-z0-9\s\<\>=\[\]\(\),!\{\}:/.~-]")
+
+
+def check_job_id(job_id):
+ """Check Job id."""
+ if not isinstance(job_id, str):
+ raise TypeError('"job_id" must be str, not {}'.format(type(job_id)))
+ _check_value(job_id, pattern="[^_A-Za-z0-9]")
+
+
+def check_input_shape(input_shape):
+ """Check input shape."""
+ if not isinstance(input_shape, str):
+ raise TypeError('"input_shape" must be str, not {}'.format(type(input_shape)))
+ _check_value(input_shape, pattern="[^_A-Za-z0-9:,]")
+
+
+def check_out_nodes(out_nodes):
+ """Check out nodes."""
+ if not isinstance(out_nodes, str):
+ raise TypeError('"out_nodes" must be str, not {}'.format(type(out_nodes)))
+ _check_value(out_nodes, pattern="[^_A-Za-z0-9:/]")
+
+
+def check_backend(backend):
+ """Check backend."""
+ if backend not in ["tensorflow", "caffe", "onnx", "mindspore"]:
+ raise ValueError("The backend only support tensorflow, caffe, onnx and mindspore.")
+
+
+def check_hardware(hardware):
+ """Check hardware."""
+ if hardware not in ["Davinci", "Bolt", "Kirin990_npu"]:
+ raise ValueError("The hardware only support Davinci and Bolt.")
+
+
+def check_precision(precision):
+ """Check precision."""
+ if precision.upper() not in ["FP32", "FP16"]:
+ raise ValueError("The precision only support FP32 and FP16.")
+
+
+def check_repeat_times(repeat_times):
+ """Check repeat times."""
+ MAX_EVAL_EPOCHS = 10000
+ if not isinstance(repeat_times, int):
+ raise TypeError('"repeat_times" must be int, not {}'.format(type(repeat_times)))
+ if not 0 < repeat_times <= MAX_EVAL_EPOCHS:
+ raise ValueError("repeat_times {} is not in valid range (1-{})".format(repeat_times, MAX_EVAL_EPOCHS))
+
+
+def path_verify(path):
+ """Verify path."""
+ return re.sub(r"[^_A-Za-z0-9\/.]", "", path)
diff --git a/evaluate_service/evaluate_service/security/check_env.py b/evaluate_service/evaluate_service/security/check_env.py
new file mode 100644
index 00000000..c394a028
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/check_env.py
@@ -0,0 +1,25 @@
+# -*- coding:utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Check security env."""
+
+
+__all__ = ["check_env"]
+
+
+def check_env(args) -> bool:
+ """Check security env."""
+ return True
diff --git a/evaluate_service/evaluate_service/security/conf.py b/evaluate_service/evaluate_service/security/conf.py
new file mode 100644
index 00000000..4e9fa034
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/conf.py
@@ -0,0 +1,140 @@
+# -*- coding:utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Security config.
+
+~/.vega/server.ini
+
+[security]
+ ca_cert=<~/.vega/car.crt>
+ server_cert_dask=<~/.vega/server_dask.crt>
+ server_secret_key_dask=<~/.vega/server_dask.key>
+ client_cert_dask=<~/.vega/client_dask.crt>
+ client_secret_key_dask=<~/.vega/ client_dask.key>
+
+~/.vega/client.ini
+
+[security]
+ ca_cert=<~/.vega/car.crt>
+ client_cert=<~/.vega/client.crt>
+ client_secret_key=<~/.vega/client.key>
+ encrypted_password=
+ key_component_1=<~/.vega/ksmaster_client.dat>
+ key_component_2=<~/.vega/ksstandby_client.dat>
+
+"""
+
+import os
+import logging
+import configparser
+from .verify_config import check_risky_files
+
+
+class Config():
+ """Security Config."""
+
+ def load(self) -> bool:
+ """Load from config file."""
+ if not check_risky_files([self.file_name]):
+ return False
+ config = configparser.ConfigParser()
+ try:
+ config.read(self.file_name)
+ except Exception:
+ logging.error(f"Failed to read setting from {self.file_name}")
+ return False
+ if "security" not in config.sections():
+ return False
+ keys = []
+ pass_check_keys = ["encrypted_password", "white_list"]
+ for key in config["security"]:
+ if key not in self.keys:
+ return False
+ setattr(self, key, config.get("security", key))
+ if key not in pass_check_keys and not check_risky_files([config.get("security", key)]):
+ return False
+ keys.append(key)
+ if len(keys) != len(self.keys):
+ missing_keys = list(set(self.keys) - set(keys))
+ logging.error(f"setting items {missing_keys} are missing in {self.file_name}")
+ return False
+ return True
+
+
+class ServerConfig(Config):
+ """Security Config."""
+
+ def __init__(self):
+ """Initialize."""
+ self.ca_cert = None
+ self.server_cert_dask = None
+ self.server_secret_key_dask = None
+ self.client_cert_dask = None
+ self.client_secret_key_dask = None
+ self.file_name = os.path.expanduser("~/.vega/server.ini")
+ self.keys = ["ca_cert", "server_cert_dask", "server_secret_key_dask", "client_cert_dask",
+ "client_secret_key_dask"]
+
+
+class ClientConfig(Config):
+ """Security Config."""
+
+ def __init__(self):
+ """Initialize."""
+ self.ca_cert = None
+ self.client_cert = None
+ self.client_secret_key = None
+ self.encrypted_password = None
+ self.key_component_1 = None
+ self.key_component_2 = None
+ self.white_list = []
+ self.file_name = os.path.expanduser("~/.vega/client.ini")
+ self.keys = [
+ "ca_cert", "client_cert", "client_secret_key", "encrypted_password",
+ "key_component_1", "key_component_2", "white_list"]
+
+
+_server_config = ServerConfig()
+_client_config = ClientConfig()
+
+
+def load_config(_type: str) -> bool:
+ """Load security config."""
+ if _type not in ["all", "server", "client"]:
+ logging.error(f"not support security config type: {_type}")
+ return False
+ if _type in ["server", "all"]:
+ global _server_config
+ if not _server_config.load():
+ logging.error("load server security config fail.")
+ return False
+ if _type in ["client", "all"]:
+ global _client_config
+ if not _client_config.load():
+ logging.error("load client security config fail.")
+ return False
+ return True
+
+
+def get_config(_type: str) -> Config:
+ """Get config."""
+ if _type not in ["server", "client"]:
+ logging.error(f"not support security config type: {_type}")
+ return False
+ if _type == "server":
+ return _server_config
+ else:
+ return _client_config
diff --git a/evaluate_service/evaluate_service/security/kmc/encrypt_key.py b/evaluate_service/evaluate_service/security/kmc/encrypt_key.py
new file mode 100644
index 00000000..7691c1dc
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/kmc/encrypt_key.py
@@ -0,0 +1,121 @@
+# -*- coding:utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Load the Certificate and encrypt the passwd."""
+
+import argparse
+import getpass
+import logging
+import subprocess
+from OpenSSL.crypto import load_certificate, FILETYPE_PEM, load_privatekey
+from . import kmc
+from .utils import check_password_rule
+
+
+def encrypt_mm(origin_mm, key_component_1, key_component_2):
+ """Encrypt the passwd."""
+ ret = kmc.init(key_component_1, key_component_2, 9)
+ if ret is False:
+ logging.error("kmc init error.")
+ return ""
+ domain_id = 0
+ result = kmc.encrypt(domain_id, origin_mm)
+ kmc.finalize()
+ return result
+
+
+def validate_certificate(cert, key, origin_mm):
+ """Validate the certificate."""
+ flag = True
+ with open(key, "r", encoding="utf-8") as f:
+ key_value = f.read()
+ try:
+ load_privatekey(FILETYPE_PEM, key_value, passphrase=origin_mm.encode('utf-8'))
+ except Exception:
+ flag = False
+ logging.error("Wrong PEM.")
+ return flag
+
+ # check signature algorithm
+ with open(cert, "r", encoding="utf-8") as f:
+ cert_value = f.read()
+ cert_value = load_certificate(FILETYPE_PEM, cert_value)
+ enc_algorithm = cert_value.get_signature_algorithm()
+ if enc_algorithm in b'sha1WithRSAEncryption' b'md5WithRSAEncryption':
+ logging.warning("Insecure encryption algorithm: %s", enc_algorithm)
+ # check key length
+
+ p1 = subprocess.Popen(["openssl", "x509", "-in", cert, "-text", "-noout"],
+ stdout=subprocess.PIPE, shell=False)
+ p2 = subprocess.Popen(["grep", "RSA Public-Key"], stdin=p1.stdout, stdout=subprocess.PIPE, shell=False)
+ p3 = subprocess.Popen(["tr", "-cd", "[0-9]"], stdin=p2.stdout, stdout=subprocess.PIPE, shell=False)
+ RSA_key = p3.communicate()[0]
+ if int(RSA_key) < 2048:
+ logging.warning("Insecure key length: %d", int(RSA_key))
+ return flag
+
+
+def import_certificate(args, origin_mm):
+ """Load the certificate."""
+ # 1.validate private key and certification, if not pass, program will exit
+ ret = validate_certificate(args.cert, args.key, origin_mm)
+ if not ret:
+ logging.error("Validate certificate failed.")
+ return 0
+
+ # 2.encrypt private key's passwd.
+ encrypt = encrypt_mm(origin_mm, args.key_component_1, args.key_component_2)
+ if not encrypt:
+ logging.error("kmc encrypt private key error.")
+ return 0
+ logging.warning(f"Encrypt sucuess. The encrypted of your input is {encrypt}")
+ logging.warning(f"The key components are {args.key_component_1} and {args.key_component_2}, please keep it safe.")
+
+ return True
+
+
+def args_parse():
+ """Parse the input args."""
+ parser = argparse.ArgumentParser(description='Certificate import')
+ parser.add_argument("--cert", default="./kmc/config/crt/sever.cert", type=str,
+ help="The path of certificate file")
+ parser.add_argument("--key", default='./kmc/config/crt/sever.key', type=str,
+ help="The path of private Key file.")
+ parser.add_argument("--key_component_1", default='./kmc/config/ksf/ksmaster.dat', type=str,
+ help="key material 1.")
+ parser.add_argument("--key_component_2", default='./kmc/config/ksf/ksstandby.dat', type=str,
+ help="key material 2.")
+
+ args = parser.parse_args()
+
+ return args
+
+
+def main():
+ """Run the encrypt process."""
+ args = args_parse()
+ logging.info("process encrypt begin.")
+ origin_mm = getpass.getpass("Please enter the password to be encrypted: ")
+ if not check_password_rule(origin_mm):
+ logging.info("You should re-generate your server cert/key with following rules:")
+ logging.info("1. equals to or longer than 8 letters")
+ logging.info("2. contains at least one digit letter")
+ logging.info("3. contains at least one capital letter")
+ logging.info("4. contains at least one lowercase letter")
+
+ ret = import_certificate(args, origin_mm)
+ if not ret:
+ logging.error("Encrypt failed.")
diff --git a/evaluate_service/evaluate_service/security/kmc/kmc.py b/evaluate_service/evaluate_service/security/kmc/kmc.py
new file mode 100644
index 00000000..2dcf5480
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/kmc/kmc.py
@@ -0,0 +1,228 @@
+# -*- coding:utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Huawei KMC library."""
+
+import ctypes
+import os
+from ctypes.util import find_library
+import logging
+import platform
+
+__all__ = ["init", "encrypt", "decrypt", "check_and_update_mk", "update_root_key", "hmac", "hmac_verify", "finalize"]
+
+_kmc_dll: ctypes.CDLL = None
+_libc_dll: ctypes.CDLL = None
+ADVANCE_DAY = 3
+
+
+def hmac(domain_id: int, plain_text: str) -> str:
+ """Encode HMAC code."""
+ p_char = ctypes.c_char_p()
+ hmac_len = ctypes.c_int(0)
+ c_plain_text = ctypes.create_string_buffer(plain_text.encode())
+ _kmc_dll.KeHmacByDomain.restype = ctypes.c_int
+ _kmc_dll.KeHmacByDomain.argtypes = [
+ ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_char_p), ctypes.POINTER(ctypes.c_int)]
+ ret = _kmc_dll.KeHmacByDomain(
+ domain_id, c_plain_text, len(plain_text), ctypes.byref(p_char), ctypes.pointer(hmac_len))
+ if ret != 0:
+ logging.error(f"failed to call KeHmacByDomain, code={ret}")
+ value = p_char.value.decode()
+ ret = _libc_dll.free(p_char)
+ if ret != 0:
+ logging.error(f"failed to free resource, code={ret}")
+ return value
+
+
+def hmac_verify(domain_id: int, plain_text: str, hmac_text: str) -> bool:
+ """Verify HMAC code."""
+ c_plain_text = ctypes.create_string_buffer(plain_text.encode())
+ c_hmac_text = ctypes.create_string_buffer(hmac_text.encode())
+ _kmc_dll.KeHmacVerifyByDomain.restype = ctypes.c_int
+ _kmc_dll.KeHmacVerifyByDomain.argtypes = [
+ ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int]
+ ret = _kmc_dll.KeHmacVerifyByDomain(domain_id, c_plain_text, len(plain_text), c_hmac_text, len(c_hmac_text))
+ return ret
+
+
+def encrypt(domain_id: int, plain_text: str) -> str:
+ """Encrypt."""
+ p_char = ctypes.c_char_p()
+ cipher_len = ctypes.c_int(0)
+ c_plain_text = ctypes.create_string_buffer(plain_text.encode())
+
+ _kmc_dll.KeEncryptByDomain.restype = ctypes.c_int
+ _kmc_dll.KeEncryptByDomain.argtypes = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_char_p),
+ ctypes.POINTER(ctypes.c_int)]
+ ret = _kmc_dll.KeEncryptByDomain(domain_id, c_plain_text, len(plain_text), ctypes.byref(p_char),
+ ctypes.pointer(cipher_len))
+ if ret != 0:
+ logging.error("KeEncryptByDomain failed.")
+ return ""
+ value = p_char.value.decode()
+ ret = _libc_dll.free(p_char)
+ if ret != 0:
+ logging.error("free memory error. ret=%d" % ret)
+ return value
+
+
+def _decrypt(domain_id: int, cipher_text: str):
+ """Decrypt."""
+ p_char = ctypes.c_char_p()
+ plain_len = ctypes.c_int(0)
+ c_cipher_text = ctypes.create_string_buffer(cipher_text.encode())
+ _kmc_dll.KeDecryptByDomain.restype = ctypes.c_int
+ _kmc_dll.KeDecryptByDomain.argtypes = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_char_p),
+ ctypes.POINTER(ctypes.c_int)]
+ ret = _kmc_dll.KeDecryptByDomain(domain_id, c_cipher_text, len(cipher_text), ctypes.byref(p_char),
+ ctypes.pointer(plain_len))
+ if ret != 0:
+ logging.error("KeDecryptByDomain failed.")
+ return ""
+ value = p_char.value.decode()
+ ret = _libc_dll.free(p_char)
+ if ret != 0:
+ logging.error("free memory error. ret=%d" % ret)
+ return value
+
+
+def check_and_update_mk(domain_id: int, advance_day: int) -> bool:
+ """Check and update mk."""
+ ret = _kmc_dll.KeCheckAndUpdateMk(domain_id, advance_day)
+ if ret != 0:
+ logging.error(f"failed to call KeCheckAndUpdateMk, code={ret}")
+ return False
+ return True
+
+
+def update_root_key() -> bool:
+ """Update root key."""
+ ret = _kmc_dll.KeUpdateRootKey()
+ if ret != 0:
+ logging.error(f"failed to call KeUpdateRootKey, code={ret}")
+ return False
+ return True
+
+
+def finalize() -> None:
+ """Finalize."""
+ _kmc_dll.KeFinalize.restype = ctypes.c_int
+ _kmc_dll.KeFinalize.argtypes = []
+ _kmc_dll.KeFinalize()
+
+
+def _get_lib_path():
+ pkg_path = os.path.dirname(__file__)
+ if platform.processor() == "x86_64":
+ return os.path.join(pkg_path, "x86_64/libkmcext.so")
+ else:
+ return os.path.join(pkg_path, "aarch64/libkmcext.so")
+
+
+def _load_dll(kmc_dll_path: str) -> None:
+ global _kmc_dll
+ if _kmc_dll:
+ return
+ global _libc_dll
+ if _libc_dll:
+ return
+ _libc_dll = ctypes.CDLL(find_library("c"))
+ _kmc_dll = ctypes.CDLL(kmc_dll_path)
+
+
+@ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_char_p)
+def _logger(level: ctypes.c_int, msg: ctypes.c_char_p):
+ logging.info("level:%d, msg:%s" % (level, str(msg)))
+
+
+def _init_log():
+ _kmc_dll.KeSetLoggerCallback.restype = None
+ _kmc_dll.KeSetLoggerCallback.argtypes = [ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_char_p)]
+ _kmc_dll.KeSetLoggerCallback(_logger)
+ _kmc_dll.KeSetLoggerLevel.restype = None
+ _kmc_dll.KeSetLoggerLevel.argtypes = [ctypes.c_int]
+ _kmc_dll.KeSetLoggerLevel(2) # DISABLE(0),ERROR(1),WARN(2),INFO(3),DEBUG(4),TRACE(5)
+
+
+class KMCConfig(ctypes.Structure):
+ _fields_ = [
+ ("primaryKeyStoreFile", ctypes.c_char * 4096),
+ ("standbyKeyStoreFile", ctypes.c_char * 4096),
+ ("domainCount", ctypes.c_int),
+ ("role", ctypes.c_int),
+ ("procLockPerm", ctypes.c_int),
+ ("sdpAlgId", ctypes.c_int),
+ ("hmacAlgId", ctypes.c_int),
+ ("semKey", ctypes.c_int)
+ ]
+
+
+def _init_kmc_config(primary_key_store_file, standby_key_store_file, alg_id, domain_count):
+ config = KMCConfig()
+ config.primaryKeyStoreFile = primary_key_store_file.encode()
+ config.standbyKeyStoreFile = standby_key_store_file.encode()
+ config.domainCount = domain_count
+ config.role = 1 # Agent 0; Master 1
+ config.procLockPerm = 0o0600
+ config.sdpAlgId = alg_id
+ config.hmacAlgId = 2052 # HMAC_SHA256 2052; HMAC_SHA384 2053 HMAC_SHA512 2054
+ config.semKey = 0x20161516
+ _kmc_dll.KeInitialize.restype = ctypes.c_int
+ _kmc_dll.KeInitialize.argtypes = [ctypes.POINTER(KMCConfig)]
+ return _kmc_dll.KeInitialize(ctypes.byref(config))
+
+
+def init(primary_key_store_file: str, standby_key_store_file: str, alg_id: int, domain_count=3) -> bool:
+ """Initialize."""
+ if alg_id not in [5, 7, 8, 9]: # AES128_CBC, AES256_CBC, AES128_GCM, AES256_GCM
+ logging.error(f"alg (id={alg_id}) is not legal")
+ return False
+ _load_dll(_get_lib_path())
+ _init_log()
+ ret = _init_kmc_config(primary_key_store_file, standby_key_store_file, alg_id, domain_count)
+ if ret != 0:
+ logging.error(f"failed to call KeInitialized, code={ret}")
+ return False
+ return True
+
+
+def decrypt(cert_pem_file, secret_key_file, key_mm, key_component_1, key_component_2):
+ """Decrypt the passwd."""
+ sdp_alg_id = 9
+ # Make sure ssl certificate file exist
+ ca_file_list = (cert_pem_file, secret_key_file)
+ for file in ca_file_list:
+ if file and os.path.exists(file):
+ continue
+ else:
+ logging.error("SSL Certificate files does not exist! Please check config.yaml and cert file.")
+ raise FileNotFoundError
+
+ primary_keyStoreFile = key_component_1
+ standby_keyStoreFile = key_component_2
+ ret = init(primary_keyStoreFile, standby_keyStoreFile, sdp_alg_id)
+ if ret is False:
+ logging.error("kmc init error.")
+ raise Exception('ERROR: kmc init failed!')
+ domain_id = 0
+ decrypt_mm = _decrypt(domain_id, key_mm)
+ if decrypt_mm == "":
+ logging.error("kmc init error.")
+ raise Exception('ERROR: kmc init failed!')
+ check_and_update_mk(domain_id, ADVANCE_DAY)
+ finalize()
+ return decrypt_mm
diff --git a/evaluate_service/evaluate_service/security/kmc/utils.py b/evaluate_service/evaluate_service/security/kmc/utils.py
new file mode 100644
index 00000000..f99bf2f6
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/kmc/utils.py
@@ -0,0 +1,44 @@
+# -*- coding:utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Some tools."""
+import re
+import logging
+
+
+def check_password_rule(password):
+ """Check password rule."""
+ digit_regex = re.compile(r'\d')
+ upper_regex = re.compile(r'[A-Z]')
+ lower_regex = re.compile(r'[a-z]')
+
+ if len(password) < 8:
+ logging.warning("The length must >= 8")
+ return False
+
+ if len(digit_regex.findall(password)) == 0:
+ logging.warning("Must contains digit letters")
+ return False
+
+ if len(upper_regex.findall(password)) == 0:
+ logging.warning("Must contains capital letters")
+ return False
+
+ if len(lower_regex.findall(password)) == 0:
+ logging.warning("Must contains lowercase letters")
+ return False
+
+ return True
diff --git a/evaluate_service/evaluate_service/security/load_pickle.py b/evaluate_service/evaluate_service/security/load_pickle.py
new file mode 100644
index 00000000..df63f238
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/load_pickle.py
@@ -0,0 +1,57 @@
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Load pickle."""
+
+import pickle
+
+__all__ = ["restricted_loads"]
+
+
+safe_builtins = {
+ 'vega',
+ 'torch',
+ 'torchvision',
+ 'functools',
+ 'timm',
+ 'mindspore',
+ 'tensorflow',
+ 'numpy',
+ 'imageio',
+ 'collections',
+}
+
+
+class RestrictedUnpickler(pickle.Unpickler):
+ """Restrict unpickler."""
+
+ def __init__(self, file, fix_imports, encoding, errors, security):
+ super(RestrictedUnpickler, self).__init__(file=file, fix_imports=fix_imports, encoding=encoding, errors=errors)
+ self.security = security
+
+ def find_class(self, module, name):
+ """Find class."""
+ _class = super().find_class(module, name)
+ if self.security:
+ if module.split('.')[0] in safe_builtins:
+ return _class
+ raise pickle.UnpicklingError(f"global '{module}' is forbidden")
+ else:
+ return _class
+
+
+def restricted_loads(file, fix_imports=True, encoding="ASCII", errors="strict", security=False):
+ """Load obj."""
+ return RestrictedUnpickler(file, fix_imports=fix_imports, encoding=encoding, errors=errors,
+ security=security).load()
diff --git a/evaluate_service/evaluate_service/security/post.py b/evaluate_service/evaluate_service/security/post.py
new file mode 100644
index 00000000..a5110e1c
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/post.py
@@ -0,0 +1,57 @@
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Rest post operation in security mode."""
+
+import urllib
+import json
+import logging
+import requests
+from .conf import get_config
+from .utils import create_context
+from .args import check_msg
+from .verify_cert import verify_cert
+
+
+def post(host, files, data):
+ """Post a REST requstion in security mode."""
+ sec_cfg = get_config('client')
+
+ ca_file = sec_cfg.ca_cert
+ cert_pem_file = sec_cfg.client_cert
+ secret_key_file = sec_cfg.client_secret_key
+ encrypted_password = sec_cfg.encrypted_password
+ key_component_1 = sec_cfg.key_component_1
+ key_component_2 = sec_cfg.key_component_2
+
+ if not cert_pem_file or not secret_key_file or not ca_file:
+ logging.error("CERT file is not existed.")
+
+ if not verify_cert(ca_file, cert_pem_file):
+ logging.error(f"The cert {ca_file} and {cert_pem_file} are invalid, please check.")
+
+ if encrypted_password == "":
+ context = create_context(ca_file, cert_pem_file, secret_key_file)
+ else:
+ context = create_context(ca_file, cert_pem_file, secret_key_file, encrypted_password, key_component_1,
+ key_component_2)
+ if host.lower().startswith('https') is False:
+ raise Exception(f'The host {host} must start with https')
+ prepped = requests.Request(method="POST", url=host, files=files, data=data).prepare()
+ request = urllib.request.Request(host, data=prepped.body, method='POST')
+ request.add_header("Content-Type", prepped.headers['Content-Type'])
+ response = urllib.request.urlopen(request, context=context) # nosec
+ result = json.loads(response.read().decode('utf8'))
+ check_msg(dict((key, value) for key, value in result.items() if key != 'error_message'))
+ return result
diff --git a/evaluate_service/evaluate_service/security/run_dask.py b/evaluate_service/evaluate_service/security/run_dask.py
new file mode 100644
index 00000000..f4039540
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/run_dask.py
@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run dask scheduler and worker."""
+import os
+import subprocess
+import shutil
+import logging
+import socket
+import random
+from distributed import Client
+from distributed.security import Security
+from .conf import get_config
+from .verify_cert import verify_cert
+
+
+sec_cfg = get_config('server')
+
+
+def get_client_security(address):
+ """Get client."""
+ address = address.replace("tcp", "tls")
+ if not verify_cert(sec_cfg.ca_cert, sec_cfg.client_cert_dask):
+ logging.error(f"The cert {sec_cfg.ca_cert} and {sec_cfg.client_cert_dask} are invalid, please check.")
+ sec = Security(tls_ca_file=sec_cfg.ca_cert,
+ tls_client_cert=sec_cfg.client_cert_dask,
+ tls_client_key=sec_cfg.client_secret_key_dask,
+ require_encryption=True)
+ return Client(address, security=sec)
+
+
+def get_address_security(master_host, master_port):
+ """Get address."""
+ return "tls://{}:{}".format(master_host, master_port)
+
+
+def run_scheduler_security(ip, port, tmp_file):
+ """Run scheduler."""
+ if not verify_cert(sec_cfg.ca_cert, sec_cfg.server_cert_dask):
+ logging.error(f"The cert {sec_cfg.ca_cert} and {sec_cfg.server_cert_dask} are invalid, please check.")
+ return subprocess.Popen(
+ [
+ "dask-scheduler",
+ "--no-dashboard",
+ "--no-show",
+ f"--tls-ca-file={sec_cfg.ca_cert}",
+ f"--tls-cert={sec_cfg.server_cert_dask}",
+ f"--tls-key={sec_cfg.server_secret_key_dask}",
+ f"--host={ip}",
+ "--protocol=tls",
+ f"--port={port}",
+ f"--scheduler-file={tmp_file}",
+ f"--local-directory={os.path.dirname(tmp_file)}",
+ ],
+ env=os.environ
+ )
+
+
+def _available_port(min_port, max_port) -> int:
+ _sock = socket.socket()
+ while True:
+ port = random.randint(min_port, max_port)
+ try:
+ _sock.bind(('', port))
+ _sock.close()
+ return port
+ except Exception:
+ logging.debug('Failed to get available port, continue.')
+ continue
+ return None
+
+
+def run_local_worker_security(slave_ip, address, local_dir):
+ """Run dask-worker on local node."""
+ address = address.replace("tcp", "tls")
+ nanny_port = _available_port(30000, 30999)
+ worker_port = _available_port(29000, 29999)
+ pid = subprocess.Popen(
+ [
+ "dask-worker",
+ address,
+ '--nthreads=1',
+ '--nprocs=1',
+ '--memory-limit=0',
+ f"--local-directory={local_dir}",
+ f"--tls-ca-file={sec_cfg.ca_cert}",
+ f"--tls-cert={sec_cfg.client_cert_dask}",
+ f"--tls-key={sec_cfg.client_secret_key_dask}",
+ "--no-dashboard",
+ f"--host={slave_ip}",
+ "--protocol=tls",
+ f"--nanny-port={nanny_port}",
+ f"--worker-port={worker_port}",
+ ],
+ env=os.environ
+ )
+ return pid
+
+
+def run_remote_worker_security(slave_ip, address, local_dir):
+ """Run dask-worker on remote node."""
+ address = address.replace("tcp", "tls")
+ nanny_port = _available_port(30000, 30999)
+ worker_port = _available_port(29000, 29999)
+ pid = subprocess.Popen(
+ [
+ "ssh",
+ slave_ip,
+ shutil.which("dask-worker"),
+ address,
+ '--nthreads=1',
+ '--nprocs=1',
+ '--memory-limit=0',
+ f"--local-directory={local_dir}",
+ f"--tls-ca-file={sec_cfg.ca_cert}",
+ f"--tls-cert={sec_cfg.client_cert_dask}",
+ f"--tls-key={sec_cfg.client_secret_key_dask}",
+ "--no-dashboard",
+ f"--host={slave_ip}",
+ "--protocol=tls",
+ f"--nanny-port={nanny_port}",
+ f"--worker-port={worker_port}",
+ ],
+ env=os.environ
+ )
+ return pid
diff --git a/evaluate_service/evaluate_service/security/utils.py b/evaluate_service/evaluate_service/security/utils.py
new file mode 100644
index 00000000..9b6c220e
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/utils.py
@@ -0,0 +1,46 @@
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Context utils."""
+import ssl
+import sys
+import logging
+
+
+def create_context(ca_file, cert_pem_file, secret_key_file, key_mm=None, key_component_1=None, key_component_2=None):
+ """Create the SSL context."""
+ ciphers = "ECDHE-ECDSA-AES128-CCM:ECDHE-ECDSA-AES256-CCM:ECDHE-ECDSA-AES128-GCM-SHA256" \
+ ":ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384" \
+ ":DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-DSS-AES128-GCM-SHA256" \
+ ":DHE-DSS-AES256-GCM-SHA384:DHE-RSA-AES128-CCM:DHE-RSA-AES256-CCM"
+ context = ssl.SSLContext(ssl.PROTOCOL_TLS)
+ context.options += ssl.OP_NO_TLSv1
+ context.options += ssl.OP_NO_TLSv1_1
+ if sys.version_info >= (3, 7):
+ context.options += ssl.OP_NO_TLSv1_2
+ context.options += ssl.OP_NO_RENEGOTIATION
+ context.options -= ssl.OP_ALL
+ context.verify_mode = ssl.CERT_REQUIRED
+ context.set_ciphers(ciphers)
+ if key_mm is not None:
+ from .kmc.kmc import decrypt
+ logging.debug("Using encrypted key.")
+ if key_component_1 is None or key_component_2 is None:
+ logging.error("For encrypted key, the component must be provided.")
+ decrypt_mm = decrypt(cert_pem_file, secret_key_file, key_mm, key_component_1, key_component_2)
+ context.load_cert_chain(cert_pem_file, secret_key_file, password=decrypt_mm)
+ else:
+ context.load_cert_chain(cert_pem_file, secret_key_file)
+ context.load_verify_locations(ca_file)
+ return context
diff --git a/evaluate_service/evaluate_service/security/verify_cert.py b/evaluate_service/evaluate_service/security/verify_cert.py
new file mode 100644
index 00000000..cdc72389
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/verify_cert.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Verify cert."""
+
+import logging
+
+
+def verify_cert(ca_cert_file, cert_file):
+ """Verify the cert."""
+ from OpenSSL.crypto import load_certificate, FILETYPE_PEM, X509Store, X509StoreContext, X509StoreContextError
+ ca_cert = load_certificate(FILETYPE_PEM, open(ca_cert_file, "r", encoding="utf-8").read())
+ cert = load_certificate(FILETYPE_PEM, open(cert_file, 'r', encoding="utf-8").read())
+ if ca_cert.has_expired() or cert.has_expired():
+ logging.error("The cert is expired, please check.")
+ return False
+ store = X509Store()
+ store.add_cert(ca_cert)
+ ctx = X509StoreContext(store, cert)
+ try:
+ ctx.verify_certificate()
+ except X509StoreContextError:
+ logging.error("Certificate signature failure, ca cert file and cert file not match.")
+ return False
+ return True
diff --git a/evaluate_service/evaluate_service/security/verify_config.py b/evaluate_service/evaluate_service/security/verify_config.py
new file mode 100644
index 00000000..f5c910e7
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/verify_config.py
@@ -0,0 +1,152 @@
+# -*- coding:utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run pipeline."""
+
+import configparser
+import logging
+import os
+import stat
+
+
+def _file_exist(path):
+ return os.access(path, os.F_OK)
+
+
+def _file_belong_to_current_user(path):
+ return os.stat(path).st_uid == os.getuid()
+
+
+def _file_other_writable(path):
+ return os.stat(path).st_mode & stat.S_IWOTH
+
+
+def _file_is_link(path):
+ return os.path.islink(path)
+
+
+def _get_risky_files_by_suffix(suffixes, path):
+ risky_files = []
+ non_current_user_files = []
+ others_writable_files = []
+ link_files = []
+ for suffix in suffixes:
+ if not path.endswith(suffix):
+ continue
+ abs_path = os.path.abspath(path)
+ if _file_exist(abs_path):
+ risky_files.append(abs_path)
+ if not _file_belong_to_current_user(abs_path):
+ non_current_user_files.append(abs_path)
+ if _file_other_writable(abs_path):
+ others_writable_files.append(abs_path)
+ if _file_is_link(abs_path):
+ link_files.append(abs_path)
+
+ return risky_files, non_current_user_files, others_writable_files, link_files
+
+
+def get_risky_files(config):
+ """Get contained risky file (.pth/.pth.tar/.onnx/.py)."""
+ risky_files = []
+ non_current_user_files = []
+ others_writable_files = []
+ link_files = []
+ from vega.common.config import Config
+ if not isinstance(config, Config):
+ return risky_files, non_current_user_files, others_writable_files, link_files
+
+ for value in config.values():
+ if isinstance(value, Config) and value.get("type") == "DeepLabNetWork":
+ value = value.get("dir").rstrip("/") + "/" + value.get("name").lstrip("/") + ".py"
+ if isinstance(value, str):
+ temp_risky_files, temp_non_current_user_files, temp_other_writable_files, temp_link_files \
+ = _get_risky_files_by_suffix([".pth", ".pth.tar", ".py"], value)
+ risky_files.extend(temp_risky_files)
+ non_current_user_files.extend(temp_non_current_user_files)
+ others_writable_files.extend(temp_other_writable_files)
+ link_files.extend(temp_link_files)
+ temp_risky_files, temp_non_current_user_files, temp_other_writable_files, temp_link_files \
+ = get_risky_files(value)
+ risky_files.extend(temp_risky_files)
+ non_current_user_files.extend(temp_non_current_user_files)
+ others_writable_files.extend(temp_other_writable_files)
+ link_files.extend(temp_link_files)
+
+ return risky_files, non_current_user_files, others_writable_files, link_files
+
+
+def check_risky_file(args, config):
+ """Check risky file (.pth/.pth.tar/.py)."""
+ if not args.security:
+ return True
+ risky_files, non_current_user_files, others_writable_files, link_files = get_risky_files(config)
+ if len(risky_files) == 0:
+ return True
+
+ print("\033[1;33m"
+ "WARNING: The following executable files will be loaded:"
+ "\033[0m")
+ for file in risky_files:
+ print(file)
+ if len(non_current_user_files) > 0:
+ print("\033[1;33m"
+ "WARNING: The following executable files that will be loaded do not belong to the current user:"
+ "\033[0m")
+ for file in non_current_user_files:
+ print(file)
+ if len(others_writable_files) > 0:
+ print("\033[1;33m"
+ "WARNING: The following executable files that will be loaded have others write permission:"
+ "\033[0m")
+ for file in others_writable_files:
+ print(file)
+ if len(link_files) > 0:
+ print("\033[1;33m"
+ "WARNING: The following executable files that will be loaded is soft link file:"
+ "\033[0m")
+ for file in link_files:
+ print(file)
+ user_confirm = input("It is possible to construct malicious pickle data "
+ "which will execute arbitrary code during unpickling .pth/.pth.tar/.py files. "
+ "\nPlease ensure the safety and consistency of the loaded executable files. "
+ "\nDo you want to continue? (yes/no) ").strip(" ")
+ while user_confirm != "yes" and user_confirm != "no":
+ user_confirm = input("Please enter yes or no! ").strip(" ")
+ if user_confirm == "yes":
+ return True
+ elif user_confirm == "no":
+ return False
+
+
+def check_risky_files(file_list):
+ """Check if cert and key file are risky."""
+ res = True
+ for file in file_list:
+ if not os.path.exists(file):
+ logging.error(f"File <{file}> does not exist")
+ res = False
+ continue
+ if not _file_belong_to_current_user(file):
+ logging.error(f"File <{file}> is not owned by current user")
+ res = False
+ if _file_is_link(file):
+ logging.error(f"File <{file}> should not be soft link")
+ res = False
+ if os.stat(file).st_mode & 0o0177:
+ logging.error(f"File <{file}> permissions are not correct, cannot exceed 600")
+ res = False
+ return res
diff --git a/evaluate_service/evaluate_service/security/zmq_op.py b/evaluate_service/evaluate_service/security/zmq_op.py
new file mode 100644
index 00000000..29b89d5e
--- /dev/null
+++ b/evaluate_service/evaluate_service/security/zmq_op.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ZMQ operation."""
+import os
+import uuid
+import zmq
+import zmq.auth
+from zmq.auth.thread import ThreadAuthenticator
+
+
+def listen_security(ip, min_port, max_port, max_tries, temp_path):
+ """Listen on server."""
+ ctx = zmq.Context.instance()
+ # Start an authenticator for this context.
+ auth = ThreadAuthenticator(ctx)
+ auth.start()
+ auth.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY)
+
+ socket = ctx.socket(zmq.REP)
+ server_secret_key = os.path.join(temp_path, "server.key_secret")
+ if not os.path.exists(server_secret_key):
+ _, server_secret_key = zmq.auth.create_certificates(temp_path, "server")
+ server_public, server_secret = zmq.auth.load_certificate(server_secret_key)
+ if os.path.exists(server_secret_key):
+ os.remove(server_secret_key)
+ socket.curve_secretkey = server_secret
+ socket.curve_publickey = server_public
+ socket.curve_server = True # must come before bind
+
+ port = socket.bind_to_random_port(
+ f"tcp://{ip}", min_port=min_port, max_port=max_port, max_tries=100)
+ return socket, port
+
+
+def connect_security(ip, port, temp_path):
+ """Connect to server."""
+ ctx = zmq.Context.instance()
+ socket = ctx.socket(zmq.REQ)
+ client_name = uuid.uuid1().hex[:8]
+ client_secret_key = os.path.join(temp_path, "{}.key_secret".format(client_name))
+ if not os.path.exists(client_secret_key):
+ client_public_key, client_secret_key = zmq.auth.create_certificates(temp_path, client_name)
+ client_public, client_secret = zmq.auth.load_certificate(client_secret_key)
+ socket.curve_secretkey = client_secret
+ socket.curve_publickey = client_public
+ server_public_key = os.path.join(temp_path, "server.key")
+ if not os.path.exists(server_public_key):
+ server_public_key, _ = zmq.auth.create_certificates(temp_path, "server")
+ server_public, _ = zmq.auth.load_certificate(server_public_key)
+ socket.curve_serverkey = server_public
+ socket.connect(f"tcp://{ip}:{port}")
+ if os.path.exists(client_secret_key):
+ os.remove(client_secret_key)
+ if os.path.exists(client_public_key):
+ os.remove(client_public_key)
+ return socket
diff --git a/evaluate_service/hardwares/__init__.py b/evaluate_service/hardwares/__init__.py
deleted file mode 100644
index c388bb8e..00000000
--- a/evaluate_service/hardwares/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .davinci.davinci import Davinci
-from .mobile.mobile import Mobile
-from .kirin990_npu.kirin990_npu import Kirin990_npu
-
-__all__ = ['Davinci', "Mobile", "Kirin990_npu"]
diff --git a/evaluate_service/hardwares/davinci/compile_atlas200.sh b/evaluate_service/hardwares/davinci/compile_atlas200.sh
deleted file mode 100644
index 5c86edf8..00000000
--- a/evaluate_service/hardwares/davinci/compile_atlas200.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-# inference for Atlas 200 DK
-WORK_DIR=$1
-EXAMPLE_DIR=$2
-DDK_USER_NAME=$3
-DDK_HOST_IP=$4
-ATLAS_HOST_IP=$5
-APP_DIR=$6
-
-CURRENT_DIR=$(pwd)
-#source env.sh
-
-# copy the example project to work dir
-mkdir $WORK_DIR/build_files/
-cp -rf $EXAMPLE_DIR/* $WORK_DIR/build_files/
-
-mkdir -p $WORK_DIR/build_files/run/out/test_data/model/
-mkdir -p $WORK_DIR/build_files/run/out/test_data/data/
-cp $WORK_DIR/*.om $WORK_DIR/build_files/run/out/test_data/model/
-cp $WORK_DIR/*.bin $WORK_DIR/build_files/run/out/test_data/data/
-
-
-# build the file
-cd $WORK_DIR/build_files/
-mkdir -p build/intermediates/device
-mkdir -p build/intermediates/host
-
-cd build/intermediates/device
-cmake ../../../src -Dtype=device -Dtarget=RC -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ -DCMAKE_CXX_FLAGS="-s" -DCMAKE_C_FLAGS="-s"
-make install
-echo "[INFO] build the device sucess"
-cd ../host
-cmake ../../../src -Dtype=host -Dtarget=RC -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ -DCMAKE_CXX_FLAGS="-s" -DCMAKE_C_FLAGS="-s"
-make install
-echo "[INFO] build the host sucess"
-
-cd $CURRENT_DIR
-
-# execute in Atlas 200 DK
-#scp /home/ly/evaluate_test/atlas_execute.sh HwHiAiUser@$ATLAS_HOST_IP:~/
-#echo "[INFO] copy the atlas_execute.sh to Atlas 200 DK."
-ssh -o "StrictHostKeyChecking no" HwHiAiUser@$ATLAS_HOST_IP "bash -s" < ./utils/atlas200_dk/atlas_execute.sh $WORK_DIR $DDK_USER_NAME $DDK_HOST_IP $APP_DIR
-echo "[INFO] execute in Atlas 200 DK finish."
-
diff --git a/evaluate_service/hardwares/davinci/env/check_atlas300.sh b/evaluate_service/hardwares/davinci/env/check_atlas300.sh
deleted file mode 100644
index 7b05e9a8..00000000
--- a/evaluate_service/hardwares/davinci/env/check_atlas300.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-echo "[INFO] start check the enviroment..."
-python3 -c "import te" && echo "[INFO] check te sucess"
-python3 -c "import topi" && echo "[INFO] check topi sucess"
-#cmake --version && echo "[INFO] check cmake sucess"
-atc --version && echo "[INFO] check atc sucess "
-
-echo "[INFO] start compile the example..."
-
-cd ../samples/atlas300/
-mkdir -p build/intermediates/host
-cd build/intermediates/host
-cmake ../../../src -DCMAKE_CXX_COMPILER=g++ -DCMAKE_SKIP_RPATH=TRUE
-make && echo "[INFO] check the env sucess!"
diff --git a/evaluate_service/hardwares/davinci/env/env_atlas200dk.sh b/evaluate_service/hardwares/davinci/env/env_atlas200dk.sh
deleted file mode 100644
index c781b54c..00000000
--- a/evaluate_service/hardwares/davinci/env/env_atlas200dk.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-export DDK_PATH={user_path}/huawei/ddk
-export PYTHONPATH=$DDK_PATH/site-packages/te-0.4.0.egg:$DDK_PATH/site-packages/topi-0.4.0.egg
-export LD_LIBRARY_PATH=$DDK_PATH/uihost/lib:$DDK_PATH/lib/x86_64-linux-gcc5.4
-export PATH=$PATH:$DDK_PATH/toolchains/ccec-linux/bin:$DDK_PATH/uihost/bin
-export TVM_AICPU_LIBRARY_PATH=$DDK_PATH/uihost/lib/:$DDK_PATH/uihost/toolchains/ccec-linux/aicpu_lib
-export TVM_AICPU_INCLUDE_PATH=$DDK_PATH/include/inc/tensor_engine
-export TVM_AICPU_OS_SYSROOT={user_path}/tools/sysroot/aarch64_Ubuntu16.04.3
-export NPU_HOST_LIB={user_path}/tools/1.32.0.B080/RC/host-aarch64_Ubuntu16.04.3/lib
-export NPU_DEV_LIB={user_path}/tools/1.32.0.B080/RC/host-aarch64_Ubuntu16.04.3/lib
-#export CPLUS_INCLUDE_PATH=$DDK_PATH/include/inc:$DDK_PATH/include/third_party
diff --git a/evaluate_service/hardwares/davinci/env/env_atlas300.sh b/evaluate_service/hardwares/davinci/env/env_atlas300.sh
deleted file mode 100644
index 3319df1c..00000000
--- a/evaluate_service/hardwares/davinci/env/env_atlas300.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-export ASCEND_HOME=/usr/local/Ascend
-export PATH=/opt/cmake-3.14.5-Linux-x86_64/bin:/usr/local/python3.7.5/bin:$ASCEND_HOME/atc/ccec_compiler/bin:$ASCEND_HOME/atc/bin:$PATH
-export LD_LIBRARY_PATH=$ASCEND_HOME/atc/python/site-packages/te.egg/lib:$ASCEND_HOME/atc/lib64:$ASCEND_HOME/acllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons
-export PYTHONPATH=$PYTHONPATH:$ASCEND_HOME/atc/python/site-packages/te.egg:$ASCEND_HOME/atc/python/site-packages/topi.egg:$ASCEND_HOME/atc/python/site-packages/auto_tune.egg
-export ASCEND_OPP_PATH=$ASCEND_HOME/ascend-toolkit/20.2.0/x86_64-linux/opp
-export DDK_PATH=$ASCEND_HOME
-export NPU_HOST_LIB=$ASCEND_HOME/ascend-toolkit/20.2.0/x86_64-linux/acllib/lib64/stub
\ No newline at end of file
diff --git a/evaluate_service/hardwares/davinci/env/env_evb.sh b/evaluate_service/hardwares/davinci/env/env_evb.sh
deleted file mode 100644
index d1c475cf..00000000
--- a/evaluate_service/hardwares/davinci/env/env_evb.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-export ASCEND_HOME=/usr/local/Ascend
-export PATH=/usr/local/python3.7/bin:$ASCEND_HOME/atc/ccec_compiler/bin:$ASCEND_HOME/atc/bin:$PATH
-export LD_LIBRARY_PATH=$ASCEND_HOME/atc/python/site-packages/te.egg/lib:$ASCEND_HOME/acllib/lib64:$ASCEND_HOME/atc/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:/usr/local/Ascend/atc/lib64/plugin/opskernel
-export PYTHONPATH=$PYTHONPATH:$ASCEND_HOME/atc/python/site-packages/te.egg:$ASCEND_HOME/atc/python/site-packages/topi.egg:$ASCEND_HOME/atc/python/site-packages/auto_tune.egg
-export ASCEND_OPP_PATH=/usr/local/Ascend/opp
-export SLOG_PRINT_TO_STDOUT=1
-#export DUMP_GE_GRAPH=1
-#export DUMP_OP=1
diff --git a/evaluate_service/hardwares/davinci/inference_atlas200.sh b/evaluate_service/hardwares/davinci/inference_atlas200.sh
deleted file mode 100644
index 48086918..00000000
--- a/evaluate_service/hardwares/davinci/inference_atlas200.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-WORK_DIR=$1
-DDK_USER_NAME=$2
-DDK_HOST_IP=$3
-APP_DIR=$4
-
-cd ~
-mkdir -p $APP_DIR
-cd ~/$APP_DIR
-scp -r $DDK_USER_NAME@$DDK_HOST_IP:$WORK_DIR/build_files/run/out/* ./
-echo "[INFO] copy the fils to Atlas 200 Dk sucess."
-./main >ome.log
-echo "[INFO] run exe in Atlas 200 Dk sucess."
-scp ome.log $DDK_USER_NAME@$DDK_HOST_IP:$WORK_DIR/
-scp ./result_files/result_file $DDK_USER_NAME@$DDK_HOST_IP:$WORK_DIR/
-echo "[INFO] copy the result log to DDK host sucess."
-cd ../
-rm -rf ./$APP_DIR
-echo "[INFO] delete the temp files in Atlas 200 DK sucess."
diff --git a/evaluate_service/hardwares/davinci/samples/atlas200dk/inc/classify_net_ai_engine.h b/evaluate_service/hardwares/davinci/samples/atlas200dk/inc/classify_net_ai_engine.h
deleted file mode 100644
index e4bded24..00000000
--- a/evaluate_service/hardwares/davinci/samples/atlas200dk/inc/classify_net_ai_engine.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
-* @file classify_net_ai_engine.h
-*
-* Copyright(c)<2018>,
-*
-* @version 1.0
-*
-* @date 2018-6-7
-*/
-
-#ifndef INC_CLASSIFY_NET_AI_ENGINE_H_
-#define INC_CLASSIFY_NET_AI_ENGINE_H_
-#include
-#include
-#include
-#include
-#include