Merge branch 'main' into stop-words

Conflicts: lmdeploy/model.py
InternLM · Sep 22, 2023 · 62db2ec · 62db2ec
2 parents cdaf7f2 + 0be9e7a
commit 62db2ec
Show file tree

Hide file tree

Showing 34 changed files with 378 additions and 107 deletions.
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -26,7 +26,13 @@ jobs:
         uses: actions/checkout@v3
       - name: Check disk space
         run: |
+          df -h
+          ls /opt/hostedtoolcache
           rm -rf ${GITHUB_WORKSPACE}/.git
+          rm -rf  /opt/hostedtoolcache/go
+          rm -rf  /opt/hostedtoolcache/node
+          rm -rf  /opt/hostedtoolcache/Ruby
+          rm -rf  /opt/hostedtoolcache/CodeQL
           cat /proc/cpuinfo  | grep -ic proc
           free
           df -h

diff --git a/.readthedocs.yml → .readthedocs.yaml b/.readthedocs.yml → .readthedocs.yaml
diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## News 🎉
 
+- \[2023/09\] TurboMind supports InternLM-20B
 - \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
 - \[2023/09\] TurboMind supports Baichuan2-7B
 - \[2023/08\] TurboMind supports flash-attention2.
@@ -61,19 +62,20 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 | :----------: | :-------------: | :--: | :-----: | :---: | :--: |
 |    Llama     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 |    Llama2    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|   InternLM   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-20B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |       Yes       | Yes  |   Yes   |  No   |  No  |
 | Baichuan-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 | Baichuan2-7B |       Yes       | Yes  |   No    |  No   |  No  |
 |  Code Llama  |       Yes       | Yes  |   No    |  No   |  No  |
 
 ### Pytorch
 
-|  Models  | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
-| :------: | :-------------: | :--: | :-----: | :---: | :--: |
-|  Llama   |       Yes       | Yes  |   No    |  No   |  No  |
-|  Llama2  |       Yes       | Yes  |   No    |  No   |  No  |
-| InternLM |       Yes       | Yes  |   No    |  No   |  No  |
+|   Models    | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
+| :---------: | :-------------: | :--: | :-----: | :---: | :--: |
+|    Llama    |       Yes       | Yes  |   No    |  No   |  No  |
+|   Llama2    |       Yes       | Yes  |   No    |  No   |  No  |
+| InternLM-7B |       Yes       | Yes  |   No    |  No   |  No  |
 
 ## Performance
 

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## 更新 🎉
 
+- \[2023/09\] TurboMind 支持 InternLM-20B 模型
 - \[2023/09\] TurboMind 支持 Code Llama 所有功能：代码续写、填空、对话、Python专项。点击[这里](./docs/zh_cn/supported_models/codellama.md)阅读部署方法
 - \[2023/09\] TurboMind 支持 Baichuan2-7B
 - \[2023/08\] TurboMind 支持 flash-attention2
@@ -62,19 +63,20 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 | :----------: | :------: | :--: | :-----: | :---: | :--: |
 |    Llama     |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 |    Llama2    |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-|   InternLM   |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-20B |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |   Yes    | Yes  |   Yes   |  No   |  No  |
 | Baichuan-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 | Baichuan2-7B |   Yes    | Yes  |   No    |  No   |  No  |
 |  Code Llama  |   Yes    | Yes  |   No    |  No   |  No  |
 
 ### Pytorch
 
-|   模型   | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
-| :------: | :------: | :--: | :-----: | :---: | :--: |
-|  Llama   |   Yes    | Yes  |   No    |  No   |  No  |
-|  Llama2  |   Yes    | Yes  |   No    |  No   |  No  |
-| InternLM |   Yes    | Yes  |   No    |  No   |  No  |
+|    模型     | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
+| :---------: | :------: | :--: | :-----: | :---: | :--: |
+|    Llama    |   Yes    | Yes  |   No    |  No   |  No  |
+|   Llama2    |   Yes    | Yes  |   No    |  No   |  No  |
+| InternLM-7B |   Yes    | Yes  |   No    |  No   |  No  |
 
 ## 性能
 

diff --git a/benchmark/README.md b/benchmark/README.md
@@ -23,10 +23,14 @@ python profile_throughput.py \
 
 `profile_generation.py` perform benchmark with dummy data.
 
+```shell
+pip install nvidia-ml-py
+```
+
 ```bash
 python profile_generation.py \
- /path/to/your/model \
- --concurrency 8 --input_seqlen 0 --output_seqlen 2048
+ --model-path /path/to/your/model \
+ --concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512
 ```
 
 ## profile serving

diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
@@ -1,12 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 # import multiprocessing as mp
+import argparse
+import csv
+import logging
+import os
 import os.path as osp
 import time
+from dataclasses import dataclass
 from queue import Queue
 from threading import Thread
 from typing import List
 
-import fire
 import numpy as np
+from pynvml import (NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex,
+                    nvmlDeviceGetMemoryInfo, nvmlDeviceGetName,
+                    nvmlDeviceGetPowerState, nvmlDeviceGetTemperature,
+                    nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion)
+from tqdm import tqdm
 
 from lmdeploy.turbomind import Tokenizer, TurboMind
 
@@ -77,12 +87,12 @@ def _infer(model, session_id):
     print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')
 
 
-def main(model_path: str,
-         concurrency: int = 1,
-         input_seqlen: int = 0,
-         output_seqlen: int = 512,
-         test_round: int = 10,
-         tp: int = 1):
+def profile_throughput(model_path: str,
+                       concurrency: int = 1,
+                       input_seqlen: int = 0,
+                       output_seqlen: int = 512,
+                       test_round: int = 10,
+                       tp: int = 1):
     tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
     tokenizer = Tokenizer(tokenizer_model_path)
     tm_model = TurboMind(model_path=model_path, tp=tp)
@@ -141,7 +151,176 @@ def main(model_path: str,
           f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
           f'{token_latency_ave:.2f}s\n'
           f'throughput: {throughput:.2f} token/s\n{"-" * 50}')
+    return tm_model.model_name, throughput, tm_model.gpu_count
+
+
+class MemoryMonitor:
+    from multiprocessing import Manager
+    max_mem = Manager().Value('f', 0)  # GB
+    device_count = Manager().Value('f', 0)
+
+    @staticmethod
+    def nvidia_info():
+        # pip install nvidia-ml-py
+        nvidia_dict = {
+            'state': True,
+            'nvidia_version': '',
+            'nvidia_count': 0,
+            'gpus': []
+        }
+        try:
+            nvmlInit()
+            nvidia_dict['nvidia_version'] = nvmlSystemGetDriverVersion()
+            nvidia_dict['nvidia_count'] = nvmlDeviceGetCount()
+            for i in range(nvidia_dict['nvidia_count']):
+                handle = nvmlDeviceGetHandleByIndex(i)
+                memory_info = nvmlDeviceGetMemoryInfo(handle)
+                gpu = {
+                    'gpu_name': nvmlDeviceGetName(handle),
+                    'total': memory_info.total,
+                    'free': memory_info.free,
+                    'used': memory_info.used,
+                    'temperature': f'{nvmlDeviceGetTemperature(handle, 0)}℃',
+                    'powerStatus': nvmlDeviceGetPowerState(handle)
+                }
+                nvidia_dict['gpus'].append(gpu)
+        except NVMLError as _:  # noqa
+            nvidia_dict['state'] = False
+        except Exception as _:  # noqa
+            nvidia_dict['state'] = False
+        finally:
+            try:
+                nvmlShutdown()
+            except:  # noqa
+                pass
+        return nvidia_dict
+
+    @classmethod
+    def mem_monitor(cls):
+        info = cls.nvidia_info()
+        max_mem = 0
+        mem_start = 0
+        cls.device_count.value = len(info['gpus'])
+        for used_total in info['gpus']:
+            mem_start += used_total['used']
+        while True:
+            info = cls.nvidia_info()
+            used = 0
+            for used_total in info['gpus']:
+                used += used_total['used']
+            if used > max_mem:
+                max_mem = used
+                cls.max_mem.value = (max_mem - mem_start) / (1 << 30)
+
+    @classmethod
+    def start(cls):
+        cls._running = True
+        from multiprocessing import Process
+        cls.proc = Process(target=cls.mem_monitor)
+        cls.proc.start()
+
+    @classmethod
+    def terminate(cls) -> float:
+        """Terminate the subprocess and return maximum memory."""
+        cls.proc.kill()
+        return cls.max_mem.value
+
+
+@dataclass
+class ProfileResult:
+    model_name: str
+    batch: int
+    prompt_tokens: int
+    completion_tokens: int
+    throughput_per_proc: float
+    throughput_per_node: float
+    mem_per_proc: float
+    mem_per_gpu: float
+    mem_per_node: float
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Regression Test')
+    parser.add_argument('--model-path',
+                        type=str,
+                        help='benchmark test model path')
+    parser.add_argument('--concurrency',
+                        nargs='+',
+                        type=int,
+                        help='how many requests launched concurrently',
+                        default=[1, 8, 16, 32])
+    parser.add_argument(
+        '--prompt-tokens',
+        nargs='+',
+        type=int,
+        help='how many requests launched concurrently. One-to-one'
+        'correspondence with completion-tokens',
+        default=[64, 512, 512, 1024])
+    parser.add_argument('--completion-tokens',
+                        nargs='+',
+                        type=int,
+                        help='how many tokens to be generated. One-to-one'
+                        'correspondence with prompt-tokens',
+                        default=[512, 512, 1024, 1024])
+    parser.add_argument('--tp', type=int, help='Tensor parallel', default=1)
+    parser.add_argument('--dst-csv',
+                        type=str,
+                        help='Where to save the result.',
+                        default='profile_generation.csv')
+    parser.add_argument('--log-level',
+                        help='set log level',
+                        default='INFO',
+                        choices=list(logging._nameToLevel.keys()))
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    os.environ['TM_LOG_LEVEL'] = args.log_level
+    results: List[ProfileResult] = []
+    for batch in tqdm(args.concurrency):
+        for prompt_tokens, completion_tokens in tqdm(
+                zip(args.prompt_tokens, args.completion_tokens)):
+            MemoryMonitor.start()
+            from functools import partial
+            from multiprocessing import Pool
+            profile_target = partial(profile_throughput,
+                                     concurrency=batch,
+                                     input_seqlen=prompt_tokens,
+                                     output_seqlen=completion_tokens,
+                                     tp=args.tp)
+            output = Pool(1).map(profile_target, (args.model_path, ))
+            model_name, throughput_per_proc, tp = output[0]
+            time.sleep(5)  # wait a while for releasing GPU mem
+            memory = MemoryMonitor.terminate()
+            device_count = MemoryMonitor.device_count.value
+            results.append(
+                ProfileResult(model_name=model_name,
+                              batch=batch,
+                              prompt_tokens=prompt_tokens,
+                              completion_tokens=completion_tokens,
+                              throughput_per_proc=throughput_per_proc,
+                              throughput_per_node=throughput_per_proc / tp *
+                              device_count,
+                              mem_per_proc=memory,
+                              mem_per_gpu=memory / tp,
+                              mem_per_node=memory / tp * device_count))
+    with open(args.dst_csv, 'w') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow([
+            'batch', 'prompt_tokens', 'completion_tokens',
+            'throughput_per_proc(token/s)', 'throughput_per_node(token/s)',
+            'mem_per_proc(GB)', 'mem_per_gpu(GB)', 'mem_per_node(GB)'
+        ])
+        for re in results:
+            writer.writerow([
+                re.batch, re.prompt_tokens, re.completion_tokens,
+                f'{re.throughput_per_proc:.2f}',
+                f'{re.throughput_per_node:.2f}', f'{re.mem_per_proc:.2f}',
+                f'{re.mem_per_gpu:.2f}', f'{re.mem_per_node:.2f}'
+            ])
 
 
 if __name__ == '__main__':
-    fire.Fire(main)
+    main()
diff --git a/builder/manywheel/build_wheel.sh b/builder/manywheel/build_wheel.sh
@@ -6,8 +6,6 @@ PLAT_NAME="$2"
 DOCKER_TAG="$3"
 OUTPUT_DIR="$4"
 
-GIT_REMOTE=${GIT_REMOTE:-https://github.com/InternLM/lmdeploy}
-GIT_BRANCH=${GIT_BRANCH:-main}
 DOCKER_IMAGE="openmmlab/lmdeploy-builder:${DOCKER_TAG}"
 export USERID=$(id -u)
 export GROUPID=$(id -g)
@@ -20,8 +18,7 @@ docker run --rm -it \
     --env PLAT_NAME="${PLAT_NAME}" \
     --env USERID="${USERID}" \
     --env GROUPID="${GROUPID}" \
-    --env GIT_BRANCH="${GIT_BRANCH}" \
-    --env GIT_REMOTE="${GIT_REMOTE}" \
+    --volume "$(pwd)/../../:/lmdeploy" \
     --volume "$(pwd)/${OUTPUT_DIR}:/lmdeploy_build" \
     --volume "$(pwd)/entrypoint_build.sh:/entrypoint_build.sh" \
     --entrypoint /entrypoint_build.sh \

diff --git a/builder/manywheel/entrypoint_build.sh b/builder/manywheel/entrypoint_build.sh
@@ -7,12 +7,9 @@ export USERID=${USERID}
 export GROUPID=${GROUPID}
 export CUDAVER=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\).*$/\1/p')
 
-export GIT_REMOTE=${GIT_REMOTE:-https://github.com/InternLM/lmdeploy}
-export GIT_BRANCH=${GIT_BRANCH:-main}
 source /opt/conda/bin/activate
 conda activate $PYTHON_VERSION
 
-git clone -b ${GIT_BRANCH} ${GIT_REMOTE}
 cd lmdeploy
 mkdir build && cd build
 bash ../generate.sh

diff --git a/docs/en/build.md b/docs/en/build.md
@@ -18,4 +18,5 @@
   ```shell
   mkdir build && cd build
   sh ../generate.sh
+  make -j$(nproc) && make install
   ```
diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md
@@ -18,4 +18,5 @@
   ```shell
   mkdir build && cd build
   sh ../generate.sh
+  make -j$(nproc) && make install
   ```