Support Ascend NPU

lm-sys · Sep 16, 2023 · 5419ac1 · 5419ac1
1 parent aa153d5
commit 5419ac1
Show file tree

Hide file tree

Showing 8 changed files with 61 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -157,6 +157,18 @@ python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 --device xpu
 ```
 Vicuna-7B can run on an Intel Arc A770 16GB.
 
+#### Ascend NPU (Huawei AI Processor)
+Install the [Ascend PyTorch Adapter](https://github.com/Ascend/pytorch). Set the CANN environment variables:
+```
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+```
+
+Use `--device npu` to enable NPU acceleration.
+```
+python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 --device npu
+```
+Vicuna-7B/13B can run on an Ascend 910B NPU 60GB.
+
 #### Not Enough Memory
 If you do not have enough memory, you can enable 8-bit compression by adding `--load-8bit` to commands above.
 This can reduce memory usage by around half with slightly degraded model quality.
@@ -301,6 +313,35 @@ Tips:
 - If you meet out-of-memory due to "FSDP Warning: When using FSDP, it is efficient and recommended... ", see solutions [here](https://github.com/huggingface/transformers/issues/24724#issuecomment-1645189539).
 - If you meet out-of-memory during model saving, see solutions [here](https://github.com/pytorch/pytorch/issues/98823).
 
+### Fine-tuning Vicuna-7B with Local NPUs
+
+You can use the following command to train Vicuna-7B with 8 x 910B (60GB). Use `--nproc_per_node` to specify the number of NPUs.
+```bash
+torchrun --nproc_per_node=8 --master_port=20001 fastchat/train/train.py \
+    --model_name_or_path ~/vicuna-7b-v1.5-16k  \
+    --data_path data/dummy_conversation.json \
+    --fp16 True \
+    --output_dir output_vicuna \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1200 \
+    --save_total_limit 10 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --lazy_preprocess True
+```
+
 ### Other models and LoRA support
 More instructions to train other models (e.g., FastChat-T5) and use LoRA are in [docs/training.md](docs/training.md).
 

diff --git a/fastchat/model/compression.py b/fastchat/model/compression.py
@@ -193,6 +193,8 @@ def load_compress_model(model_path, device, torch_dtype, use_fast, revision="mai
             torch.cuda.empty_cache()
             if device == "xpu":
                 torch.xpu.empty_cache()
+            if device == "npu":
+                torch.npu.empty_cache()
 
     for name in model.state_dict():
         if name not in linear_weights:

diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
@@ -206,6 +206,13 @@ def load_model(
             warnings.warn(
                 "Intel Extension for PyTorch is not installed, but is required for xpu inference."
             )
+    elif device == "npu":
+        kwargs = {"torch_dtype": torch.float16}
+        # Try to load ipex, while it looks unused, it links into torch for xpu support
+        try:
+            import torch_npu
+        except ImportError:
+            warnings.warn("Ascend Extension for PyTorch is not installed.")
     else:
         raise ValueError(f"Invalid device: {device}")
 
@@ -288,6 +295,7 @@ def load_model(
     if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in (
         "mps",
         "xpu",
+        "npu",
     ):
         model.to(device)
 
@@ -369,7 +377,7 @@ def add_model_args(parser):
     parser.add_argument(
         "--device",
         type=str,
-        choices=["cpu", "cuda", "mps", "xpu"],
+        choices=["cpu", "cuda", "mps", "xpu", "npu"],
         default="cuda",
         help="The device type",
     )

diff --git a/fastchat/model/model_codet5p.py b/fastchat/model/model_codet5p.py
@@ -104,3 +104,5 @@ def __call__(
     torch.cuda.empty_cache()
     if device == "xpu":
         torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()
diff --git a/fastchat/model/model_falcon.py b/fastchat/model/model_falcon.py
@@ -136,3 +136,5 @@ def generate_stream_falcon(
     torch.cuda.empty_cache()
     if device == "xpu":
         torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()
diff --git a/fastchat/serve/inference.py b/fastchat/serve/inference.py
@@ -263,6 +263,8 @@ def generate_stream(
     torch.cuda.empty_cache()
     if device == "xpu":
         torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()
 
 
 class ChatIO(abc.ABC):

diff --git a/fastchat/serve/launch_all_serve.py b/fastchat/serve/launch_all_serve.py
@@ -66,7 +66,7 @@
 parser.add_argument(
     "--device",
     type=str,
-    choices=["cpu", "cuda", "mps", "xpu"],
+    choices=["cpu", "cuda", "mps", "xpu", "npu"],
     default="cuda",
     help="The device type",
 )

diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py
@@ -370,6 +370,8 @@ def get_embeddings(self, params):
             torch.cuda.empty_cache()
             if self.device == "xpu":
                 torch.xpu.empty_cache()
+            if self.device == "npu":
+                torch.npu.empty_cache()
         except torch.cuda.OutOfMemoryError as e:
             ret = {
                 "text": f"{SERVER_ERROR_MSG}\n\n({e})",