forked from deepspeedai/DeepSpeedExamples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest-roberta.py
29 lines (23 loc) · 966 Bytes
/
test-roberta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from transformers import pipeline
import transformers
import deepspeed
import torch
import os
from transformers.models.roberta.modeling_roberta import RobertaLayer
local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '4'))
pipe = pipeline('fill-mask', model="roberta-large", device=local_rank)
# The injection_policy shows two things:
# 1. which layer module we need to add Tensor-Parallelism
# 2. the name of several linear layers: a) attention_output (both encoder and decoder),
# and b) transformer output
pipe.model = deepspeed.init_inference(
pipe.model,
mp_size=world_size,
dtype=torch.float,
injection_policy={RobertaLayer: ('output.dense')}
)
pipe.device = torch.device(f'cuda:{local_rank}')
output = pipe("The invention of the <mask> revolutionized the way we communicate with each other.")
if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
print(output)