-
Notifications
You must be signed in to change notification settings - Fork 59
/
encode_lang_batch.py
76 lines (62 loc) · 2.6 KB
/
encode_lang_batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import json
import torch
import yaml
from tqdm import tqdm
from models.multimodal_encoder.t5_encoder import T5Embedder
GPU = 0
MODEL_PATH = "google/t5-v1_1-xxl"
CONFIG_PATH = "configs/base.yaml"
# Modify the TARGET_DIR to your dataset path
TARGET_DIR = "data/datasets/agilex/tfrecords/"
# Note: if your GPU VRAM is less than 24GB,
# it is recommended to enable offloading by specifying an offload directory.
OFFLOAD_DIR = None # Specify your offload directory here, ensuring the directory exists.
def main():
with open(CONFIG_PATH, "r") as fp:
config = yaml.safe_load(fp)
device = torch.device(f"cuda:{GPU}")
text_embedder = T5Embedder(
from_pretrained=MODEL_PATH,
model_max_length=config["dataset"]["tokenizer_max_length"],
device=device,
use_offload_folder=OFFLOAD_DIR
)
tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
# Get all the task paths
task_paths = []
for sub_dir in os.listdir(TARGET_DIR):
middle_dir = os.path.join(TARGET_DIR, sub_dir)
if os.path.isdir(middle_dir):
for task_dir in os.listdir(middle_dir):
task_path = os.path.join(middle_dir, task_dir)
if os.path.isdir(task_path):
task_paths.append(task_path)
# For each task, encode the instructions
for task_path in tqdm(task_paths):
# Load the instructions corresponding to the task from the directory
with open(os.path.join(task_path, 'expanded_instruction_gpt-4-turbo.json'), 'r') as f_instr:
instruction_dict = json.load(f_instr)
instructions = [instruction_dict['instruction']] + instruction_dict['simplified_instruction'] + \
instruction_dict['expanded_instruction']
# Encode the instructions
tokenized_res = tokenizer(
instructions, return_tensors="pt",
padding="longest",
truncation=True
)
tokens = tokenized_res["input_ids"].to(device)
attn_mask = tokenized_res["attention_mask"].to(device)
with torch.no_grad():
text_embeds = text_encoder(
input_ids=tokens,
attention_mask=attn_mask
)["last_hidden_state"].detach().cpu()
attn_mask = attn_mask.cpu().bool()
# Save the embeddings for training use
for i in range(len(instructions)):
text_embed = text_embeds[i][attn_mask[i]]
save_path = os.path.join(task_path, f"lang_embed_{i}.pt")
torch.save(text_embed, save_path)
if __name__ == "__main__":
main()