-
Notifications
You must be signed in to change notification settings - Fork 2
/
model_text.py
187 lines (144 loc) · 5.58 KB
/
model_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# import torch
import settings
from model import *
# from typing import Any, Dict, List, Optional, cast
from llama_index_log_handler import callback_manager
from llama_index.core.llms import ChatMessage
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from typing import List
from IPython.display import Markdown, display
# TODO: Nim
# Temperature of 0 means deterministic output, while 1 means random output
# if settings.TEMPERATURE == 0.0:
# torch.random.manual_seed(0)
if settings.TEXT_MODEL_BACKEND == "ollama":
llm = Ollama(
model=settings.TEXT_MODEL,
request_timeout=1200.0,
temperature=settings.TEMPERATURE,
callback_manager=callback_manager,
)
llm_json = Ollama(
model=settings.TEXT_MODEL,
request_timeout=1200.0,
temperature=settings.TEMPERATURE,
callback_manager=callback_manager,
json_mode=True,
)
embedding = OllamaEmbedding(
model_name=settings.TEXT_MODEL,
# base_url="http://localhost:11434",
ollama_additional_kwargs={"mirostat": 2},
)
elif settings.TEXT_MODEL_BACKEND == "nim":
import os
from llama_index.llms.nvidia import NVIDIA
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
# Configure the NVIDIA LLM to connect to your local NIM server
llm = NVIDIA(
model=settings.TEXT_MODEL,
base_url="http://localhost:8000", # Adjust if your NIM server uses a different port
# base_url = "https://integrate.api.nvidia.com/v1",
api_key = os.environ["NGC_API_KEY"]
)
# Set the LLM as the default for LlamaIndex
Settings.llm = llm
# from llama_index.llm_predictor import LLMPredictor
# from llama_index.prompts.prompts import Prompt
# from llama_index.response.schema import RESPONSE_TYPE
# from typing import Any, Optional
# from nemoguardrails import LLMRails, RailsConfig
def display_messages(messages: List[ChatMessage]):
if not settings.DEBUG:
return
output = "---\n\n### Messages\n\n"
for m in messages:
output += f"#### {m.role.title()}:\n\n"
if m.content.startswith("{"):
try:
json_content = json.loads(m.content)
formatted_json = json.dumps(json_content, indent=4)
output += f"```json\n{formatted_json}\n```\n\n"
except json.JSONDecodeError:
output += f"```json\n{m.content}\n```\n\n"
else:
output += "\n".join(["> "+ line for line in m.content.split("\n")])
output += "\n\n"
output += "\n---\n\n"
display(Markdown(output))
def stream_llm_response(response, progress=None):
line_len = 0
for r in response:
if progress:
progress.value += 1
print(r.delta, end="")
line_len += len(r.delta)
if line_len > 120:
print()
line_len = 0
elif "\n" in r.delta:
line_len = 0
return r.message.content
# # This is for llama
# generator = pipeline(model=settings.TEXT_MODEL, device_map=settings.DEVICE_MAP, torch_dtype=torch.bfloat16)
# def run_llm(messages: list[dict[str, str]], max_new_tokens: int = 500, temperature: float = settings.TEMPERATURE) -> str:
# generation = generator(
# messages,
# do_sample=temperature > 0,
# temperature=1.0,
# top_p=settings.TOP_P if temperature > 0 else 1,
# max_new_tokens=max_new_tokens,
# pad_token_id=generator.tokenizer.eos_token_id,
# )
# return generation[0]['generated_text'][-1]['content'].strip()
def free_memory():
"""Free memory up after running the text model"""
pass
# print("Max mem allocated (GB) while doing text model:", torch.cuda.max_memory_allocated() / (1024**3))
# from numba import cuda
# device = cuda.get_current_device()
# device.reset()
# # Step 1: Delete all references to the models and pipelines
# try:
# del generator
# except:
# pass
# # Step 2: Run garbage collection to free up Python memory references
# gc.collect()
# # Step 3: Empty the CUDA cache to free memory back to PyTorch
# torch.cuda.empty_cache()
# # Step 4 (Optional): Synchronize CUDA to ensure all operations are complete
# torch.cuda.synchronize()
# TODO: Check for Phi-3.5-mini-instruct vs llama
# # Load the model
# model_text_args = {
# "device_map": settings.DEVICE_MAP,
# "torch_dtype": "auto",
# "trust_remote_code": True,
# "attn_implementation": settings.ATTN_IMPLEMENTATION,
# }
# model_text = AutoModelForCausalLM.from_pretrained(settings.TEXT_MODEL, **model_text_args)
# tokenizer_text = AutoTokenizer.from_pretrained(settings.TEXT_MODEL)
# pipe_text = pipeline(
# "text-generation",
# model=model_text,
# tokenizer=tokenizer_text,
# )
# Define the function to run text generation
# def run_llm(messages: list[dict[str, str]], max_new_tokens: int = 500, temperature: float = settings.TEMPERATURE) -> str:
# generation_args = {
# "max_new_tokens": max_new_tokens,
# "return_full_text": False,
# "temperature": temperature,
# "do_sample": True,
# "top_p": settings.TOP_P,
# "pad_token_id": pipe_text.tokenizer.eos_token_id
# }
# if temperature == 0.0:
# generation_args["do_sample"] = False
# generation_args["temperature"] = None
# generation_args["top_p"] = None
# output = pipe_text(messages, **generation_args)
# return output[0]['generated_text'].strip()