mrs83 · mrs83 · Nov 17, 2024 · Nov 17, 2024
diff --git a/kurtis/inference.py b/kurtis/inference.py
@@ -1,5 +1,3 @@
-import torch
-
 from .utils import get_device
 
 
@@ -22,36 +20,33 @@ def inference_model(
     """
 
     response = None
-    try:
-        device = get_device()
-        # Ensure the model is on the correct device
-        model.eval()
-        messages = [
-            {
-                "role": "system",
-                "content": config.QA_INSTRUCTION,
-            },
-            {"role": "user", "content": input_text},
-        ]
-        input_text = tokenizer.apply_chat_template(messages, tokenize=False)
-        inputs = tokenizer.encode(f"{input_text}assistant\n", return_tensors="pt").to(
-            device
-        )
-        with torch.no_grad():
-            outputs = model.generate(
-                inputs,
-                max_new_tokens=512,
-                temperature=0.4,
-                top_p=0.9,
-                top_k=50,
-                repetition_penalty=1.0,
-                do_sample=True,
-                eos_token_id=tokenizer.eos_token_id,
-            )
-            new_tokens = outputs[0][inputs.shape[-1] :]
-            response = tokenizer.decode(new_tokens, skip_special_tokens=True)
-    except Exception as e:
-        response = f"An error occurred during inference: {str(e)}"
+    device = get_device()
+    # Ensure the model is on the correct device
+    model.to(device)
+    model.eval()
+    messages = [
+        {
+            "role": "system",
+            "content": config.QA_INSTRUCTION,
+        },
+        {"role": "user", "content": input_text},
+    ]
+    input_text = tokenizer.apply_chat_template(messages, tokenize=False)
+    inputs = tokenizer.encode(f"{input_text}assistant\n", return_tensors="pt").to(
+        device
+    )
+    outputs = model.generate(
+        inputs,
+        max_new_tokens=512,
+        temperature=0.4,
+        top_p=0.9,
+        top_k=50,
+        repetition_penalty=1.0,
+        do_sample=True,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    new_tokens = outputs[0][inputs.shape[-1] :]
+    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
 
     fallback_response = "I'm sorry, I don't have an answer for that."
     return response.strip() if response else fallback_response
diff --git a/kurtis/model.py b/kurtis/model.py
@@ -14,7 +14,7 @@
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_use_double_quant=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_compute_dtype=torch_dtype,
 )
 
 

diff --git a/kurtis/train.py b/kurtis/train.py
@@ -77,8 +77,6 @@ def formatting_prompts_func(example):
             optim="paged_adamw_8bit",
             run_name=model_output,
             save_strategy="epoch",
-            use_mps_device=False,
-            no_cuda=False,
         ),
         peft_config=config.LORA_CONFIG,
         formatting_func=formatting_prompts_func,

diff --git a/kurtis/utils.py b/kurtis/utils.py
@@ -47,16 +47,14 @@ def load_config(config_module="kurtis.config.default"):
 
 
 # https://github.com/pytorch/pytorch/issues/83015
-# def get_device():
-#     return (
-#         "mps"
-#         if torch.backends.mps.is_available()
-#         else "cuda" if torch.cuda.is_available() else "cpu"
-#     )
-
-
 def get_device():
-    return "cuda" if torch.cuda.is_available() else "cpu"
+    return (
+        "mps"
+        if torch.backends.mps.is_available()
+        else "cuda"
+        if torch.cuda.is_available()
+        else "cpu"
+    )
 
 
 def free_unused_memory():

diff --git a/pyproject.toml b/pyproject.toml
@@ -31,10 +31,12 @@ dependencies = [
     "tqdm>=4.67.0",
     "bitsandbytes>=0.44.1 ; platform_machine == 'x86_64'",
     "setuptools>=75.3.0",
+    "bitsandbytes ; sys_platform == 'darwin' and platform_machine == 'arm64'",
 ]
 
 [tool.uv.sources]
 torch = { index = "pytorch-cu124", marker = "platform_machine == 'x86_64'"}
+bitsandbytes = { git = "https://github.com/bitsandbytes-foundation/bitsandbytes.git", branch = "multi-backend-refactor", marker = "sys_platform == 'darwin' and platform_machine == 'arm64'" }
 
 [[tool.uv.index]]
 name = "pytorch-cu124"

diff --git a/uv.lock b/uv.lock