From 870415262cf12f7144d93359317acb24629e6dfd Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 19 Mar 2024 19:53:29 -0700
Subject: [PATCH 1/3] Summary: redoing
 https://github.com/pytorch-labs/gpt-fast/commit/5bf70c114088a5133299609694a8c17b37de69c4
 in a way that doesn't get reverted

Test Plan: sh run.sh

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
---
 GPTQ.py     |  4 ++--
 model.py    |  6 ++----
 quantize.py | 32 +++++++++++++++-----------------
 run.sh      | 17 +++++++++++++++++
 4 files changed, 36 insertions(+), 23 deletions(-)
 create mode 100644 run.sh

diff --git a/GPTQ.py b/GPTQ.py
index 806ffad..e1279bd 100644
--- a/GPTQ.py
+++ b/GPTQ.py
@@ -150,9 +150,9 @@ def __init__(
         }
 
         # trace model for one input
-        one_input = [multi.values[0] for multi in inputs]
+        one_input = [multi.values[0].cpu() for multi in inputs]
         exported_model = torch._dynamo.export(
-            model, aten_graph=True, pre_dispatch=True, tracing_mode="fake"
+            model.cpu(), aten_graph=True, pre_dispatch=True, tracing_mode="fake"
         )(*one_input)
         super().__init__(exported_model.graph_module)
         self.new_state_dict = model.state_dict()
diff --git a/model.py b/model.py
index dbf24e5..e70e87a 100644
--- a/model.py
+++ b/model.py
@@ -78,10 +78,8 @@ def update(self, input_pos, k_val, v_val):
         # input_pos: [S], k_val: [B, H, S, D]
         assert input_pos.shape[0] == k_val.shape[2]
 
-        k_out = self.k_cache
-        v_out = self.v_cache
-        k_out[:, :, input_pos] = k_val
-        v_out[:, :, input_pos] = v_val
+        k_out = torch.ops.aten.index_put_(self.k_cache, [None, None, input_pos], k_val)
+        v_out = torch.ops.aten.index_put_(self.v_cache, [None, None, input_pos], v_val)
 
         return k_out, v_out
 
diff --git a/quantize.py b/quantize.py
index db47775..a9b3f79 100644
--- a/quantize.py
+++ b/quantize.py
@@ -365,6 +365,9 @@ def prepare_int4_weight_and_scales_and_zeros(weight_bf16, groupsize, inner_k_til
     weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(weight_int32, inner_k_tiles)
     return weight_int4pack, scales_and_zeros
 
+def _calc_padded_size(k, groupsize=1, innner_k_tiles=1):
+    from model import find_multiple
+    return find_multiple(k, 1024)
 
 def linear_forward_int4(x, weight_int4pack, scales_and_zeros, out_features, groupsize):
     origin_x_size = x.size()
@@ -378,29 +381,24 @@ def linear_forward_int4(x, weight_int4pack, scales_and_zeros, out_features, grou
 def _check_linear_int4_k(k, groupsize = 1, inner_k_tiles = 1):
     return k % groupsize == 0 and k % (inner_k_tiles * 16) == 0
 
-def replace_linear_int4(module, groupsize, inner_k_tiles, padding, use_cuda):
+def replace_linear_int4(module, groupsize, inner_k_tiles, padding_allowed, use_cuda):
     for name, child in module.named_children():
         if isinstance(child, nn.Linear):
-            if _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles):
+            if _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles) or padding_allowed:
                 setattr(module, name, WeightOnlyInt4Linear(
                     child.in_features, child.out_features, bias=False,
-                    groupsize=groupsize, inner_k_tiles=inner_k_tiles, padding=False, use_cuda=use_cuda
-                ))
-            elif padding:
-                setattr(module, name, WeightOnlyInt4Linear(
-                    child.in_features, child.out_features, bias=False,
-                    groupsize=groupsize, inner_k_tiles=inner_k_tiles, padding=True, use_cuda=use_cuda
+                    groupsize=groupsize, inner_k_tiles=inner_k_tiles, use_cuda=use_cuda
                 ))
         else:
-            replace_linear_int4(child, groupsize, inner_k_tiles, padding, use_cuda)
+            replace_linear_int4(child, groupsize, inner_k_tiles, padding_allowed, use_cuda)
 
 
 class WeightOnlyInt4QuantHandler:
-    def __init__(self, mod, groupsize=128, inner_k_tiles=8, padding=True):
+    def __init__(self, mod, groupsize=128, inner_k_tiles=8, padding_allowed=True):
         self.mod = mod
         self.groupsize = groupsize
         self.inner_k_tiles = inner_k_tiles
-        self.padding = padding
+        self.padding_allowed = padding_allowed
         assert groupsize in [32, 64, 128, 256]
         assert inner_k_tiles in [2, 4, 8]
 
@@ -417,7 +415,7 @@ def create_quantized_state_dict(self):
 
                 weight = mod.weight.data
                 if not _check_linear_int4_k(in_features, self.groupsize, self.inner_k_tiles):
-                    if self.padding:
+                    if self.padding_allowed:
                         from model import find_multiple
                         import torch.nn.functional as F
                         print(f"warning: {fqn} is padded to satisfy in_features % 1024 == 0")
@@ -436,7 +434,7 @@ def create_quantized_state_dict(self):
         return cur_state_dict
 
     def convert_for_runtime(self, use_cuda):
-        replace_linear_int4(self.mod, self.groupsize, self.inner_k_tiles, self.padding, use_cuda)
+        replace_linear_int4(self.mod, self.groupsize, self.inner_k_tiles, self.padding_allowed, use_cuda)
         return self.mod
 
 class WeightOnlyInt4GPTQQuantHandler(GPTQQuantHandler):
@@ -485,11 +483,11 @@ class WeightOnlyInt4Linear(torch.nn.Module):
 
     def __init__(
             self, in_features: int, out_features: int,
-            bias=True, device=None, dtype=None, groupsize: int = 128, inner_k_tiles: int = 8, padding: bool = True, use_cuda=True,
+            bias=True, device=None, dtype=None, groupsize: int = 128, inner_k_tiles: int = 8, use_cuda=True,
     ) -> None:
         super().__init__()
-        self.padding = padding
-        if padding:
+        self.padding = _check_linear_int4_k(in_features, groupsize, inner_k_tiles)
+        if self.padding:
             from model import find_multiple
             self.origin_in_features = in_features
             in_features = find_multiple(in_features, 1024)
@@ -597,7 +595,7 @@ def quantize(
 
         dir_name = checkpoint_path.parent
         base_name = checkpoint_path.name
-        new_base_name = base_name.replace('.pth', f"{label}int4-gptq.g{groupsize}.pth")
+        new_base_name = base_name.replace('.pth', f"{label}int4-gptq.g{groupsize}.{device}.pth")
     else:
         raise ValueError(f"Invalid quantization mode {mode} needs to be one of [int8, int4, int4-gpptq]")
 
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000..b772fa3
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,17 @@
+export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
+
+# python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --compile # working
+# echo "base"
+
+python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4-gptq --calibration_tasks wikitext --calibration_limit 5
+python eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4-gptq.g32.cuda.pth --tasks wikitext --limit 5
+
+# python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.pth --compile
+# echo "quant good"
+
+# python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4
+# python eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.cuda.pth --tasks wikitext --limit 5
+
+# ENABLE_INTRA_NODE_COMM=1 torchrun --standalone --nproc_per_node=8 generate.py --compile --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.cuda.pth
+
+# python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4-gptq --calibration_tasks wikitext --calibration_limit 5

From 749093eacb2181e02e1b79be81d94bd3119a7385 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 26 Mar 2024 13:06:22 -0700
Subject: [PATCH 2/3] Update on "Summary: redoing"

https://github.com/pytorch-labs/gpt-fast/commit/5bf70c114088a5133299609694a8c17b37de69c4
in a way that doesn't get reverted

Test Plan:

export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4-gptq --calibration_tasks wikitext --calibration_limit 5
python eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4-gptq.g32.cuda.pth --tasks wikitext --limit 5

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
---
 GPTQ.py |   2 +-
 log.log | 477 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 run.sh  |   8 +-
 3 files changed, 482 insertions(+), 5 deletions(-)
 create mode 100644 log.log

diff --git a/GPTQ.py b/GPTQ.py
index e1279bd..792259e 100644
--- a/GPTQ.py
+++ b/GPTQ.py
@@ -150,7 +150,7 @@ def __init__(
         }
 
         # trace model for one input
-        one_input = [multi.values[0].cpu() for multi in inputs]
+        one_input = tuple([multi.values[0].cpu() for multi in inputs])
         exported_model = torch._dynamo.export(
             model.cpu(), aten_graph=True, pre_dispatch=True, tracing_mode="fake"
         )(*one_input)
diff --git a/log.log b/log.log
new file mode 100644
index 0000000..1d40c56
--- /dev/null
+++ b/log.log
@@ -0,0 +1,477 @@
+2024-03-25:17:50:41,903 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
+2024-03-25:17:50:41,904 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
+2024-03-25:17:50:41,904 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
+  from pandas.core import (
+2024-03-25:17:50:49,270 INFO     [huggingface.py:148] Using device 'cuda'
+2024-03-25:17:50:54,575 WARNING  [__init__.py:194] Some tasks could not be loaded due to missing dependencies. Run with `--verbosity DEBUG` for full details.
+2024-03-25:17:50:58,714 WARNING  [__init__.py:194] Some tasks could not be loaded due to missing dependencies. Run with `--verbosity DEBUG` for full details.
+2024-03-25:17:50:58,714 WARNING  [task.py:626] [Task: wikitext] metric word_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
+2024-03-25:17:50:58,714 WARNING  [task.py:638] [Task: wikitext] metric word_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
+2024-03-25:17:50:58,715 WARNING  [task.py:626] [Task: wikitext] metric byte_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
+2024-03-25:17:50:58,715 WARNING  [task.py:638] [Task: wikitext] metric byte_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
+2024-03-25:17:50:58,715 WARNING  [task.py:626] [Task: wikitext] metric bits_per_byte is defined, but aggregation is not. using default aggregation=bits_per_byte
+2024-03-25:17:50:58,715 WARNING  [task.py:638] [Task: wikitext] metric bits_per_byte is defined, but higher_is_better is not. using default higher_is_better=False
+Repo card metadata block was not found. Setting CardData to empty.
+2024-03-25:17:50:59,420 WARNING  [repocard.py:107] Repo card metadata block was not found. Setting CardData to empty.
+2024-03-25:17:50:59,466 INFO     [task.py:363] Building contexts for task on rank 0...
+2024-03-25:17:50:59,475 INFO     [evaluator.py:324] Running loglikelihood_rolling requests
+Loading model ...
+Using int4 weight-only quantization!
+Time to load model: 3.75 seconds.
+  0%|          | 0/5 [00:00<?, ?it/s] 20%|██        | 1/5 [00:01<00:05,  1.26s/it] 40%|████      | 2/5 [00:06<00:09,  3.32s/it] 60%|██████    | 3/5 [00:09<00:06,  3.26s/it] 80%|████████  | 4/5 [00:17<00:05,  5.10s/it]100%|██████████| 5/5 [00:20<00:00,  4.40s/it]100%|██████████| 5/5 [00:20<00:00,  4.06s/it]
+Time to run eval: 30.53 seconds.
+For model checkpoints/meta-llama/Llama-2-7b-chat-hf/model_int4-gptq.g32.cuda.pth
+wikitext: {'word_perplexity,none': 11.232339081135366, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.6038800882234914, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.6815662848152432, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'}
+2024-03-25:17:51:25,668 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
+2024-03-25:17:51:25,668 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
+2024-03-25:17:51:25,668 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
+  from pandas.core import (
+Loading model ...
+Quantizing model weights for int4 weight-only affine per-channel groupwise quantization
+Prepacking model weights in cuda optimal layout
+linear: layers.0.attention.wqkv, in=4096, out=12288
+linear: layers.0.attention.wo, in=4096, out=4096
+linear: layers.0.feed_forward.w1, in=4096, out=11008
+linear: layers.0.feed_forward.w3, in=4096, out=11008
+linear: layers.0.feed_forward.w2, in=11008, out=4096
+linear: layers.1.attention.wqkv, in=4096, out=12288
+linear: layers.1.attention.wo, in=4096, out=4096
+linear: layers.1.feed_forward.w1, in=4096, out=11008
+linear: layers.1.feed_forward.w3, in=4096, out=11008
+linear: layers.1.feed_forward.w2, in=11008, out=4096
+linear: layers.2.attention.wqkv, in=4096, out=12288
+linear: layers.2.attention.wo, in=4096, out=4096
+linear: layers.2.feed_forward.w1, in=4096, out=11008
+linear: layers.2.feed_forward.w3, in=4096, out=11008
+linear: layers.2.feed_forward.w2, in=11008, out=4096
+linear: layers.3.attention.wqkv, in=4096, out=12288
+linear: layers.3.attention.wo, in=4096, out=4096
+linear: layers.3.feed_forward.w1, in=4096, out=11008
+linear: layers.3.feed_forward.w3, in=4096, out=11008
+linear: layers.3.feed_forward.w2, in=11008, out=4096
+linear: layers.4.attention.wqkv, in=4096, out=12288
+linear: layers.4.attention.wo, in=4096, out=4096
+linear: layers.4.feed_forward.w1, in=4096, out=11008
+linear: layers.4.feed_forward.w3, in=4096, out=11008
+linear: layers.4.feed_forward.w2, in=11008, out=4096
+linear: layers.5.attention.wqkv, in=4096, out=12288
+linear: layers.5.attention.wo, in=4096, out=4096
+linear: layers.5.feed_forward.w1, in=4096, out=11008
+linear: layers.5.feed_forward.w3, in=4096, out=11008
+linear: layers.5.feed_forward.w2, in=11008, out=4096
+linear: layers.6.attention.wqkv, in=4096, out=12288
+linear: layers.6.attention.wo, in=4096, out=4096
+linear: layers.6.feed_forward.w1, in=4096, out=11008
+linear: layers.6.feed_forward.w3, in=4096, out=11008
+linear: layers.6.feed_forward.w2, in=11008, out=4096
+linear: layers.7.attention.wqkv, in=4096, out=12288
+linear: layers.7.attention.wo, in=4096, out=4096
+linear: layers.7.feed_forward.w1, in=4096, out=11008
+linear: layers.7.feed_forward.w3, in=4096, out=11008
+linear: layers.7.feed_forward.w2, in=11008, out=4096
+linear: layers.8.attention.wqkv, in=4096, out=12288
+linear: layers.8.attention.wo, in=4096, out=4096
+linear: layers.8.feed_forward.w1, in=4096, out=11008
+linear: layers.8.feed_forward.w3, in=4096, out=11008
+linear: layers.8.feed_forward.w2, in=11008, out=4096
+linear: layers.9.attention.wqkv, in=4096, out=12288
+linear: layers.9.attention.wo, in=4096, out=4096
+linear: layers.9.feed_forward.w1, in=4096, out=11008
+linear: layers.9.feed_forward.w3, in=4096, out=11008
+linear: layers.9.feed_forward.w2, in=11008, out=4096
+linear: layers.10.attention.wqkv, in=4096, out=12288
+linear: layers.10.attention.wo, in=4096, out=4096
+linear: layers.10.feed_forward.w1, in=4096, out=11008
+linear: layers.10.feed_forward.w3, in=4096, out=11008
+linear: layers.10.feed_forward.w2, in=11008, out=4096
+linear: layers.11.attention.wqkv, in=4096, out=12288
+linear: layers.11.attention.wo, in=4096, out=4096
+linear: layers.11.feed_forward.w1, in=4096, out=11008
+linear: layers.11.feed_forward.w3, in=4096, out=11008
+linear: layers.11.feed_forward.w2, in=11008, out=4096
+linear: layers.12.attention.wqkv, in=4096, out=12288
+linear: layers.12.attention.wo, in=4096, out=4096
+linear: layers.12.feed_forward.w1, in=4096, out=11008
+linear: layers.12.feed_forward.w3, in=4096, out=11008
+linear: layers.12.feed_forward.w2, in=11008, out=4096
+linear: layers.13.attention.wqkv, in=4096, out=12288
+linear: layers.13.attention.wo, in=4096, out=4096
+linear: layers.13.feed_forward.w1, in=4096, out=11008
+linear: layers.13.feed_forward.w3, in=4096, out=11008
+linear: layers.13.feed_forward.w2, in=11008, out=4096
+linear: layers.14.attention.wqkv, in=4096, out=12288
+linear: layers.14.attention.wo, in=4096, out=4096
+linear: layers.14.feed_forward.w1, in=4096, out=11008
+linear: layers.14.feed_forward.w3, in=4096, out=11008
+linear: layers.14.feed_forward.w2, in=11008, out=4096
+linear: layers.15.attention.wqkv, in=4096, out=12288
+linear: layers.15.attention.wo, in=4096, out=4096
+linear: layers.15.feed_forward.w1, in=4096, out=11008
+linear: layers.15.feed_forward.w3, in=4096, out=11008
+linear: layers.15.feed_forward.w2, in=11008, out=4096
+linear: layers.16.attention.wqkv, in=4096, out=12288
+linear: layers.16.attention.wo, in=4096, out=4096
+linear: layers.16.feed_forward.w1, in=4096, out=11008
+linear: layers.16.feed_forward.w3, in=4096, out=11008
+linear: layers.16.feed_forward.w2, in=11008, out=4096
+linear: layers.17.attention.wqkv, in=4096, out=12288
+linear: layers.17.attention.wo, in=4096, out=4096
+linear: layers.17.feed_forward.w1, in=4096, out=11008
+linear: layers.17.feed_forward.w3, in=4096, out=11008
+linear: layers.17.feed_forward.w2, in=11008, out=4096
+linear: layers.18.attention.wqkv, in=4096, out=12288
+linear: layers.18.attention.wo, in=4096, out=4096
+linear: layers.18.feed_forward.w1, in=4096, out=11008
+linear: layers.18.feed_forward.w3, in=4096, out=11008
+linear: layers.18.feed_forward.w2, in=11008, out=4096
+linear: layers.19.attention.wqkv, in=4096, out=12288
+linear: layers.19.attention.wo, in=4096, out=4096
+linear: layers.19.feed_forward.w1, in=4096, out=11008
+linear: layers.19.feed_forward.w3, in=4096, out=11008
+linear: layers.19.feed_forward.w2, in=11008, out=4096
+linear: layers.20.attention.wqkv, in=4096, out=12288
+linear: layers.20.attention.wo, in=4096, out=4096
+linear: layers.20.feed_forward.w1, in=4096, out=11008
+linear: layers.20.feed_forward.w3, in=4096, out=11008
+linear: layers.20.feed_forward.w2, in=11008, out=4096
+linear: layers.21.attention.wqkv, in=4096, out=12288
+linear: layers.21.attention.wo, in=4096, out=4096
+linear: layers.21.feed_forward.w1, in=4096, out=11008
+linear: layers.21.feed_forward.w3, in=4096, out=11008
+linear: layers.21.feed_forward.w2, in=11008, out=4096
+linear: layers.22.attention.wqkv, in=4096, out=12288
+linear: layers.22.attention.wo, in=4096, out=4096
+linear: layers.22.feed_forward.w1, in=4096, out=11008
+linear: layers.22.feed_forward.w3, in=4096, out=11008
+linear: layers.22.feed_forward.w2, in=11008, out=4096
+linear: layers.23.attention.wqkv, in=4096, out=12288
+linear: layers.23.attention.wo, in=4096, out=4096
+linear: layers.23.feed_forward.w1, in=4096, out=11008
+linear: layers.23.feed_forward.w3, in=4096, out=11008
+linear: layers.23.feed_forward.w2, in=11008, out=4096
+linear: layers.24.attention.wqkv, in=4096, out=12288
+linear: layers.24.attention.wo, in=4096, out=4096
+linear: layers.24.feed_forward.w1, in=4096, out=11008
+linear: layers.24.feed_forward.w3, in=4096, out=11008
+linear: layers.24.feed_forward.w2, in=11008, out=4096
+linear: layers.25.attention.wqkv, in=4096, out=12288
+linear: layers.25.attention.wo, in=4096, out=4096
+linear: layers.25.feed_forward.w1, in=4096, out=11008
+linear: layers.25.feed_forward.w3, in=4096, out=11008
+linear: layers.25.feed_forward.w2, in=11008, out=4096
+linear: layers.26.attention.wqkv, in=4096, out=12288
+linear: layers.26.attention.wo, in=4096, out=4096
+linear: layers.26.feed_forward.w1, in=4096, out=11008
+linear: layers.26.feed_forward.w3, in=4096, out=11008
+linear: layers.26.feed_forward.w2, in=11008, out=4096
+linear: layers.27.attention.wqkv, in=4096, out=12288
+linear: layers.27.attention.wo, in=4096, out=4096
+linear: layers.27.feed_forward.w1, in=4096, out=11008
+linear: layers.27.feed_forward.w3, in=4096, out=11008
+linear: layers.27.feed_forward.w2, in=11008, out=4096
+linear: layers.28.attention.wqkv, in=4096, out=12288
+linear: layers.28.attention.wo, in=4096, out=4096
+linear: layers.28.feed_forward.w1, in=4096, out=11008
+linear: layers.28.feed_forward.w3, in=4096, out=11008
+linear: layers.28.feed_forward.w2, in=11008, out=4096
+linear: layers.29.attention.wqkv, in=4096, out=12288
+linear: layers.29.attention.wo, in=4096, out=4096
+linear: layers.29.feed_forward.w1, in=4096, out=11008
+linear: layers.29.feed_forward.w3, in=4096, out=11008
+linear: layers.29.feed_forward.w2, in=11008, out=4096
+linear: layers.30.attention.wqkv, in=4096, out=12288
+linear: layers.30.attention.wo, in=4096, out=4096
+linear: layers.30.feed_forward.w1, in=4096, out=11008
+linear: layers.30.feed_forward.w3, in=4096, out=11008
+linear: layers.30.feed_forward.w2, in=11008, out=4096
+linear: layers.31.attention.wqkv, in=4096, out=12288
+linear: layers.31.attention.wo, in=4096, out=4096
+linear: layers.31.feed_forward.w1, in=4096, out=11008
+linear: layers.31.feed_forward.w3, in=4096, out=11008
+linear: layers.31.feed_forward.w2, in=11008, out=4096
+linear: output, in=4096, out=32000
+Writing quantized weights to checkpoints/meta-llama/Llama-2-7b-chat-hf/model_int4.g32.cuda.pth
+Quantization complete took 18.33 seconds
+2024-03-25:17:51:53,532 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
+2024-03-25:17:51:53,532 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
+2024-03-25:17:51:53,532 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
+  from pandas.core import (
+Loading model ...
+Using int4 weight-only quantization!
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/gpt-fast/eval.py", line 268, in <module>
+    main(
+  File "/home/cdhernandez/local/gpt-fast/eval.py", line 227, in main
+    model = _load_model(checkpoint_path, device, precision, False)
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 240, in _load_model
+    model.load_state_dict(checkpoint, assign=True)
+  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 2184, in load_state_dict
+    raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+RuntimeError: Error(s) in loading state_dict for Transformer:
+	size mismatch for layers.0.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.0.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.1.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.1.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.2.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.2.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.3.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.3.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.4.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.4.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.5.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.5.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.6.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.6.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.7.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.7.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.8.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.8.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.9.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.9.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.10.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.10.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.11.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.11.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.12.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.12.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.13.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.13.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.14.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.14.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.15.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.15.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.16.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.16.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.17.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.17.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.18.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.18.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.19.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.19.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.20.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.20.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.21.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.21.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.22.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.22.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.23.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.23.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.24.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.24.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.25.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.25.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.26.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.26.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.27.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.27.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.28.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.28.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.29.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.29.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.30.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.30.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+	size mismatch for layers.31.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
+	size mismatch for layers.31.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
+W0325 17:52:01.365000 140417423238144 torch/distributed/run.py:757] 
+W0325 17:52:01.365000 140417423238144 torch/distributed/run.py:757] *****************************************
+W0325 17:52:01.365000 140417423238144 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+W0325 17:52:01.365000 140417423238144 torch/distributed/run.py:757] *****************************************
+2024-03-25:17:52:08,088 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
+2024-03-25:17:52:08,088 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
+  from pandas.core import (
+2024-03-25:17:52:08,258 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
+2024-03-25:17:52:08,258 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
+  from pandas.core import (
+2024-03-25:17:52:08,311 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
+2024-03-25:17:52:08,311 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
+  from pandas.core import (
+2024-03-25:17:52:08,358 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
+2024-03-25:17:52:08,358 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
+  from pandas.core import (
+2024-03-25:17:52:08,427 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
+2024-03-25:17:52:08,427 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
+2024-03-25:17:52:08,429 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
+2024-03-25:17:52:08,429 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
+2024-03-25:17:52:08,433 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
+2024-03-25:17:52:08,433 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
+  from pandas.core import (
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
+  from pandas.core import (
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
+  from pandas.core import (
+2024-03-25:17:52:08,502 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
+2024-03-25:17:52:08,502 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
+/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
+  from pandas.core import (
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
+    main(
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
+    rank = maybe_init_dist()
+  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
+    torch.cuda.set_device(rank)
+  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
+    torch._C._cuda_setDevice(device)
+RuntimeError: CUDA error: invalid device ordinal
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
+    main(
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
+    rank = maybe_init_dist()
+  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
+    torch.cuda.set_device(rank)
+  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
+    torch._C._cuda_setDevice(device)
+RuntimeError: CUDA error: invalid device ordinal
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
+    main(
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
+    rank = maybe_init_dist()
+  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
+    torch.cuda.set_device(rank)
+  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
+    torch._C._cuda_setDevice(device)
+RuntimeError: CUDA error: invalid device ordinal
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
+    main(
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
+    rank = maybe_init_dist()
+  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
+    torch.cuda.set_device(rank)
+  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
+    torch._C._cuda_setDevice(device)
+RuntimeError: CUDA error: invalid device ordinal
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
+    main(
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
+    rank = maybe_init_dist()
+  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
+    torch.cuda.set_device(rank)
+  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
+    torch._C._cuda_setDevice(device)
+RuntimeError: CUDA error: invalid device ordinal
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
+    main(
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
+    rank = maybe_init_dist()
+  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
+    torch.cuda.set_device(rank)
+  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
+    torch._C._cuda_setDevice(device)
+RuntimeError: CUDA error: invalid device ordinal
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
+    main(
+  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
+    rank = maybe_init_dist()
+  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
+    torch.cuda.set_device(rank)
+  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
+    torch._C._cuda_setDevice(device)
+RuntimeError: CUDA error: invalid device ordinal
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+W0325 17:52:11.561000 140417423238144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1765652 closing signal SIGTERM
+W0325 17:52:11.562000 140417423238144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1765656 closing signal SIGTERM
+E0325 17:52:11.675000 140417423238144 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 1 (pid: 1765653) of binary: /home/cdhernandez/local/miniconda3/envs/pytorch/bin/python
+Traceback (most recent call last):
+  File "/home/cdhernandez/local/miniconda3/envs/pytorch/bin/torchrun", line 33, in <module>
+    sys.exit(load_entry_point('torch', 'console_scripts', 'torchrun')())
+  File "/home/cdhernandez/local/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
+    return f(*args, **kwargs)
+  File "/home/cdhernandez/local/pytorch/torch/distributed/run.py", line 879, in main
+    run(args)
+  File "/home/cdhernandez/local/pytorch/torch/distributed/run.py", line 870, in run
+    elastic_launch(
+  File "/home/cdhernandez/local/pytorch/torch/distributed/launcher/api.py", line 132, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/cdhernandez/local/pytorch/torch/distributed/launcher/api.py", line 263, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+generate.py FAILED
+------------------------------------------------------------
+Failures:
+[1]:
+  time      : 2024-03-25_17:52:11
+  host      : devgpu001.ash8.facebook.com
+  rank      : 2 (local_rank: 2)
+  exitcode  : 1 (pid: 1765654)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+[2]:
+  time      : 2024-03-25_17:52:11
+  host      : devgpu001.ash8.facebook.com
+  rank      : 3 (local_rank: 3)
+  exitcode  : 1 (pid: 1765655)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+[3]:
+  time      : 2024-03-25_17:52:11
+  host      : devgpu001.ash8.facebook.com
+  rank      : 5 (local_rank: 5)
+  exitcode  : 1 (pid: 1765657)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+[4]:
+  time      : 2024-03-25_17:52:11
+  host      : devgpu001.ash8.facebook.com
+  rank      : 6 (local_rank: 6)
+  exitcode  : 1 (pid: 1765658)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+[5]:
+  time      : 2024-03-25_17:52:11
+  host      : devgpu001.ash8.facebook.com
+  rank      : 7 (local_rank: 7)
+  exitcode  : 1 (pid: 1765659)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2024-03-25_17:52:11
+  host      : devgpu001.ash8.facebook.com
+  rank      : 1 (local_rank: 1)
+  exitcode  : 1 (pid: 1765653)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
diff --git a/run.sh b/run.sh
index b772fa3..d657afe 100644
--- a/run.sh
+++ b/run.sh
@@ -3,15 +3,15 @@ export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
 # python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --compile # working
 # echo "base"
 
-python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4-gptq --calibration_tasks wikitext --calibration_limit 5
+# python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4-gptq --calibration_tasks wikitext --calibration_limit 5
 python eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4-gptq.g32.cuda.pth --tasks wikitext --limit 5
 
 # python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.pth --compile
 # echo "quant good"
 
-# python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4
-# python eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.cuda.pth --tasks wikitext --limit 5
+python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4
+python eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.cuda.pth --tasks wikitext --limit 5
 
-# ENABLE_INTRA_NODE_COMM=1 torchrun --standalone --nproc_per_node=8 generate.py --compile --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.cuda.pth
+ENABLE_INTRA_NODE_COMM=1 torchrun --standalone --nproc_per_node=8 generate.py --compile --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.cuda.pth
 
 # python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4-gptq --calibration_tasks wikitext --calibration_limit 5

From 2e7673714b8ffdb4062e6dd89764f8b7f16a936e Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 26 Mar 2024 14:09:32 -0700
Subject: [PATCH 3/3] Update on "int4 gptq shape fix"

Summary: redoing
https://github.com/pytorch-labs/gpt-fast/commit/5bf70c114088a5133299609694a8c17b37de69c4
in a way that doesn't get reverted. note, needed to fix a device issue
as well.

Test Plan:

export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4-gptq --calibration_tasks wikitext --calibration_limit 5
python eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4-gptq.g32.cuda.pth --tasks wikitext --limit 5

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
---
 GPTQ.py  |   2 +-
 log.log  | 477 -------------------------------------------------------
 model.py |   6 +-
 run.sh   |  17 --
 4 files changed, 5 insertions(+), 497 deletions(-)
 delete mode 100644 log.log
 delete mode 100644 run.sh

diff --git a/GPTQ.py b/GPTQ.py
index 792259e..e1279bd 100644
--- a/GPTQ.py
+++ b/GPTQ.py
@@ -150,7 +150,7 @@ def __init__(
         }
 
         # trace model for one input
-        one_input = tuple([multi.values[0].cpu() for multi in inputs])
+        one_input = [multi.values[0].cpu() for multi in inputs]
         exported_model = torch._dynamo.export(
             model.cpu(), aten_graph=True, pre_dispatch=True, tracing_mode="fake"
         )(*one_input)
diff --git a/log.log b/log.log
deleted file mode 100644
index 1d40c56..0000000
--- a/log.log
+++ /dev/null
@@ -1,477 +0,0 @@
-2024-03-25:17:50:41,903 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
-2024-03-25:17:50:41,904 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
-2024-03-25:17:50:41,904 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
-/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
-  from pandas.core import (
-2024-03-25:17:50:49,270 INFO     [huggingface.py:148] Using device 'cuda'
-2024-03-25:17:50:54,575 WARNING  [__init__.py:194] Some tasks could not be loaded due to missing dependencies. Run with `--verbosity DEBUG` for full details.
-2024-03-25:17:50:58,714 WARNING  [__init__.py:194] Some tasks could not be loaded due to missing dependencies. Run with `--verbosity DEBUG` for full details.
-2024-03-25:17:50:58,714 WARNING  [task.py:626] [Task: wikitext] metric word_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
-2024-03-25:17:50:58,714 WARNING  [task.py:638] [Task: wikitext] metric word_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
-2024-03-25:17:50:58,715 WARNING  [task.py:626] [Task: wikitext] metric byte_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
-2024-03-25:17:50:58,715 WARNING  [task.py:638] [Task: wikitext] metric byte_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
-2024-03-25:17:50:58,715 WARNING  [task.py:626] [Task: wikitext] metric bits_per_byte is defined, but aggregation is not. using default aggregation=bits_per_byte
-2024-03-25:17:50:58,715 WARNING  [task.py:638] [Task: wikitext] metric bits_per_byte is defined, but higher_is_better is not. using default higher_is_better=False
-Repo card metadata block was not found. Setting CardData to empty.
-2024-03-25:17:50:59,420 WARNING  [repocard.py:107] Repo card metadata block was not found. Setting CardData to empty.
-2024-03-25:17:50:59,466 INFO     [task.py:363] Building contexts for task on rank 0...
-2024-03-25:17:50:59,475 INFO     [evaluator.py:324] Running loglikelihood_rolling requests
-Loading model ...
-Using int4 weight-only quantization!
-Time to load model: 3.75 seconds.
-  0%|          | 0/5 [00:00<?, ?it/s] 20%|██        | 1/5 [00:01<00:05,  1.26s/it] 40%|████      | 2/5 [00:06<00:09,  3.32s/it] 60%|██████    | 3/5 [00:09<00:06,  3.26s/it] 80%|████████  | 4/5 [00:17<00:05,  5.10s/it]100%|██████████| 5/5 [00:20<00:00,  4.40s/it]100%|██████████| 5/5 [00:20<00:00,  4.06s/it]
-Time to run eval: 30.53 seconds.
-For model checkpoints/meta-llama/Llama-2-7b-chat-hf/model_int4-gptq.g32.cuda.pth
-wikitext: {'word_perplexity,none': 11.232339081135366, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.6038800882234914, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.6815662848152432, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'}
-2024-03-25:17:51:25,668 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
-2024-03-25:17:51:25,668 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
-2024-03-25:17:51:25,668 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
-/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
-  from pandas.core import (
-Loading model ...
-Quantizing model weights for int4 weight-only affine per-channel groupwise quantization
-Prepacking model weights in cuda optimal layout
-linear: layers.0.attention.wqkv, in=4096, out=12288
-linear: layers.0.attention.wo, in=4096, out=4096
-linear: layers.0.feed_forward.w1, in=4096, out=11008
-linear: layers.0.feed_forward.w3, in=4096, out=11008
-linear: layers.0.feed_forward.w2, in=11008, out=4096
-linear: layers.1.attention.wqkv, in=4096, out=12288
-linear: layers.1.attention.wo, in=4096, out=4096
-linear: layers.1.feed_forward.w1, in=4096, out=11008
-linear: layers.1.feed_forward.w3, in=4096, out=11008
-linear: layers.1.feed_forward.w2, in=11008, out=4096
-linear: layers.2.attention.wqkv, in=4096, out=12288
-linear: layers.2.attention.wo, in=4096, out=4096
-linear: layers.2.feed_forward.w1, in=4096, out=11008
-linear: layers.2.feed_forward.w3, in=4096, out=11008
-linear: layers.2.feed_forward.w2, in=11008, out=4096
-linear: layers.3.attention.wqkv, in=4096, out=12288
-linear: layers.3.attention.wo, in=4096, out=4096
-linear: layers.3.feed_forward.w1, in=4096, out=11008
-linear: layers.3.feed_forward.w3, in=4096, out=11008
-linear: layers.3.feed_forward.w2, in=11008, out=4096
-linear: layers.4.attention.wqkv, in=4096, out=12288
-linear: layers.4.attention.wo, in=4096, out=4096
-linear: layers.4.feed_forward.w1, in=4096, out=11008
-linear: layers.4.feed_forward.w3, in=4096, out=11008
-linear: layers.4.feed_forward.w2, in=11008, out=4096
-linear: layers.5.attention.wqkv, in=4096, out=12288
-linear: layers.5.attention.wo, in=4096, out=4096
-linear: layers.5.feed_forward.w1, in=4096, out=11008
-linear: layers.5.feed_forward.w3, in=4096, out=11008
-linear: layers.5.feed_forward.w2, in=11008, out=4096
-linear: layers.6.attention.wqkv, in=4096, out=12288
-linear: layers.6.attention.wo, in=4096, out=4096
-linear: layers.6.feed_forward.w1, in=4096, out=11008
-linear: layers.6.feed_forward.w3, in=4096, out=11008
-linear: layers.6.feed_forward.w2, in=11008, out=4096
-linear: layers.7.attention.wqkv, in=4096, out=12288
-linear: layers.7.attention.wo, in=4096, out=4096
-linear: layers.7.feed_forward.w1, in=4096, out=11008
-linear: layers.7.feed_forward.w3, in=4096, out=11008
-linear: layers.7.feed_forward.w2, in=11008, out=4096
-linear: layers.8.attention.wqkv, in=4096, out=12288
-linear: layers.8.attention.wo, in=4096, out=4096
-linear: layers.8.feed_forward.w1, in=4096, out=11008
-linear: layers.8.feed_forward.w3, in=4096, out=11008
-linear: layers.8.feed_forward.w2, in=11008, out=4096
-linear: layers.9.attention.wqkv, in=4096, out=12288
-linear: layers.9.attention.wo, in=4096, out=4096
-linear: layers.9.feed_forward.w1, in=4096, out=11008
-linear: layers.9.feed_forward.w3, in=4096, out=11008
-linear: layers.9.feed_forward.w2, in=11008, out=4096
-linear: layers.10.attention.wqkv, in=4096, out=12288
-linear: layers.10.attention.wo, in=4096, out=4096
-linear: layers.10.feed_forward.w1, in=4096, out=11008
-linear: layers.10.feed_forward.w3, in=4096, out=11008
-linear: layers.10.feed_forward.w2, in=11008, out=4096
-linear: layers.11.attention.wqkv, in=4096, out=12288
-linear: layers.11.attention.wo, in=4096, out=4096
-linear: layers.11.feed_forward.w1, in=4096, out=11008
-linear: layers.11.feed_forward.w3, in=4096, out=11008
-linear: layers.11.feed_forward.w2, in=11008, out=4096
-linear: layers.12.attention.wqkv, in=4096, out=12288
-linear: layers.12.attention.wo, in=4096, out=4096
-linear: layers.12.feed_forward.w1, in=4096, out=11008
-linear: layers.12.feed_forward.w3, in=4096, out=11008
-linear: layers.12.feed_forward.w2, in=11008, out=4096
-linear: layers.13.attention.wqkv, in=4096, out=12288
-linear: layers.13.attention.wo, in=4096, out=4096
-linear: layers.13.feed_forward.w1, in=4096, out=11008
-linear: layers.13.feed_forward.w3, in=4096, out=11008
-linear: layers.13.feed_forward.w2, in=11008, out=4096
-linear: layers.14.attention.wqkv, in=4096, out=12288
-linear: layers.14.attention.wo, in=4096, out=4096
-linear: layers.14.feed_forward.w1, in=4096, out=11008
-linear: layers.14.feed_forward.w3, in=4096, out=11008
-linear: layers.14.feed_forward.w2, in=11008, out=4096
-linear: layers.15.attention.wqkv, in=4096, out=12288
-linear: layers.15.attention.wo, in=4096, out=4096
-linear: layers.15.feed_forward.w1, in=4096, out=11008
-linear: layers.15.feed_forward.w3, in=4096, out=11008
-linear: layers.15.feed_forward.w2, in=11008, out=4096
-linear: layers.16.attention.wqkv, in=4096, out=12288
-linear: layers.16.attention.wo, in=4096, out=4096
-linear: layers.16.feed_forward.w1, in=4096, out=11008
-linear: layers.16.feed_forward.w3, in=4096, out=11008
-linear: layers.16.feed_forward.w2, in=11008, out=4096
-linear: layers.17.attention.wqkv, in=4096, out=12288
-linear: layers.17.attention.wo, in=4096, out=4096
-linear: layers.17.feed_forward.w1, in=4096, out=11008
-linear: layers.17.feed_forward.w3, in=4096, out=11008
-linear: layers.17.feed_forward.w2, in=11008, out=4096
-linear: layers.18.attention.wqkv, in=4096, out=12288
-linear: layers.18.attention.wo, in=4096, out=4096
-linear: layers.18.feed_forward.w1, in=4096, out=11008
-linear: layers.18.feed_forward.w3, in=4096, out=11008
-linear: layers.18.feed_forward.w2, in=11008, out=4096
-linear: layers.19.attention.wqkv, in=4096, out=12288
-linear: layers.19.attention.wo, in=4096, out=4096
-linear: layers.19.feed_forward.w1, in=4096, out=11008
-linear: layers.19.feed_forward.w3, in=4096, out=11008
-linear: layers.19.feed_forward.w2, in=11008, out=4096
-linear: layers.20.attention.wqkv, in=4096, out=12288
-linear: layers.20.attention.wo, in=4096, out=4096
-linear: layers.20.feed_forward.w1, in=4096, out=11008
-linear: layers.20.feed_forward.w3, in=4096, out=11008
-linear: layers.20.feed_forward.w2, in=11008, out=4096
-linear: layers.21.attention.wqkv, in=4096, out=12288
-linear: layers.21.attention.wo, in=4096, out=4096
-linear: layers.21.feed_forward.w1, in=4096, out=11008
-linear: layers.21.feed_forward.w3, in=4096, out=11008
-linear: layers.21.feed_forward.w2, in=11008, out=4096
-linear: layers.22.attention.wqkv, in=4096, out=12288
-linear: layers.22.attention.wo, in=4096, out=4096
-linear: layers.22.feed_forward.w1, in=4096, out=11008
-linear: layers.22.feed_forward.w3, in=4096, out=11008
-linear: layers.22.feed_forward.w2, in=11008, out=4096
-linear: layers.23.attention.wqkv, in=4096, out=12288
-linear: layers.23.attention.wo, in=4096, out=4096
-linear: layers.23.feed_forward.w1, in=4096, out=11008
-linear: layers.23.feed_forward.w3, in=4096, out=11008
-linear: layers.23.feed_forward.w2, in=11008, out=4096
-linear: layers.24.attention.wqkv, in=4096, out=12288
-linear: layers.24.attention.wo, in=4096, out=4096
-linear: layers.24.feed_forward.w1, in=4096, out=11008
-linear: layers.24.feed_forward.w3, in=4096, out=11008
-linear: layers.24.feed_forward.w2, in=11008, out=4096
-linear: layers.25.attention.wqkv, in=4096, out=12288
-linear: layers.25.attention.wo, in=4096, out=4096
-linear: layers.25.feed_forward.w1, in=4096, out=11008
-linear: layers.25.feed_forward.w3, in=4096, out=11008
-linear: layers.25.feed_forward.w2, in=11008, out=4096
-linear: layers.26.attention.wqkv, in=4096, out=12288
-linear: layers.26.attention.wo, in=4096, out=4096
-linear: layers.26.feed_forward.w1, in=4096, out=11008
-linear: layers.26.feed_forward.w3, in=4096, out=11008
-linear: layers.26.feed_forward.w2, in=11008, out=4096
-linear: layers.27.attention.wqkv, in=4096, out=12288
-linear: layers.27.attention.wo, in=4096, out=4096
-linear: layers.27.feed_forward.w1, in=4096, out=11008
-linear: layers.27.feed_forward.w3, in=4096, out=11008
-linear: layers.27.feed_forward.w2, in=11008, out=4096
-linear: layers.28.attention.wqkv, in=4096, out=12288
-linear: layers.28.attention.wo, in=4096, out=4096
-linear: layers.28.feed_forward.w1, in=4096, out=11008
-linear: layers.28.feed_forward.w3, in=4096, out=11008
-linear: layers.28.feed_forward.w2, in=11008, out=4096
-linear: layers.29.attention.wqkv, in=4096, out=12288
-linear: layers.29.attention.wo, in=4096, out=4096
-linear: layers.29.feed_forward.w1, in=4096, out=11008
-linear: layers.29.feed_forward.w3, in=4096, out=11008
-linear: layers.29.feed_forward.w2, in=11008, out=4096
-linear: layers.30.attention.wqkv, in=4096, out=12288
-linear: layers.30.attention.wo, in=4096, out=4096
-linear: layers.30.feed_forward.w1, in=4096, out=11008
-linear: layers.30.feed_forward.w3, in=4096, out=11008
-linear: layers.30.feed_forward.w2, in=11008, out=4096
-linear: layers.31.attention.wqkv, in=4096, out=12288
-linear: layers.31.attention.wo, in=4096, out=4096
-linear: layers.31.feed_forward.w1, in=4096, out=11008
-linear: layers.31.feed_forward.w3, in=4096, out=11008
-linear: layers.31.feed_forward.w2, in=11008, out=4096
-linear: output, in=4096, out=32000
-Writing quantized weights to checkpoints/meta-llama/Llama-2-7b-chat-hf/model_int4.g32.cuda.pth
-Quantization complete took 18.33 seconds
-2024-03-25:17:51:53,532 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
-2024-03-25:17:51:53,532 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
-2024-03-25:17:51:53,532 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
-/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
-  from pandas.core import (
-Loading model ...
-Using int4 weight-only quantization!
-Traceback (most recent call last):
-  File "/home/cdhernandez/local/gpt-fast/eval.py", line 268, in <module>
-    main(
-  File "/home/cdhernandez/local/gpt-fast/eval.py", line 227, in main
-    model = _load_model(checkpoint_path, device, precision, False)
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 240, in _load_model
-    model.load_state_dict(checkpoint, assign=True)
-  File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 2184, in load_state_dict
-    raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-RuntimeError: Error(s) in loading state_dict for Transformer:
-	size mismatch for layers.0.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.0.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.1.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.1.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.2.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.2.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.3.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.3.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.4.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.4.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.5.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.5.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.6.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.6.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.7.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.7.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.8.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.8.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.9.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.9.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.10.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.10.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.11.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.11.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.12.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.12.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.13.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.13.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.14.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.14.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.15.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.15.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.16.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.16.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.17.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.17.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.18.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.18.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.19.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.19.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.20.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.20.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.21.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.21.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.22.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.22.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.23.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.23.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.24.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.24.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.25.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.25.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.26.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.26.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.27.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.27.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.28.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.28.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.29.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.29.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.30.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.30.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-	size mismatch for layers.31.feed_forward.w2.weight: copying a param with shape torch.Size([512, 86, 32, 4]) from checkpoint, the shape in current model is torch.Size([512, 88, 32, 4]).
-	size mismatch for layers.31.feed_forward.w2.scales_and_zeros: copying a param with shape torch.Size([344, 4096, 2]) from checkpoint, the shape in current model is torch.Size([352, 4096, 2]).
-W0325 17:52:01.365000 140417423238144 torch/distributed/run.py:757] 
-W0325 17:52:01.365000 140417423238144 torch/distributed/run.py:757] *****************************************
-W0325 17:52:01.365000 140417423238144 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
-W0325 17:52:01.365000 140417423238144 torch/distributed/run.py:757] *****************************************
-2024-03-25:17:52:08,088 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
-2024-03-25:17:52:08,088 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
-/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
-  from pandas.core import (
-2024-03-25:17:52:08,258 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
-2024-03-25:17:52:08,258 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
-/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
-  from pandas.core import (
-2024-03-25:17:52:08,311 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
-2024-03-25:17:52:08,311 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
-/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
-  from pandas.core import (
-2024-03-25:17:52:08,358 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
-2024-03-25:17:52:08,358 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
-/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
-  from pandas.core import (
-2024-03-25:17:52:08,427 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
-2024-03-25:17:52:08,427 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
-2024-03-25:17:52:08,429 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
-2024-03-25:17:52:08,429 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
-2024-03-25:17:52:08,433 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
-2024-03-25:17:52:08,433 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
-/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
-  from pandas.core import (
-/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
-  from pandas.core import (
-/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
-  from pandas.core import (
-2024-03-25:17:52:08,502 INFO     [utils.py:145] Note: detected 192 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
-2024-03-25:17:52:08,502 INFO     [utils.py:148] Note: NumExpr detected 192 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
-/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
-  from pandas.core import (
-Traceback (most recent call last):
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
-    main(
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
-    rank = maybe_init_dist()
-  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
-    torch.cuda.set_device(rank)
-  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
-    torch._C._cuda_setDevice(device)
-RuntimeError: CUDA error: invalid device ordinal
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
-    main(
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
-    rank = maybe_init_dist()
-  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
-    torch.cuda.set_device(rank)
-  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
-    torch._C._cuda_setDevice(device)
-RuntimeError: CUDA error: invalid device ordinal
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
-    main(
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
-    rank = maybe_init_dist()
-  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
-    torch.cuda.set_device(rank)
-  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
-    torch._C._cuda_setDevice(device)
-RuntimeError: CUDA error: invalid device ordinal
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
-    main(
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
-    rank = maybe_init_dist()
-  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
-    torch.cuda.set_device(rank)
-  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
-    torch._C._cuda_setDevice(device)
-RuntimeError: CUDA error: invalid device ordinal
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
-    main(
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
-    rank = maybe_init_dist()
-  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
-    torch.cuda.set_device(rank)
-  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
-    torch._C._cuda_setDevice(device)
-RuntimeError: CUDA error: invalid device ordinal
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
-    main(
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
-    rank = maybe_init_dist()
-  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
-    torch.cuda.set_device(rank)
-  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
-    torch._C._cuda_setDevice(device)
-RuntimeError: CUDA error: invalid device ordinal
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 423, in <module>
-    main(
-  File "/home/cdhernandez/local/gpt-fast/generate.py", line 276, in main
-    rank = maybe_init_dist()
-  File "/home/cdhernandez/local/gpt-fast/tp.py", line 49, in maybe_init_dist
-    torch.cuda.set_device(rank)
-  File "/home/cdhernandez/local/pytorch/torch/cuda/__init__.py", line 399, in set_device
-    torch._C._cuda_setDevice(device)
-RuntimeError: CUDA error: invalid device ordinal
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-W0325 17:52:11.561000 140417423238144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1765652 closing signal SIGTERM
-W0325 17:52:11.562000 140417423238144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1765656 closing signal SIGTERM
-E0325 17:52:11.675000 140417423238144 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 1 (pid: 1765653) of binary: /home/cdhernandez/local/miniconda3/envs/pytorch/bin/python
-Traceback (most recent call last):
-  File "/home/cdhernandez/local/miniconda3/envs/pytorch/bin/torchrun", line 33, in <module>
-    sys.exit(load_entry_point('torch', 'console_scripts', 'torchrun')())
-  File "/home/cdhernandez/local/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
-    return f(*args, **kwargs)
-  File "/home/cdhernandez/local/pytorch/torch/distributed/run.py", line 879, in main
-    run(args)
-  File "/home/cdhernandez/local/pytorch/torch/distributed/run.py", line 870, in run
-    elastic_launch(
-  File "/home/cdhernandez/local/pytorch/torch/distributed/launcher/api.py", line 132, in __call__
-    return launch_agent(self._config, self._entrypoint, list(args))
-  File "/home/cdhernandez/local/pytorch/torch/distributed/launcher/api.py", line 263, in launch_agent
-    raise ChildFailedError(
-torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
-============================================================
-generate.py FAILED
-------------------------------------------------------------
-Failures:
-[1]:
-  time      : 2024-03-25_17:52:11
-  host      : devgpu001.ash8.facebook.com
-  rank      : 2 (local_rank: 2)
-  exitcode  : 1 (pid: 1765654)
-  error_file: <N/A>
-  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-[2]:
-  time      : 2024-03-25_17:52:11
-  host      : devgpu001.ash8.facebook.com
-  rank      : 3 (local_rank: 3)
-  exitcode  : 1 (pid: 1765655)
-  error_file: <N/A>
-  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-[3]:
-  time      : 2024-03-25_17:52:11
-  host      : devgpu001.ash8.facebook.com
-  rank      : 5 (local_rank: 5)
-  exitcode  : 1 (pid: 1765657)
-  error_file: <N/A>
-  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-[4]:
-  time      : 2024-03-25_17:52:11
-  host      : devgpu001.ash8.facebook.com
-  rank      : 6 (local_rank: 6)
-  exitcode  : 1 (pid: 1765658)
-  error_file: <N/A>
-  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-[5]:
-  time      : 2024-03-25_17:52:11
-  host      : devgpu001.ash8.facebook.com
-  rank      : 7 (local_rank: 7)
-  exitcode  : 1 (pid: 1765659)
-  error_file: <N/A>
-  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-------------------------------------------------------------
-Root Cause (first observed failure):
-[0]:
-  time      : 2024-03-25_17:52:11
-  host      : devgpu001.ash8.facebook.com
-  rank      : 1 (local_rank: 1)
-  exitcode  : 1 (pid: 1765653)
-  error_file: <N/A>
-  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-============================================================
diff --git a/model.py b/model.py
index e70e87a..dbf24e5 100644
--- a/model.py
+++ b/model.py
@@ -78,8 +78,10 @@ def update(self, input_pos, k_val, v_val):
         # input_pos: [S], k_val: [B, H, S, D]
         assert input_pos.shape[0] == k_val.shape[2]
 
-        k_out = torch.ops.aten.index_put_(self.k_cache, [None, None, input_pos], k_val)
-        v_out = torch.ops.aten.index_put_(self.v_cache, [None, None, input_pos], v_val)
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out[:, :, input_pos] = k_val
+        v_out[:, :, input_pos] = v_val
 
         return k_out, v_out
 
diff --git a/run.sh b/run.sh
deleted file mode 100644
index d657afe..0000000
--- a/run.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
-
-# python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --compile # working
-# echo "base"
-
-# python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4-gptq --calibration_tasks wikitext --calibration_limit 5
-python eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4-gptq.g32.cuda.pth --tasks wikitext --limit 5
-
-# python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.pth --compile
-# echo "quant good"
-
-python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4
-python eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.cuda.pth --tasks wikitext --limit 5
-
-ENABLE_INTRA_NODE_COMM=1 torchrun --standalone --nproc_per_node=8 generate.py --compile --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.cuda.pth
-
-# python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4-gptq --calibration_tasks wikitext --calibration_limit 5