TransformerLensOrg · thisnick · Jan 31, 2025 · Jan 31, 2025 · Feb 5, 2025 · Feb 5, 2025
diff --git a/demos/Colab_Compatibility.ipynb b/demos/Colab_Compatibility.ipynb
@@ -16,9 +16,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/var/folders/m3/z6c6rcdj1rbb2jh9vqpgvxg40000gn/T/ipykernel_57027/2944939757.py:18: DeprecationWarning: `magic(...)` is deprecated since IPython 0.13 (warning added in 8.1), use run_line_magic(magic_name, parameter_s).\n",
+      "/var/folders/pr/77j77_bs2gl2stxyrvr14x3c0000gn/T/ipykernel_60991/3507779555.py:18: DeprecationWarning: `magic(...)` is deprecated since IPython 0.13 (warning added in 8.1), use run_line_magic(magic_name, parameter_s).\n",
       "  ipython.magic(\"load_ext autoreload\")\n",
-      "/var/folders/m3/z6c6rcdj1rbb2jh9vqpgvxg40000gn/T/ipykernel_57027/2944939757.py:19: DeprecationWarning: `magic(...)` is deprecated since IPython 0.13 (warning added in 8.1), use run_line_magic(magic_name, parameter_s).\n",
+      "/var/folders/pr/77j77_bs2gl2stxyrvr14x3c0000gn/T/ipykernel_60991/3507779555.py:19: DeprecationWarning: `magic(...)` is deprecated since IPython 0.13 (warning added in 8.1), use run_line_magic(magic_name, parameter_s).\n",
       "  ipython.magic(\"autoreload 2\")\n"
      ]
     }
@@ -43,7 +43,7 @@
     "    # Code to automatically update the HookedTransformer code as its edited without restarting the kernel\n",
     "    ipython.magic(\"load_ext autoreload\")\n",
     "    ipython.magic(\"autoreload 2\")\n",
-    "    \n",
+    "\n",
     "\n",
     "\n",
     "if IN_COLAB or IN_GITHUB:\n",
@@ -58,14 +58,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "TransformerLens currently supports 207 models out of the box.\n"
+      "TransformerLens currently supports 208 models out of the box.\n"
      ]
     }
    ],
@@ -96,7 +96,7 @@
     "def mark_models_as_tested(model_set: List[str]) -> None:\n",
     "    for model in model_set:\n",
     "        untested_models.remove(model)\n",
-    "    \n",
+    "\n",
     "\n",
     "def run_set(model_set: List[str], device=\"cuda\") -> None:\n",
     "    for model in model_set:\n",
@@ -112,12 +112,12 @@
     "def run_llama_set(model_set: List[str], weight_root: str, device=\"cuda\") -> None:\n",
     "    for model in model_set:\n",
     "        print(\"Testing \" + model)\n",
-    "        # to run this, make sure weight root is the root that contains all models with the \n",
+    "        # to run this, make sure weight root is the root that contains all models with the\n",
     "        # sub directories sharing the same name as the model in the list of models\n",
     "        tokenizer = LlamaTokenizer.from_pretrained(weight_root + model)\n",
     "        hf_model = LlamaForCausalLM.from_pretrained(weight_root + model, low_cpu_mem_usage=True)\n",
     "        tl_model = HookedTransformer.from_pretrained_no_processing(\n",
-    "            model, \n",
+    "            model,\n",
     "            hf_model=hf_model,\n",
     "            device=device,\n",
     "            fold_ln=False,\n",
@@ -309,7 +309,7 @@
     "\n",
     "if IN_COLAB:\n",
     "    run_set(free_compatible)\n",
-    "    \n",
+    "\n",
     "mark_models_as_tested(free_compatible)"
    ]
   },
@@ -357,6 +357,7 @@
     "    \"mistralai/Mistral-7B-Instruct-v0.1\",\n",
     "    \"mistralai/Mistral-7B-v0.1\",\n",
     "    \"mistralai/Mistral-Nemo-Base-2407\",\n",
+    "    \"mistralai/Mistral-Small-24B-Base-2501\",\n",
     "    \"Qwen/Qwen-7B\",\n",
     "    \"Qwen/Qwen-7B-Chat\",\n",
     "    \"Qwen/Qwen1.5-4B\",\n",
@@ -377,7 +378,7 @@
     "\n",
     "if IN_COLAB:\n",
     "    run_set(paid_gpu_models)\n",
-    "    \n",
+    "\n",
     "mark_models_as_tested(paid_gpu_models)"
    ]
   },
@@ -410,7 +411,7 @@
     "\n",
     "if IN_COLAB:\n",
     "    run_set(paid_cpu_models, \"cpu\")\n",
-    "    \n",
+    "\n",
     "mark_models_as_tested(paid_cpu_models)"
    ]
   },
@@ -528,7 +529,7 @@
     "# Any models listed in the cell below have not been tested. This should always remain blank. If your\n",
     "# PR fails due to this notebook, most likely you need to check any new model changes to ensure that\n",
     "# this notebook is up to date.\n",
-    "print(*untested_models, sep = '\\n')"
+    "print(*untested_models, sep=\"\\n\")"
    ]
   }
  ],
@@ -548,7 +549,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,

diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py
@@ -182,6 +182,7 @@
     "stabilityai/stablelm-tuned-alpha-7b",
     "mistralai/Mistral-7B-v0.1",
     "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mistral-Small-24B-Base-2501",
     "mistralai/Mistral-Nemo-Base-2407",
     "mistralai/Mixtral-8x7B-v0.1",
     "mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -979,7 +980,7 @@ def convert_hf_model_config(model_name: str, **kwargs):
             "normalization_type": "RMS",
             "positional_embedding_type": "rotary",
             "rotary_adjacent_pairs": False,
-            "rotary_dim": 32,
+            "rotary_dim": 128,
             "final_rms": True,
             "gated_mlp": True,
             "rotary_base": 500000.0,