diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
index ad040ed6..b569e971 100644
--- a/elk/extraction/extraction.py
+++ b/elk/extraction/extraction.py
@@ -3,7 +3,7 @@
 import os
 from dataclasses import InitVar, dataclass, replace
 from itertools import zip_longest
-from typing import Any, Iterable, Literal
+from typing import Any, Iterable, Literal, Optional
 from warnings import filterwarnings
 
 import torch
@@ -147,6 +147,24 @@ def explode(self) -> list["Extract"]:
         ]
 
 
+def get_reusable_kv(
+    answer_ids: torch.Tensor, past_key_values: tuple[tuple[torch.Tensor]]
+) -> tuple[tuple[torch.Tensor]]:
+    reshaped_past_key_values = ()
+    answer_length = answer_ids.size(-1)
+
+    for layer_kv in past_key_values:
+        reshaped_kv = ()
+        for t in layer_kv:
+            # Tensor is (batch_size, num_heads, sequence_length, embed_size_per_head)
+            # Reduce the sequence_length by answer_length to remove the token
+            reshaped_tensor = t[:, :, :-answer_length, :]
+            reshaped_kv += (reshaped_tensor,)
+        reshaped_past_key_values += (reshaped_kv,)
+
+    return reshaped_past_key_values
+
+
 @torch.inference_mode()
 def extract_hiddens(
     cfg: "Extract",
@@ -247,7 +265,7 @@ def extract_hiddens(
         # Iterate over variants
         for i, record in enumerate(example["prompts"]):
             variant_questions = []
-
+            cached_kv: tuple[tuple[Tensor]] | None = None
             # Iterate over answers
             for j, choice in enumerate(record):
                 text = choice["question"]
@@ -264,6 +282,7 @@ def extract_hiddens(
                 input_ids = assert_type(Tensor, encoding.input_ids)
                 if is_enc_dec:
                     answer = assert_type(Tensor, encoding.labels)
+                    input_ids_to_pass = input_ids
                 else:
                     encoding2 = tokenizer(
                         choice["answer"],
@@ -272,22 +291,29 @@ def extract_hiddens(
                         return_tensors="pt",
                     ).to(first_device)
                     answer = assert_type(Tensor, encoding2.input_ids)
-                    input_ids = torch.cat([input_ids, answer], dim=-1)
+                    input_ids_to_pass = torch.cat([input_ids, answer], dim=-1)
+                    # for decoders, we just need to pass the cached key-values
 
                 # If this input is too long, skip it
-                if input_ids.shape[-1] > max_length:
+                if input_ids_to_pass.shape[-1] > max_length:
                     break
                 else:
                     # Record the EXACT question we fed to the model
                     variant_questions.append(text)
 
                 # Make sure we only pass the arguments that the model expects
-                inputs = dict(input_ids=input_ids.long())
+                inputs = dict(input_ids=input_ids_to_pass.long())
                 if is_enc_dec:
                     inputs["labels"] = answer
+                if cached_kv is not None:
+                    inputs["past_key_values"] = cached_kv
+                    inputs["input_ids"] = input_ids
 
-                outputs = model(**inputs, output_hidden_states=True)
+                outputs = model(**inputs, output_hidden_states=True, use_cache=True)
 
+                cached_kv = get_reusable_kv(
+                    answer_ids=answer, past_key_values=outputs.past_key_values
+                )
                 # Compute the log probability of the answer tokens if available
                 if has_lm_preds:
                     answer_len = answer.shape[-1]
@@ -320,7 +346,6 @@ def extract_hiddens(
 
                 for layer_idx, hidden in zip(layer_indices, hiddens):
                     hidden_dict[f"hidden_{layer_idx}"][i, j] = float_to_int16(hidden)
-
             # We skipped a pseudolabel because it was too long; break out of this whole
             # example and move on to the next one
             if len(variant_questions) != num_choices:
diff --git a/tests/test_smoke_eval.py b/tests/test_smoke_eval.py
index 683e718a..ac1f1f1c 100644
--- a/tests/test_smoke_eval.py
+++ b/tests/test_smoke_eval.py
@@ -2,10 +2,11 @@
 
 import pandas as pd
 
-from elk import Extract
+from elk import Extract, extract_hiddens
 from elk.evaluation import Eval
 from elk.training import CcsReporterConfig, EigenReporterConfig
 from elk.training.train import Elicit
+from elk.utils.multi_gpu import ModelDevices
 
 EVAL_EXPECTED_FILES = [
     "cfg.yaml",
@@ -19,7 +20,7 @@ def setup_elicit(
     tmp_path: Path,
     dataset_name="imdb",
     model_path="sshleifer/tiny-gpt2",
-    min_mem=10 * 1024 ** 2,
+    min_mem=10 * 1024**2,
     is_ccs: bool = True,
 ) -> Elicit:
     """Setup elicit config for testing, execute elicit, and save output to tmp_path.
@@ -96,6 +97,22 @@ def test_smoke_tfr_eval_run_tiny_gpt2_ccs(tmp_path: Path):
     eval_assert_files_created(elicit, transfer_datasets=transfer_datasets)
 
 
+def test_extract():
+    for i in extract_hiddens(
+        cfg=Extract(
+            model="sshleifer/tiny-gpt2",
+            datasets=("imdb",),
+            max_examples=(10, 10),
+            # run on all layers, tiny-gpt only has 2 layers
+        ),
+        devices=ModelDevices(first_device="cpu", other_devices=[]),
+        split_type="train",
+        rank=0,
+        world_size=1,
+    ):
+        print(i)
+
+
 def test_smoke_eval_run_tiny_gpt2_eigen(tmp_path: Path):
     elicit = setup_elicit(tmp_path, is_ccs=False)
     transfer_datasets = ("christykoh/imdb_pt",)