diff --git a/.github/workflows/eval_test.yaml b/.github/workflows/ci_eval.yaml
similarity index 95%
rename from .github/workflows/eval_test.yaml
rename to .github/workflows/ci_eval.yaml
index ec3a0f3ca..d3681d95a 100644
--- a/.github/workflows/eval_test.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -55,7 +55,7 @@ jobs:
           pip install --no-compile -r pytorch-cpu-requirements.txt
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
-          pip install --no-compile -r requirements.txt -e sharktank/
+          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
       - name: Run perplexity test
         run:  pytest sharktank/tests/evaluate/perplexity_test.py  --longrun
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 8b3f50944..a005acf14 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -86,7 +86,7 @@ jobs:
           pip install --no-compile -r pytorch-rocm-requirements.txt
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
-          pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
+          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/ shortfin/
 
       - name: Run punet tests
         run: |
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index 21498d728..040775409 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -128,14 +128,6 @@ def pytest_addoption(parser):
         help="Llama3.1 8B & 405B model baseline perplexity scores json",
     )
 
-    parser.addoption(
-        "--current-perplexity-scores-json",
-        type=Path,
-        action="store",
-        default="sharktank/tests/evaluate/current_perplexity_scores.json",
-        help="Llama3.1 8B & 405B model current perplexity scores json",
-    )
-
 
 def set_fixture_from_cli_option(
     request: FixtureRequest,
@@ -200,7 +192,4 @@ def get_model_path(request: FixtureRequest):
     model_path["baseline_perplexity_score_json"] = set_fixture_from_cli_option(
         request, "--baseline-perplexity-score-json", "baseline_perplexity_score_json"
     )
-    model_path["current_perplexity_scores_json"] = set_fixture_from_cli_option(
-        request, "--current-perplexity-scores-json", "current_perplexity_scores_json"
-    )
     return model_path
diff --git a/sharktank/tests/evaluate/perplexity_test.py b/sharktank/tests/evaluate/perplexity_test.py
index a9287ea3b..faf3a263f 100644
--- a/sharktank/tests/evaluate/perplexity_test.py
+++ b/sharktank/tests/evaluate/perplexity_test.py
@@ -23,11 +23,6 @@ def setUp(self):
         with open(self.baseline_perplexity_score_json, "r") as f:
             self.baseline_perplexity = json.load(f)
 
-    def save_perplexity(self, model_name: str, current_perplexity: dict):
-        self.current_perplexity_all = {model_name: current_perplexity}
-        with open(self.current_perplexity_scores_json, "w") as f:
-            json.dump(self.current_perplexity_all, f)
-
     @longrun
     def test_llama3_8B_f16_decomposed(self):
 
@@ -43,8 +38,6 @@ def test_llama3_8B_f16_decomposed(self):
             ]
         )
 
-        self.save_perplexity(model_name, current_perplexity)
-
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
@@ -71,8 +64,6 @@ def test_llama3_8B_f16_non_decomposed(self):
             ]
         )
 
-        self.save_perplexity(model_name, current_perplexity)
-
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
@@ -98,8 +89,6 @@ def test_llama3_8B_fp8_decomposed(self):
             ]
         )
 
-        self.save_perplexity(model_name, current_perplexity)
-
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
@@ -126,8 +115,6 @@ def test_llama3_8B_fp8_non_decomposed(self):
             ]
         )
 
-        self.save_perplexity(model_name, current_perplexity)
-
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
@@ -151,8 +138,6 @@ def test_llama3_405B_f16_decomposed(self):
             ]
         )
 
-        self.save_perplexity(model_name, current_perplexity)
-
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
@@ -180,8 +165,6 @@ def test_llama3_405B_f16_non_decomposed(self):
             ]
         )
 
-        self.save_perplexity(model_name, current_perplexity)
-
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
@@ -208,8 +191,6 @@ def test_llama3_405B_fp8_decomposed(self):
             ]
         )
 
-        self.save_perplexity(model_name, current_perplexity)
-
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
@@ -237,8 +218,6 @@ def test_llama3_405B_fp8_non_decomposed(self):
             ]
         )
 
-        self.save_perplexity(model_name, current_perplexity)
-
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],