cleanlab · jas2600 · Oct 1, 2024 · Sep 19, 2024 · Sep 26, 2024 · Sep 26, 2024
diff --git a/.github/workflows/ci-rerun-failed-test-tlm.yml b/.github/workflows/ci-rerun-failed-test-tlm.yml
@@ -47,6 +47,7 @@ jobs:
         run: |
           pytest tests/tlm/test_properties.py --last-failed --verbose
       - name: Cache pytest results
+        if: always() && github.event.comment.body == '/rerun-failed-test-tlm' # Ensure this runs even if tests above fail
         uses: actions/cache@v2
         with:
           path: .pytest_cache

diff --git a/.github/workflows/ci-test-tlm.yml b/.github/workflows/ci-test-tlm.yml
@@ -88,6 +88,7 @@ jobs:
         run: |
           pytest -n auto tests/tlm/test_properties.py --verbose
       - name: Cache pytest results
+        if: always() && github.event.comment.body == '/test-tlm' # Ensure this runs even if tests above fail
         uses: actions/cache@v2
         with:
           path: .pytest_cache

diff --git a/cleanlab_studio/internal/constants.py b/cleanlab_studio/internal/constants.py
@@ -1,4 +1,4 @@
-from typing import List, Set, Tuple
+from typing import Dict, List, Set, Tuple
 
 # TLM constants
 # prepend constants with _ so that they don't show up in help.cleanlab.ai docs
@@ -14,7 +14,15 @@
 ]
 _TLM_DEFAULT_MODEL: str = "gpt-4o-mini"
 _TLM_MAX_RETRIES: int = 3  # TODO: finalize this number
-TLM_MAX_TOKEN_RANGE: Tuple[int, int] = (64, 512)  # (min, max)
+_TLM_MAX_TOKEN_RANGE: Dict[str, Tuple[int, int]] = {  # model: (min, max)
+    "gpt-3.5-turbo-16k": (64, 4096),
+    "gpt-4": (64, 4096),
+    "gpt-4o": (64, 4096),
+    "gpt-4o-mini": (64, 4096),
+    "claude-3-haiku": (64, 512),
+    "claude-3-sonnet": (64, 512),
+    "claude-3.5-sonnet": (64, 512),
+}
 TLM_NUM_CANDIDATE_RESPONSES_RANGE: Tuple[int, int] = (1, 20)  # (min, max)
 TLM_NUM_CONSISTENCY_SAMPLES_RANGE: Tuple[int, int] = (0, 20)  # (min, max)
 TLM_VALID_LOG_OPTIONS: Set[str] = {"perplexity", "explanation"}

diff --git a/cleanlab_studio/internal/tlm/validation.py b/cleanlab_studio/internal/tlm/validation.py
@@ -3,8 +3,9 @@
 
 from cleanlab_studio.errors import ValidationError
 from cleanlab_studio.internal.constants import (
+    _TLM_DEFAULT_MODEL,
+    _TLM_MAX_TOKEN_RANGE,
     _VALID_TLM_MODELS,
-    TLM_MAX_TOKEN_RANGE,
     TLM_NUM_CANDIDATE_RESPONSES_RANGE,
     TLM_NUM_CONSISTENCY_SAMPLES_RANGE,
     TLM_VALID_GET_TRUSTWORTHINESS_SCORE_KWARGS,
@@ -137,9 +138,10 @@ def validate_tlm_options(options: Any) -> None:
             if not isinstance(val, int):
                 raise ValidationError(f"Invalid type {type(val)}, max_tokens must be an integer")
 
-            if val < TLM_MAX_TOKEN_RANGE[0] or val > TLM_MAX_TOKEN_RANGE[1]:
+            model = options.get("model", _TLM_DEFAULT_MODEL)
+            if val < _TLM_MAX_TOKEN_RANGE[model][0] or val > _TLM_MAX_TOKEN_RANGE[model][1]:
                 raise ValidationError(
-                    f"Invalid value {val}, max_tokens must be in the range {TLM_MAX_TOKEN_RANGE}"
+                    f"Invalid value {val}, max_tokens for {model} must be in the range {_TLM_MAX_TOKEN_RANGE[model]}"
                 )
 
         elif option == "model":

diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py
@@ -760,7 +760,7 @@ class TLMOptions(TypedDict):
         that can be generated internally within the TLM (to estimate the trustworthiness score).
         Higher values here can produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes.
         If you are experiencing token limit errors while using the TLM (especially on higher quality presets), consider lowering this number.
-        This parameter must be between 64 and 512.
+        For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
 
         num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM.
         TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one.

diff --git a/tests/tlm/conftest.py b/tests/tlm/conftest.py
@@ -8,6 +8,8 @@
 
 from cleanlab_studio import Studio
 from cleanlab_studio.internal.constants import (
+    _TLM_DEFAULT_MODEL,
+    _TLM_MAX_TOKEN_RANGE,
     _VALID_TLM_MODELS,
     _VALID_TLM_QUALITY_PRESETS,
 )
@@ -87,7 +89,8 @@ def _get_options_dictionary(model: Optional[str]) -> dict:
     add_log_perplexity_score = np.random.choice([True, False])
 
     if add_max_tokens:
-        options["max_tokens"] = int(np.random.randint(64, 512))
+        max_tokens = _TLM_MAX_TOKEN_RANGE[options.get("model", _TLM_DEFAULT_MODEL)][1]
+        options["max_tokens"] = int(np.random.randint(64, max_tokens))
     if add_use_self_reflection:
         options["use_self_reflection"] = random.choice([True, False])
     if add_num_candidate_responses: