diff --git a/.github/workflows/ci-rerun-failed-test-tlm.yml b/.github/workflows/ci-rerun-failed-test-tlm.yml index 9dd9c52c..9b90417d 100644 --- a/.github/workflows/ci-rerun-failed-test-tlm.yml +++ b/.github/workflows/ci-rerun-failed-test-tlm.yml @@ -47,6 +47,7 @@ jobs: run: | pytest tests/tlm/test_properties.py --last-failed --verbose - name: Cache pytest results + if: always() && github.event.comment.body == '/rerun-failed-test-tlm' # Ensure this runs even if tests above fail uses: actions/cache@v2 with: path: .pytest_cache diff --git a/.github/workflows/ci-test-tlm.yml b/.github/workflows/ci-test-tlm.yml index 4da03fd1..deb87d2e 100644 --- a/.github/workflows/ci-test-tlm.yml +++ b/.github/workflows/ci-test-tlm.yml @@ -88,6 +88,7 @@ jobs: run: | pytest -n auto tests/tlm/test_properties.py --verbose - name: Cache pytest results + if: always() && github.event.comment.body == '/test-tlm' # Ensure this runs even if tests above fail uses: actions/cache@v2 with: path: .pytest_cache diff --git a/cleanlab_studio/internal/constants.py b/cleanlab_studio/internal/constants.py index 594aa89c..97c4afe7 100644 --- a/cleanlab_studio/internal/constants.py +++ b/cleanlab_studio/internal/constants.py @@ -1,4 +1,4 @@ -from typing import List, Set, Tuple +from typing import Dict, List, Set, Tuple # TLM constants # prepend constants with _ so that they don't show up in help.cleanlab.ai docs @@ -14,7 +14,15 @@ ] _TLM_DEFAULT_MODEL: str = "gpt-4o-mini" _TLM_MAX_RETRIES: int = 3 # TODO: finalize this number -TLM_MAX_TOKEN_RANGE: Tuple[int, int] = (64, 512) # (min, max) +_TLM_MAX_TOKEN_RANGE: Dict[str, Tuple[int, int]] = { # model: (min, max) + "gpt-3.5-turbo-16k": (64, 4096), + "gpt-4": (64, 4096), + "gpt-4o": (64, 4096), + "gpt-4o-mini": (64, 4096), + "claude-3-haiku": (64, 512), + "claude-3-sonnet": (64, 512), + "claude-3.5-sonnet": (64, 512), +} TLM_NUM_CANDIDATE_RESPONSES_RANGE: Tuple[int, int] = (1, 20) # (min, max) TLM_NUM_CONSISTENCY_SAMPLES_RANGE: Tuple[int, int] = (0, 20) # (min, max) TLM_VALID_LOG_OPTIONS: Set[str] = {"perplexity", "explanation"} diff --git a/cleanlab_studio/internal/tlm/validation.py b/cleanlab_studio/internal/tlm/validation.py index cb09d4d0..74819d1b 100644 --- a/cleanlab_studio/internal/tlm/validation.py +++ b/cleanlab_studio/internal/tlm/validation.py @@ -3,8 +3,9 @@ from cleanlab_studio.errors import ValidationError from cleanlab_studio.internal.constants import ( + _TLM_DEFAULT_MODEL, + _TLM_MAX_TOKEN_RANGE, _VALID_TLM_MODELS, - TLM_MAX_TOKEN_RANGE, TLM_NUM_CANDIDATE_RESPONSES_RANGE, TLM_NUM_CONSISTENCY_SAMPLES_RANGE, TLM_VALID_GET_TRUSTWORTHINESS_SCORE_KWARGS, @@ -137,9 +138,10 @@ def validate_tlm_options(options: Any) -> None: if not isinstance(val, int): raise ValidationError(f"Invalid type {type(val)}, max_tokens must be an integer") - if val < TLM_MAX_TOKEN_RANGE[0] or val > TLM_MAX_TOKEN_RANGE[1]: + model = options.get("model", _TLM_DEFAULT_MODEL) + if val < _TLM_MAX_TOKEN_RANGE[model][0] or val > _TLM_MAX_TOKEN_RANGE[model][1]: raise ValidationError( - f"Invalid value {val}, max_tokens must be in the range {TLM_MAX_TOKEN_RANGE}" + f"Invalid value {val}, max_tokens for {model} must be in the range {_TLM_MAX_TOKEN_RANGE[model]}" ) elif option == "model": diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index f41c4cb2..2294b6b2 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -760,7 +760,7 @@ class TLMOptions(TypedDict): that can be generated internally within the TLM (to estimate the trustworthiness score). Higher values here can produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes. If you are experiencing token limit errors while using the TLM (especially on higher quality presets), consider lowering this number. - This parameter must be between 64 and 512. + For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM. TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one. diff --git a/tests/tlm/conftest.py b/tests/tlm/conftest.py index 3f3a0209..a0a514e7 100644 --- a/tests/tlm/conftest.py +++ b/tests/tlm/conftest.py @@ -8,6 +8,8 @@ from cleanlab_studio import Studio from cleanlab_studio.internal.constants import ( + _TLM_DEFAULT_MODEL, + _TLM_MAX_TOKEN_RANGE, _VALID_TLM_MODELS, _VALID_TLM_QUALITY_PRESETS, ) @@ -87,7 +89,8 @@ def _get_options_dictionary(model: Optional[str]) -> dict: add_log_perplexity_score = np.random.choice([True, False]) if add_max_tokens: - options["max_tokens"] = int(np.random.randint(64, 512)) + max_tokens = _TLM_MAX_TOKEN_RANGE[options.get("model", _TLM_DEFAULT_MODEL)][1] + options["max_tokens"] = int(np.random.randint(64, max_tokens)) if add_use_self_reflection: options["use_self_reflection"] = random.choice([True, False]) if add_num_candidate_responses: