From 0f350b6bbbcc41f29dc204a7f67262d0c6e009b0 Mon Sep 17 00:00:00 2001 From: huiwengoh <45724323+huiwengoh@users.noreply.github.com> Date: Fri, 12 Apr 2024 19:21:59 -0400 Subject: [PATCH 01/13] improve tlm docs --- cleanlab_studio/studio/studio.py | 18 ++++--- .../studio/trustworthy_language_model.py | 53 ++++++++++++++----- 2 files changed, 52 insertions(+), 19 deletions(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 4e7122e2..61d02844 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -399,12 +399,18 @@ def TLM( Args: quality_preset (TLMQualityPreset): quality preset to use for TLM queries, which will determine the quality of the output responses and trustworthiness scores. - Supported presets include "best", "high", "medium", "low", "base". - The "best" and "high" presets will improve the LLM responses themselves, with "best" also returning the most reliable trustworthiness scores. - The "medium" and "low" presets will return standard LLM responses along with associated confidence scores, - with "medium" producing more reliable trustworthiness scores than low. - The "base" preset will not return any confidence score, just a standard LLM output response, this option is similar to using your favorite LLM API. - Higher presets have increased runtime and cost. + TLMQualityPreset is a string specifying either of the supported presets, including "best", "high", "medium", "low", "base". + + The "best" and "high" presets will improve the LLM responses themselves, alongside providing reliable trustworthiness scores. + The "medium" and "low" presets will return standard LLM responses along with associated trustworthiness scores, + with "medium" producing more reliable trustworthiness scores than low. + The "base" preset will not return any trustworthiness score, just a standard LLM output response, this option is similar to using your favorite LLM API. + + Higher presets have increased runtime and cost. For more information about the details of each present, + view the documentation for [TLMOptions](../trustworthy_language_model#class-tlmoptions). + Note that it is recommended to avoid using "best" or "high" presets if you mostly care about evaluating the LLM outputs using trustworthiness scores + (and not about improving the LLM responses), as these presets have higher runtimes/costs and are optimized to return more accurate LLM outputs, + but not necessarily more reliable trustworthiness scores. options (TLMOptions, optional): a typed dict of advanced configuration options. Options that can be passed in include "model", "max_tokens", "num_candidate_responses", "num_consistency_samples", "use_self_reflection". diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index f442a337..b21ad190 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -36,7 +36,10 @@ class TLM: """Represents a Trustworthy Language Model (TLM) instance, bound to a Cleanlab Studio account. - TLM should be configured and instantiated using the [`Studio.TLM()`](../studio/#method-tlm) method. Then, using the TLM object, you can [`prompt()`](#method-prompt) the language model, etc. + ** The TLM object is not meant to be constructed directly.** Instead, use the [`Studio.TLM()`](../studio/#method-tlm) + method to configure and instantiate a TLM object. + After you've instantiated the TLM object using [`Studio.TLM()`](../studio/#method-tlm), you can use the instance methods below, + such as [`prompt()`](#method-prompt) and [`get_trustworthiness_score()`](#method-get_trustworthiness_score). """ def __init__( @@ -50,7 +53,8 @@ def __init__( ) -> None: """Initializes a Trustworthy Language Model. - **Objects of this class are not meant to be constructed directly.** Instead, use [`Studio.TLM()`](../studio/#method-tlm), whose documentation also explains the different configuration options.""" + lazydocs: ignore + """ self._api_key = api_key if quality_preset not in _VALID_TLM_QUALITY_PRESETS: @@ -129,7 +133,7 @@ async def _batch_get_trustworthiness_score( responses: Sequence[str], capture_exceptions: bool = False, ) -> Union[List[float], List[Optional[float]]]: - """Run batch of TLM get confidence score. + """Run batch of TLM get trustworthiness score. capture_exceptions behavior: - If true, the list will contain None in place of the response for any errors or timeout processing some inputs. @@ -140,19 +144,19 @@ async def _batch_get_trustworthiness_score( - If false, a single timeout is applied to the entire batch (i.e. all queries will fail if the timeout is reached) Args: - prompts (Sequence[str]): list of prompts to run get confidence score for - responses (Sequence[str]): list of responses to run get confidence score for + prompts (Sequence[str]): list of prompts to run get trustworthiness score for + responses (Sequence[str]): list of responses to run get trustworthiness score for capture_exceptions (bool): if should return None in place of the response for any errors or timeout processing some inputs Returns: - Union[List[float], List[Optional[float]]]: TLM confidence score for each prompt (in supplied order) + Union[List[float], List[Optional[float]]]: TLM trustworthiness score for each prompt (in supplied order) """ if capture_exceptions: per_query_timeout, per_batch_timeout = self._timeout, None else: per_query_timeout, per_batch_timeout = None, self._timeout - # run batch of TLM get confidence score + # run batch of TLM get trustworthiness score tlm_responses = await self._batch_async( [ self._get_trustworthiness_score_async( @@ -180,7 +184,7 @@ async def _batch_async( """Runs batch of TLM queries. Args: - tlm_coroutines (List[Coroutine[None, None, Union[TLMResponse, float, None]]]): list of query coroutines to run, returning TLM responses or confidence scores (or None if capture_exceptions is True) + tlm_coroutines (List[Coroutine[None, None, Union[TLMResponse, float, None]]]): list of query coroutines to run, returning TLM responses or trustworthiness scores (or None if capture_exceptions is True) batch_timeout (Optional[float], optional): timeout (in seconds) to run all queries, defaults to None (no timeout) Returns: @@ -266,8 +270,8 @@ def try_prompt( The list returned will have the same length as the input list, if there are any failures (errors or timeout) processing some inputs, the list will contain None in place of the response. - If there are any failures (errors or timeouts) processing some inputs, the list returned will have - the same length as the input list. In case of failure, the list will contain None in place of the response. + This is the recommended way to get TLM responses and trustworthiness scores for big datasets, + where some individual responses within the dataset may fail, as it will ensure partial results are not lost. Args: prompt (Sequence[str]): list of multiple prompts for the TLM @@ -411,10 +415,14 @@ def try_get_trustworthiness_score( response: Sequence[str], ) -> List[Optional[float]]: """Gets trustworthiness score for prompt-response pairs. + The list returned will have the same length as the input list, if there are any failures (errors or timeout) processing some inputs, the list will contain None in place of the response. + This is the recommended way to get TLM trustworthiness scores for big datasets, + where some individual responses within the dataset may fail, as it will ensure partial results are not lost. + Args: prompt (Sequence[str]): list of prompts for the TLM to evaluate response (Sequence[str]): list of responses corresponding to the input prompts @@ -495,7 +503,7 @@ async def _get_trustworthiness_score_async( """ if self._quality_preset == "base": raise ValidationError( - "Cannot get confidence score with `base` quality_preset -- choose a higher preset." + "Cannot get trustworthiness score with `base` quality_preset -- choose a higher preset." ) try: @@ -543,11 +551,30 @@ class TLMOptions(TypedDict): (see the arguments in the TLM [initialization method](../studio#method-tlm) to learn more about the various quality presets), but specifying custom values here will override any default values from the quality preset. + For all options described below, higher/more expensive settings will lead to longer runtimes and may consume more tokens internally. + The high token cost might make it such that you are not able to run long prompts (or prompts with long responses) in your account, + unless your token limits are increased. If you are running into issue with token limits, try using lower/less expensive settings + to be able to run longer prompts. + + The default values for the various quality presets (specified when instantiating [`Studio.TLM`](../studio/#method-tlm)) are as below: + - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8, `use_self_reflection` = True, this quality preset will return improved LLM responses + - **high:** `num_candidate_responses` = 6, `num_consistency_samples` = 8, `use_self_reflection` = True, this quality preset will return improved LLM responses + - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 4, `use_self_reflection` = True + - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4, `use_self_reflection` = True + - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0, `use_self_reflection` = False, this quality preset is equivalent to a regular LLM call + + By default, the TLM is set to the "medium" quality preset. The default `model` used is "gpt-3.5-turbo-16k", and `max_tokens` is 512 for all quality presets. + You can set custom values for these arguments regardless of the quality preset specified. + Args: model (str, default = "gpt-3.5-turbo-16k"): underlying LLM to use (better models will yield better results). Models currently supported include "gpt-3.5-turbo-16k", "gpt-4". max_tokens (int, default = 512): the maximum number of tokens to generate in the TLM response. + This number will impact the maximum number of tokens you will see in the output response, and also the number of tokens + that can be generated for internal calls (to estimate the trustworthiness score). + Higher values here produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes. + If you are experiencing token limits while using the TLM (especially on higher quality presets), consider lowering this number. The minimum value for this parameter is 64, and the maximum is 512. num_candidate_responses (int, default = 1): this controls how many candidate responses are internally generated. @@ -555,10 +582,10 @@ class TLMOptions(TypedDict): Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes. The minimum value for this parameter is 1, and the maximum is 20. - num_consistency_samples (int, default = 5): this controls how many samples are internally generated to evaluate the LLM-response-consistency. + num_consistency_samples (int, default = 4): this controls how many samples are internally generated to evaluate the LLM-response-consistency. This is a big part of the returned trustworthiness_score, in particular to evaluate strange input prompts or prompts that are too open-ended to receive a clearly defined 'good' response. - Higher values here produce better (more reliable) TLM confidence scores, but at higher costs/runtimes. + Higher values here produce better (more reliable) TLM trustworthiness scores, but at higher costs/runtimes. The minimum value for this parameter is 0, and the maximum is 20. use_self_reflection (bool, default = `True`): this controls whether self-reflection is used to have the LLM reflect upon the response it is From d49c0d83eb9fe6b6df6e53ca0a348029430434f0 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 12 Apr 2024 16:57:40 -0700 Subject: [PATCH 02/13] language improvement --- cleanlab_studio/studio/trustworthy_language_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index b21ad190..865c4647 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -51,7 +51,7 @@ def __init__( timeout: Optional[float] = None, verbose: Optional[bool] = None, ) -> None: - """Initializes a Trustworthy Language Model. + """Use `Studio.TLM()` instead of this method to initialize a TLM. lazydocs: ignore """ From f599dae52ff978f74346c4a603d99754557a5b51 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 12 Apr 2024 17:10:52 -0700 Subject: [PATCH 03/13] jonas edits on studio.py docs --- cleanlab_studio/studio/studio.py | 36 ++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 61d02844..4a314b7c 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -391,34 +391,38 @@ def TLM( timeout: Optional[float] = None, verbose: Optional[bool] = None, ) -> trustworthy_language_model.TLM: - """Gets a configured instance of Trustworthy Language Model (TLM). + """Instantiates a configured Trustworthy Language Model (TLM) instance. - The returned TLM object can then be used as a drop-in replacement for an LLM, for estimating trustworthiness scores for LLM prompt/response pairs, and more. See the documentation for the [TLM](../trustworthy_language_model#class-TLM) class for more on what you can do with TLM. + The TLM object can be used as a drop-in replacement for an LLM, or for estimating trustworthiness scores for arbitrary text prompt/response pairs, and more (see the [TLM documentation](../trustworthy_language_model#class-TLM)). - For advanced use cases, TLM supports a number of configuration options. The documentation below summarizes the options, and the [TLM tutorial](/tutorials/tlm) explains the tradeoffs in more detail. + For advanced use, TLM offers configuration options. The documentation below summarizes these options, and more details are explained in the [TLM tutorial](/tutorials/tlm). Args: - quality_preset (TLMQualityPreset): quality preset to use for TLM queries, which will determine the quality of the output responses and trustworthiness scores. - TLMQualityPreset is a string specifying either of the supported presets, including "best", "high", "medium", "low", "base". + quality_preset (TLMQualityPreset): An optional preset to control the quality of TLM responses and trustworthiness scores vs. runtimes/costs. + TLMQualityPreset is a string specifying one of the supported presets: "best", "high", "medium", "low", "base". - The "best" and "high" presets will improve the LLM responses themselves, alongside providing reliable trustworthiness scores. - The "medium" and "low" presets will return standard LLM responses along with associated trustworthiness scores, + The "best" and "high" presets improve the LLM responses themselves, + with "best" returning more reliable trustworthiness scores than "high". + The "medium" and "low" presets return standard LLM responses along with associated trustworthiness scores, with "medium" producing more reliable trustworthiness scores than low. - The "base" preset will not return any trustworthiness score, just a standard LLM output response, this option is similar to using your favorite LLM API. + The "base" preset will not return any trustworthiness score, just a standard LLM response, and is similar to directly using your favorite LLM API. - Higher presets have increased runtime and cost. For more information about the details of each present, - view the documentation for [TLMOptions](../trustworthy_language_model#class-tlmoptions). - Note that it is recommended to avoid using "best" or "high" presets if you mostly care about evaluating the LLM outputs using trustworthiness scores - (and not about improving the LLM responses), as these presets have higher runtimes/costs and are optimized to return more accurate LLM outputs, - but not necessarily more reliable trustworthiness scores. + Higher presets have increased runtime and cost (and may internally consume more tokens). + Reduce your preset if you see token-limit errors. + Details about each present are in the documentation for [TLMOptions](../trustworthy_language_model#class-tlmoptions). + Avoid using "best" or "high" presets if you primarily want to get trustworthiness scores, and are less concerned with improving LLM responses. + These presets have higher runtime/cost and are optimized to return more accurate LLM outputs, but not necessarily more reliable trustworthiness scores. options (TLMOptions, optional): a typed dict of advanced configuration options. - Options that can be passed in include "model", "max_tokens", "num_candidate_responses", "num_consistency_samples", "use_self_reflection". + Avaialable options (keys in this dict) include: "model", "max_tokens", "num_candidate_responses", "num_consistency_samples", "use_self_reflection". For more details about the options, see the documentation for [TLMOptions](../trustworthy_language_model#class-tlmoptions). + If specified, these override any settings from the choice of `quality_preset`. - timeout (float, optional): timeout (in seconds) to apply to each method call. If a result is not produced within the timeout, a TimeoutError will be raised. Defaults to None, which does not apply a timeout. + timeout (float, optional): timeout (in seconds) to apply to each method call. + If a result is not produced within the timeout, a TimeoutError will be raised. Defaults to None, which does not apply a timeout. - verbose (bool, optional): whether to run in verbose mode, i.e., whether to show a tqdm progress bar when TLM is prompted with batches of data. If None, this will be determined automatically based on whether the code is running in an interactive environment such as a notebook. + verbose (bool, optional): whether to print outputs during execution, i.e., whether to show a progress bar when TLM is prompted with batches of data. + If None, this will be determined automatically based on whether the code is running in an interactive environment such as a Jupyter notebook. Returns: TLM: the [Trustworthy Language Model](../trustworthy_language_model#class-tlm) object From 21b14d8b1d82879bc9be0f516062030105c391e0 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 12 Apr 2024 17:36:10 -0700 Subject: [PATCH 04/13] writing improvements --- .../studio/trustworthy_language_model.py | 90 +++++++++---------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index 865c4647..8154a3f7 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -1,7 +1,7 @@ """ Cleanlab's Trustworthy Language Model (TLM) is a large language model that gives more reliable answers and quantifies its uncertainty in these answers. -**This module is not meant to be imported and used directly.** Instead, use [`Studio.TLM()`](/reference/python/studio/#method-tlm) to instantiate a [TLM](#class-TLM) object, and then you can use the methods like [`prompt()`](#method-prompt) and [`get_trustworthiness_score()`](#method-get_trustworthiness_score) documented in this page. +**This module is not meant to be imported and used directly.** Instead, use [`Studio.TLM()`](/reference/python/studio/#method-tlm) to instantiate a [TLM](#class-TLM) object, and then you can use the methods like [`prompt()`](#method-prompt) and [`get_trustworthiness_score()`](#method-get_trustworthiness_score) documented on this page. The [Trustworthy Language Model tutorial](/tutorials/tlm/) further explains TLM and its use cases. """ @@ -34,12 +34,11 @@ class TLM: - """Represents a Trustworthy Language Model (TLM) instance, bound to a Cleanlab Studio account. + """Represents a Trustworthy Language Model (TLM) instance, which is bound to a Cleanlab Studio account. ** The TLM object is not meant to be constructed directly.** Instead, use the [`Studio.TLM()`](../studio/#method-tlm) method to configure and instantiate a TLM object. - After you've instantiated the TLM object using [`Studio.TLM()`](../studio/#method-tlm), you can use the instance methods below, - such as [`prompt()`](#method-prompt) and [`get_trustworthiness_score()`](#method-get_trustworthiness_score). + After you've instantiated the TLM object using [`Studio.TLM()`](../studio/#method-tlm), you can use the instance methods documented on this page. """ def __init__( @@ -91,17 +90,15 @@ async def _batch_prompt( prompts: Sequence[str], capture_exceptions: bool = False, ) -> Union[List[TLMResponse], List[Optional[TLMResponse]]]: - """Run batch of TLM prompts. The list returned will have the same length as the input list. - - If capture_exceptions is True, the list will contain None in place of the response for any errors or timeout processing some inputs. - Otherwise, the method will raise an exception for any errors or timeout processing some inputs. + """Run a batch of prompts through TLM and get responses/scores for each prompt in the batch. The list returned will have the same length as the input list. Args: prompts (List[str]): list of prompts to run - capture_exceptions (bool): if should return None in place of the response for any errors or timeout processing some inputs + capture_exceptions (bool): if ``True``, the returned list will contain ``None`` in place of the response for any errors or timeout when processing a particular prompt from the batch. + If ``False``, this entire method will raise an exception if TLM fails to produce a result for any prompt in the batch. Returns: - Union[List[TLMResponse], List[Optional[TLMResponse]]]: TLM responses for each prompt (in supplied order) + Union[List[TLMResponse], List[Optional[TLMResponse]]]: TLM responses/scores for each prompt (in supplied order) """ if capture_exceptions: per_query_timeout, per_batch_timeout = self._timeout, None @@ -230,16 +227,18 @@ def prompt( but also provides trustworthiness scores quantifying the quality of the output. Args: - prompt (str | Sequence[str]): prompt (or list of multiple prompts) for the language model + prompt (str | Sequence[str]): prompt (or list of multiple prompts) for the language model. + Providing a batch of many prompts here will be faster than calling this method on each prompt separately. Returns: TLMResponse | List[TLMResponse]: [TLMResponse](#class-tlmresponse) object containing the response and trustworthiness score. If multiple prompts were provided in a list, then a list of such objects is returned, one for each prompt. - This method will raise an exception if any errors occur or if you hit a timeout (given a timeout is specified), - and is suitable if strict error handling and immediate notification of any exceptions/timeouts is preferred. - However, you could lose any partial results if an exception is raised. - If saving partial results is important to you, you can call this method on smaller chunks of data at a time - (and save intermediate results as desired); you can also consider using the more advanced - [`try_prompt()`](#method-try_prompt) method instead. + This method will raise an exception if any errors occur or if you hit a timeout (given a timeout is specified). + Use it if you want strict error handling and immediate notification of any exceptions/timeouts. + + If running this method on a big batch of prompts: you might lose partially completed results if TLM fails on any one of them. + To avoid losing partial results for the prompts that TLM did not fail on, + you can either call this method on smaller batches of prompts at a time + (and save intermediate results between batches), or use the [`try_prompt()`](#method-try_prompt) method instead. """ validate_tlm_prompt(prompt) @@ -264,26 +263,26 @@ def try_prompt( /, ) -> List[Optional[TLMResponse]]: """ - Gets response and trustworthiness score for any text input, + Gets response and trustworthiness score for any batch of prompts, handling any failures (errors of timeouts) by returning None in place of the failures. The list returned will have the same length as the input list, if there are any failures (errors or timeout) processing some inputs, the list will contain None in place of the response. - This is the recommended way to get TLM responses and trustworthiness scores for big datasets, - where some individual responses within the dataset may fail, as it will ensure partial results are not lost. + This is the recommended way to get TLM responses and trustworthiness scores for big datasets of many prompts, + where some individual TLM responses within the dataset may fail. It ensures partial results are not lost. Args: prompt (Sequence[str]): list of multiple prompts for the TLM Returns: List[Optional[TLMResponse]]: list of [TLMResponse](#class-tlmresponse) objects containing the response and trustworthiness score. The returned list will always have the same length as the input list. - In case of failure on any prompt (due to timeouts or other erros), - the return list will contain None in place of the TLM response. - This method is suitable if you prioritize obtaining results for as many inputs as possible, - however you might miss out on certain error messages. - If you would prefer to be notified immediately about any errors or timeouts that might occur, - consider using the [`prompt()`](#method-prompt) method instead. + In case of TLM failure on any prompt (due to timeouts or other errors), + the return list will contain None in place of the TLM response for that failed prompt. + Use this to obtain TLM results for as many prompts as possible, + but you might miss out on certain error messages. + If you prefer to be notified immediately about any errors or timeouts when running many prompts, + use the [`prompt()`](#method-prompt) method instead. """ validate_tlm_try_prompt(prompt) @@ -374,21 +373,22 @@ def get_trustworthiness_score( prompt: Union[str, Sequence[str]], response: Union[str, Sequence[str]], ) -> Union[float, List[float]]: - """Gets trustworthiness score for prompt-response pairs. + """Computes trustworthiness score for arbitrary given prompt-response pairs. Args: prompt (str | Sequence[str]): prompt (or list of prompts) for the TLM to evaluate - response (str | Sequence[str]): response (or list of responses) corresponding to the input prompts + response (str | Sequence[str]): existing response (or list of responses) associated with the input prompts. + These can be from any LLM or human-written responses. Returns: float | List[float]: float or list of floats (if multiple prompt-responses were provided) corresponding to the TLM's trustworthiness score. The score quantifies how confident TLM is that the given response is good for the given prompt. - This method will raise an exception if any errors occur or if you hit a timeout (given a timeout is specified), - and is suitable if strict error handling and immediate notification of any exceptions/timeouts is preferred. - However, you could lose any partial results if an exception is raised. - If saving partial results is important to you, you can call this method on smaller chunks of data at a time - (and save intermediate results as desired); you can also consider using the more advanced - [`try_get_trustworthiness_score()`](#method-try_get_trustworthiness_score) method instead. + If running on many prompt-response pairs simultaneously: + this method will raise an exception if any TLM errors or timeouts occur. + Use it if strict error handling and immediate notification of any exceptions/timeouts is preferred. + You will lose any partial results if an exception is raised. + If saving partial results is important, you can call this method on smaller batches of prompt-response pairs at a time + (and save intermediate results) or use the [`try_get_trustworthiness_score()`](#method-try_get_trustworthiness_score) method instead. """ validate_tlm_prompt_response(prompt, response) @@ -414,28 +414,28 @@ def try_get_trustworthiness_score( prompt: Sequence[str], response: Sequence[str], ) -> List[Optional[float]]: - """Gets trustworthiness score for prompt-response pairs. + """Gets trustworthiness score for batches of many prompt-response pairs. - The list returned will have the same length as the input list, if there are any - failures (errors or timeout) processing some inputs, the list will contain None - in place of the response. + The list returned will have the same length as the input list, if TLM hits any + errors or timeout processing certain inputs, the list will contain None + in place of the TLM score for this failed input. This is the recommended way to get TLM trustworthiness scores for big datasets, - where some individual responses within the dataset may fail, as it will ensure partial results are not lost. + where some individual TLM calls within the dataset may fail. It will ensure partial results are not lost. Args: prompt (Sequence[str]): list of prompts for the TLM to evaluate - response (Sequence[str]): list of responses corresponding to the input prompts + response (Sequence[str]): list of existing responses corresponding to the input prompts (from any LLM or human-written) Returns: List[float]: list of floats corresponding to the TLM's trustworthiness score. The score quantifies how confident TLM is that the given response is good for the given prompt. The returned list will always have the same length as the input list. - In case of failure on any prompt-response pair (due to timeouts or other erros), - the return list will contain None in place of the trustworthiness score. - This method is suitable if you prioritize obtaining results for as many inputs as possible, + In case of TLM error or timeout on any prompt-response pair, + the returned list will contain None in place of the trustworthiness score. + Use this method if you prioritize obtaining results for as many inputs as possible, however you might miss out on certain error messages. - If you would prefer to be notified immediately about any errors or timeouts that might occur, - consider using the [`get_trustworthiness_score()`](#method-get_trustworthiness_score) method instead. + If you prefer to be notified immediately about any errors or timeouts, + use the [`get_trustworthiness_score()`](#method-get_trustworthiness_score) method instead. """ validate_try_tlm_prompt_response(prompt, response) From aab3802ce7d8be158c8107c9cb3199cf7bbc7a2e Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 12 Apr 2024 17:49:31 -0700 Subject: [PATCH 05/13] tlmoptions explanations --- .../studio/trustworthy_language_model.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index 8154a3f7..05993375 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -547,19 +547,19 @@ class TLMResponse(TypedDict): class TLMOptions(TypedDict): """Typed dict containing advanced configuration options for the Trustworthy Language Model. - Many of these arguments are automatically determined by the quality preset selected - (see the arguments in the TLM [initialization method](../studio#method-tlm) to learn more about the various quality presets), - but specifying custom values here will override any default values from the quality preset. + Many of these configurations are automatically determined by the quality preset selected + (see the arguments in the TLM [initialization method](../studio#method-tlm) to learn more about quality presets). + Specifying custom values here will override any default values from the quality preset. For all options described below, higher/more expensive settings will lead to longer runtimes and may consume more tokens internally. The high token cost might make it such that you are not able to run long prompts (or prompts with long responses) in your account, - unless your token limits are increased. If you are running into issue with token limits, try using lower/less expensive settings - to be able to run longer prompts. + unless your token limits are increased. If you are hit token limit issues, try using lower/less expensive settings + to be able to run longer prompts/responses. - The default values for the various quality presets (specified when instantiating [`Studio.TLM`](../studio/#method-tlm)) are as below: - - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8, `use_self_reflection` = True, this quality preset will return improved LLM responses - - **high:** `num_candidate_responses` = 6, `num_consistency_samples` = 8, `use_self_reflection` = True, this quality preset will return improved LLM responses - - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 4, `use_self_reflection` = True + The default values corresponding to each quality preset (specified when instantiating [`Studio.TLM()`](../studio/#method-tlm)) are: + - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8, `use_self_reflection` = True. This preset will improve LLM responses. + - **high:** `num_candidate_responses` = 6, `num_consistency_samples` = 8, `use_self_reflection` = True. This preset will improve LLM responses. + - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8, `use_self_reflection` = True - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4, `use_self_reflection` = True - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0, `use_self_reflection` = False, this quality preset is equivalent to a regular LLM call @@ -572,26 +572,26 @@ class TLMOptions(TypedDict): max_tokens (int, default = 512): the maximum number of tokens to generate in the TLM response. This number will impact the maximum number of tokens you will see in the output response, and also the number of tokens - that can be generated for internal calls (to estimate the trustworthiness score). - Higher values here produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes. - If you are experiencing token limits while using the TLM (especially on higher quality presets), consider lowering this number. - The minimum value for this parameter is 64, and the maximum is 512. + that can be generated internally within the TLM (to estimate the trustworthiness score). + Higher values here can produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes. + If you are experiencing token limit errors while using the TLM (especially on higher quality presets), consider lowering this number. + This parameter must be between 64 and 512. - num_candidate_responses (int, default = 1): this controls how many candidate responses are internally generated. + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM. TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one. - Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes. - The minimum value for this parameter is 1, and the maximum is 20. + Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens). + This parameter must be between 1 and 20. - num_consistency_samples (int, default = 4): this controls how many samples are internally generated to evaluate the LLM-response-consistency. - This is a big part of the returned trustworthiness_score, in particular to evaluate strange input prompts or prompts that are too open-ended - to receive a clearly defined 'good' response. + num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM-response-consistency. + This consistency forms a big part of the returned trustworthiness score, helping quantify the epistemic uncertainty associated with + strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. Higher values here produce better (more reliable) TLM trustworthiness scores, but at higher costs/runtimes. - The minimum value for this parameter is 0, and the maximum is 20. + This parameter must be between 0 and 20. - use_self_reflection (bool, default = `True`): this controls whether self-reflection is used to have the LLM reflect upon the response it is - generating and explicitly self-evaluate the accuracy of that response. - This is a big part of the trustworthiness score, in particular for evaluating responses that are obviously incorrect/bad for a - standard prompt (with well-defined answers) that LLMs should be able to handle. + use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it + generated and self-evaluate this response. + This self-reflection forms a big part of the trustworthiness score, helping quantify aleatoric uncertainty associated with challenging prompts + and helping catch answers that are obviously incorrect/bad for a prompt asking for a well-defined answer that LLMs should be able to handle. Setting this to False disables the use of self-reflection and may produce worse TLM trustworthiness scores, but will reduce costs/runtimes. """ From fac499df2ab900c43d702613f70c38a1eaa04dfe Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Fri, 12 Apr 2024 17:52:01 -0700 Subject: [PATCH 06/13] clariciation on candidate responses = 1 --- cleanlab_studio/studio/trustworthy_language_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index 05993375..8bf978b9 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -581,6 +581,7 @@ class TLMOptions(TypedDict): TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one. Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens). This parameter must be between 1 and 20. + When it is 1, TLM simply returns a standard LLM response and does not attempt to improve it. num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM-response-consistency. This consistency forms a big part of the returned trustworthiness score, helping quantify the epistemic uncertainty associated with From 4c91f5d2826213cd15606f4ecd30f879cad138f7 Mon Sep 17 00:00:00 2001 From: huiwengoh <45724323+huiwengoh@users.noreply.github.com> Date: Sat, 13 Apr 2024 01:27:27 -0400 Subject: [PATCH 07/13] minor changes --- cleanlab_studio/studio/studio.py | 4 ++-- .../studio/trustworthy_language_model.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 4a314b7c..fe9a5768 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -399,7 +399,7 @@ def TLM( Args: quality_preset (TLMQualityPreset): An optional preset to control the quality of TLM responses and trustworthiness scores vs. runtimes/costs. - TLMQualityPreset is a string specifying one of the supported presets: "best", "high", "medium", "low", "base". + TLMQualityPreset is a string specifying one of the supported presets, including "best", "high", "medium", "low", "base". The "best" and "high" presets improve the LLM responses themselves, with "best" returning more reliable trustworthiness scores than "high". @@ -414,7 +414,7 @@ def TLM( These presets have higher runtime/cost and are optimized to return more accurate LLM outputs, but not necessarily more reliable trustworthiness scores. options (TLMOptions, optional): a typed dict of advanced configuration options. - Avaialable options (keys in this dict) include: "model", "max_tokens", "num_candidate_responses", "num_consistency_samples", "use_self_reflection". + Avaialable options (keys in this dict) include "model", "max_tokens", "num_candidate_responses", "num_consistency_samples", "use_self_reflection". For more details about the options, see the documentation for [TLMOptions](../trustworthy_language_model#class-tlmoptions). If specified, these override any settings from the choice of `quality_preset`. diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index 8bf978b9..dd75bf86 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -234,7 +234,7 @@ def prompt( If multiple prompts were provided in a list, then a list of such objects is returned, one for each prompt. This method will raise an exception if any errors occur or if you hit a timeout (given a timeout is specified). Use it if you want strict error handling and immediate notification of any exceptions/timeouts. - + If running this method on a big batch of prompts: you might lose partially completed results if TLM fails on any one of them. To avoid losing partial results for the prompts that TLM did not fail on, you can either call this method on smaller batches of prompts at a time @@ -383,7 +383,7 @@ def get_trustworthiness_score( float | List[float]: float or list of floats (if multiple prompt-responses were provided) corresponding to the TLM's trustworthiness score. The score quantifies how confident TLM is that the given response is good for the given prompt. - If running on many prompt-response pairs simultaneously: + If running on many prompt-response pairs simultaneously: this method will raise an exception if any TLM errors or timeouts occur. Use it if strict error handling and immediate notification of any exceptions/timeouts is preferred. You will lose any partial results if an exception is raised. @@ -558,10 +558,10 @@ class TLMOptions(TypedDict): The default values corresponding to each quality preset (specified when instantiating [`Studio.TLM()`](../studio/#method-tlm)) are: - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8, `use_self_reflection` = True. This preset will improve LLM responses. - - **high:** `num_candidate_responses` = 6, `num_consistency_samples` = 8, `use_self_reflection` = True. This preset will improve LLM responses. - - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8, `use_self_reflection` = True - - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4, `use_self_reflection` = True - - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0, `use_self_reflection` = False, this quality preset is equivalent to a regular LLM call + - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8, `use_self_reflection` = True. This preset will improve LLM responses. + - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8, `use_self_reflection` = True. + - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4, `use_self_reflection` = True. + - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0, `use_self_reflection` = False. This preset is equivalent to a regular LLM call. By default, the TLM is set to the "medium" quality preset. The default `model` used is "gpt-3.5-turbo-16k", and `max_tokens` is 512 for all quality presets. You can set custom values for these arguments regardless of the quality preset specified. @@ -589,7 +589,7 @@ class TLMOptions(TypedDict): Higher values here produce better (more reliable) TLM trustworthiness scores, but at higher costs/runtimes. This parameter must be between 0 and 20. - use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it + use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it generated and self-evaluate this response. This self-reflection forms a big part of the trustworthiness score, helping quantify aleatoric uncertainty associated with challenging prompts and helping catch answers that are obviously incorrect/bad for a prompt asking for a well-defined answer that LLMs should be able to handle. From 6b26c6c5e40bb3074e19e4875f3997b30dc8b4cb Mon Sep 17 00:00:00 2001 From: huiwengoh <45724323+huiwengoh@users.noreply.github.com> Date: Sat, 13 Apr 2024 01:49:06 -0400 Subject: [PATCH 08/13] add note about timeout --- cleanlab_studio/studio/studio.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index fe9a5768..830c1a4a 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -418,7 +418,8 @@ def TLM( For more details about the options, see the documentation for [TLMOptions](../trustworthy_language_model#class-tlmoptions). If specified, these override any settings from the choice of `quality_preset`. - timeout (float, optional): timeout (in seconds) to apply to each method call. + timeout (float, optional): timeout (in seconds) to apply to each TLM prompt. + If a batch of data is passed in, the timeout will be applied to each individual item in the batch. If a result is not produced within the timeout, a TimeoutError will be raised. Defaults to None, which does not apply a timeout. verbose (bool, optional): whether to print outputs during execution, i.e., whether to show a progress bar when TLM is prompted with batches of data. From 2806d47391737f850b8a99f5b8e12c8226d141c4 Mon Sep 17 00:00:00 2001 From: huiwengoh <45724323+huiwengoh@users.noreply.github.com> Date: Sat, 13 Apr 2024 01:52:23 -0400 Subject: [PATCH 09/13] add note about async methods --- cleanlab_studio/studio/trustworthy_language_model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index dd75bf86..246871d4 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -300,7 +300,8 @@ async def prompt_async( ) -> Union[TLMResponse, List[TLMResponse]]: """ Asynchronously get response and trustworthiness score for any text input from TLM. - This method is similar to the [`prompt()`](#method-prompt) method but operates asynchronously. + This method is similar to the [`prompt()`](#method-prompt) method but operates asynchronously, + allowing for non-blocking concurrent operations. Args: prompt (str | Sequence[str]): prompt (or list of multiple prompts) for the TLM @@ -452,7 +453,8 @@ async def get_trustworthiness_score_async( response: Union[str, Sequence[str]], ) -> Union[float, List[float]]: """Asynchronously gets trustworthiness score for prompt-response pairs. - This method is similar to the [`get_trustworthiness_score()`](#method-get_trustworthiness_score) method but operates asynchronously. + This method is similar to the [`get_trustworthiness_score()`](#method-get_trustworthiness_score) method but operates asynchronously, + allowing for non-blocking concurrent operations. Args: prompt (str | Sequence[str]): prompt (or list of prompts) for the TLM to evaluate From f2f6ed48dde8be497739b80b4a40db10a79be592 Mon Sep 17 00:00:00 2001 From: huiwengoh <45724323+huiwengoh@users.noreply.github.com> Date: Mon, 15 Apr 2024 17:23:00 -0400 Subject: [PATCH 10/13] add async note --- cleanlab_studio/studio/trustworthy_language_model.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index 246871d4..e944ccbc 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -303,6 +303,10 @@ async def prompt_async( This method is similar to the [`prompt()`](#method-prompt) method but operates asynchronously, allowing for non-blocking concurrent operations. + Use this method if the prompts are streaming in one at a time, and you want to returs the results + for each input as quickly as possible, without the execution of any one TLM prompt blocking the execution of other TLM prompts. + Note that asynchronous methods do not block until completion, so you will need to fetch the results yourself later. + Args: prompt (str | Sequence[str]): prompt (or list of multiple prompts) for the TLM Returns: @@ -456,6 +460,10 @@ async def get_trustworthiness_score_async( This method is similar to the [`get_trustworthiness_score()`](#method-get_trustworthiness_score) method but operates asynchronously, allowing for non-blocking concurrent operations. + Use this method if the prompts-response pairs are streaming in one at a time, and you want to returs the results + for each input as quickly as possible, without the execution of any one TLM call blocking the execution of other TLM calls. + Note that asynchronous methods do not block until completion, so you will need to fetch the results yourself later. + Args: prompt (str | Sequence[str]): prompt (or list of prompts) for the TLM to evaluate response (str | Sequence[str]): response (or list of responses) corresponding to the input prompts From a9421184c2710372c9fe74eb7fdbed07add09a52 Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Mon, 15 Apr 2024 21:02:35 -0700 Subject: [PATCH 11/13] shorten Co-authored-by: Ulyana --- cleanlab_studio/studio/studio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 830c1a4a..3558b4ed 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -393,7 +393,7 @@ def TLM( ) -> trustworthy_language_model.TLM: """Instantiates a configured Trustworthy Language Model (TLM) instance. - The TLM object can be used as a drop-in replacement for an LLM, or for estimating trustworthiness scores for arbitrary text prompt/response pairs, and more (see the [TLM documentation](../trustworthy_language_model#class-TLM)). + The TLM object can be used as a drop-in replacement for an LLM, or, for estimating trustworthiness scores for arbitrary text prompt/response pairs, and more (see the [TLM documentation](../trustworthy_language_model#class-TLM)). For advanced use, TLM offers configuration options. The documentation below summarizes these options, and more details are explained in the [TLM tutorial](/tutorials/tlm). From 97685623aea014ac183d59f9fb8bb2bdecc16d2a Mon Sep 17 00:00:00 2001 From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> Date: Mon, 15 Apr 2024 21:03:13 -0700 Subject: [PATCH 12/13] better language Co-authored-by: Ulyana --- cleanlab_studio/studio/studio.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 3558b4ed..9a7531a0 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -401,8 +401,8 @@ def TLM( quality_preset (TLMQualityPreset): An optional preset to control the quality of TLM responses and trustworthiness scores vs. runtimes/costs. TLMQualityPreset is a string specifying one of the supported presets, including "best", "high", "medium", "low", "base". - The "best" and "high" presets improve the LLM responses themselves, - with "best" returning more reliable trustworthiness scores than "high". + The "best" and "high" presets return improved LLM responses, + with "best" also returning more reliable trustworthiness scores than "high". The "medium" and "low" presets return standard LLM responses along with associated trustworthiness scores, with "medium" producing more reliable trustworthiness scores than low. The "base" preset will not return any trustworthiness score, just a standard LLM response, and is similar to directly using your favorite LLM API. From 75e2b8f019b2f875ff03e0d99f39504756fe8de3 Mon Sep 17 00:00:00 2001 From: Hui Wen <45724323+huiwengoh@users.noreply.github.com> Date: Tue, 16 Apr 2024 01:31:11 -0400 Subject: [PATCH 13/13] fix typos Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab_studio/studio/trustworthy_language_model.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index e944ccbc..8dd1718a 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -303,9 +303,9 @@ async def prompt_async( This method is similar to the [`prompt()`](#method-prompt) method but operates asynchronously, allowing for non-blocking concurrent operations. - Use this method if the prompts are streaming in one at a time, and you want to returs the results - for each input as quickly as possible, without the execution of any one TLM prompt blocking the execution of other TLM prompts. - Note that asynchronous methods do not block until completion, so you will need to fetch the results yourself later. + Use this method if prompts are streaming in one at a time, and you want to return results + for each one as quickly as possible, without the TLM execution of any one prompt blocking the execution of the others. + Asynchronous methods do not block until completion, so you will need to fetch the results yourself. Args: prompt (str | Sequence[str]): prompt (or list of multiple prompts) for the TLM @@ -460,9 +460,9 @@ async def get_trustworthiness_score_async( This method is similar to the [`get_trustworthiness_score()`](#method-get_trustworthiness_score) method but operates asynchronously, allowing for non-blocking concurrent operations. - Use this method if the prompts-response pairs are streaming in one at a time, and you want to returs the results - for each input as quickly as possible, without the execution of any one TLM call blocking the execution of other TLM calls. - Note that asynchronous methods do not block until completion, so you will need to fetch the results yourself later. + Use this method if prompt-response pairs are streaming in, and you want to return TLM scores + for each pair as quickly as possible, without the TLM scoring of any one pair blocking the scoring of the others. + Asynchronous methods do not block until completion, so you will need to fetch the results yourself. Args: prompt (str | Sequence[str]): prompt (or list of prompts) for the TLM to evaluate