Apply text improvement suggestions from code review

Co-authored-by: Jonas Mueller <[email protected]> Co-authored-by: Matthew Turk <[email protected]>
cleanlab · Apr 30, 2024 · 6e9786a · 6e9786a
1 parent 87f8b71
commit 6e9786a
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 7 deletions.
diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
@@ -14,7 +14,7 @@
 
 from cleanlab_studio.internal.api import api
 from cleanlab_studio.internal.settings import CleanlabSettings
-from cleanlab_studio.errors import InvalidDatasetError, HandledError, ValidationError
+from cleanlab_studio.errors import InvalidDatasetError, HandledError
 
 try:
     import snowflake.snowpark as snowpark

diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
@@ -2,7 +2,6 @@
 Python API for Cleanlab Studio.
 """
 
-import re
 from typing import Any, List, Literal, Optional, Union
 from types import FunctionType
 import warnings

diff --git a/cleanlab_studio/utils/data_enrichment/enrich.py b/cleanlab_studio/utils/data_enrichment/enrich.py
@@ -24,26 +24,36 @@ def enrich_data(
     **kwargs,
 ) -> pd.DataFrame:
     """
-    This method takes in a Studio client object, prompt template and a DataFrame and enriches the dataframe with the results of the prompting and associated trustworthiness scores.
+    Generate a column of arbitrary metadata for your DataFrame, reliably at scale with Generative AI.
+    The metadata is separately generated for each row of your DataFrame, based on a prompt that specifies what information you need and what existing columns' data it should be derived from. 
+    Each row of generated metadata is accompanied by a trustworthiness score, which helps you discover which metadata is most/least reliable.
+    You can optionally apply regular expressions to further reformat your metadata beyond raw LLM outputs, or specify that each row of the metadata must be constrained to a particular set of values.
 
     Args:
+        studio: Cleanlab Studio client object, which you must instantiate before calling this method.
         prompt: Formatted f-string, that contains both the prompt, and names of columns to embed.
             **Example:** "Is this a numeric value, answer Yes or No only. Value: {column_name}"
-        regex: One or more expressions will be passed into re.compile or a list of already compiled regular expressions.
-            If a list is proivded, the regexes are applied in order and first succesfull match is returned.
+        regex: One or more expressions will be passed into ``re.compile()`` or a list of already compiled regular expressions.
+        The regex will be applied to the raw LLM outputs from your prompt, enabling additional control over the final column values returned.
+            If a list is provided, the regexes are applied in order and first successful match is returned.
             **Note:** Regex patterns should each specify exactly 1 group that is the match group using parenthesis like so '.*(<desired match group pattern>)'.
             **Example:** `r'.*(Bird|[Rr]abbit).*'` will match any string that is the word 'Bird', 'Rabbit' or 'rabbit' into group 1.
         return_values: List of all possible values for the `metadata` column.
             If specified, every entry in the `metadata` column will exactly match one of these values (for less open-ended data enrichment tasks). If None, the `metadata` column can contain arbitrary values (for more open-ended data enrichment tasks).
             After your regex is applied, there may be additional transformations applied to ensure the returned value is one of these.
-        subset_indices: What subset of the supplied data to run this for. Can be either a list of unique indicies or a range. If None, we run on all the data.
+        subset_indices: What subset of the supplied data rows to generate metadata for. If None, we run on all of the data.
+        
+        This can be either a list of unique indices or a range. These indices are passed into pandas ``.iloc`` method, so should be integers based on row order as opposed to row-index labels pointing to `df.index`.
+        
+        We advise against collecting results for all of your data at first. First collect results for a smaller data subset, and use this subset to experiment with different values of the `prompt` or `regex` arguments. Only once the results look good for your subset should you run on the full dataset. 
+        
         column_name_prefix: Optional prefix appended to all columns names that are returned.
 
     Returns:
         A DataFrame that now contains additional `metadata` and `trustworthiness` columns related to the prompt. Columns will have `column_name_prefix_` prepended to them if specified.
         `metadata` column = responses to the prompt and other data mutations if `regex` or `return_values` is not specified.
         `trustworthiness` column = trustworthiness of the prompt responses (which ignore the data mutations).
-        **Note**: If any data mutations were made to the original response from the prompt, an additional `log` column will be added to the DataFrame that contains the raw output before the mutations were applied.
+        **Note**: If you specified the `regex` or `return_values` arguments, some additional transformations may be applied to raw LLM outputs to produce the returned values. In these cases, an additional `log` column will be added to the returned DataFrame that records the raw LLM outputs.
     """
     subset_data = extract_df_subset(data, subset_indices)
     outputs = get_prompt_outputs(studio, prompt, subset_data, **kwargs)