Skip to content

Commit

Permalink
add periods
Browse files Browse the repository at this point in the history
  • Loading branch information
axl1313 committed Jul 17, 2023
1 parent ad01269 commit d077d5a
Showing 1 changed file with 36 additions and 41 deletions.
77 changes: 36 additions & 41 deletions cleanlab_studio/studio/studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,7 @@


class Studio:
"""Used to interact with Cleanlab Studio
Attributes:
_api_key: API key for user interacting with Cleanlab Studio
"""
"""Used to interact with Cleanlab Studio."""

_api_key: str

Expand Down Expand Up @@ -59,17 +55,17 @@ def upload_dataset(
id_column: Optional[str] = None,
) -> str:
"""
Uploads a dataset to Cleanlab Studio
Uploads a dataset to Cleanlab Studio.
Args:
dataset: Object representing the dataset to upload. Currently supported formats include a `str` path to your dataset, a pandas DataFrame, a pyspark DataFrame
dataset_name: Name for your dataset in Cleanlab Studio (optional if uploading from filepath)
schema_overrides: Optional dictionary of overrides you would like to make to the schema of your dataset. If not provided, schema will be inferred
modality: Optional parameter to override the modality of your dataset. If not provided, modality will be inferred
id_column: Optional parameter to override the ID column of your dataset. If not provided, a monotonically increasing ID column will be generated
dataset: Object representing the dataset to upload. Currently supported formats include a `str` path to your dataset, a pandas DataFrame, a pyspark DataFrame.
dataset_name: Name for your dataset in Cleanlab Studio (optional if uploading from filepath).
schema_overrides: Optional dictionary of overrides you would like to make to the schema of your dataset. If not provided, schema will be inferred.
modality: Optional parameter to override the modality of your dataset. If not provided, modality will be inferred.
id_column: Optional parameter to override the ID column of your dataset. If not provided, a monotonically increasing ID column will be generated.
Returns:
ID of uploaded dataset
ID of uploaded dataset.
"""
ds = init_dataset_source(dataset, dataset_name)
return upload.upload_dataset(
Expand All @@ -87,15 +83,14 @@ def download_cleanlab_columns(
to_spark: bool = False,
) -> Any:
"""
Downloads Cleanlab columns for a cleanset
Downloads Cleanlab columns for a cleanset.
Args:
cleanset_id: ID of cleanset to download columns from
include_action: Whether to include a column with any actions taken on the cleanset in the downloaded columns
cleanset_id: ID of cleanset to download columns from.
include_action: Whether to include a column with any actions taken on the cleanset in the downloaded columns.
Returns:
A pandas or pyspark DataFrame
Type Any because don't want to rely on pyspark being installed
A pandas or pyspark DataFrame. Type is `Any` to avoid requiring pyspark installation.
"""
rows_df = api.download_cleanlab_columns(
self._api_key, cleanset_id, all=True, to_spark=to_spark
Expand All @@ -109,15 +104,15 @@ def download_cleanlab_columns(

def apply_corrections(self, cleanset_id: str, dataset: Any, keep_excluded: bool = False) -> Any:
"""
Applies corrections from a Cleanlab Studio cleanset to your dataset. Corrections can be made by viewing your project in the Cleanlab Studio webapp
Applies corrections from a Cleanlab Studio cleanset to your dataset. Corrections can be made by viewing your project in the Cleanlab Studio webapp.
Args:
cleanset_id: ID of cleanset to apply corrections from
dataset: Dataset to apply corrections to. Supported formats include pandas DataFrame and pyspark DataFrame. Dataset should have the same number of rows as the dataset used to create the project. It should also contain a label column with the same name as the label column for the project
keep_excluded: Whether to retain rows with an "exclude" action. By default these rows will be removed from the dataset
cleanset_id: ID of cleanset to apply corrections from.
dataset: Dataset to apply corrections to. Supported formats include pandas DataFrame and pyspark DataFrame. Dataset should have the same number of rows as the dataset used to create the project. It should also contain a label column with the same name as the label column for the project.
keep_excluded: Whether to retain rows with an "exclude" action. By default these rows will be removed from the dataset.
Returns:
A copy of the dataset with corrections applied
A copy of the dataset with corrections applied.
"""
project_id = api.get_project_of_cleanset(self._api_key, cleanset_id)
label_column = api.get_label_column_of_project(self._api_key, project_id)
Expand Down Expand Up @@ -201,20 +196,20 @@ def create_project(
text_column: Optional[str] = None,
) -> str:
"""
Creates a Cleanlab Studio project
Creates a Cleanlab Studio project.
Args:
dataset_id: ID of dataset to create project for
project_name: name for resulting project
modality: modality of project (i.e. text, tabular, image)
task_type: type of classification to perform (i.e. multi-class, multi-label)
model_type: type of model to train (i.e. fast, regular)
label_column: name of column in dataset containing labels (if not supplied, we'll make our best guess)
feature_columns: list of columns to use as features when training tabular modality project (if not supplied and modality is "tabular" we'll use all valid feature columns)
text_column: name of column containing the text to train text modality project on (if not supplied and modality is "text" we'll make our best guess)
dataset_id: ID of dataset to create project for.
project_name: name for resulting project.
modality: modality of project (i.e. text, tabular, image).
task_type: type of classification to perform (i.e. multi-class, multi-label).
model_type: type of model to train (i.e. fast, regular).
label_column: name of column in dataset containing labels (if not supplied, we'll make our best guess).
feature_columns: list of columns to use as features when training tabular modality project (if not supplied and modality is "tabular" we'll use all valid feature columns).
text_column: name of column containing the text to train text modality project on (if not supplied and modality is "text" we'll make our best guess).
Returns:
ID of created project
ID of created project.
"""
dataset_details = api.get_dataset_details(self._api_key, dataset_id)

Expand Down Expand Up @@ -262,35 +257,35 @@ def create_project(

def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None) -> bool:
"""
Polls for cleanset status. Blocks until cleanset is ready, there is a cleanset error, or `timeout` is exceeded
Polls for cleanset status. Blocks until cleanset is ready, there is a cleanset error, or `timeout` is exceeded.
Args:
cleanset_id: ID of cleanset to check status of
timeout: Optional timeout after which to stop polling for progress. If not provided, will block until cleanset is ready
cleanset_id: ID of cleanset to check status of.
timeout: Optional timeout after which to stop polling for progress. If not provided, will block until cleanset is ready.
Returns:
`True` if cleanset is ready, `False` otherwise
`True` if cleanset is ready, `False` otherwise.
"""
return clean.poll_cleanset_status(self._api_key, cleanset_id, timeout)

def get_latest_cleanset_id(self, project_id: str) -> str:
"""
Gets latest cleanset ID for a project
Gets latest cleanset ID for a project.
Args:
project_id: ID of project
project_id: ID of project.
Returns:
ID of latest associated cleanset
ID of latest associated cleanset.
"""
return api.get_latest_cleanset_id(self._api_key, project_id)

def delete_project(self, project_id: str) -> None:
"""
Deletes a project from Cleanlab Studio
Deletes a project from Cleanlab Studio.
Args:
project_id: ID of project to delete
project_id: ID of project to delete.
"""
api.delete_project(self._api_key, project_id)
print(f"Successfully deleted project: {project_id}")
Expand Down

0 comments on commit d077d5a

Please sign in to comment.