diff --git a/cleanlab_studio/studio/clean.py b/cleanlab_studio/internal/clean_helpers.py similarity index 100% rename from cleanlab_studio/studio/clean.py rename to cleanlab_studio/internal/clean_helpers.py diff --git a/cleanlab_studio/internal/upload_helpers.py b/cleanlab_studio/internal/upload_helpers.py index 42b1e042..08274b8a 100644 --- a/cleanlab_studio/internal/upload_helpers.py +++ b/cleanlab_studio/internal/upload_helpers.py @@ -10,7 +10,40 @@ from .api import api from .dataset_source import DatasetSource -from .types import JSONDict +from .types import FieldSchemaDict, JSONDict + + +def upload_dataset( + api_key: str, + dataset_source: DatasetSource, + *, + schema_overrides: Optional[FieldSchemaDict] = None, + modality: Optional[str] = None, + id_column: Optional[str] = None, +) -> str: + upload_id = upload_dataset_file(api_key, dataset_source) + schema = get_proposed_schema(api_key, upload_id) + + if (schema is None or schema.get("immutable", False)) and ( + schema_overrides is not None or modality is not None or id_column is not None + ): + raise ValueError( + "Schema_overrides, modality, and id_column parameters cannot be provided for simple zip uploads" + ) + + if schema is not None and not schema.get("immutable", False): + schema["metadata"]["name"] = dataset_source.dataset_name + if schema_overrides is not None: + for field in schema_overrides: + schema["fields"][field] = schema_overrides[field] + if modality is not None: + schema["metadata"]["modality"] = modality + if id_column is not None: + schema["metadata"]["id_column"] = id_column + + api.confirm_schema(api_key, schema, upload_id) + dataset_id = get_ingestion_result(api_key, upload_id) + return dataset_id async def _upload_file_chunk_async( @@ -58,7 +91,10 @@ def upload_file_parts( def upload_dataset_file(api_key: str, dataset_source: DatasetSource) -> str: upload_id, part_sizes, presigned_posts = api.initialize_upload( - api_key, dataset_source.get_filename(), dataset_source.file_type, dataset_source.file_size + api_key, + dataset_source.get_filename(), + dataset_source.file_type, + dataset_source.file_size, ) upload_parts = upload_file_parts(dataset_source, part_sizes, presigned_posts) api.complete_file_upload(api_key, upload_id, upload_parts) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index a506ed82..d4ccd9a2 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -7,7 +7,8 @@ import numpy.typing as npt import pandas as pd -from . import clean, upload, inference +from . import inference +from cleanlab_studio.internal import clean_helpers, upload_helpers from cleanlab_studio.internal.api import api from cleanlab_studio.internal.util import ( init_dataset_source, @@ -68,7 +69,7 @@ def upload_dataset( ID of uploaded dataset. """ ds = init_dataset_source(dataset, dataset_name) - return upload.upload_dataset( + return upload_helpers.upload_dataset( self._api_key, ds, schema_overrides=schema_overrides, @@ -266,7 +267,7 @@ def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None) Returns: After cleanset is done being generated, returns `True` if cleanset is ready to use, `False` otherwise. """ - return clean.poll_cleanset_status(self._api_key, cleanset_id, timeout) + return clean_helpers.poll_cleanset_status(self._api_key, cleanset_id, timeout) def get_latest_cleanset_id(self, project_id: str) -> str: """ @@ -298,7 +299,7 @@ def get_model(self, model_id: str) -> inference.Model: model_id: ID of model to get. This ID should be fetched in the deployments page of the app UI. Returns: - Model object with methods to run predictions on new input data + [Model](inference#class-model) object with methods to run predictions on new input data. """ return inference.Model(self._api_key, model_id) diff --git a/cleanlab_studio/studio/upload.py b/cleanlab_studio/studio/upload.py deleted file mode 100644 index 28946b90..00000000 --- a/cleanlab_studio/studio/upload.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import Optional - -from cleanlab_studio.internal.api import api -from cleanlab_studio.internal.dataset_source import DatasetSource -from cleanlab_studio.internal.types import FieldSchemaDict -from cleanlab_studio.internal.upload_helpers import ( - get_ingestion_result, - get_proposed_schema, - upload_dataset_file, -) - - -def upload_dataset( - api_key: str, - dataset_source: DatasetSource, - *, - schema_overrides: Optional[FieldSchemaDict] = None, - modality: Optional[str] = None, - id_column: Optional[str] = None, -) -> str: - upload_id = upload_dataset_file(api_key, dataset_source) - schema = get_proposed_schema(api_key, upload_id) - - if (schema is None or schema.get("immutable", False)) and ( - schema_overrides is not None or modality is not None or id_column is not None - ): - raise ValueError( - "Schema_overrides, modality, and id_column parameters cannot be provided for simple zip uploads" - ) - - if schema is not None and not schema.get("immutable", False): - schema["metadata"]["name"] = dataset_source.dataset_name - if schema_overrides is not None: - for field in schema_overrides: - schema["fields"][field] = schema_overrides[field] - if modality is not None: - schema["metadata"]["modality"] = modality - if id_column is not None: - schema["metadata"]["id_column"] = id_column - - api.confirm_schema(api_key, schema, upload_id) - dataset_id = get_ingestion_result(api_key, upload_id) - return dataset_id