From cafec6184ad355855d95a2fa7472eaa700494514 Mon Sep 17 00:00:00 2001 From: ryansingman Date: Wed, 12 Jul 2023 16:23:11 -0600 Subject: [PATCH 01/42] adds inference API (v0) --- cleanlab_studio/internal/api/api.py | 53 ++++++++++++++++++- cleanlab_studio/studio/inference.py | 81 +++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 cleanlab_studio/studio/inference.py diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index fde0f230..ae9157fe 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -1,8 +1,10 @@ +import io import os import time -from typing import Callable, List, Optional, Tuple, Union, Any +from typing import Callable, List, Optional, Tuple, Dict, Union, Any from cleanlab_studio.errors import APIError +import aiohttp import requests from tqdm import tqdm import pandas as pd @@ -25,6 +27,7 @@ dataset_base_url = f"{base_url}/datasets" project_base_url = f"{base_url}/projects" cleanset_base_url = f"{base_url}/cleansets" +model_base_url = f"{base_url}/models" def _construct_headers( @@ -329,3 +332,51 @@ def poll_progress( res = request_function(progress_id) pbar.update(float(1) - pbar.n) return res + + +async def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str: + """Uploads prediction batch and returns query ID.""" + async with aiohttp.ClientSession() as session: + async with session.post( + f"{model_base_url}/{model_id}/upload", + headers=_construct_headers(api_key), + ) as resp: + resp_json = await resp.json() + handle_api_error_from_json(resp_json) + + query_id: str = resp_json["query_id"] + upload_url: str = resp_json["upload_url"] + + session.put(upload_url, data=batch) + + return query_id + + +async def start_prediction(api_key: str, model_id: str, query_id: str) -> None: + """Starts prediction for query.""" + async with aiohttp.ClientSession() as session: + async with session.post( + f"{model_base_url}/{model_id}/predict/{query_id}", + headers=_construct_headers(api_key), + ) as resp: + handle_api_error_from_json(await resp.json()) + + +async def get_prediction_status(api_key: str, model_id: str, query_id: str) -> Dict[str, str]: + """Gets status of model prediction query.""" + async with aiohttp.ClientSession() as session: + async with session.get( + f"{model_base_url}/{model_id}/predict/{query_id}", + headers=_construct_headers(api_key), + ) as resp: + resp_json = await resp.json() + handle_api_error_from_json(resp_json) + + return resp_json + + +async def download_prediction_results(result_url: str) -> io.StringIO: + """Downloads prediction results from presigned URL.""" + async with aiohttp.ClientSession() as session: + async with session.get(result_url) as resp: + return io.StringIO(await resp.text()) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py new file mode 100644 index 00000000..9dc64957 --- /dev/null +++ b/cleanlab_studio/studio/inference.py @@ -0,0 +1,81 @@ +import abc +import io +from typing import Any, Awaitable, Tuple + +import numpy as np +import numpy.typing as npt +import pandas as pd + +from cleanlab_studio.internal.api import api + + +Predictions = npt.NDArray[np.int_] | npt.NDArray[np.str_] +ClassProbablities = pd.DataFrame + + +class Model(abc.ABC): + """Base class for deployed model inference.""" + + def __init__(self, api_key: str, model_id: str): + """Initializes model class w/ API key and model ID.""" + self._api_key = api_key + self._model_id = model_id + + @abc.abstractmethod + def predict( + self, batch: Any, return_pred_proba: bool = False + ) -> Predictions | Tuple[Predictions, ClassProbablities]: + """Gets predictions for batch of examples, optionally returning class probabilities. + + :param batch: batch of example to predict classes for + :param return_pred_proba: if should return class probabilities, defaults to False + :return: predictions + class probabilities, if requested + """ + raise NotImplementedError + + def _predict( + self, batch: io.StringIO, return_pred_proba: bool + ) -> Predictions | Tuple[Predictions, ClassProbablities]: + """Gets predictions for batch of examples, optionally returning class probabilities. + + :param batch: batch of example to predict classes for, as in-memory CSV file + :param return_pred_proba: if should return class probabilities + :return: predictions + class probabilities, if requested + """ + return asyncio.run(self._predict_async(batch, return_pred_proba)) + + @abc.abstractmethod + async def predict_async( + self, batch: Any, return_pred_proba: bool = False + ) -> Awaitable[Predictions] | Awaitable[Tuple[Predictions, ClassProbablities]]: + """Asynchronously gets predictions for batch of examples, optionally returning class probabilities. + + :param batch: batch of example to predict classes for + :param return_pred_proba: if should return class probabilities, defaults to False + :return: predictions + class probabilities, if requested + """ + raise NotImplementedError + + async def _predict_async( + self, batch: io.StringIO, return_pred_proba: bool + ) -> Predictions | Tuple[Predictions, ClassProbablities]: + """Asynchronously gets predictions for batch of examples, optionally returning class probabilities. + + :param batch: batch of example to predict classes for, as in-memory CSV file + :param return_pred_proba: if should return class probabilities, defaults to False + :return: predictions + class probabilities, if requested + """ + query_id: str = await api.upload_predict_batch(self._api_key, self._model_id, batch) + await api.start_prediction(self._api_key, self._model_id, query_id) + + status: str | None = None + result_url: str = "" + while status != "done": + status, result_url = await api.get_prediction_status( + self._api_key, self._model_id, query_id + ) + + # TODO handle get pred proba case + return pd.read_csv( + await api.download_prediction_results(result_url), + ).values From 55f93a28e21ff0b2db30372232f9cb2328888c2a Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Thu, 20 Jul 2023 01:27:33 +0900 Subject: [PATCH 02/42] modify client to fit backend api endpoints --- cleanlab_studio/internal/api/api.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index ae9157fe..7d38b8f0 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -27,7 +27,7 @@ dataset_base_url = f"{base_url}/datasets" project_base_url = f"{base_url}/projects" cleanset_base_url = f"{base_url}/cleansets" -model_base_url = f"{base_url}/models" +model_base_url = f"{base_url}/v1/deployment" def _construct_headers( @@ -344,29 +344,36 @@ async def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) resp_json = await resp.json() handle_api_error_from_json(resp_json) - query_id: str = resp_json["query_id"] upload_url: str = resp_json["upload_url"] - session.put(upload_url, data=batch) + session.put(upload_url["url"], data=upload_url["fields"], files=batch) - return query_id + return upload_url["fields"]["key"] -async def start_prediction(api_key: str, model_id: str, query_id: str) -> None: +async def start_prediction(api_key: str, model_id: str, s3_key: str) -> None: """Starts prediction for query.""" async with aiohttp.ClientSession() as session: async with session.post( - f"{model_base_url}/{model_id}/predict/{query_id}", + f"{model_base_url}/{model_id}/predict", headers=_construct_headers(api_key), + data={ + "s3_key": s3_key, + } ) as resp: - handle_api_error_from_json(await resp.json()) + resp_json = await resp.json() + handle_api_error_from_json(resp_json) + + query_id: str = resp_json["id"] + + return query_id -async def get_prediction_status(api_key: str, model_id: str, query_id: str) -> Dict[str, str]: +async def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: """Gets status of model prediction query.""" async with aiohttp.ClientSession() as session: async with session.get( - f"{model_base_url}/{model_id}/predict/{query_id}", + f"{model_base_url}/predict/{query_id}", headers=_construct_headers(api_key), ) as resp: resp_json = await resp.json() From 9257172d12cd61669bd97a82d8d99e2b09f0abca Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Sat, 22 Jul 2023 01:01:07 +0900 Subject: [PATCH 03/42] modify cli to make model prediction work --- cleanlab_studio/internal/api/api.py | 79 +++++++++++++++-------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 7d38b8f0..0b0ede71 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -21,6 +21,8 @@ from cleanlab_studio.internal.types import JSONDict from cleanlab_studio.version import __version__ +os.environ["CLEANLAB_API_BASE_URL"] = "http://localhost:8500/api" + base_url = os.environ.get("CLEANLAB_API_BASE_URL", "https://api.cleanlab.ai/api") cli_base_url = f"{base_url}/cli/v0" upload_base_url = f"{base_url}/upload/v0" @@ -334,56 +336,59 @@ def poll_progress( return res -async def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str: +def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str: """Uploads prediction batch and returns query ID.""" - async with aiohttp.ClientSession() as session: - async with session.post( - f"{model_base_url}/{model_id}/upload", - headers=_construct_headers(api_key), - ) as resp: - resp_json = await resp.json() - handle_api_error_from_json(resp_json) + res = requests.post( + f"{model_base_url}/{model_id}/upload", + headers=_construct_headers(api_key), + ) - upload_url: str = resp_json["upload_url"] + handle_api_error(res) + presigned_url = res.json()["upload_url"] - session.put(upload_url["url"], data=upload_url["fields"], files=batch) + requests.post(presigned_url["url"], data=presigned_url["fields"], files={"file": batch}) - return upload_url["fields"]["key"] + return presigned_url["fields"]["key"] -async def start_prediction(api_key: str, model_id: str, s3_key: str) -> None: +def start_prediction(api_key: str, model_id: str, s3_key: str) -> str: """Starts prediction for query.""" - async with aiohttp.ClientSession() as session: - async with session.post( - f"{model_base_url}/{model_id}/predict", - headers=_construct_headers(api_key), - data={ - "s3_key": s3_key, - } - ) as resp: - resp_json = await resp.json() - handle_api_error_from_json(resp_json) + res = requests.post( + f"{model_base_url}/{model_id}/predict", + headers=_construct_headers(api_key), + json={ + "s3_key": s3_key, + } + ) - query_id: str = resp_json["id"] + handle_api_error(res) + query_id: str = res.json()["id"] - return query_id + return query_id -async def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: +def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: """Gets status of model prediction query.""" - async with aiohttp.ClientSession() as session: - async with session.get( - f"{model_base_url}/predict/{query_id}", - headers=_construct_headers(api_key), - ) as resp: - resp_json = await resp.json() - handle_api_error_from_json(resp_json) + res = requests.get( + f"{model_base_url}/predict/{query_id}", + headers=_construct_headers(api_key), + ) + handle_api_error(res) + + prediction_results = res.json() + status = prediction_results["status"] + result_url = prediction_results["results"] + error_msg = prediction_results["error_msg"] - return resp_json + if prediction_results["status"] == "COMPLETE": + return {"status": status, "result_url": result_url} + elif prediction_results["status"] == "FAILED": + return {"status": status, "error_msg": error_msg} + else: + return {"status": status} -async def download_prediction_results(result_url: str) -> io.StringIO: +def download_prediction_results(result_url: str) -> io.StringIO: """Downloads prediction results from presigned URL.""" - async with aiohttp.ClientSession() as session: - async with session.get(result_url) as resp: - return io.StringIO(await resp.text()) + res = requests.get(result_url) + return io.StringIO(res.text) From 79b5cde1eac9297c00b98fc984a22f5470dd8043 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Sat, 22 Jul 2023 01:16:43 +0900 Subject: [PATCH 04/42] black --- cleanlab_studio/internal/api/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 0b0ede71..28ef7ae8 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -358,7 +358,7 @@ def start_prediction(api_key: str, model_id: str, s3_key: str) -> str: headers=_construct_headers(api_key), json={ "s3_key": s3_key, - } + }, ) handle_api_error(res) From 92d973cd33efc0c85e9e2f33b659cd6310ba84b3 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Mon, 24 Jul 2023 10:14:38 -0700 Subject: [PATCH 05/42] modify cli after testing api endpoints --- cleanlab_studio/internal/api/api.py | 13 ++++--------- cleanlab_studio/studio/inference.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 28ef7ae8..573e5cfb 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -345,26 +345,21 @@ def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str handle_api_error(res) presigned_url = res.json()["upload_url"] + query_id = res.json()["query_id"] requests.post(presigned_url["url"], data=presigned_url["fields"], files={"file": batch}) - return presigned_url["fields"]["key"] + return query_id -def start_prediction(api_key: str, model_id: str, s3_key: str) -> str: +def start_prediction(api_key: str, model_id: str, query_id: str) -> None: """Starts prediction for query.""" res = requests.post( - f"{model_base_url}/{model_id}/predict", + f"{model_base_url}/{model_id}/predict/{query_id}", headers=_construct_headers(api_key), - json={ - "s3_key": s3_key, - }, ) handle_api_error(res) - query_id: str = res.json()["id"] - - return query_id def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 9dc64957..d1947531 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -42,10 +42,10 @@ def _predict( :param return_pred_proba: if should return class probabilities :return: predictions + class probabilities, if requested """ - return asyncio.run(self._predict_async(batch, return_pred_proba)) + return self._predict_async(batch, return_pred_proba) @abc.abstractmethod - async def predict_async( + def predict_async( self, batch: Any, return_pred_proba: bool = False ) -> Awaitable[Predictions] | Awaitable[Tuple[Predictions, ClassProbablities]]: """Asynchronously gets predictions for batch of examples, optionally returning class probabilities. @@ -56,7 +56,7 @@ async def predict_async( """ raise NotImplementedError - async def _predict_async( + def _predict_async( self, batch: io.StringIO, return_pred_proba: bool ) -> Predictions | Tuple[Predictions, ClassProbablities]: """Asynchronously gets predictions for batch of examples, optionally returning class probabilities. @@ -65,17 +65,17 @@ async def _predict_async( :param return_pred_proba: if should return class probabilities, defaults to False :return: predictions + class probabilities, if requested """ - query_id: str = await api.upload_predict_batch(self._api_key, self._model_id, batch) - await api.start_prediction(self._api_key, self._model_id, query_id) + query_id: str = api.upload_predict_batch(self._api_key, self._model_id, batch) + api.start_prediction(self._api_key, self._model_id, query_id) status: str | None = None result_url: str = "" while status != "done": - status, result_url = await api.get_prediction_status( - self._api_key, self._model_id, query_id + status, result_url = api.get_prediction_status( + self._api_key, query_id ) # TODO handle get pred proba case return pd.read_csv( - await api.download_prediction_results(result_url), + api.download_prediction_results(result_url), ).values From 967722b45e8161bc917ece7916a1f234c586863b Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Mon, 24 Jul 2023 10:34:04 -0700 Subject: [PATCH 06/42] black --- cleanlab_studio/studio/inference.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index d1947531..45158167 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -71,9 +71,7 @@ def _predict_async( status: str | None = None result_url: str = "" while status != "done": - status, result_url = api.get_prediction_status( - self._api_key, query_id - ) + status, result_url = api.get_prediction_status(self._api_key, query_id) # TODO handle get pred proba case return pd.read_csv( From 3d63153033a0c12587588d756565a6a860697efb Mon Sep 17 00:00:00 2001 From: ryansingman Date: Mon, 24 Jul 2023 14:07:03 -0600 Subject: [PATCH 07/42] integrate into Studio class --- cleanlab_studio/studio/inference.py | 85 ++++++++++++++++------------- cleanlab_studio/studio/studio.py | 6 +- 2 files changed, 51 insertions(+), 40 deletions(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 45158167..f00bf940 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -1,6 +1,8 @@ import abc +import csv +import functools import io -from typing import Any, Awaitable, Tuple +from typing import List import numpy as np import numpy.typing as npt @@ -9,6 +11,10 @@ from cleanlab_studio.internal.api import api +TextBatch = List[str] | npt.NDArray[np.str_] | pd.Series +TabularBatch = pd.DataFrame +Batch = TextBatch | TabularBatch + Predictions = npt.NDArray[np.int_] | npt.NDArray[np.str_] ClassProbablities = pd.DataFrame @@ -21,49 +27,23 @@ def __init__(self, api_key: str, model_id: str): self._api_key = api_key self._model_id = model_id - @abc.abstractmethod def predict( - self, batch: Any, return_pred_proba: bool = False - ) -> Predictions | Tuple[Predictions, ClassProbablities]: - """Gets predictions for batch of examples, optionally returning class probabilities. - - :param batch: batch of example to predict classes for - :param return_pred_proba: if should return class probabilities, defaults to False - :return: predictions + class probabilities, if requested - """ - raise NotImplementedError - - def _predict( - self, batch: io.StringIO, return_pred_proba: bool - ) -> Predictions | Tuple[Predictions, ClassProbablities]: - """Gets predictions for batch of examples, optionally returning class probabilities. - - :param batch: batch of example to predict classes for, as in-memory CSV file - :param return_pred_proba: if should return class probabilities - :return: predictions + class probabilities, if requested - """ - return self._predict_async(batch, return_pred_proba) - - @abc.abstractmethod - def predict_async( - self, batch: Any, return_pred_proba: bool = False - ) -> Awaitable[Predictions] | Awaitable[Tuple[Predictions, ClassProbablities]]: - """Asynchronously gets predictions for batch of examples, optionally returning class probabilities. + self, + batch: Batch, + ) -> Predictions: + """Gets predictions for batch of examples. :param batch: batch of example to predict classes for - :param return_pred_proba: if should return class probabilities, defaults to False - :return: predictions + class probabilities, if requested + :return: predictions from batch """ - raise NotImplementedError + csv_batch = self._convert_batch_to_csv(batch) + return self._predict(csv_batch) - def _predict_async( - self, batch: io.StringIO, return_pred_proba: bool - ) -> Predictions | Tuple[Predictions, ClassProbablities]: - """Asynchronously gets predictions for batch of examples, optionally returning class probabilities. + def _predict(self, batch: io.StringIO) -> Predictions: + """Gets predictions for batch of examples. :param batch: batch of example to predict classes for, as in-memory CSV file - :param return_pred_proba: if should return class probabilities, defaults to False - :return: predictions + class probabilities, if requested + :return: predictions from batch """ query_id: str = api.upload_predict_batch(self._api_key, self._model_id, batch) api.start_prediction(self._api_key, self._model_id, query_id) @@ -71,9 +51,36 @@ def _predict_async( status: str | None = None result_url: str = "" while status != "done": - status, result_url = api.get_prediction_status(self._api_key, query_id) + resp = api.get_prediction_status(self._api_key, query_id) + status = resp["status"] + result_url = resp["result_url"] - # TODO handle get pred proba case return pd.read_csv( api.download_prediction_results(result_url), ).values + + @functools.singledispatchmethod + def _convert_batch_to_csv(self, batch: Batch) -> io.StringIO: + """Converts batch object to CSV string IO.""" + sio = io.StringIO() + + # handle text batches + if isinstance(batch, (list, np.ndarray, pd.Series)): + writer = csv.writer(sio) + + # write header + writer.writerow(["label"]) + + # write labels to CSV + for label in batch: + writer.writerow([label]) + + # handle tabular batches + elif isinstance(batch, pd.DataFrame): + batch.to_csv(sio) + + else: + raise TypeError(f"Invalid type of batch: {type(batch)}") + + sio.seek(0) + return sio diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index c0eacc11..56d38a8e 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -4,7 +4,7 @@ import numpy.typing as npt import pandas as pd -from . import clean, upload +from . import clean, upload, inference from cleanlab_studio.internal.api import api from cleanlab_studio.internal.util import init_dataset_source, check_none, check_not_none from cleanlab_studio.internal.settings import CleanlabSettings @@ -231,6 +231,10 @@ def delete_project(self, project_id: str) -> None: api.delete_project(self._api_key, project_id) print(f"Successfully deleted project: {project_id}") + def get_model(self, model_id: str) -> inference.Model: + """Creates model object from model ID, to use for inference.""" + return inference.Model(self._api_key, model_id) + class Experimental: def __init__(self, outer): # type: ignore self._outer = outer From d81b2096afc3042f73a2047a3e76ae3f491da0bb Mon Sep 17 00:00:00 2001 From: ryansingman Date: Mon, 24 Jul 2023 14:23:50 -0600 Subject: [PATCH 08/42] remove base url override --- cleanlab_studio/internal/api/api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 573e5cfb..ce565c36 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -21,8 +21,6 @@ from cleanlab_studio.internal.types import JSONDict from cleanlab_studio.version import __version__ -os.environ["CLEANLAB_API_BASE_URL"] = "http://localhost:8500/api" - base_url = os.environ.get("CLEANLAB_API_BASE_URL", "https://api.cleanlab.ai/api") cli_base_url = f"{base_url}/cli/v0" upload_base_url = f"{base_url}/upload/v0" @@ -338,8 +336,10 @@ def poll_progress( def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str: """Uploads prediction batch and returns query ID.""" + url = f"{model_base_url}/{model_id}/upload" + print(f"upload {url=}") res = requests.post( - f"{model_base_url}/{model_id}/upload", + url, headers=_construct_headers(api_key), ) From 865cf032928373d70a5404e4fc400e98fc8eec88 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Mon, 24 Jul 2023 14:00:43 -0700 Subject: [PATCH 09/42] fix api endpoint for client to work --- cleanlab_studio/internal/api/api.py | 11 ++++++----- cleanlab_studio/studio/inference.py | 7 ++++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index ce565c36..8ba68fe6 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -21,6 +21,7 @@ from cleanlab_studio.internal.types import JSONDict from cleanlab_studio.version import __version__ + base_url = os.environ.get("CLEANLAB_API_BASE_URL", "https://api.cleanlab.ai/api") cli_base_url = f"{base_url}/cli/v0" upload_base_url = f"{base_url}/upload/v0" @@ -375,12 +376,12 @@ def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: result_url = prediction_results["results"] error_msg = prediction_results["error_msg"] - if prediction_results["status"] == "COMPLETE": - return {"status": status, "result_url": result_url} - elif prediction_results["status"] == "FAILED": - return {"status": status, "error_msg": error_msg} + if status == "COMPLETE": + return {"status": "done", "result_url": result_url} + elif status == "FAILED": + return {"status": "error", "error_msg": error_msg} else: - return {"status": status} + return {"status": "running"} def download_prediction_results(result_url: str) -> io.StringIO: diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index f00bf940..ce07dc9d 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -48,12 +48,13 @@ def _predict(self, batch: io.StringIO) -> Predictions: query_id: str = api.upload_predict_batch(self._api_key, self._model_id, batch) api.start_prediction(self._api_key, self._model_id, query_id) - status: str | None = None - result_url: str = "" + resp = api.get_prediction_status(self._api_key, query_id) + status: str | None = resp["status"] while status != "done": resp = api.get_prediction_status(self._api_key, query_id) status = resp["status"] - result_url = resp["result_url"] + + result_url = resp["result_url"] return pd.read_csv( api.download_prediction_results(result_url), From c88f90b945e3d7dcf1c6e2a8c3417f1bfd93114e Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Mon, 24 Jul 2023 15:08:25 -0700 Subject: [PATCH 10/42] modify code to support text files without headers --- cleanlab_studio/internal/api/api.py | 16 +++++++++++++-- cleanlab_studio/studio/inference.py | 21 ++++++++++--------- tests/models/test_prediction.py | 31 +++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 13 deletions(-) create mode 100644 tests/models/test_prediction.py diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 8ba68fe6..c1504516 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -1,6 +1,8 @@ import io import os import time +from itertools import chain +from shutil import copyfileobj from typing import Callable, List, Optional, Tuple, Dict, Union, Any from cleanlab_studio.errors import APIError @@ -338,7 +340,6 @@ def poll_progress( def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str: """Uploads prediction batch and returns query ID.""" url = f"{model_base_url}/{model_id}/upload" - print(f"upload {url=}") res = requests.post( url, headers=_construct_headers(api_key), @@ -347,8 +348,19 @@ def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str handle_api_error(res) presigned_url = res.json()["upload_url"] query_id = res.json()["query_id"] + header = res.json()["header"] + if header: + batch_header = batch.readline() + if batch_header == header: + input_batch = batch + else: + header_io = io.StringIO(header) + batch_header_io = io.StringIO(batch_header) + input_batch = io.StringIO("\n".join(chain(header_io, batch_header_io, batch))) + else: + input_batch = batch - requests.post(presigned_url["url"], data=presigned_url["fields"], files={"file": batch}) + requests.post(presigned_url["url"], data=presigned_url["fields"], files={"file": input_batch}) return query_id diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index ce07dc9d..6cf1832a 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -50,15 +50,17 @@ def _predict(self, batch: io.StringIO) -> Predictions: resp = api.get_prediction_status(self._api_key, query_id) status: str | None = resp["status"] - while status != "done": + while status == "running": resp = api.get_prediction_status(self._api_key, query_id) status = resp["status"] - result_url = resp["result_url"] - - return pd.read_csv( - api.download_prediction_results(result_url), - ).values + if status == "error": + return resp["error_msg"] + else: + result_url = resp["result_url"] + return pd.read_csv( + api.download_prediction_results(result_url), + ).values @functools.singledispatchmethod def _convert_batch_to_csv(self, batch: Batch) -> io.StringIO: @@ -69,12 +71,9 @@ def _convert_batch_to_csv(self, batch: Batch) -> io.StringIO: if isinstance(batch, (list, np.ndarray, pd.Series)): writer = csv.writer(sio) - # write header - writer.writerow(["label"]) - # write labels to CSV - for label in batch: - writer.writerow([label]) + for input_data in batch: + writer.writerow([input_data]) # handle tabular batches elif isinstance(batch, pd.DataFrame): diff --git a/tests/models/test_prediction.py b/tests/models/test_prediction.py new file mode 100644 index 00000000..bd60c50a --- /dev/null +++ b/tests/models/test_prediction.py @@ -0,0 +1,31 @@ +import os + +os.environ["CLEANLAB_API_BASE_URL"] = "https://api.dev-bc26qf4m.cleanlab.ai/api" +# os.environ["CLEANLAB_API_BASE_URL"] = "http://localhost:8500/api" + +from cleanlab_studio import Studio +import pandas as pd + + +API_KEY = "350b3ee6fbe64d21a6012ea281ce0ca1" +MODEL_ID = "cea761848e5f449b85e34fe347696b53" +# API_KEY = "75f2ab8c962c40169917136756c5d937" +# MODEL_ID = "750dbdfb6549470192573b9646be40e9" +BATCH = pd.read_csv("/Users/tony/test_files/text_amazon_reviews_test_small.csv") +TEXT_BATCH = [ + "This magazine was great for the times but as with all other technology magazines the new stuff isn't as good a lot of advertisments and reviews seem biased.", + "We ordered this magazine for our grandson (then 7 going on 30) who was/is deploy into technology. He really enjoyed every issue.", + "I didn't receive a full year. I only receive the magazine twice. It's a good magazine, I just didn't receive it as promised.", + "I was hoping for more technical than what was there. it seems to be more like 'look how cool this is' than a technical publication. It's like sport compact car, but for computers.", + "I only received one copy of the mag so I couldn't really find out if it was good reading or not", + "This magazine is just ok. I ended up subscribing to pc world instead. They are more for the technician and not just the cusumer.", + "There articles are alright, but they screw you on the amount you get as i only got 10 of the 12 months subcription. so be carefull unless you are on the auto renew.", + "Excellent product! I love reading through the magazine and learning about the cool new products out there and the cool programs!", + "I ordered this hoping to learn more about the latest gadgets, and I did learn some things but in over my head over all. I do not enjoy this reading at all.", + "Love the magazine. The price through Amazon is well worth it for the knowledge recieved and the subscription process is painless", + "I bought this subscription for my son. He is presently building a computer. He said it has lots of good and useful information in it.", +] +studio = Studio(API_KEY) +model = studio.get_model(MODEL_ID) +results = model.predict(TEXT_BATCH) +print(results) From 7db17d80b0a7d0c82c7cba62c9a903321552d17f Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Mon, 24 Jul 2023 15:09:32 -0700 Subject: [PATCH 11/42] remove test file for local testing --- tests/models/test_prediction.py | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 tests/models/test_prediction.py diff --git a/tests/models/test_prediction.py b/tests/models/test_prediction.py deleted file mode 100644 index bd60c50a..00000000 --- a/tests/models/test_prediction.py +++ /dev/null @@ -1,31 +0,0 @@ -import os - -os.environ["CLEANLAB_API_BASE_URL"] = "https://api.dev-bc26qf4m.cleanlab.ai/api" -# os.environ["CLEANLAB_API_BASE_URL"] = "http://localhost:8500/api" - -from cleanlab_studio import Studio -import pandas as pd - - -API_KEY = "350b3ee6fbe64d21a6012ea281ce0ca1" -MODEL_ID = "cea761848e5f449b85e34fe347696b53" -# API_KEY = "75f2ab8c962c40169917136756c5d937" -# MODEL_ID = "750dbdfb6549470192573b9646be40e9" -BATCH = pd.read_csv("/Users/tony/test_files/text_amazon_reviews_test_small.csv") -TEXT_BATCH = [ - "This magazine was great for the times but as with all other technology magazines the new stuff isn't as good a lot of advertisments and reviews seem biased.", - "We ordered this magazine for our grandson (then 7 going on 30) who was/is deploy into technology. He really enjoyed every issue.", - "I didn't receive a full year. I only receive the magazine twice. It's a good magazine, I just didn't receive it as promised.", - "I was hoping for more technical than what was there. it seems to be more like 'look how cool this is' than a technical publication. It's like sport compact car, but for computers.", - "I only received one copy of the mag so I couldn't really find out if it was good reading or not", - "This magazine is just ok. I ended up subscribing to pc world instead. They are more for the technician and not just the cusumer.", - "There articles are alright, but they screw you on the amount you get as i only got 10 of the 12 months subcription. so be carefull unless you are on the auto renew.", - "Excellent product! I love reading through the magazine and learning about the cool new products out there and the cool programs!", - "I ordered this hoping to learn more about the latest gadgets, and I did learn some things but in over my head over all. I do not enjoy this reading at all.", - "Love the magazine. The price through Amazon is well worth it for the knowledge recieved and the subscription process is painless", - "I bought this subscription for my son. He is presently building a computer. He said it has lots of good and useful information in it.", -] -studio = Studio(API_KEY) -model = studio.get_model(MODEL_ID) -results = model.predict(TEXT_BATCH) -print(results) From 2d27b3c24c536e09af8defba3c1ad791c2b5d490 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Mon, 24 Jul 2023 15:40:32 -0700 Subject: [PATCH 12/42] change response for upload api and remove logic for comparing headers of text input files with actual text data columns used for prediction --- cleanlab_studio/internal/api/api.py | 12 ++++------ tests/models/test_prediction.py | 34 +++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 8 deletions(-) create mode 100644 tests/models/test_prediction.py diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index c1504516..df1c9ae7 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -348,15 +348,11 @@ def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str handle_api_error(res) presigned_url = res.json()["upload_url"] query_id = res.json()["query_id"] + modality = res.json()["modality"] header = res.json()["header"] - if header: - batch_header = batch.readline() - if batch_header == header: - input_batch = batch - else: - header_io = io.StringIO(header) - batch_header_io = io.StringIO(batch_header) - input_batch = io.StringIO("\n".join(chain(header_io, batch_header_io, batch))) + if modality == "text": + header_io = io.StringIO(header) + input_batch = io.StringIO("\n".join(chain(header_io, batch))) else: input_batch = batch diff --git a/tests/models/test_prediction.py b/tests/models/test_prediction.py new file mode 100644 index 00000000..3474b413 --- /dev/null +++ b/tests/models/test_prediction.py @@ -0,0 +1,34 @@ +import os + +os.environ["CLEANLAB_API_BASE_URL"] = "https://api.dev-bc26qf4m.cleanlab.ai/api" +# os.environ["CLEANLAB_API_BASE_URL"] = "http://localhost:8500/api" + +from cleanlab_studio import Studio +import pandas as pd + + +API_KEY = "350b3ee6fbe64d21a6012ea281ce0ca1" +MODEL_ID = "cea761848e5f449b85e34fe347696b53" +# API_KEY = "75f2ab8c962c40169917136756c5d937" +# MODEL_ID = "750dbdfb6549470192573b9646be40e9" +BATCH = pd.read_csv( + "/Users/tony/test_files/text_amazon_reviews_test_small.csv", index_col=False, header=0 +).loc[0, :] +print(BATCH) +# TEXT_BATCH = pd.Series([ +# "This magazine was great for the times but as with all other technology magazines the new stuff isn't as good a lot of advertisments and reviews seem biased.", +# "We ordered this magazine for our grandson (then 7 going on 30) who was/is deploy into technology. He really enjoyed every issue.", +# "I didn't receive a full year. I only receive the magazine twice. It's a good magazine, I just didn't receive it as promised.", +# "I was hoping for more technical than what was there. it seems to be more like 'look how cool this is' than a technical publication. It's like sport compact car, but for computers.", +# "I only received one copy of the mag so I couldn't really find out if it was good reading or not", +# "This magazine is just ok. I ended up subscribing to pc world instead. They are more for the technician and not just the cusumer.", +# "There articles are alright, but they screw you on the amount you get as i only got 10 of the 12 months subcription. so be carefull unless you are on the auto renew.", +# "Excellent product! I love reading through the magazine and learning about the cool new products out there and the cool programs!", +# "I ordered this hoping to learn more about the latest gadgets, and I did learn some things but in over my head over all. I do not enjoy this reading at all.", +# "Love the magazine. The price through Amazon is well worth it for the knowledge recieved and the subscription process is painless", +# "I bought this subscription for my son. He is presently building a computer. He said it has lots of good and useful information in it.", +# ], name="review_text") +studio = Studio(API_KEY) +model = studio.get_model(MODEL_ID) +results = model.predict(BATCH) +print(results) From 25d33acbb1910d1ad1fc64f16cb76e2e33e8a970 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Mon, 24 Jul 2023 16:12:29 -0700 Subject: [PATCH 13/42] modify invoke lambda api to send only query_id as param --- cleanlab_studio/internal/api/api.py | 4 ++-- cleanlab_studio/studio/inference.py | 2 +- tests/models/test_prediction.py | 17 +++++++---------- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index df1c9ae7..d36eb358 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -361,10 +361,10 @@ def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str return query_id -def start_prediction(api_key: str, model_id: str, query_id: str) -> None: +def start_prediction(api_key: str, query_id: str) -> None: """Starts prediction for query.""" res = requests.post( - f"{model_base_url}/{model_id}/predict/{query_id}", + f"{model_base_url}/predict/{query_id}", headers=_construct_headers(api_key), ) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 6cf1832a..ec6109da 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -46,7 +46,7 @@ def _predict(self, batch: io.StringIO) -> Predictions: :return: predictions from batch """ query_id: str = api.upload_predict_batch(self._api_key, self._model_id, batch) - api.start_prediction(self._api_key, self._model_id, query_id) + api.start_prediction(self._api_key, query_id) resp = api.get_prediction_status(self._api_key, query_id) status: str | None = resp["status"] diff --git a/tests/models/test_prediction.py b/tests/models/test_prediction.py index 3474b413..a54d1d77 100644 --- a/tests/models/test_prediction.py +++ b/tests/models/test_prediction.py @@ -1,20 +1,17 @@ import os -os.environ["CLEANLAB_API_BASE_URL"] = "https://api.dev-bc26qf4m.cleanlab.ai/api" -# os.environ["CLEANLAB_API_BASE_URL"] = "http://localhost:8500/api" +# os.environ["CLEANLAB_API_BASE_URL"] = "https://api.dev-bc26qf4m.cleanlab.ai/api" +os.environ["CLEANLAB_API_BASE_URL"] = "http://localhost:8500/api" from cleanlab_studio import Studio import pandas as pd -API_KEY = "350b3ee6fbe64d21a6012ea281ce0ca1" -MODEL_ID = "cea761848e5f449b85e34fe347696b53" -# API_KEY = "75f2ab8c962c40169917136756c5d937" -# MODEL_ID = "750dbdfb6549470192573b9646be40e9" -BATCH = pd.read_csv( - "/Users/tony/test_files/text_amazon_reviews_test_small.csv", index_col=False, header=0 -).loc[0, :] -print(BATCH) +# API_KEY = "350b3ee6fbe64d21a6012ea281ce0ca1" +# MODEL_ID = "cea761848e5f449b85e34fe347696b53" +API_KEY = "75f2ab8c962c40169917136756c5d937" +MODEL_ID = "750dbdfb6549470192573b9646be40e9" +BATCH = pd.read_csv("/Users/tony/test_files/tabular_grades_test_small.csv") # TEXT_BATCH = pd.Series([ # "This magazine was great for the times but as with all other technology magazines the new stuff isn't as good a lot of advertisments and reviews seem biased.", # "We ordered this magazine for our grandson (then 7 going on 30) who was/is deploy into technology. He really enjoyed every issue.", From ba0a5f92ae7a289a9571c77fc937a9337ef7014d Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 09:49:53 -0700 Subject: [PATCH 14/42] remove test file --- cleanlab_studio/internal/api/api.py | 2 -- tests/models/test_prediction.py | 31 ----------------------------- 2 files changed, 33 deletions(-) delete mode 100644 tests/models/test_prediction.py diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index d36eb358..e945a906 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -2,11 +2,9 @@ import os import time from itertools import chain -from shutil import copyfileobj from typing import Callable, List, Optional, Tuple, Dict, Union, Any from cleanlab_studio.errors import APIError -import aiohttp import requests from tqdm import tqdm import pandas as pd diff --git a/tests/models/test_prediction.py b/tests/models/test_prediction.py deleted file mode 100644 index a54d1d77..00000000 --- a/tests/models/test_prediction.py +++ /dev/null @@ -1,31 +0,0 @@ -import os - -# os.environ["CLEANLAB_API_BASE_URL"] = "https://api.dev-bc26qf4m.cleanlab.ai/api" -os.environ["CLEANLAB_API_BASE_URL"] = "http://localhost:8500/api" - -from cleanlab_studio import Studio -import pandas as pd - - -# API_KEY = "350b3ee6fbe64d21a6012ea281ce0ca1" -# MODEL_ID = "cea761848e5f449b85e34fe347696b53" -API_KEY = "75f2ab8c962c40169917136756c5d937" -MODEL_ID = "750dbdfb6549470192573b9646be40e9" -BATCH = pd.read_csv("/Users/tony/test_files/tabular_grades_test_small.csv") -# TEXT_BATCH = pd.Series([ -# "This magazine was great for the times but as with all other technology magazines the new stuff isn't as good a lot of advertisments and reviews seem biased.", -# "We ordered this magazine for our grandson (then 7 going on 30) who was/is deploy into technology. He really enjoyed every issue.", -# "I didn't receive a full year. I only receive the magazine twice. It's a good magazine, I just didn't receive it as promised.", -# "I was hoping for more technical than what was there. it seems to be more like 'look how cool this is' than a technical publication. It's like sport compact car, but for computers.", -# "I only received one copy of the mag so I couldn't really find out if it was good reading or not", -# "This magazine is just ok. I ended up subscribing to pc world instead. They are more for the technician and not just the cusumer.", -# "There articles are alright, but they screw you on the amount you get as i only got 10 of the 12 months subcription. so be carefull unless you are on the auto renew.", -# "Excellent product! I love reading through the magazine and learning about the cool new products out there and the cool programs!", -# "I ordered this hoping to learn more about the latest gadgets, and I did learn some things but in over my head over all. I do not enjoy this reading at all.", -# "Love the magazine. The price through Amazon is well worth it for the knowledge recieved and the subscription process is painless", -# "I bought this subscription for my son. He is presently building a computer. He said it has lots of good and useful information in it.", -# ], name="review_text") -studio = Studio(API_KEY) -model = studio.get_model(MODEL_ID) -results = model.predict(BATCH) -print(results) From 9ab9d283357c24007c408e19fa7f77c16fe0dddb Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 10:34:11 -0700 Subject: [PATCH 15/42] fix mypy errors --- cleanlab_studio/internal/api/api.py | 3 ++- cleanlab_studio/studio/inference.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index e945a906..1e3e2f3c 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -345,7 +345,7 @@ def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str handle_api_error(res) presigned_url = res.json()["upload_url"] - query_id = res.json()["query_id"] + query_id: str = res.json()["query_id"] modality = res.json()["modality"] header = res.json()["header"] if modality == "text": @@ -393,4 +393,5 @@ def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: def download_prediction_results(result_url: str) -> io.StringIO: """Downloads prediction results from presigned URL.""" res = requests.get(result_url) + print(res.text) return io.StringIO(res.text) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index ec6109da..73fd2ce3 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -2,7 +2,7 @@ import csv import functools import io -from typing import List +from typing import List, TypeAlias import numpy as np import numpy.typing as npt @@ -11,9 +11,9 @@ from cleanlab_studio.internal.api import api -TextBatch = List[str] | npt.NDArray[np.str_] | pd.Series -TabularBatch = pd.DataFrame -Batch = TextBatch | TabularBatch +TextBatch: TypeAlias = List[str] | npt.NDArray[np.str_] | pd.Series +TabularBatch: TypeAlias = pd.DataFrame +Batch: TypeAlias = TextBatch | TabularBatch Predictions = npt.NDArray[np.int_] | npt.NDArray[np.str_] ClassProbablities = pd.DataFrame @@ -30,7 +30,7 @@ def __init__(self, api_key: str, model_id: str): def predict( self, batch: Batch, - ) -> Predictions: + ) -> str | Predictions: """Gets predictions for batch of examples. :param batch: batch of example to predict classes for @@ -39,7 +39,7 @@ def predict( csv_batch = self._convert_batch_to_csv(batch) return self._predict(csv_batch) - def _predict(self, batch: io.StringIO) -> Predictions: + def _predict(self, batch: io.StringIO) -> str | Predictions: """Gets predictions for batch of examples. :param batch: batch of example to predict classes for, as in-memory CSV file @@ -58,9 +58,9 @@ def _predict(self, batch: io.StringIO) -> Predictions: return resp["error_msg"] else: result_url = resp["result_url"] - return pd.read_csv( - api.download_prediction_results(result_url), - ).values + results: io.StringIO = api.download_prediction_results(result_url) + results_converted: Predictions = pd.read_csv(results).to_numpy() + return results_converted @functools.singledispatchmethod def _convert_batch_to_csv(self, batch: Batch) -> io.StringIO: From 7db2438ee77cd1ebb04410d210c1459c96caac7c Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 10:40:36 -0700 Subject: [PATCH 16/42] remove TypeAlias --- cleanlab_studio/studio/inference.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 73fd2ce3..fd999629 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -2,7 +2,7 @@ import csv import functools import io -from typing import List, TypeAlias +from typing import List import numpy as np import numpy.typing as npt @@ -11,9 +11,9 @@ from cleanlab_studio.internal.api import api -TextBatch: TypeAlias = List[str] | npt.NDArray[np.str_] | pd.Series -TabularBatch: TypeAlias = pd.DataFrame -Batch: TypeAlias = TextBatch | TabularBatch +TextBatch = List[str] | npt.NDArray[np.str_] | pd.Series +TabularBatch = pd.DataFrame +Batch = TextBatch | TabularBatch Predictions = npt.NDArray[np.int_] | npt.NDArray[np.str_] ClassProbablities = pd.DataFrame From 7b1a48aa700ee2429db1e6ac58b7b54772f321d1 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 10:44:16 -0700 Subject: [PATCH 17/42] fix mypy for Batch type --- cleanlab_studio/studio/inference.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index fd999629..d492795c 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -2,7 +2,7 @@ import csv import functools import io -from typing import List +from typing import List, Union import numpy as np import numpy.typing as npt @@ -11,9 +11,9 @@ from cleanlab_studio.internal.api import api -TextBatch = List[str] | npt.NDArray[np.str_] | pd.Series -TabularBatch = pd.DataFrame -Batch = TextBatch | TabularBatch +TextBatch = Union[List[str], npt.NDArray[np.str_], pd.Series] +TabularBatch = Union[pd.DataFrame] +Batch = Union[TextBatch, TabularBatch] Predictions = npt.NDArray[np.int_] | npt.NDArray[np.str_] ClassProbablities = pd.DataFrame From b0abe970a677e487c7a6824c02eb1ded4858784f Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 10:47:52 -0700 Subject: [PATCH 18/42] User Union instead of | for multi generic typing --- cleanlab_studio/studio/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index d492795c..dc75dec4 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -15,7 +15,7 @@ TabularBatch = Union[pd.DataFrame] Batch = Union[TextBatch, TabularBatch] -Predictions = npt.NDArray[np.int_] | npt.NDArray[np.str_] +Predictions = Union[npt.NDArray[np.int_], npt.NDArray[np.str_]] ClassProbablities = pd.DataFrame From d5fb4d4be948d977f85ddc4c7a3202594e494297 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 10:55:50 -0700 Subject: [PATCH 19/42] more typing fixes and timeout in prediction --- cleanlab_studio/studio/inference.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index dc75dec4..864baee3 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -2,7 +2,8 @@ import csv import functools import io -from typing import List, Union +import time +from typing import List, Union, Optional import numpy as np import numpy.typing as npt @@ -30,7 +31,7 @@ def __init__(self, api_key: str, model_id: str): def predict( self, batch: Batch, - ) -> str | Predictions: + ) -> Union[str, Predictions]: """Gets predictions for batch of examples. :param batch: batch of example to predict classes for @@ -39,7 +40,7 @@ def predict( csv_batch = self._convert_batch_to_csv(batch) return self._predict(csv_batch) - def _predict(self, batch: io.StringIO) -> str | Predictions: + def _predict(self, batch: io.StringIO) -> Union[str, Predictions]: """Gets predictions for batch of examples. :param batch: batch of example to predict classes for, as in-memory CSV file @@ -49,8 +50,11 @@ def _predict(self, batch: io.StringIO) -> str | Predictions: api.start_prediction(self._api_key, query_id) resp = api.get_prediction_status(self._api_key, query_id) - status: str | None = resp["status"] - while status == "running": + status: Optional[str] = resp["status"] + # Set timeout to 10 minutes as inference won't take longer than 10 minutes typically and + # to prevent users from getting stuck in this loop indefinitely when there is a failure + timeout = time.time() + 60 * 10 + while status == "running" or time.time() < timeout: resp = api.get_prediction_status(self._api_key, query_id) status = resp["status"] From 0b2a73c81020e3a57a4c75f65506c47292c36f45 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 11:23:20 -0700 Subject: [PATCH 20/42] change timeout to adn --- cleanlab_studio/studio/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 864baee3..a485f895 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -54,7 +54,7 @@ def _predict(self, batch: io.StringIO) -> Union[str, Predictions]: # Set timeout to 10 minutes as inference won't take longer than 10 minutes typically and # to prevent users from getting stuck in this loop indefinitely when there is a failure timeout = time.time() + 60 * 10 - while status == "running" or time.time() < timeout: + while status == "running" and time.time() < timeout: resp = api.get_prediction_status(self._api_key, query_id) status = resp["status"] From 4846bf3adb407c94ff21c56269c38023ee78f231 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 11:24:20 -0700 Subject: [PATCH 21/42] remove print statement --- cleanlab_studio/internal/api/api.py | 1 - tests/models/test_prediction.py | 31 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 tests/models/test_prediction.py diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 1e3e2f3c..a3577cfa 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -393,5 +393,4 @@ def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: def download_prediction_results(result_url: str) -> io.StringIO: """Downloads prediction results from presigned URL.""" res = requests.get(result_url) - print(res.text) return io.StringIO(res.text) diff --git a/tests/models/test_prediction.py b/tests/models/test_prediction.py new file mode 100644 index 00000000..0bf74a3d --- /dev/null +++ b/tests/models/test_prediction.py @@ -0,0 +1,31 @@ +import os + +# os.environ["CLEANLAB_API_BASE_URL"] = "https://api.dev-bc26qf4m.cleanlab.ai/api" +os.environ["CLEANLAB_API_BASE_URL"] = "http://localhost:8500/api" + +from cleanlab_studio import Studio +import pandas as pd + + +# API_KEY = "350b3ee6fbe64d21a6012ea281ce0ca1" +# MODEL_ID = "cea761848e5f449b85e34fe347696b53" +API_KEY = "75f2ab8c962c40169917136756c5d937" +MODEL_ID = "750dbdfb6549470192573b9646be40e9" +BATCH = pd.read_csv("/Users/tony/test_files/tabular_grades_test_small.csv") +TEXT_BATCH = [ + "This magazine was great for the times but as with all other technology magazines the new stuff isn't as good a lot of advertisments and reviews seem biased.", + "We ordered this magazine for our grandson (then 7 going on 30) who was/is deploy into technology. He really enjoyed every issue.", + "I didn't receive a full year. I only receive the magazine twice. It's a good magazine, I just didn't receive it as promised.", + "I was hoping for more technical than what was there. it seems to be more like 'look how cool this is' than a technical publication. It's like sport compact car, but for computers.", + "I only received one copy of the mag so I couldn't really find out if it was good reading or not", + "This magazine is just ok. I ended up subscribing to pc world instead. They are more for the technician and not just the cusumer.", + "There articles are alright, but they screw you on the amount you get as i only got 10 of the 12 months subcription. so be carefull unless you are on the auto renew.", + "Excellent product! I love reading through the magazine and learning about the cool new products out there and the cool programs!", + "I ordered this hoping to learn more about the latest gadgets, and I did learn some things but in over my head over all. I do not enjoy this reading at all.", + "Love the magazine. The price through Amazon is well worth it for the knowledge recieved and the subscription process is painless", + "I bought this subscription for my son. He is presently building a computer. He said it has lots of good and useful information in it.", +] +studio = Studio(API_KEY) +model = studio.get_model(MODEL_ID) +results = model.predict(BATCH) +print(results) From af6ce97a9210955f052e0bd9d6bd598a3a21e3fd Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 11:43:08 -0700 Subject: [PATCH 22/42] remove test files again --- tests/models/test_prediction.py | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 tests/models/test_prediction.py diff --git a/tests/models/test_prediction.py b/tests/models/test_prediction.py deleted file mode 100644 index 0bf74a3d..00000000 --- a/tests/models/test_prediction.py +++ /dev/null @@ -1,31 +0,0 @@ -import os - -# os.environ["CLEANLAB_API_BASE_URL"] = "https://api.dev-bc26qf4m.cleanlab.ai/api" -os.environ["CLEANLAB_API_BASE_URL"] = "http://localhost:8500/api" - -from cleanlab_studio import Studio -import pandas as pd - - -# API_KEY = "350b3ee6fbe64d21a6012ea281ce0ca1" -# MODEL_ID = "cea761848e5f449b85e34fe347696b53" -API_KEY = "75f2ab8c962c40169917136756c5d937" -MODEL_ID = "750dbdfb6549470192573b9646be40e9" -BATCH = pd.read_csv("/Users/tony/test_files/tabular_grades_test_small.csv") -TEXT_BATCH = [ - "This magazine was great for the times but as with all other technology magazines the new stuff isn't as good a lot of advertisments and reviews seem biased.", - "We ordered this magazine for our grandson (then 7 going on 30) who was/is deploy into technology. He really enjoyed every issue.", - "I didn't receive a full year. I only receive the magazine twice. It's a good magazine, I just didn't receive it as promised.", - "I was hoping for more technical than what was there. it seems to be more like 'look how cool this is' than a technical publication. It's like sport compact car, but for computers.", - "I only received one copy of the mag so I couldn't really find out if it was good reading or not", - "This magazine is just ok. I ended up subscribing to pc world instead. They are more for the technician and not just the cusumer.", - "There articles are alright, but they screw you on the amount you get as i only got 10 of the 12 months subcription. so be carefull unless you are on the auto renew.", - "Excellent product! I love reading through the magazine and learning about the cool new products out there and the cool programs!", - "I ordered this hoping to learn more about the latest gadgets, and I did learn some things but in over my head over all. I do not enjoy this reading at all.", - "Love the magazine. The price through Amazon is well worth it for the knowledge recieved and the subscription process is painless", - "I bought this subscription for my son. He is presently building a computer. He said it has lots of good and useful information in it.", -] -studio = Studio(API_KEY) -model = studio.get_model(MODEL_ID) -results = model.predict(BATCH) -print(results) From a2a466c7f8cac6174e4b8af5d5fab6c1310070ab Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 16:08:14 -0700 Subject: [PATCH 23/42] fix code review comments --- cleanlab_studio/internal/api/api.py | 2 +- cleanlab_studio/studio/inference.py | 22 +++++++++++--------- tests/models/test_prediction.py | 31 +++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 10 deletions(-) create mode 100644 tests/models/test_prediction.py diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index a3577cfa..4d117efa 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -393,4 +393,4 @@ def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: def download_prediction_results(result_url: str) -> io.StringIO: """Downloads prediction results from presigned URL.""" res = requests.get(result_url) - return io.StringIO(res.text) + return io.StringIO(res.raw) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index a485f895..24312171 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -9,6 +9,7 @@ import numpy.typing as npt import pandas as pd +from cleanlab_studio.errors import APIError from cleanlab_studio.internal.api import api @@ -31,16 +32,18 @@ def __init__(self, api_key: str, model_id: str): def predict( self, batch: Batch, + timeout: int = 600, ) -> Union[str, Predictions]: """Gets predictions for batch of examples. :param batch: batch of example to predict classes for + :param timeout: optional parameter to set timeout for predictions in seconds :return: predictions from batch """ csv_batch = self._convert_batch_to_csv(batch) - return self._predict(csv_batch) + return self._predict(csv_batch, timeout) - def _predict(self, batch: io.StringIO) -> Union[str, Predictions]: + def _predict(self, batch: io.StringIO, timeout: int) -> Union[str, Predictions]: """Gets predictions for batch of examples. :param batch: batch of example to predict classes for, as in-memory CSV file @@ -51,23 +54,24 @@ def _predict(self, batch: io.StringIO) -> Union[str, Predictions]: resp = api.get_prediction_status(self._api_key, query_id) status: Optional[str] = resp["status"] - # Set timeout to 10 minutes as inference won't take longer than 10 minutes typically and - # to prevent users from getting stuck in this loop indefinitely when there is a failure - timeout = time.time() + 60 * 10 + # Set timeout to prevent users from getting stuck indefinitely when there is a failure + timeout = time.time() + timeout while status == "running" and time.time() < timeout: resp = api.get_prediction_status(self._api_key, query_id) status = resp["status"] + # Set time.sleep so that the while loop doesn't flood backend with api calls + time.sleep(3) if status == "error": - return resp["error_msg"] + raise APIError(resp["error_msg"]) else: result_url = resp["result_url"] - results: io.StringIO = api.download_prediction_results(result_url) + results = api.download_prediction_results(result_url) results_converted: Predictions = pd.read_csv(results).to_numpy() return results_converted - @functools.singledispatchmethod - def _convert_batch_to_csv(self, batch: Batch) -> io.StringIO: + @staticmethod + def _convert_batch_to_csv(batch: Batch) -> io.StringIO: """Converts batch object to CSV string IO.""" sio = io.StringIO() diff --git a/tests/models/test_prediction.py b/tests/models/test_prediction.py new file mode 100644 index 00000000..0bf74a3d --- /dev/null +++ b/tests/models/test_prediction.py @@ -0,0 +1,31 @@ +import os + +# os.environ["CLEANLAB_API_BASE_URL"] = "https://api.dev-bc26qf4m.cleanlab.ai/api" +os.environ["CLEANLAB_API_BASE_URL"] = "http://localhost:8500/api" + +from cleanlab_studio import Studio +import pandas as pd + + +# API_KEY = "350b3ee6fbe64d21a6012ea281ce0ca1" +# MODEL_ID = "cea761848e5f449b85e34fe347696b53" +API_KEY = "75f2ab8c962c40169917136756c5d937" +MODEL_ID = "750dbdfb6549470192573b9646be40e9" +BATCH = pd.read_csv("/Users/tony/test_files/tabular_grades_test_small.csv") +TEXT_BATCH = [ + "This magazine was great for the times but as with all other technology magazines the new stuff isn't as good a lot of advertisments and reviews seem biased.", + "We ordered this magazine for our grandson (then 7 going on 30) who was/is deploy into technology. He really enjoyed every issue.", + "I didn't receive a full year. I only receive the magazine twice. It's a good magazine, I just didn't receive it as promised.", + "I was hoping for more technical than what was there. it seems to be more like 'look how cool this is' than a technical publication. It's like sport compact car, but for computers.", + "I only received one copy of the mag so I couldn't really find out if it was good reading or not", + "This magazine is just ok. I ended up subscribing to pc world instead. They are more for the technician and not just the cusumer.", + "There articles are alright, but they screw you on the amount you get as i only got 10 of the 12 months subcription. so be carefull unless you are on the auto renew.", + "Excellent product! I love reading through the magazine and learning about the cool new products out there and the cool programs!", + "I ordered this hoping to learn more about the latest gadgets, and I did learn some things but in over my head over all. I do not enjoy this reading at all.", + "Love the magazine. The price through Amazon is well worth it for the knowledge recieved and the subscription process is painless", + "I bought this subscription for my son. He is presently building a computer. He said it has lots of good and useful information in it.", +] +studio = Studio(API_KEY) +model = studio.get_model(MODEL_ID) +results = model.predict(BATCH) +print(results) From a3ce52f061e7137b0a32f9ca1b42ed01e5d05914 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 16:11:53 -0700 Subject: [PATCH 24/42] remove test files --- tests/models/test_prediction.py | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 tests/models/test_prediction.py diff --git a/tests/models/test_prediction.py b/tests/models/test_prediction.py deleted file mode 100644 index 0bf74a3d..00000000 --- a/tests/models/test_prediction.py +++ /dev/null @@ -1,31 +0,0 @@ -import os - -# os.environ["CLEANLAB_API_BASE_URL"] = "https://api.dev-bc26qf4m.cleanlab.ai/api" -os.environ["CLEANLAB_API_BASE_URL"] = "http://localhost:8500/api" - -from cleanlab_studio import Studio -import pandas as pd - - -# API_KEY = "350b3ee6fbe64d21a6012ea281ce0ca1" -# MODEL_ID = "cea761848e5f449b85e34fe347696b53" -API_KEY = "75f2ab8c962c40169917136756c5d937" -MODEL_ID = "750dbdfb6549470192573b9646be40e9" -BATCH = pd.read_csv("/Users/tony/test_files/tabular_grades_test_small.csv") -TEXT_BATCH = [ - "This magazine was great for the times but as with all other technology magazines the new stuff isn't as good a lot of advertisments and reviews seem biased.", - "We ordered this magazine for our grandson (then 7 going on 30) who was/is deploy into technology. He really enjoyed every issue.", - "I didn't receive a full year. I only receive the magazine twice. It's a good magazine, I just didn't receive it as promised.", - "I was hoping for more technical than what was there. it seems to be more like 'look how cool this is' than a technical publication. It's like sport compact car, but for computers.", - "I only received one copy of the mag so I couldn't really find out if it was good reading or not", - "This magazine is just ok. I ended up subscribing to pc world instead. They are more for the technician and not just the cusumer.", - "There articles are alright, but they screw you on the amount you get as i only got 10 of the 12 months subcription. so be carefull unless you are on the auto renew.", - "Excellent product! I love reading through the magazine and learning about the cool new products out there and the cool programs!", - "I ordered this hoping to learn more about the latest gadgets, and I did learn some things but in over my head over all. I do not enjoy this reading at all.", - "Love the magazine. The price through Amazon is well worth it for the knowledge recieved and the subscription process is painless", - "I bought this subscription for my son. He is presently building a computer. He said it has lots of good and useful information in it.", -] -studio = Studio(API_KEY) -model = studio.get_model(MODEL_ID) -results = model.predict(BATCH) -print(results) From 9bcb15f25c34460cfe3e89e46236af762eb603e7 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 16:31:56 -0700 Subject: [PATCH 25/42] for updating pr --- cleanlab_studio/studio/inference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 24312171..fc20b272 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -56,6 +56,7 @@ def _predict(self, batch: io.StringIO, timeout: int) -> Union[str, Predictions]: status: Optional[str] = resp["status"] # Set timeout to prevent users from getting stuck indefinitely when there is a failure timeout = time.time() + timeout + while status == "running" and time.time() < timeout: resp = api.get_prediction_status(self._api_key, query_id) status = resp["status"] From 8de5c4a8c83a7681287c8c816a02fa78a4e2f6bc Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Tue, 25 Jul 2023 16:39:23 -0700 Subject: [PATCH 26/42] change timeout to new var name --- cleanlab_studio/studio/inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index fc20b272..45af2fb3 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -55,9 +55,9 @@ def _predict(self, batch: io.StringIO, timeout: int) -> Union[str, Predictions]: resp = api.get_prediction_status(self._api_key, query_id) status: Optional[str] = resp["status"] # Set timeout to prevent users from getting stuck indefinitely when there is a failure - timeout = time.time() + timeout + timeout_limit = time.time() + timeout - while status == "running" and time.time() < timeout: + while status == "running" and time.time() < timeout_limit: resp = api.get_prediction_status(self._api_key, query_id) status = resp["status"] # Set time.sleep so that the while loop doesn't flood backend with api calls From fd758ea718b1529e0e9f8bbfc3b79b81fc51a005 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Wed, 26 Jul 2023 12:33:15 -0700 Subject: [PATCH 27/42] remove header replace logic --- cleanlab_studio/internal/api/api.py | 9 +-------- cleanlab_studio/studio/inference.py | 3 +++ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 4d117efa..68506c28 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -346,15 +346,8 @@ def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str handle_api_error(res) presigned_url = res.json()["upload_url"] query_id: str = res.json()["query_id"] - modality = res.json()["modality"] - header = res.json()["header"] - if modality == "text": - header_io = io.StringIO(header) - input_batch = io.StringIO("\n".join(chain(header_io, batch))) - else: - input_batch = batch - requests.post(presigned_url["url"], data=presigned_url["fields"], files={"file": input_batch}) + requests.post(presigned_url["url"], data=presigned_url["fields"], files={"file": batch}) return query_id diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 45af2fb3..39a2aedb 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -80,6 +80,9 @@ def _convert_batch_to_csv(batch: Batch) -> io.StringIO: if isinstance(batch, (list, np.ndarray, pd.Series)): writer = csv.writer(sio) + # write header + writer.writerow(["text"]) + # write labels to CSV for input_data in batch: writer.writerow([input_data]) From 658b5faa0da055cac47cb53cec8dcb1d88fc34a7 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Wed, 26 Jul 2023 13:59:47 -0700 Subject: [PATCH 28/42] modify predict function to take care of text inputs --- cleanlab_studio/internal/api/api.py | 6 +++--- cleanlab_studio/studio/inference.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 68506c28..387b26e4 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -352,10 +352,10 @@ def upload_predict_batch(api_key: str, model_id: str, batch: io.StringIO) -> str return query_id -def start_prediction(api_key: str, query_id: str) -> None: +def start_prediction(api_key: str, model_id: str, query_id: str) -> None: """Starts prediction for query.""" res = requests.post( - f"{model_base_url}/predict/{query_id}", + f"{model_base_url}/{model_id}/predict/{query_id}", headers=_construct_headers(api_key), ) @@ -386,4 +386,4 @@ def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: def download_prediction_results(result_url: str) -> io.StringIO: """Downloads prediction results from presigned URL.""" res = requests.get(result_url) - return io.StringIO(res.raw) + return io.StringIO(res.text) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 39a2aedb..1e8f8d91 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -50,7 +50,7 @@ def _predict(self, batch: io.StringIO, timeout: int) -> Union[str, Predictions]: :return: predictions from batch """ query_id: str = api.upload_predict_batch(self._api_key, self._model_id, batch) - api.start_prediction(self._api_key, query_id) + api.start_prediction(self._api_key, self._model_id, query_id) resp = api.get_prediction_status(self._api_key, query_id) status: Optional[str] = resp["status"] From 612a522b716c4efd352451aca174922e1bf300ba Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Thu, 27 Jul 2023 10:59:59 -0700 Subject: [PATCH 29/42] remove download api endpoint and supply url directly to pandas --- cleanlab_studio/internal/api/api.py | 6 ------ cleanlab_studio/studio/inference.py | 3 +-- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 387b26e4..c488c288 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -381,9 +381,3 @@ def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: return {"status": "error", "error_msg": error_msg} else: return {"status": "running"} - - -def download_prediction_results(result_url: str) -> io.StringIO: - """Downloads prediction results from presigned URL.""" - res = requests.get(result_url) - return io.StringIO(res.text) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 1e8f8d91..04a10a22 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -67,8 +67,7 @@ def _predict(self, batch: io.StringIO, timeout: int) -> Union[str, Predictions]: raise APIError(resp["error_msg"]) else: result_url = resp["result_url"] - results = api.download_prediction_results(result_url) - results_converted: Predictions = pd.read_csv(results).to_numpy() + results_converted: Predictions = pd.read_csv(result_url).to_numpy() return results_converted @staticmethod From 40bffbf360e77da840f87eb4b2d3eef3f8b76b05 Mon Sep 17 00:00:00 2001 From: taekang1618 Date: Fri, 28 Jul 2023 12:41:24 -0700 Subject: [PATCH 30/42] update doctring to match documentation format --- cleanlab_studio/studio/inference.py | 12 ++++++++---- cleanlab_studio/studio/studio.py | 10 +++++++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 04a10a22..cc039407 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -34,11 +34,15 @@ def predict( batch: Batch, timeout: int = 600, ) -> Union[str, Predictions]: - """Gets predictions for batch of examples. + """ + Gets predictions for batch of examples. - :param batch: batch of example to predict classes for - :param timeout: optional parameter to set timeout for predictions in seconds - :return: predictions from batch + Args: + batch: batch of example to predict classes for + timeout: optional parameter to set timeout for predictions in seconds + + Returns: + predictions from batch as a numpy array or an error message if predictions fail """ csv_batch = self._convert_batch_to_csv(batch) return self._predict(csv_batch, timeout) diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 7c0e9cb3..3485b889 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -291,7 +291,15 @@ def delete_project(self, project_id: str) -> None: print(f"Successfully deleted project: {project_id}") def get_model(self, model_id: str) -> inference.Model: - """Creates model object from model ID, to use for inference.""" + """ + Gets a model deployed by Cleanlab Studio. + + Args: + model_id: ID of model to get. This ID should be fetched in the deployments page of the app UI. + + Returns: + Model object with methods run predictions on new input data + """ return inference.Model(self._api_key, model_id) class Experimental: From d83e5e1d5b360c5f655aafe6c667b844243edcb3 Mon Sep 17 00:00:00 2001 From: ryansingman Date: Mon, 31 Jul 2023 14:23:28 -0600 Subject: [PATCH 31/42] fix predict timeout --- cleanlab_studio/studio/inference.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index cc039407..bffd5988 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -14,7 +14,7 @@ TextBatch = Union[List[str], npt.NDArray[np.str_], pd.Series] -TabularBatch = Union[pd.DataFrame] +TabularBatch = pd.DataFrame Batch = Union[TextBatch, TabularBatch] Predictions = Union[npt.NDArray[np.int_], npt.NDArray[np.str_]] @@ -62,13 +62,15 @@ def _predict(self, batch: io.StringIO, timeout: int) -> Union[str, Predictions]: timeout_limit = time.time() + timeout while status == "running" and time.time() < timeout_limit: + time.sleep(1) + resp = api.get_prediction_status(self._api_key, query_id) status = resp["status"] - # Set time.sleep so that the while loop doesn't flood backend with api calls - time.sleep(3) if status == "error": raise APIError(resp["error_msg"]) + elif status == "running": + raise TimeoutError("Timeout of {timeout}s expired while waiting for prediction") else: result_url = resp["result_url"] results_converted: Predictions = pd.read_csv(result_url).to_numpy() From e2665188217c321203f2c73a15083314b1622be6 Mon Sep 17 00:00:00 2001 From: ryansingman Date: Mon, 31 Jul 2023 14:28:27 -0600 Subject: [PATCH 32/42] mypy fix --- cleanlab_studio/studio/inference.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index bffd5988..f6e51fbd 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -1,9 +1,8 @@ import abc import csv -import functools import io import time -from typing import List, Union, Optional +from typing import List, TypeAlias, Union, Optional import numpy as np import numpy.typing as npt @@ -14,7 +13,7 @@ TextBatch = Union[List[str], npt.NDArray[np.str_], pd.Series] -TabularBatch = pd.DataFrame +TabularBatch: TypeAlias = pd.DataFrame Batch = Union[TextBatch, TabularBatch] Predictions = Union[npt.NDArray[np.int_], npt.NDArray[np.str_]] From 8679aade3678603aa2382fbaac2d1e33eb1385e2 Mon Sep 17 00:00:00 2001 From: ryansingman Date: Mon, 31 Jul 2023 14:30:08 -0600 Subject: [PATCH 33/42] mypy fix --- cleanlab_studio/studio/inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index f6e51fbd..df7ebe62 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -2,7 +2,8 @@ import csv import io import time -from typing import List, TypeAlias, Union, Optional +from typing import List, Union, Optional +from typing_extensions import TypeAlias import numpy as np import numpy.typing as npt From c871c8c2eba7565e924e689f983c099be098434d Mon Sep 17 00:00:00 2001 From: ryansingman Date: Mon, 31 Jul 2023 14:34:55 -0600 Subject: [PATCH 34/42] add typing extensions req --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 3dadb41e..873e81b4 100644 --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ "jsonstreams>=0.6.0", "semver>=2.13.0,<3.0.0", "Pillow>=9.2.0", + "typing_extensions==4.2.0", "openpyxl==3.0.10", "validators>=0.20.0", ], From 13b4c8caf938037d591ac15be86157f19eb3f00b Mon Sep 17 00:00:00 2001 From: ryansingman Date: Mon, 31 Jul 2023 14:53:36 -0600 Subject: [PATCH 35/42] fix incorrect return types, return predictions separate from class probs --- cleanlab_studio/studio/inference.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index df7ebe62..99e780de 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -2,7 +2,7 @@ import csv import io import time -from typing import List, Union, Optional +from typing import List, Optional, Tuple, Union from typing_extensions import TypeAlias import numpy as np @@ -32,26 +32,33 @@ def __init__(self, api_key: str, model_id: str): def predict( self, batch: Batch, + return_pred_proba: bool = False, timeout: int = 600, - ) -> Union[str, Predictions]: + ) -> Union[Predictions, Tuple[Predictions, ClassProbablities]]: """ Gets predictions for batch of examples. Args: batch: batch of example to predict classes for + return_pred_proba: if should return class probabilities for each example timeout: optional parameter to set timeout for predictions in seconds Returns: - predictions from batch as a numpy array or an error message if predictions fail + predictions from batch as a numpy array """ csv_batch = self._convert_batch_to_csv(batch) - return self._predict(csv_batch, timeout) + predictions, class_probabilities = self._predict(csv_batch, timeout) - def _predict(self, batch: io.StringIO, timeout: int) -> Union[str, Predictions]: + if return_pred_proba: + return predictions, class_probabilities + + return predictions + + def _predict(self, batch: io.StringIO, timeout: int) -> Tuple[Predictions, ClassProbablities]: """Gets predictions for batch of examples. :param batch: batch of example to predict classes for, as in-memory CSV file - :return: predictions from batch + :return: predictions from batch, class probabilities """ query_id: str = api.upload_predict_batch(self._api_key, self._model_id, batch) api.start_prediction(self._api_key, self._model_id, query_id) @@ -70,11 +77,12 @@ def _predict(self, batch: io.StringIO, timeout: int) -> Union[str, Predictions]: if status == "error": raise APIError(resp["error_msg"]) elif status == "running": - raise TimeoutError("Timeout of {timeout}s expired while waiting for prediction") + raise TimeoutError(f"Timeout of {timeout}s expired while waiting for prediction") else: result_url = resp["result_url"] - results_converted: Predictions = pd.read_csv(result_url).to_numpy() - return results_converted + results: pd.DataFrame = pd.read_csv(result_url) + + return results.pop("Suggested Label").to_numpy(), results @staticmethod def _convert_batch_to_csv(batch: Batch) -> io.StringIO: From 8b7b8deb0dee4bdecd1df8d19e918afbb402f3c4 Mon Sep 17 00:00:00 2001 From: ryansingman Date: Mon, 31 Jul 2023 14:57:19 -0600 Subject: [PATCH 36/42] mypy fix --- cleanlab_studio/studio/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 99e780de..1660c3b5 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -18,7 +18,7 @@ Batch = Union[TextBatch, TabularBatch] Predictions = Union[npt.NDArray[np.int_], npt.NDArray[np.str_]] -ClassProbablities = pd.DataFrame +ClassProbablities: TypeAlias = pd.DataFrame class Model(abc.ABC): From 4261e5a2880004d57dcb93bf4bb9035347dc7bd0 Mon Sep 17 00:00:00 2001 From: ryansingman Date: Tue, 1 Aug 2023 11:51:18 -0600 Subject: [PATCH 37/42] clean up polling interface, angelas comments --- cleanlab_studio/internal/api/api.py | 14 ++------------ cleanlab_studio/studio/inference.py | 30 ++++++++++++----------------- cleanlab_studio/studio/studio.py | 2 +- 3 files changed, 15 insertions(+), 31 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index b4329997..064a9927 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -364,21 +364,11 @@ def start_prediction(api_key: str, model_id: str, query_id: str) -> None: def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: - """Gets status of model prediction query.""" + """Gets status of model prediction query. Returns status, and optionally the result_url or error message.""" res = requests.get( f"{model_base_url}/predict/{query_id}", headers=_construct_headers(api_key), ) handle_api_error(res) - prediction_results = res.json() - status = prediction_results["status"] - result_url = prediction_results["results"] - error_msg = prediction_results["error_msg"] - - if status == "COMPLETE": - return {"status": "done", "result_url": result_url} - elif status == "FAILED": - return {"status": "error", "error_msg": error_msg} - else: - return {"status": "running"} + return res.json() diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 1660c3b5..d895b967 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -39,22 +39,24 @@ def predict( Gets predictions for batch of examples. Args: - batch: batch of example to predict classes for + batch: batch of examples to predict classes for return_pred_proba: if should return class probabilities for each example timeout: optional parameter to set timeout for predictions in seconds Returns: - predictions from batch as a numpy array + predictions from batch as a numpy array, optionally also pandas dataframe of class probabilties """ csv_batch = self._convert_batch_to_csv(batch) - predictions, class_probabilities = self._predict(csv_batch, timeout) + predictions, class_probabilities = self._predict_from_csv(csv_batch, timeout) if return_pred_proba: return predictions, class_probabilities return predictions - def _predict(self, batch: io.StringIO, timeout: int) -> Tuple[Predictions, ClassProbablities]: + def _predict_from_csv( + self, batch: io.StringIO, timeout: int + ) -> Tuple[Predictions, ClassProbablities]: """Gets predictions for batch of examples. :param batch: batch of example to predict classes for, as in-memory CSV file @@ -63,26 +65,18 @@ def _predict(self, batch: io.StringIO, timeout: int) -> Tuple[Predictions, Class query_id: str = api.upload_predict_batch(self._api_key, self._model_id, batch) api.start_prediction(self._api_key, self._model_id, query_id) - resp = api.get_prediction_status(self._api_key, query_id) - status: Optional[str] = resp["status"] # Set timeout to prevent users from getting stuck indefinitely when there is a failure timeout_limit = time.time() + timeout - - while status == "running" and time.time() < timeout_limit: + while time.time() < timeout_limit: + resp = api.get_prediction_status(self._api_key, query_id) time.sleep(1) - resp = api.get_prediction_status(self._api_key, query_id) - status = resp["status"] + if result_url := resp.get("result_url"): + results: pd.DataFrame = pd.read_csv(result_url) + return results.pop("Suggested Label").to_numpy(), results - if status == "error": - raise APIError(resp["error_msg"]) - elif status == "running": - raise TimeoutError(f"Timeout of {timeout}s expired while waiting for prediction") else: - result_url = resp["result_url"] - results: pd.DataFrame = pd.read_csv(result_url) - - return results.pop("Suggested Label").to_numpy(), results + raise TimeoutError(f"Timeout of {timeout}s expired while waiting for prediction") @staticmethod def _convert_batch_to_csv(batch: Batch) -> io.StringIO: diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py index 3485b889..a506ed82 100644 --- a/cleanlab_studio/studio/studio.py +++ b/cleanlab_studio/studio/studio.py @@ -298,7 +298,7 @@ def get_model(self, model_id: str) -> inference.Model: model_id: ID of model to get. This ID should be fetched in the deployments page of the app UI. Returns: - Model object with methods run predictions on new input data + Model object with methods to run predictions on new input data """ return inference.Model(self._api_key, model_id) From 24b47339672c16d83f47fe60fdc8682616b202b8 Mon Sep 17 00:00:00 2001 From: ryansingman Date: Tue, 1 Aug 2023 11:52:53 -0600 Subject: [PATCH 38/42] fix sleep placement in poll loop --- cleanlab_studio/studio/inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index d895b967..8114d0b2 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -69,12 +69,13 @@ def _predict_from_csv( timeout_limit = time.time() + timeout while time.time() < timeout_limit: resp = api.get_prediction_status(self._api_key, query_id) - time.sleep(1) if result_url := resp.get("result_url"): results: pd.DataFrame = pd.read_csv(result_url) return results.pop("Suggested Label").to_numpy(), results + time.sleep(1) + else: raise TimeoutError(f"Timeout of {timeout}s expired while waiting for prediction") From 260386827101eb2b69cc95facfd55ece9aa04996 Mon Sep 17 00:00:00 2001 From: ryansingman Date: Tue, 1 Aug 2023 11:56:26 -0600 Subject: [PATCH 39/42] mypy fix --- cleanlab_studio/internal/api/api.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 064a9927..72699eec 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -1,8 +1,7 @@ import io import os import time -from itertools import chain -from typing import Callable, List, Optional, Tuple, Dict, Union, Any +from typing import Callable, cast, List, Optional, Tuple, Dict, Union, Any from cleanlab_studio.errors import APIError import requests @@ -371,4 +370,4 @@ def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: ) handle_api_error(res) - return res.json() + return cast(dict, res.json()) From f7cdefab0431d487e6344299db694535e54457a3 Mon Sep 17 00:00:00 2001 From: ryansingman Date: Tue, 1 Aug 2023 11:58:05 -0600 Subject: [PATCH 40/42] mypy fix --- cleanlab_studio/internal/api/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index 72699eec..aa8177c9 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -370,4 +370,4 @@ def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: ) handle_api_error(res) - return cast(dict, res.json()) + return cast(Dict[str, str], res.json()) From f1d5102e497eebeef39029663f2c78b5b84ac1b6 Mon Sep 17 00:00:00 2001 From: ryansingman Date: Tue, 1 Aug 2023 21:11:40 -0600 Subject: [PATCH 41/42] fix results name --- cleanlab_studio/studio/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index 8114d0b2..c12495ec 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -70,7 +70,7 @@ def _predict_from_csv( while time.time() < timeout_limit: resp = api.get_prediction_status(self._api_key, query_id) - if result_url := resp.get("result_url"): + if result_url := resp.get("results"): results: pd.DataFrame = pd.read_csv(result_url) return results.pop("Suggested Label").to_numpy(), results From af4d02fda5d2f0ecf002983981d9d516f35a4147 Mon Sep 17 00:00:00 2001 From: ryansingman Date: Wed, 2 Aug 2023 11:43:34 -0600 Subject: [PATCH 42/42] fix nits --- cleanlab_studio/internal/api/api.py | 2 +- cleanlab_studio/studio/inference.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py index aa8177c9..918c0257 100644 --- a/cleanlab_studio/internal/api/api.py +++ b/cleanlab_studio/internal/api/api.py @@ -363,7 +363,7 @@ def start_prediction(api_key: str, model_id: str, query_id: str) -> None: def get_prediction_status(api_key: str, query_id: str) -> Dict[str, str]: - """Gets status of model prediction query. Returns status, and optionally the result_url or error message.""" + """Gets status of model prediction query. Returns status, and optionally the result url or error message.""" res = requests.get( f"{model_base_url}/predict/{query_id}", headers=_construct_headers(api_key), diff --git a/cleanlab_studio/studio/inference.py b/cleanlab_studio/studio/inference.py index c12495ec..7f84fc0c 100644 --- a/cleanlab_studio/studio/inference.py +++ b/cleanlab_studio/studio/inference.py @@ -2,14 +2,13 @@ import csv import io import time -from typing import List, Optional, Tuple, Union +from typing import List, Tuple, Union from typing_extensions import TypeAlias import numpy as np import numpy.typing as npt import pandas as pd -from cleanlab_studio.errors import APIError from cleanlab_studio.internal.api import api