diff --git a/BENCHMARKS.txt b/BENCHMARKS.txt index 9ad0f40..5284006 100644 --- a/BENCHMARKS.txt +++ b/BENCHMARKS.txt @@ -16,6 +16,7 @@ Scoring CPU intensive model 100 times synchronously (1 job at a time) Direct scoring: 110 sec Scoring on the pool (1 worker): 103 sec +Scoring on the pool asyncio (1 worker): 109 sec # Just making sure it matches The MLPool logic doesn't seem to introduce any overhead, it even seems to speed things up slightly (interesting) diff --git a/README.md b/README.md index 1a2e767..d28e767 100644 --- a/README.md +++ b/README.md @@ -1,38 +1,120 @@ ### Use case / why -Cloud Run as of November 2022 does not support GPU machines. Assuming we have a model, -which is CPU hungry and takes time to score, which we want to serve through an API/web sockets while: +Say, we have a ML model, which is CPU hungry and takes time to score, which we want to +serve through an API / Web Sockets while: - keeping overall application complexity low (no dedicated service to serve ML models) - minimising latency -- maximasing throughput (the number of requests a single Cloud Run instance can handle) +- maximasing throughput (the number of requests a single Cloud Run instance / container can handle) +- providing the ability to avoid making blocking calls, which would block coroutines / web sockets - -A typical simple API serving ML model(s) instantiates a model right in the main process, which -results in the model consuming the process's memory and CPU cycles for scoring. This directly -affects the API performance. Ideally, we want to move the model scoring / feature engineering bit -to dedicated cores. This would let us: +A typical simple API serving ML model(s) instantiates the models right in the main process, which +results in the models consuming the API process's RAM and CPU cycles for scoring. This directly +affects the API performance in terms of latency and throughput. It would be helpful if we could move +model scoring bit somewhere else away from the API process - other cores. This would let us: - use CPU resources more efficiently (loading more cores instead of a single one provided that the container has 1+ cores at its disposal) -- avoid hurting API performance by moving model scoring / feature engineering bits that require memory and CPU away from the API process +- avoid hurting API performance by moving model scoring that require memory and CPU away from the API process - decrease latency - increase throughput - -On top of that, the solution seems to work well with the Cloud Run autoscaling strategy. When +On top of that, the solution seems to integrate well with the Cloud Run autoscaling strategy. When the load is low, the workers serving the models are pretty much idle. As the load increases, the workers get busy, which in turn increases overall Cloud Run instance CPU usage. Once it reaches a certain threshold, Cloud Run will spin up another instance to handle the load. + +--- + +### How to use / Examples + +In order to use MLPool, for every ML model a user wants to run on the pool, they need to provide +a callable, which loads a model into memory and prepares it. Under the hood, MLPool will ensure +every worker in the pool running in a dedicated process loads the model(s) in memory. + +Consider these functions that load the models. + +```python +def load_iris(model_path: str) -> HungryIris: + # TODO: Loading and preparing ML model goes here + return HungryIris(model_path) + + +def load_diabetes_classifier(model_path: str) -> HungryDiabetesClassifier: + # TODO: Loading and preparing ML model goes here + return HungryDiabetesClassifier(model_path) +``` + +When instantiating MLPool, a user needs to provide a dictionary (models_to_load) where the keys are +model names MLPool will serve, and the values are the callables loading the models (functions above). +In addition, the user can specify the number of workers they want, scoring results TTL etc. + +```python +if __name__ == "__main__": + with MLPool( + models_to_load={ + "iris": partial(load_iris, "../models/iris_xgb.json"), + "diabetes_classifier": partial( + load_diabetes_classifier, "../models/diabetes_xgb.json" + ) + }, + nb_workers=5, + ) as pool: + ... +``` + + + +# STOPPED HERE - SCORING + + +When it comes to scoring, MLPool provides the ability to: + +- Create a scoring job on the pool +- Get results of the scoring job +- Cancel a scoring job + + + + + + +- `create_job` or asyncio friendly `create_job_async` methods to execute model on the pool. The methods +expect a callable which accepts as the first parameter a ML model (passed by MLPool) and *args, **kwargs. + +Consider an example + +```python +def score_iris(model: HungryIris, features): + # TODO: Feature engineering etc goes here + + return model.predict(features) +``` + +```python +@app.post("/iris") +async def serve_iris(request: Request) -> Response: + features = request.features + logger.info(f"/iris request, features: {features}") + + job_id = await pool.create_job_async( + score_model_function=score_iris, + model_name="iris", + kwargs={"features": features}, + ) + result = await pool.get_result_async(job_id) + + return Response(prediction=result) +``` --- ### Gotchas: - IPC in Python is done via pickling / unpickling objects - transferring large objects such as images -could be expensive +could be expensive. -- If your model is light and takes very little time to score, then adding the MLPool will +- If your model is light and takes very little time to score (tens of milliseconds), then introducing MLPool will only slow things down. In this case it makes sense to score the model directly in the API process. If there is feature engineering work associated with pulling/preprocessing features, then it might make sense, depends. @@ -42,14 +124,6 @@ gunicorn etc. Be careful when using MLPool in such configuration as you could ov Cloud Run to scale up lol. In our case each container runs only a single instance of the API app, spinning up more instances within the same container won't help as the bottleneck is CPU hungry model scoring. ---- - - -### How to use / Examples - - -TBA - --- diff --git a/examples/estimate_latency_mlpool_adds.py b/examples/estimate_latency_mlpool_adds.py index 2cdcc36..56c5ab6 100644 --- a/examples/estimate_latency_mlpool_adds.py +++ b/examples/estimate_latency_mlpool_adds.py @@ -1,4 +1,5 @@ import sys +import asyncio sys.path.append("..") from functools import partial @@ -8,6 +9,22 @@ from examples.models import HungryIris +""" +The point is to estimate how much latency MLPool adds compared to running the +model directly. + +Under the hood MLPool does a bunch of things: +- needs to do some checks +- create a Job object +- put Job in the queue +- then, MLWorker gets it from the queue +- processes the job, +- puts in the shared dictionary + +All of these ^ takes time, but how much? +""" + + NB_SCORES = 100 @@ -42,8 +59,17 @@ def score_on_pool(): print("Pool scoring took:", time.perf_counter() - start) -def main(): - score_directly() +# Just to make sure +async def score_on_pool_async(): + start = time.perf_counter() + for i in range(NB_SCORES): + job_id = await pool.create_job_async( + score_iris, model_name="iris", args=([3.0, 2.0, 1.0, 0.2],) + ) + _ = await pool.get_result_async(job_id) + print(f"Scored {i} time") + + print("Async pool scoring took:", time.perf_counter() - start) if __name__ == "__main__": @@ -52,5 +78,6 @@ def main(): nb_workers=1, ) as pool: score_on_pool() + asyncio.run(score_on_pool_async()) score_directly()