From bdd422df6aa16a2427943a14873847403a8d5582 Mon Sep 17 00:00:00 2001 From: Baptiste Date: Thu, 21 Nov 2024 20:33:38 +0000 Subject: [PATCH 1/6] fix(tests): fix broken GPT2 integration test --- Makefile | 6 +- .../integration-tests/conftest.py | 208 ++++++++++++++---- .../integration-tests/test_gpt2.py | 19 +- .../server/build-requirements.txt | 2 +- 4 files changed, 178 insertions(+), 57 deletions(-) diff --git a/Makefile b/Makefile index 7091ac00..97df108b 100644 --- a/Makefile +++ b/Makefile @@ -42,12 +42,14 @@ clean: rm -rf dist deps make -C text-generation-inference/server/ clean +# ulimit nofile=100000:100000 is required for TPUs +# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#privileged-mode tpu-tgi: docker build --rm -f text-generation-inference/docker/Dockerfile \ --build-arg VERSION=$(VERSION) \ --build-arg TGI_VERSION=$(TGI_VERSION) \ - --ulimit nofile=100000:100000 \ - -t huggingface/optimum-tpu:$(VERSION)-tgi . + --ulimit nofile=100000:100000 \ + -t huggingface/optimum-tpu:$(VERSION)-tgi . docker tag huggingface/optimum-tpu:$(VERSION)-tgi huggingface/optimum-tpu:latest tpu-tgi-ie: diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py index 435b55eb..a6b23210 100644 --- a/text-generation-inference/integration-tests/conftest.py +++ b/text-generation-inference/integration-tests/conftest.py @@ -4,7 +4,10 @@ import shlex import subprocess import sys +import threading import time +import signal +import logging from tempfile import TemporaryDirectory from typing import List @@ -16,30 +19,81 @@ from text_generation.types import Response -DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tpu-tgi:latest") -HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None) +DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "huggingface/optimum-tpu:latest") +HF_TOKEN = os.getenv("HF_TOKEN", None) DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data") +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + + +def cleanup_handler(signum, frame): + logger.info("\nCleaning up containers due to shutdown, please wait...") + try: + client = docker.from_env() + containers = client.containers.list(filters={"name": "tgi-tests-"}) + for container in containers: + try: + container.stop() + container.remove() + logger.info(f"Successfully cleaned up container {container.name}") + except Exception as e: + logger.error(f"Error cleaning up container {container.name}: {str(e)}") + except Exception as e: + logger.error(f"Error during cleanup: {str(e)}") + sys.exit(1) + +signal.signal(signal.SIGINT, cleanup_handler) +signal.signal(signal.SIGTERM, cleanup_handler) + +def stream_container_logs(container): + """Stream container logs in a separate thread.""" + try: + for log in container.logs(stream=True, follow=True): + print("[TGI Server Logs] " + log.decode("utf-8"), end="", file=sys.stderr, flush=True) + except Exception as e: + logger.error(f"Error streaming container logs: {str(e)}") + class LauncherHandle: def __init__(self, port: int): - self.client = AsyncClient(f"http://localhost:{port}") + self.client = AsyncClient(f"http://localhost:{port}", timeout=600) + self.logger = logging.getLogger(self.__class__.__name__) def _inner_health(self): raise NotImplementedError async def health(self, timeout: int = 60): assert timeout > 0 - for _ in range(timeout): + start_time = time.time() + self.logger.info(f"Starting health check with timeout of {timeout}s") + + for attempt in range(timeout): if not self._inner_health(): + self.logger.error("Launcher crashed during health check") raise RuntimeError("Launcher crashed") try: await self.client.generate("test") + elapsed = time.time() - start_time + self.logger.info(f"Health check passed after {elapsed:.1f}s") return - except (ClientConnectorError, ClientOSError, ServerDisconnectedError): + except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e: + if attempt == timeout - 1: + self.logger.error(f"Health check failed after {timeout}s: {str(e)}") + raise RuntimeError(f"Health check failed: {str(e)}") + self.logger.debug(f"Connection attempt {attempt+1}/{timeout} failed: {str(e)}") time.sleep(1) - raise RuntimeError("Health check failed") + except Exception as e: + self.logger.error(f"Unexpected error during health check: {str(e)}") + # Get full traceback for debugging + import traceback + self.logger.error(f"Full traceback:\n{traceback.format_exc()}") + raise class ContainerLauncherHandle(LauncherHandle): @@ -49,8 +103,18 @@ def __init__(self, docker_client, container_name, port: int): self.container_name = container_name def _inner_health(self) -> bool: - container = self.docker_client.containers.get(self.container_name) - return container.status in ["running", "created"] + try: + container = self.docker_client.containers.get(self.container_name) + status = container.status + if status not in ["running", "created"]: + self.logger.warning(f"Container status is {status}") + # Get container logs for debugging + logs = container.logs().decode("utf-8") + self.logger.debug(f"Container logs:\n{logs}") + return status in ["running", "created"] + except Exception as e: + self.logger.error(f"Error checking container health: {str(e)}") + return False class ProcessLauncherHandle(LauncherHandle): @@ -73,8 +137,11 @@ def event_loop(): def data_volume(): tmpdir = TemporaryDirectory() yield tmpdir.name - # Cleanup the temporary directory using sudo as it contains root files created by the container - subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}")) + try: + # Cleanup the temporary directory using sudo as it contains root files created by the container + subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"), check=True) + except subprocess.CalledProcessError as e: + logger.error(f"Error cleaning up temporary directory: {str(e)}") @pytest.fixture(scope="module") @@ -84,11 +151,12 @@ def docker_launcher( model_id: str, trust_remote_code: bool = False, ): + logger.info(f"Starting docker launcher for model {model_id}") # TODO: consider finding out how to forward a port in the container instead of leaving it to 80. # For now this is necessary because TPU dockers require to run with net=host and privileged mode. port = 80 - args = ["--model-id", model_id, "--env"] + args = ["--env"] if trust_remote_code: args.append("--trust-remote-code") @@ -99,15 +167,25 @@ def docker_launcher( try: container = client.containers.get(container_name) + logger.info(f"Stopping existing container {container_name}") container.stop() container.wait() except NotFound: pass + except Exception as e: + logger.error(f"Error handling existing container: {str(e)}") - env = {"LOG_LEVEL": "info,text_generation_router=debug"} + env = { + "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug", + "MAX_BATCH_SIZE": "4", + "HF_HUB_ENABLE_HF_TRANSFER": "0", + "JETSTREAM_PT": "1", + "SKIP_WARMUP": "1", + "MODEL_ID": model_id, + } - if HUGGING_FACE_HUB_TOKEN is not None: - env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN + if HF_TOKEN is not None: + env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN for var in ["MAX_BATCH_SIZE", "HF_SEQUENCE_LENGTH"]: if var in os.environ: @@ -115,31 +193,68 @@ def docker_launcher( volumes = [f"{data_volume}:/data"] - container = client.containers.run( - DOCKER_IMAGE, - command=args, - name=container_name, - environment=env, - auto_remove=False, - detach=True, - volumes=volumes, - shm_size="1G", - privileged=True, - network_mode="host", - ) - - yield ContainerLauncherHandle(client, container.name, port) - try: - container.stop() - container.wait() - except NotFound: - pass - - container_output = container.logs().decode("utf-8") - print(container_output, file=sys.stderr) - - container.remove() + # Add debug logging before container creation + logger.debug(f"Creating container with image {DOCKER_IMAGE}") + logger.debug(f"Container environment: {env}") + logger.debug(f"Container volumes: {volumes}") + + container = client.containers.run( + DOCKER_IMAGE, + command=args, + name=container_name, + environment=env, + auto_remove=False, + detach=True, + volumes=volumes, + shm_size="16G", + privileged=True, + ipc_mode="host", + ) + logger.info(f"Container {container_name} started successfully") + + # Start log streaming in a background thread + log_thread = threading.Thread( + target=stream_container_logs, + args=(container,), + daemon=True # This ensures the thread will be killed when the main program exits + ) + log_thread.start() + + # Add a small delay to allow container to initialize + time.sleep(2) + + # Check container status after creation + status = container.status + logger.debug(f"Initial container status: {status}") + if status not in ["running", "created"]: + logs = container.logs().decode("utf-8") + logger.error(f"Container failed to start properly. Logs:\n{logs}") + + yield ContainerLauncherHandle(client, container.name, port) + + except Exception as e: + logger.error(f"Error starting container: {str(e)}") + # Get full traceback for debugging + import traceback + logger.error(f"Full traceback:\n{traceback.format_exc()}") + raise + finally: + try: + container = client.containers.get(container_name) + logger.info(f"Stopping container {container_name}") + container.stop() + container.wait() + + container_output = container.logs().decode("utf-8") + print(container_output, file=sys.stderr) + + container.remove() + logger.info(f"Container {container_name} removed successfully") + except NotFound: + pass + except Exception as e: + logger.error(f"Error cleaning up container: {str(e)}") return docker_launcher @@ -147,10 +262,17 @@ def docker_launcher( @pytest.fixture(scope="module") def generate_load(): async def generate_load_inner(client: AsyncClient, prompt: str, max_new_tokens: int, n: int) -> List[Response]: - futures = [ - client.generate(prompt, max_new_tokens=max_new_tokens, decoder_input_details=True) for _ in range(n) - ] - - return await asyncio.gather(*futures) + try: + futures = [ + client.generate( + prompt, + max_new_tokens=max_new_tokens, + decoder_input_details=True, + ) for _ in range(n) + ] + return await asyncio.gather(*futures) + except Exception as e: + logger.error(f"Error generating load: {str(e)}") + raise return generate_load_inner diff --git a/text-generation-inference/integration-tests/test_gpt2.py b/text-generation-inference/integration-tests/test_gpt2.py index d200bd5d..402c9956 100644 --- a/text-generation-inference/integration-tests/test_gpt2.py +++ b/text-generation-inference/integration-tests/test_gpt2.py @@ -1,13 +1,10 @@ import os - import Levenshtein import pytest - MODEL_ID = "openai-community/gpt2" SEQUENCE_LENGTH = 1024 - @pytest.fixture(scope="module") def model_name_or_path(): os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH) @@ -22,14 +19,13 @@ def tgi_service(launcher, model_name_or_path): @pytest.fixture(scope="module") async def tgi_client(tgi_service): - await tgi_service.health(300) + await tgi_service.health(1000) return tgi_service.client - @pytest.mark.asyncio async def test_model_single_request(tgi_client): - # Greedy bounded without input + # Bounded greedy decoding without input response = await tgi_client.generate( "What is Deep Learning?", max_new_tokens=17, @@ -37,10 +33,10 @@ async def test_model_single_request(tgi_client): ) assert response.details.generated_tokens == 17 assert ( - response.generated_text == "\n\nDeep learning is a technique that allows you to learn something from a set of" + response.generated_text == "\n\nDeep learning is a new field of research that has been around for a while" ) - # Greedy bounded with input + # Bounded greedy decoding with input response = await tgi_client.generate( "What is Deep Learning?", max_new_tokens=17, @@ -50,7 +46,7 @@ async def test_model_single_request(tgi_client): assert response.details.generated_tokens == 17 assert ( response.generated_text - == "What is Deep Learning?\n\nDeep learning is a technique that allows you to learn something from a set of" + == "What is Deep Learning?\n\nDeep learning is a new field of research that has been around for a while" ) # Sampling @@ -64,8 +60,9 @@ async def test_model_single_request(tgi_client): seed=42, decoder_input_details=True, ) + assert ( - 'The deep neural networks that we create are essentially "miniature" neural networks that can easily be trained' + 'A lot of researchers have tried to make a "deep learning" approach that focuses only on what is being shown' in response.generated_text ) @@ -81,7 +78,7 @@ async def test_model_multiple_requests(tgi_client, generate_load): ) assert len(responses) == 4 - expected = "\n\nDeep learning is a technique that allows you to learn something from a set of" + expected = "\n\nDeep learning is a technique that allows you to learn something from a single source" for r in responses: assert r.details.generated_tokens == 17 # Compute the similarity with the expectation using the levenshtein distance diff --git a/text-generation-inference/server/build-requirements.txt b/text-generation-inference/server/build-requirements.txt index 5307dc5d..64b2b6e9 100644 --- a/text-generation-inference/server/build-requirements.txt +++ b/text-generation-inference/server/build-requirements.txt @@ -1,3 +1,3 @@ build grpcio-tools==1.62.1 -mypy-protobuf==3.2.0 +mypy-protobuf==3.2.0 \ No newline at end of file From 9e63ff23e66e3e86bad53b4622b1ff9f78b87fa0 Mon Sep 17 00:00:00 2001 From: Baptiste Date: Mon, 25 Nov 2024 12:24:08 +0000 Subject: [PATCH 2/6] feat(tests): add Gemma integration test --- .../integration-tests/test_gemma.py | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 text-generation-inference/integration-tests/test_gemma.py diff --git a/text-generation-inference/integration-tests/test_gemma.py b/text-generation-inference/integration-tests/test_gemma.py new file mode 100644 index 00000000..b52b1d6a --- /dev/null +++ b/text-generation-inference/integration-tests/test_gemma.py @@ -0,0 +1,87 @@ +import os +import Levenshtein +import pytest + +MODEL_ID = "google/gemma-2b-it" +SEQUENCE_LENGTH = 1024 + +@pytest.fixture(scope="module") +def model_name_or_path(): + os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH) + yield MODEL_ID + + +@pytest.fixture(scope="module") +def tgi_service(launcher, model_name_or_path): + with launcher(model_name_or_path) as tgi_service: + yield tgi_service + + +@pytest.fixture(scope="module") +async def tgi_client(tgi_service): + await tgi_service.health(1000) + return tgi_service.client + +@pytest.mark.asyncio +async def test_model_single_request(tgi_client): + + # Bounded greedy decoding without input + response = await tgi_client.generate( + "What is Deep Learning?", + max_new_tokens=17, + decoder_input_details=True, + ) + assert response.details.generated_tokens == 17 + assert ( + response.generated_text == "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data" + ) + + # Bounded greedy decoding with input + response = await tgi_client.generate( + "What is Deep Learning?", + max_new_tokens=17, + return_full_text=True, + decoder_input_details=True, + ) + assert response.details.generated_tokens == 17 + assert ( + response.generated_text + == "What is Deep Learning?\n\nDeep learning is a subfield of machine learning that allows computers to learn from data" + ) + + # Sampling + response = await tgi_client.generate( + "What is Deep Learning?", + do_sample=True, + top_k=50, + top_p=0.9, + repetition_penalty=1.2, + max_new_tokens=100, + seed=42, + decoder_input_details=True, + ) + print(f"\nGot sampling output with seed=42: {response.generated_text}") + + assert ( + 'Deep learning is a subfield of machine learning that focuses on mimicking the structure and function of the human brain' + in response.generated_text + ) + + +@pytest.mark.asyncio +async def test_model_multiple_requests(tgi_client, generate_load): + num_requests = 4 + responses = await generate_load( + tgi_client, + "What is Deep Learning?", + max_new_tokens=17, + n=num_requests, + ) + + assert len(responses) == 4 + expected = "\n\nDeep learning is a subfield of machine learning that uses artificial neural networks to learn" + for r in responses: + assert r.details.generated_tokens == 17 + # Compute the similarity with the expectation using the levenshtein distance + # We should not have more than two substitutions or additions + assert Levenshtein.distance(r.generated_text, expected) < 3 From c822676d2ee37ac2120dd41af3cce2f4c2c66457 Mon Sep 17 00:00:00 2001 From: Baptiste Date: Mon, 2 Dec 2024 18:34:07 +0000 Subject: [PATCH 3/6] refactor(logging): migrate to Loguru --- .../integration-tests/conftest.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py index a6b23210..bbbf14d7 100644 --- a/text-generation-inference/integration-tests/conftest.py +++ b/text-generation-inference/integration-tests/conftest.py @@ -7,7 +7,6 @@ import threading import time import signal -import logging from tempfile import TemporaryDirectory from typing import List @@ -17,18 +16,20 @@ from docker.errors import NotFound from text_generation import AsyncClient from text_generation.types import Response +from loguru import logger DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "huggingface/optimum-tpu:latest") HF_TOKEN = os.getenv("HF_TOKEN", None) DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data") -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +# Configure loguru logger +logger.remove() # Remove default handler +logger.add( + sys.stderr, + format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}", + level="INFO" ) -logger = logging.getLogger(__name__) def cleanup_handler(signum, frame): @@ -62,7 +63,6 @@ def stream_container_logs(container): class LauncherHandle: def __init__(self, port: int): self.client = AsyncClient(f"http://localhost:{port}", timeout=600) - self.logger = logging.getLogger(self.__class__.__name__) def _inner_health(self): raise NotImplementedError @@ -70,29 +70,29 @@ def _inner_health(self): async def health(self, timeout: int = 60): assert timeout > 0 start_time = time.time() - self.logger.info(f"Starting health check with timeout of {timeout}s") + logger.info(f"Starting health check with timeout of {timeout}s") for attempt in range(timeout): if not self._inner_health(): - self.logger.error("Launcher crashed during health check") + logger.error("Launcher crashed during health check") raise RuntimeError("Launcher crashed") try: await self.client.generate("test") elapsed = time.time() - start_time - self.logger.info(f"Health check passed after {elapsed:.1f}s") + logger.info(f"Health check passed after {elapsed:.1f}s") return except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e: if attempt == timeout - 1: - self.logger.error(f"Health check failed after {timeout}s: {str(e)}") + logger.error(f"Health check failed after {timeout}s: {str(e)}") raise RuntimeError(f"Health check failed: {str(e)}") - self.logger.debug(f"Connection attempt {attempt+1}/{timeout} failed: {str(e)}") + logger.debug(f"Connection attempt {attempt+1}/{timeout} failed: {str(e)}") time.sleep(1) except Exception as e: - self.logger.error(f"Unexpected error during health check: {str(e)}") + logger.error(f"Unexpected error during health check: {str(e)}") # Get full traceback for debugging import traceback - self.logger.error(f"Full traceback:\n{traceback.format_exc()}") + logger.error(f"Full traceback:\n{traceback.format_exc()}") raise @@ -107,13 +107,13 @@ def _inner_health(self) -> bool: container = self.docker_client.containers.get(self.container_name) status = container.status if status not in ["running", "created"]: - self.logger.warning(f"Container status is {status}") + logger.warning(f"Container status is {status}") # Get container logs for debugging logs = container.logs().decode("utf-8") - self.logger.debug(f"Container logs:\n{logs}") + logger.debug(f"Container logs:\n{logs}") return status in ["running", "created"] except Exception as e: - self.logger.error(f"Error checking container health: {str(e)}") + logger.error(f"Error checking container health: {str(e)}") return False From 1af9edc15cc31aa913b7170033d68b86b0380098 Mon Sep 17 00:00:00 2001 From: Baptiste Date: Wed, 4 Dec 2024 09:59:20 +0000 Subject: [PATCH 4/6] fix(tests): fix broken connection to docker container --- Makefile | 1 - .../integration-tests/conftest.py | 57 +++++---- .../integration-tests/test_gemma.py | 87 ------------- .../integration-tests/test_gpt2.py | 86 ------------- .../integration-tests/test_model.py | 117 ++++++++++++++++++ 5 files changed, 145 insertions(+), 203 deletions(-) delete mode 100644 text-generation-inference/integration-tests/test_gemma.py delete mode 100644 text-generation-inference/integration-tests/test_gpt2.py create mode 100644 text-generation-inference/integration-tests/test_model.py diff --git a/Makefile b/Makefile index 97df108b..0220201d 100644 --- a/Makefile +++ b/Makefile @@ -66,7 +66,6 @@ style_check: ruff check . style: - ruff check . --fix # Utilities to release to PyPi build_dist_install_tools: diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py index bbbf14d7..1ef7e4c7 100644 --- a/text-generation-inference/integration-tests/conftest.py +++ b/text-generation-inference/integration-tests/conftest.py @@ -2,11 +2,11 @@ import contextlib import os import shlex +import signal import subprocess import sys import threading import time -import signal from tempfile import TemporaryDirectory from typing import List @@ -14,17 +14,16 @@ import pytest from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError from docker.errors import NotFound +from loguru import logger +from test_model import MODEL_CONFIGS from text_generation import AsyncClient from text_generation.types import Response -from loguru import logger DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "huggingface/optimum-tpu:latest") HF_TOKEN = os.getenv("HF_TOKEN", None) DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data") -# Configure loguru logger -logger.remove() # Remove default handler logger.add( sys.stderr, format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}", @@ -62,7 +61,7 @@ def stream_container_logs(container): class LauncherHandle: def __init__(self, port: int): - self.client = AsyncClient(f"http://localhost:{port}", timeout=600) + self.client = AsyncClient(f"http://localhost:{port}", timeout=3600) def _inner_health(self): raise NotImplementedError @@ -71,7 +70,7 @@ async def health(self, timeout: int = 60): assert timeout > 0 start_time = time.time() logger.info(f"Starting health check with timeout of {timeout}s") - + for attempt in range(timeout): if not self._inner_health(): logger.error("Launcher crashed during health check") @@ -126,13 +125,6 @@ def _inner_health(self) -> bool: return self.process.poll() is None -@pytest.fixture(scope="module") -def event_loop(): - loop = asyncio.get_event_loop() - yield loop - loop.close() - - @pytest.fixture(scope="module") def data_volume(): tmpdir = TemporaryDirectory() @@ -145,18 +137,21 @@ def data_volume(): @pytest.fixture(scope="module") -def launcher(event_loop, data_volume): +def launcher(data_volume): @contextlib.contextmanager def docker_launcher( model_id: str, trust_remote_code: bool = False, ): logger.info(f"Starting docker launcher for model {model_id}") - # TODO: consider finding out how to forward a port in the container instead of leaving it to 80. - # For now this is necessary because TPU dockers require to run with net=host and privileged mode. - port = 80 + port = 8080 - args = ["--env"] + args = [ + "--max-input-length", "512", + "--max-total-tokens", "1024", + "--max-batch-prefill-tokens", "512", + "--max-batch-total-tokens", "1024" + ] if trust_remote_code: args.append("--trust-remote-code") @@ -175,17 +170,14 @@ def docker_launcher( except Exception as e: logger.error(f"Error handling existing container: {str(e)}") - env = { - "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug", - "MAX_BATCH_SIZE": "4", - "HF_HUB_ENABLE_HF_TRANSFER": "0", - "JETSTREAM_PT": "1", - "SKIP_WARMUP": "1", - "MODEL_ID": model_id, - } + model_name = next(name for name, cfg in MODEL_CONFIGS.items() if cfg["model_id"] == model_id) + env = MODEL_CONFIGS[model_name]["env_config"].copy() + + # Add model_id to env + env["MODEL_ID"] = model_id if HF_TOKEN is not None: - env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN + env["HF_TOKEN"] = HF_TOKEN for var in ["MAX_BATCH_SIZE", "HF_SEQUENCE_LENGTH"]: if var in os.environ: @@ -198,7 +190,13 @@ def docker_launcher( logger.debug(f"Creating container with image {DOCKER_IMAGE}") logger.debug(f"Container environment: {env}") logger.debug(f"Container volumes: {volumes}") - + + # Log equivalent docker run command + env_str = ' '.join([f'-e {k}="{v}"' for k,v in env.items()]) + volume_str = ' '.join([f'-v {v}' for v in volumes]) + cmd_str = f'docker run -d --name {container_name} {env_str} {volume_str} --shm-size 16G --privileged --ipc host {DOCKER_IMAGE} {" ".join(args)}' + logger.debug(f"Equivalent docker run command:\n{cmd_str}") + container = client.containers.run( DOCKER_IMAGE, command=args, @@ -210,6 +208,7 @@ def docker_launcher( shm_size="16G", privileged=True, ipc_mode="host", + ports={"80/tcp": 8080} ) logger.info(f"Container {container_name} started successfully") @@ -245,7 +244,7 @@ def docker_launcher( logger.info(f"Stopping container {container_name}") container.stop() container.wait() - + container_output = container.logs().decode("utf-8") print(container_output, file=sys.stderr) diff --git a/text-generation-inference/integration-tests/test_gemma.py b/text-generation-inference/integration-tests/test_gemma.py deleted file mode 100644 index b52b1d6a..00000000 --- a/text-generation-inference/integration-tests/test_gemma.py +++ /dev/null @@ -1,87 +0,0 @@ -import os -import Levenshtein -import pytest - -MODEL_ID = "google/gemma-2b-it" -SEQUENCE_LENGTH = 1024 - -@pytest.fixture(scope="module") -def model_name_or_path(): - os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH) - yield MODEL_ID - - -@pytest.fixture(scope="module") -def tgi_service(launcher, model_name_or_path): - with launcher(model_name_or_path) as tgi_service: - yield tgi_service - - -@pytest.fixture(scope="module") -async def tgi_client(tgi_service): - await tgi_service.health(1000) - return tgi_service.client - -@pytest.mark.asyncio -async def test_model_single_request(tgi_client): - - # Bounded greedy decoding without input - response = await tgi_client.generate( - "What is Deep Learning?", - max_new_tokens=17, - decoder_input_details=True, - ) - assert response.details.generated_tokens == 17 - assert ( - response.generated_text == "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data" - ) - - # Bounded greedy decoding with input - response = await tgi_client.generate( - "What is Deep Learning?", - max_new_tokens=17, - return_full_text=True, - decoder_input_details=True, - ) - assert response.details.generated_tokens == 17 - assert ( - response.generated_text - == "What is Deep Learning?\n\nDeep learning is a subfield of machine learning that allows computers to learn from data" - ) - - # Sampling - response = await tgi_client.generate( - "What is Deep Learning?", - do_sample=True, - top_k=50, - top_p=0.9, - repetition_penalty=1.2, - max_new_tokens=100, - seed=42, - decoder_input_details=True, - ) - print(f"\nGot sampling output with seed=42: {response.generated_text}") - - assert ( - 'Deep learning is a subfield of machine learning that focuses on mimicking the structure and function of the human brain' - in response.generated_text - ) - - -@pytest.mark.asyncio -async def test_model_multiple_requests(tgi_client, generate_load): - num_requests = 4 - responses = await generate_load( - tgi_client, - "What is Deep Learning?", - max_new_tokens=17, - n=num_requests, - ) - - assert len(responses) == 4 - expected = "\n\nDeep learning is a subfield of machine learning that uses artificial neural networks to learn" - for r in responses: - assert r.details.generated_tokens == 17 - # Compute the similarity with the expectation using the levenshtein distance - # We should not have more than two substitutions or additions - assert Levenshtein.distance(r.generated_text, expected) < 3 diff --git a/text-generation-inference/integration-tests/test_gpt2.py b/text-generation-inference/integration-tests/test_gpt2.py deleted file mode 100644 index 402c9956..00000000 --- a/text-generation-inference/integration-tests/test_gpt2.py +++ /dev/null @@ -1,86 +0,0 @@ -import os -import Levenshtein -import pytest - -MODEL_ID = "openai-community/gpt2" -SEQUENCE_LENGTH = 1024 - -@pytest.fixture(scope="module") -def model_name_or_path(): - os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH) - yield MODEL_ID - - -@pytest.fixture(scope="module") -def tgi_service(launcher, model_name_or_path): - with launcher(model_name_or_path) as tgi_service: - yield tgi_service - - -@pytest.fixture(scope="module") -async def tgi_client(tgi_service): - await tgi_service.health(1000) - return tgi_service.client - -@pytest.mark.asyncio -async def test_model_single_request(tgi_client): - - # Bounded greedy decoding without input - response = await tgi_client.generate( - "What is Deep Learning?", - max_new_tokens=17, - decoder_input_details=True, - ) - assert response.details.generated_tokens == 17 - assert ( - response.generated_text == "\n\nDeep learning is a new field of research that has been around for a while" - ) - - # Bounded greedy decoding with input - response = await tgi_client.generate( - "What is Deep Learning?", - max_new_tokens=17, - return_full_text=True, - decoder_input_details=True, - ) - assert response.details.generated_tokens == 17 - assert ( - response.generated_text - == "What is Deep Learning?\n\nDeep learning is a new field of research that has been around for a while" - ) - - # Sampling - response = await tgi_client.generate( - "What is Deep Learning?", - do_sample=True, - top_k=50, - top_p=0.9, - repetition_penalty=1.2, - max_new_tokens=100, - seed=42, - decoder_input_details=True, - ) - - assert ( - 'A lot of researchers have tried to make a "deep learning" approach that focuses only on what is being shown' - in response.generated_text - ) - - -@pytest.mark.asyncio -async def test_model_multiple_requests(tgi_client, generate_load): - num_requests = 4 - responses = await generate_load( - tgi_client, - "What is Deep Learning?", - max_new_tokens=17, - n=num_requests, - ) - - assert len(responses) == 4 - expected = "\n\nDeep learning is a technique that allows you to learn something from a single source" - for r in responses: - assert r.details.generated_tokens == 17 - # Compute the similarity with the expectation using the levenshtein distance - # We should not have more than two substitutions or additions - assert Levenshtein.distance(r.generated_text, expected) < 3 diff --git a/text-generation-inference/integration-tests/test_model.py b/text-generation-inference/integration-tests/test_model.py new file mode 100644 index 00000000..b9ae54e0 --- /dev/null +++ b/text-generation-inference/integration-tests/test_model.py @@ -0,0 +1,117 @@ +import os +from typing import Any, Dict + +import Levenshtein +import pytest + + +MODEL_CONFIGS = { + "gpt2": { + "model_id": "openai-community/gpt2", + "sequence_length": 1024, + "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a while", + "expected_sampling_output": 'The fundamental concepts of deep learning are the same as those used to train and understand your first language, or your first set of skills', + "expected_batch_output": "\n\nDeep learning is a technique that allows you to learn something from a single source", + "env_config": { + "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug", + "MAX_BATCH_SIZE": "4", + "HF_HUB_ENABLE_HF_TRANSFER": "0", + "JETSTREAM_PT_DISABLE": "1", + "SKIP_WARMUP": "1", + } + }, + "gemma": { + "model_id": "google/gemma-2b-it", + "sequence_length": 1024, + "expected_greedy_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data", + "expected_sampling_output": "Deep learning is a subfield of machine learning that focuses on mimicking the structure and function of the human brain", + "expected_batch_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data", + "env_config": { + "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug", + "MAX_BATCH_SIZE": "4", + "HF_HUB_ENABLE_HF_TRANSFER": "0", + "SKIP_WARMUP": "1", + } + } +} + +@pytest.fixture(scope="module", params=MODEL_CONFIGS.keys()) +def model_config(request) -> Dict[str, Any]: + """Fixture that provides model configurations for testing.""" + return MODEL_CONFIGS[request.param] + +@pytest.fixture(scope="module") +def model_name_or_path(model_config): + os.environ["HF_SEQUENCE_LENGTH"] = str(model_config["sequence_length"]) + yield model_config["model_id"] + +@pytest.fixture(scope="module") +def tgi_service(launcher, model_name_or_path): + with launcher(model_name_or_path) as tgi_service: + yield tgi_service + +@pytest.fixture(scope="module") +async def tgi_client(tgi_service): + await tgi_service.health(1000) + return tgi_service.client + +@pytest.fixture(scope="module") +def expected_outputs(model_config): + return { + "greedy": model_config["expected_greedy_output"], + "sampling": model_config["expected_sampling_output"], + "batch": model_config["expected_batch_output"] + } + +@pytest.mark.asyncio +async def test_model_single_request(tgi_client, expected_outputs): + # Bounded greedy decoding without input + response = await tgi_client.generate( + "What is Deep Learning?", + max_new_tokens=17, + decoder_input_details=True, + ) + assert response.details.generated_tokens == 17 + assert response.generated_text == expected_outputs["greedy"] + + # Bounded greedy decoding with input + response = await tgi_client.generate( + "What is Deep Learning?", + max_new_tokens=17, + return_full_text=True, + decoder_input_details=True, + ) + assert response.details.generated_tokens == 17 + assert response.generated_text == f"What is Deep Learning?{expected_outputs['greedy']}" + + # Sampling + response = await tgi_client.generate( + "What is Deep Learning?", + do_sample=True, + top_k=50, + top_p=0.9, + repetition_penalty=1.2, + max_new_tokens=100, + seed=42, + decoder_input_details=True, + ) + + assert expected_outputs["sampling"] in response.generated_text + +@pytest.mark.asyncio +async def test_model_multiple_requests(tgi_client, generate_load, expected_outputs): + num_requests = 4 + responses = await generate_load( + tgi_client, + "What is Deep Learning?", + max_new_tokens=17, + n=num_requests, + ) + + assert len(responses) == 4 + expected = expected_outputs["batch"] + for r in responses: + assert r.details.generated_tokens == 17 + # Compute the similarity with the expectation using the levenshtein distance + # We should not have more than two substitutions or additions + assert Levenshtein.distance(r.generated_text, expected) < 3 From 3a5a7f03e60ab87758265b040a40c3e177c7ec1c Mon Sep 17 00:00:00 2001 From: Baptiste Date: Wed, 4 Dec 2024 10:14:54 +0000 Subject: [PATCH 5/6] refractor(tests): make run arguments to model config --- .../integration-tests/conftest.py | 15 +++++---------- .../integration-tests/test_model.py | 12 ++++++++++++ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py index 1ef7e4c7..8e162991 100644 --- a/text-generation-inference/integration-tests/conftest.py +++ b/text-generation-inference/integration-tests/conftest.py @@ -146,16 +146,6 @@ def docker_launcher( logger.info(f"Starting docker launcher for model {model_id}") port = 8080 - args = [ - "--max-input-length", "512", - "--max-total-tokens", "1024", - "--max-batch-prefill-tokens", "512", - "--max-batch-total-tokens", "1024" - ] - - if trust_remote_code: - args.append("--trust-remote-code") - client = docker.from_env() container_name = f"tgi-tests-{model_id.split('/')[-1]}" @@ -171,6 +161,11 @@ def docker_launcher( logger.error(f"Error handling existing container: {str(e)}") model_name = next(name for name, cfg in MODEL_CONFIGS.items() if cfg["model_id"] == model_id) + + args = MODEL_CONFIGS[model_name]["args"].copy() + if trust_remote_code: + args.append("--trust-remote-code") + env = MODEL_CONFIGS[model_name]["env_config"].copy() # Add model_id to env diff --git a/text-generation-inference/integration-tests/test_model.py b/text-generation-inference/integration-tests/test_model.py index b9ae54e0..45f0fef5 100644 --- a/text-generation-inference/integration-tests/test_model.py +++ b/text-generation-inference/integration-tests/test_model.py @@ -12,6 +12,12 @@ "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a while", "expected_sampling_output": 'The fundamental concepts of deep learning are the same as those used to train and understand your first language, or your first set of skills', "expected_batch_output": "\n\nDeep learning is a technique that allows you to learn something from a single source", + "args": [ + "--max-input-length", "512", + "--max-total-tokens", "1024", + "--max-batch-prefill-tokens", "512", + "--max-batch-total-tokens", "1024" + ], "env_config": { "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug", "MAX_BATCH_SIZE": "4", @@ -26,6 +32,12 @@ "expected_greedy_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data", "expected_sampling_output": "Deep learning is a subfield of machine learning that focuses on mimicking the structure and function of the human brain", "expected_batch_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data", + "args": [ + "--max-input-length", "512", + "--max-total-tokens", "1024", + "--max-batch-prefill-tokens", "512", + "--max-batch-total-tokens", "1024" + ], "env_config": { "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug", "MAX_BATCH_SIZE": "4", From 606a6131c23b4a4feb6840790e0466d10265615b Mon Sep 17 00:00:00 2001 From: Baptiste Date: Wed, 4 Dec 2024 10:25:21 +0000 Subject: [PATCH 6/6] refractor(tests): make run arguments to model config --- text-generation-inference/integration-tests/conftest.py | 7 ++++++- text-generation-inference/integration-tests/test_model.py | 4 ---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py index 8e162991..3115e3dd 100644 --- a/text-generation-inference/integration-tests/conftest.py +++ b/text-generation-inference/integration-tests/conftest.py @@ -166,7 +166,12 @@ def docker_launcher( if trust_remote_code: args.append("--trust-remote-code") - env = MODEL_CONFIGS[model_name]["env_config"].copy() + env = { + "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug", + "HF_HUB_ENABLE_HF_TRANSFER": "0" + } + env.update(MODEL_CONFIGS[model_name]["env_config"].copy()) + # Add model_id to env env["MODEL_ID"] = model_id diff --git a/text-generation-inference/integration-tests/test_model.py b/text-generation-inference/integration-tests/test_model.py index 45f0fef5..a9b44c75 100644 --- a/text-generation-inference/integration-tests/test_model.py +++ b/text-generation-inference/integration-tests/test_model.py @@ -19,9 +19,7 @@ "--max-batch-total-tokens", "1024" ], "env_config": { - "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug", "MAX_BATCH_SIZE": "4", - "HF_HUB_ENABLE_HF_TRANSFER": "0", "JETSTREAM_PT_DISABLE": "1", "SKIP_WARMUP": "1", } @@ -39,9 +37,7 @@ "--max-batch-total-tokens", "1024" ], "env_config": { - "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug", "MAX_BATCH_SIZE": "4", - "HF_HUB_ENABLE_HF_TRANSFER": "0", "SKIP_WARMUP": "1", } }