From bdd422df6aa16a2427943a14873847403a8d5582 Mon Sep 17 00:00:00 2001
From: Baptiste <collebaptiste@gmail.com>
Date: Thu, 21 Nov 2024 20:33:38 +0000
Subject: [PATCH 1/6] fix(tests): fix broken GPT2 integration test

---
 Makefile                                      |   6 +-
 .../integration-tests/conftest.py             | 208 ++++++++++++++----
 .../integration-tests/test_gpt2.py            |  19 +-
 .../server/build-requirements.txt             |   2 +-
 4 files changed, 178 insertions(+), 57 deletions(-)

diff --git a/Makefile b/Makefile
index 7091ac00..97df108b 100644
--- a/Makefile
+++ b/Makefile
@@ -42,12 +42,14 @@ clean:
 	rm -rf dist deps
 	make -C text-generation-inference/server/ clean
 
+# ulimit nofile=100000:100000 is required for TPUs
+# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#privileged-mode
 tpu-tgi:
 	docker build --rm -f text-generation-inference/docker/Dockerfile \
 	             --build-arg VERSION=$(VERSION) \
 	             --build-arg TGI_VERSION=$(TGI_VERSION) \
-				 --ulimit nofile=100000:100000 \
-				 -t huggingface/optimum-tpu:$(VERSION)-tgi .
+	             --ulimit nofile=100000:100000 \
+	             -t huggingface/optimum-tpu:$(VERSION)-tgi .
 	docker tag huggingface/optimum-tpu:$(VERSION)-tgi huggingface/optimum-tpu:latest
 
 tpu-tgi-ie:
diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py
index 435b55eb..a6b23210 100644
--- a/text-generation-inference/integration-tests/conftest.py
+++ b/text-generation-inference/integration-tests/conftest.py
@@ -4,7 +4,10 @@
 import shlex
 import subprocess
 import sys
+import threading
 import time
+import signal
+import logging
 from tempfile import TemporaryDirectory
 from typing import List
 
@@ -16,30 +19,81 @@
 from text_generation.types import Response
 
 
-DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tpu-tgi:latest")
-HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
+DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "huggingface/optimum-tpu:latest")
+HF_TOKEN = os.getenv("HF_TOKEN", None)
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
 
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+def cleanup_handler(signum, frame):
+    logger.info("\nCleaning up containers due to shutdown, please wait...")
+    try:
+        client = docker.from_env()
+        containers = client.containers.list(filters={"name": "tgi-tests-"})
+        for container in containers:
+            try:
+                container.stop()
+                container.remove()
+                logger.info(f"Successfully cleaned up container {container.name}")
+            except Exception as e:
+                logger.error(f"Error cleaning up container {container.name}: {str(e)}")
+    except Exception as e:
+        logger.error(f"Error during cleanup: {str(e)}")
+    sys.exit(1)
+
+signal.signal(signal.SIGINT, cleanup_handler)
+signal.signal(signal.SIGTERM, cleanup_handler)
+
+def stream_container_logs(container):
+    """Stream container logs in a separate thread."""
+    try:
+        for log in container.logs(stream=True, follow=True):
+            print("[TGI Server Logs] " + log.decode("utf-8"), end="", file=sys.stderr, flush=True)
+    except Exception as e:
+        logger.error(f"Error streaming container logs: {str(e)}")
+
 
 class LauncherHandle:
     def __init__(self, port: int):
-        self.client = AsyncClient(f"http://localhost:{port}")
+        self.client = AsyncClient(f"http://localhost:{port}", timeout=600)
+        self.logger = logging.getLogger(self.__class__.__name__)
 
     def _inner_health(self):
         raise NotImplementedError
 
     async def health(self, timeout: int = 60):
         assert timeout > 0
-        for _ in range(timeout):
+        start_time = time.time()
+        self.logger.info(f"Starting health check with timeout of {timeout}s")
+        
+        for attempt in range(timeout):
             if not self._inner_health():
+                self.logger.error("Launcher crashed during health check")
                 raise RuntimeError("Launcher crashed")
 
             try:
                 await self.client.generate("test")
+                elapsed = time.time() - start_time
+                self.logger.info(f"Health check passed after {elapsed:.1f}s")
                 return
-            except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
+            except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
+                if attempt == timeout - 1:
+                    self.logger.error(f"Health check failed after {timeout}s: {str(e)}")
+                    raise RuntimeError(f"Health check failed: {str(e)}")
+                self.logger.debug(f"Connection attempt {attempt+1}/{timeout} failed: {str(e)}")
                 time.sleep(1)
-        raise RuntimeError("Health check failed")
+            except Exception as e:
+                self.logger.error(f"Unexpected error during health check: {str(e)}")
+                # Get full traceback for debugging
+                import traceback
+                self.logger.error(f"Full traceback:\n{traceback.format_exc()}")
+                raise
 
 
 class ContainerLauncherHandle(LauncherHandle):
@@ -49,8 +103,18 @@ def __init__(self, docker_client, container_name, port: int):
         self.container_name = container_name
 
     def _inner_health(self) -> bool:
-        container = self.docker_client.containers.get(self.container_name)
-        return container.status in ["running", "created"]
+        try:
+            container = self.docker_client.containers.get(self.container_name)
+            status = container.status
+            if status not in ["running", "created"]:
+                self.logger.warning(f"Container status is {status}")
+                # Get container logs for debugging
+                logs = container.logs().decode("utf-8")
+                self.logger.debug(f"Container logs:\n{logs}")
+            return status in ["running", "created"]
+        except Exception as e:
+            self.logger.error(f"Error checking container health: {str(e)}")
+            return False
 
 
 class ProcessLauncherHandle(LauncherHandle):
@@ -73,8 +137,11 @@ def event_loop():
 def data_volume():
     tmpdir = TemporaryDirectory()
     yield tmpdir.name
-    # Cleanup the temporary directory using sudo as it contains root files created by the container
-    subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"))
+    try:
+        # Cleanup the temporary directory using sudo as it contains root files created by the container
+        subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"), check=True)
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Error cleaning up temporary directory: {str(e)}")
 
 
 @pytest.fixture(scope="module")
@@ -84,11 +151,12 @@ def docker_launcher(
         model_id: str,
         trust_remote_code: bool = False,
     ):
+        logger.info(f"Starting docker launcher for model {model_id}")
         # TODO: consider finding out how to forward a port in the container instead of leaving it to 80.
         # For now this is necessary because TPU dockers require to run with net=host and privileged mode.
         port = 80
 
-        args = ["--model-id", model_id, "--env"]
+        args = ["--env"]
 
         if trust_remote_code:
             args.append("--trust-remote-code")
@@ -99,15 +167,25 @@ def docker_launcher(
 
         try:
             container = client.containers.get(container_name)
+            logger.info(f"Stopping existing container {container_name}")
             container.stop()
             container.wait()
         except NotFound:
             pass
+        except Exception as e:
+            logger.error(f"Error handling existing container: {str(e)}")
 
-        env = {"LOG_LEVEL": "info,text_generation_router=debug"}
+        env = {
+            "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug",
+            "MAX_BATCH_SIZE": "4",
+            "HF_HUB_ENABLE_HF_TRANSFER": "0",
+            "JETSTREAM_PT": "1",
+            "SKIP_WARMUP": "1",
+            "MODEL_ID": model_id,
+        }
 
-        if HUGGING_FACE_HUB_TOKEN is not None:
-            env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
+        if HF_TOKEN is not None:
+            env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
 
         for var in ["MAX_BATCH_SIZE", "HF_SEQUENCE_LENGTH"]:
             if var in os.environ:
@@ -115,31 +193,68 @@ def docker_launcher(
 
         volumes = [f"{data_volume}:/data"]
 
-        container = client.containers.run(
-            DOCKER_IMAGE,
-            command=args,
-            name=container_name,
-            environment=env,
-            auto_remove=False,
-            detach=True,
-            volumes=volumes,
-            shm_size="1G",
-            privileged=True,
-            network_mode="host",
-        )
-
-        yield ContainerLauncherHandle(client, container.name, port)
-
         try:
-            container.stop()
-            container.wait()
-        except NotFound:
-            pass
-
-        container_output = container.logs().decode("utf-8")
-        print(container_output, file=sys.stderr)
-
-        container.remove()
+            # Add debug logging before container creation
+            logger.debug(f"Creating container with image {DOCKER_IMAGE}")
+            logger.debug(f"Container environment: {env}")
+            logger.debug(f"Container volumes: {volumes}")
+            
+            container = client.containers.run(
+                DOCKER_IMAGE,
+                command=args,
+                name=container_name,
+                environment=env,
+                auto_remove=False,
+                detach=True,
+                volumes=volumes,
+                shm_size="16G",
+                privileged=True,
+                ipc_mode="host",
+            )
+            logger.info(f"Container {container_name} started successfully")
+
+            # Start log streaming in a background thread
+            log_thread = threading.Thread(
+                target=stream_container_logs,
+                args=(container,),
+                daemon=True  # This ensures the thread will be killed when the main program exits
+            )
+            log_thread.start()
+
+            # Add a small delay to allow container to initialize
+            time.sleep(2)
+
+            # Check container status after creation
+            status = container.status
+            logger.debug(f"Initial container status: {status}")
+            if status not in ["running", "created"]:
+                logs = container.logs().decode("utf-8")
+                logger.error(f"Container failed to start properly. Logs:\n{logs}")
+
+            yield ContainerLauncherHandle(client, container.name, port)
+
+        except Exception as e:
+            logger.error(f"Error starting container: {str(e)}")
+            # Get full traceback for debugging
+            import traceback
+            logger.error(f"Full traceback:\n{traceback.format_exc()}")
+            raise
+        finally:
+            try:
+                container = client.containers.get(container_name)
+                logger.info(f"Stopping container {container_name}")
+                container.stop()
+                container.wait()
+                
+                container_output = container.logs().decode("utf-8")
+                print(container_output, file=sys.stderr)
+
+                container.remove()
+                logger.info(f"Container {container_name} removed successfully")
+            except NotFound:
+                pass
+            except Exception as e:
+                logger.error(f"Error cleaning up container: {str(e)}")
 
     return docker_launcher
 
@@ -147,10 +262,17 @@ def docker_launcher(
 @pytest.fixture(scope="module")
 def generate_load():
     async def generate_load_inner(client: AsyncClient, prompt: str, max_new_tokens: int, n: int) -> List[Response]:
-        futures = [
-            client.generate(prompt, max_new_tokens=max_new_tokens, decoder_input_details=True) for _ in range(n)
-        ]
-
-        return await asyncio.gather(*futures)
+        try:
+            futures = [
+                client.generate(
+                    prompt,
+                    max_new_tokens=max_new_tokens,
+                    decoder_input_details=True,
+                ) for _ in range(n)
+            ]
+            return await asyncio.gather(*futures)
+        except Exception as e:
+            logger.error(f"Error generating load: {str(e)}")
+            raise
 
     return generate_load_inner
diff --git a/text-generation-inference/integration-tests/test_gpt2.py b/text-generation-inference/integration-tests/test_gpt2.py
index d200bd5d..402c9956 100644
--- a/text-generation-inference/integration-tests/test_gpt2.py
+++ b/text-generation-inference/integration-tests/test_gpt2.py
@@ -1,13 +1,10 @@
 import os
-
 import Levenshtein
 import pytest
 
-
 MODEL_ID = "openai-community/gpt2"
 SEQUENCE_LENGTH = 1024
 
-
 @pytest.fixture(scope="module")
 def model_name_or_path():
     os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH)
@@ -22,14 +19,13 @@ def tgi_service(launcher, model_name_or_path):
 
 @pytest.fixture(scope="module")
 async def tgi_client(tgi_service):
-    await tgi_service.health(300)
+    await tgi_service.health(1000)
     return tgi_service.client
 
-
 @pytest.mark.asyncio
 async def test_model_single_request(tgi_client):
 
-    # Greedy bounded without input
+    # Bounded greedy decoding without input
     response = await tgi_client.generate(
         "What is Deep Learning?",
         max_new_tokens=17,
@@ -37,10 +33,10 @@ async def test_model_single_request(tgi_client):
     )
     assert response.details.generated_tokens == 17
     assert (
-        response.generated_text == "\n\nDeep learning is a technique that allows you to learn something from a set of"
+        response.generated_text == "\n\nDeep learning is a new field of research that has been around for a while"
     )
 
-    # Greedy bounded with input
+    # Bounded greedy decoding with input
     response = await tgi_client.generate(
         "What is Deep Learning?",
         max_new_tokens=17,
@@ -50,7 +46,7 @@ async def test_model_single_request(tgi_client):
     assert response.details.generated_tokens == 17
     assert (
         response.generated_text
-        == "What is Deep Learning?\n\nDeep learning is a technique that allows you to learn something from a set of"
+        == "What is Deep Learning?\n\nDeep learning is a new field of research that has been around for a while"
     )
 
     # Sampling
@@ -64,8 +60,9 @@ async def test_model_single_request(tgi_client):
         seed=42,
         decoder_input_details=True,
     )
+
     assert (
-        'The deep neural networks that we create are essentially "miniature" neural networks that can easily be trained'
+        'A lot of researchers have tried to make a "deep learning" approach that focuses only on what is being shown'
         in response.generated_text
     )
 
@@ -81,7 +78,7 @@ async def test_model_multiple_requests(tgi_client, generate_load):
     )
 
     assert len(responses) == 4
-    expected = "\n\nDeep learning is a technique that allows you to learn something from a set of"
+    expected = "\n\nDeep learning is a technique that allows you to learn something from a single source"
     for r in responses:
         assert r.details.generated_tokens == 17
         # Compute the similarity with the expectation using the levenshtein distance
diff --git a/text-generation-inference/server/build-requirements.txt b/text-generation-inference/server/build-requirements.txt
index 5307dc5d..64b2b6e9 100644
--- a/text-generation-inference/server/build-requirements.txt
+++ b/text-generation-inference/server/build-requirements.txt
@@ -1,3 +1,3 @@
 build
 grpcio-tools==1.62.1
-mypy-protobuf==3.2.0
+mypy-protobuf==3.2.0
\ No newline at end of file

From 9e63ff23e66e3e86bad53b4622b1ff9f78b87fa0 Mon Sep 17 00:00:00 2001
From: Baptiste <collebaptiste@gmail.com>
Date: Mon, 25 Nov 2024 12:24:08 +0000
Subject: [PATCH 2/6] feat(tests): add Gemma integration test

---
 .../integration-tests/test_gemma.py           | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 text-generation-inference/integration-tests/test_gemma.py

diff --git a/text-generation-inference/integration-tests/test_gemma.py b/text-generation-inference/integration-tests/test_gemma.py
new file mode 100644
index 00000000..b52b1d6a
--- /dev/null
+++ b/text-generation-inference/integration-tests/test_gemma.py
@@ -0,0 +1,87 @@
+import os
+import Levenshtein
+import pytest
+
+MODEL_ID = "google/gemma-2b-it"
+SEQUENCE_LENGTH = 1024
+
+@pytest.fixture(scope="module")
+def model_name_or_path():
+    os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH)
+    yield MODEL_ID
+
+
+@pytest.fixture(scope="module")
+def tgi_service(launcher, model_name_or_path):
+    with launcher(model_name_or_path) as tgi_service:
+        yield tgi_service
+
+
+@pytest.fixture(scope="module")
+async def tgi_client(tgi_service):
+    await tgi_service.health(1000)
+    return tgi_service.client
+
+@pytest.mark.asyncio
+async def test_model_single_request(tgi_client):
+
+    # Bounded greedy decoding without input
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+    assert (
+        response.generated_text == "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data"
+    )
+
+    # Bounded greedy decoding with input
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        return_full_text=True,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+    assert (
+        response.generated_text
+        == "What is Deep Learning?\n\nDeep learning is a subfield of machine learning that allows computers to learn from data"
+    )
+
+    # Sampling
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        max_new_tokens=100,
+        seed=42,
+        decoder_input_details=True,
+    )
+    print(f"\nGot sampling output with seed=42: {response.generated_text}")
+
+    assert (
+        'Deep learning is a subfield of machine learning that focuses on mimicking the structure and function of the human brain'
+        in response.generated_text
+    )
+
+
+@pytest.mark.asyncio
+async def test_model_multiple_requests(tgi_client, generate_load):
+    num_requests = 4
+    responses = await generate_load(
+        tgi_client,
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        n=num_requests,
+    )
+
+    assert len(responses) == 4
+    expected = "\n\nDeep learning is a subfield of machine learning that uses artificial neural networks to learn"
+    for r in responses:
+        assert r.details.generated_tokens == 17
+        # Compute the similarity with the expectation using the levenshtein distance
+        # We should not have more than two substitutions or additions
+        assert Levenshtein.distance(r.generated_text, expected) < 3

From c822676d2ee37ac2120dd41af3cce2f4c2c66457 Mon Sep 17 00:00:00 2001
From: Baptiste <collebaptiste@gmail.com>
Date: Mon, 2 Dec 2024 18:34:07 +0000
Subject: [PATCH 3/6] refactor(logging): migrate to Loguru

---
 .../integration-tests/conftest.py             | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py
index a6b23210..bbbf14d7 100644
--- a/text-generation-inference/integration-tests/conftest.py
+++ b/text-generation-inference/integration-tests/conftest.py
@@ -7,7 +7,6 @@
 import threading
 import time
 import signal
-import logging
 from tempfile import TemporaryDirectory
 from typing import List
 
@@ -17,18 +16,20 @@
 from docker.errors import NotFound
 from text_generation import AsyncClient
 from text_generation.types import Response
+from loguru import logger
 
 
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "huggingface/optimum-tpu:latest")
 HF_TOKEN = os.getenv("HF_TOKEN", None)
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
 
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+# Configure loguru logger
+logger.remove()  # Remove default handler
+logger.add(
+    sys.stderr,
+    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
+    level="INFO"
 )
-logger = logging.getLogger(__name__)
 
 
 def cleanup_handler(signum, frame):
@@ -62,7 +63,6 @@ def stream_container_logs(container):
 class LauncherHandle:
     def __init__(self, port: int):
         self.client = AsyncClient(f"http://localhost:{port}", timeout=600)
-        self.logger = logging.getLogger(self.__class__.__name__)
 
     def _inner_health(self):
         raise NotImplementedError
@@ -70,29 +70,29 @@ def _inner_health(self):
     async def health(self, timeout: int = 60):
         assert timeout > 0
         start_time = time.time()
-        self.logger.info(f"Starting health check with timeout of {timeout}s")
+        logger.info(f"Starting health check with timeout of {timeout}s")
         
         for attempt in range(timeout):
             if not self._inner_health():
-                self.logger.error("Launcher crashed during health check")
+                logger.error("Launcher crashed during health check")
                 raise RuntimeError("Launcher crashed")
 
             try:
                 await self.client.generate("test")
                 elapsed = time.time() - start_time
-                self.logger.info(f"Health check passed after {elapsed:.1f}s")
+                logger.info(f"Health check passed after {elapsed:.1f}s")
                 return
             except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
                 if attempt == timeout - 1:
-                    self.logger.error(f"Health check failed after {timeout}s: {str(e)}")
+                    logger.error(f"Health check failed after {timeout}s: {str(e)}")
                     raise RuntimeError(f"Health check failed: {str(e)}")
-                self.logger.debug(f"Connection attempt {attempt+1}/{timeout} failed: {str(e)}")
+                logger.debug(f"Connection attempt {attempt+1}/{timeout} failed: {str(e)}")
                 time.sleep(1)
             except Exception as e:
-                self.logger.error(f"Unexpected error during health check: {str(e)}")
+                logger.error(f"Unexpected error during health check: {str(e)}")
                 # Get full traceback for debugging
                 import traceback
-                self.logger.error(f"Full traceback:\n{traceback.format_exc()}")
+                logger.error(f"Full traceback:\n{traceback.format_exc()}")
                 raise
 
 
@@ -107,13 +107,13 @@ def _inner_health(self) -> bool:
             container = self.docker_client.containers.get(self.container_name)
             status = container.status
             if status not in ["running", "created"]:
-                self.logger.warning(f"Container status is {status}")
+                logger.warning(f"Container status is {status}")
                 # Get container logs for debugging
                 logs = container.logs().decode("utf-8")
-                self.logger.debug(f"Container logs:\n{logs}")
+                logger.debug(f"Container logs:\n{logs}")
             return status in ["running", "created"]
         except Exception as e:
-            self.logger.error(f"Error checking container health: {str(e)}")
+            logger.error(f"Error checking container health: {str(e)}")
             return False
 
 

From 1af9edc15cc31aa913b7170033d68b86b0380098 Mon Sep 17 00:00:00 2001
From: Baptiste <collebaptiste@gmail.com>
Date: Wed, 4 Dec 2024 09:59:20 +0000
Subject: [PATCH 4/6] fix(tests): fix broken connection to docker container

---
 Makefile                                      |   1 -
 .../integration-tests/conftest.py             |  57 +++++----
 .../integration-tests/test_gemma.py           |  87 -------------
 .../integration-tests/test_gpt2.py            |  86 -------------
 .../integration-tests/test_model.py           | 117 ++++++++++++++++++
 5 files changed, 145 insertions(+), 203 deletions(-)
 delete mode 100644 text-generation-inference/integration-tests/test_gemma.py
 delete mode 100644 text-generation-inference/integration-tests/test_gpt2.py
 create mode 100644 text-generation-inference/integration-tests/test_model.py

diff --git a/Makefile b/Makefile
index 97df108b..0220201d 100644
--- a/Makefile
+++ b/Makefile
@@ -66,7 +66,6 @@ style_check:
 	ruff check .
 
 style:
-	ruff check . --fix
 
 # Utilities to release to PyPi
 build_dist_install_tools:
diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py
index bbbf14d7..1ef7e4c7 100644
--- a/text-generation-inference/integration-tests/conftest.py
+++ b/text-generation-inference/integration-tests/conftest.py
@@ -2,11 +2,11 @@
 import contextlib
 import os
 import shlex
+import signal
 import subprocess
 import sys
 import threading
 import time
-import signal
 from tempfile import TemporaryDirectory
 from typing import List
 
@@ -14,17 +14,16 @@
 import pytest
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
+from loguru import logger
+from test_model import MODEL_CONFIGS
 from text_generation import AsyncClient
 from text_generation.types import Response
-from loguru import logger
 
 
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "huggingface/optimum-tpu:latest")
 HF_TOKEN = os.getenv("HF_TOKEN", None)
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
 
-# Configure loguru logger
-logger.remove()  # Remove default handler
 logger.add(
     sys.stderr,
     format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
@@ -62,7 +61,7 @@ def stream_container_logs(container):
 
 class LauncherHandle:
     def __init__(self, port: int):
-        self.client = AsyncClient(f"http://localhost:{port}", timeout=600)
+        self.client = AsyncClient(f"http://localhost:{port}", timeout=3600)
 
     def _inner_health(self):
         raise NotImplementedError
@@ -71,7 +70,7 @@ async def health(self, timeout: int = 60):
         assert timeout > 0
         start_time = time.time()
         logger.info(f"Starting health check with timeout of {timeout}s")
-        
+
         for attempt in range(timeout):
             if not self._inner_health():
                 logger.error("Launcher crashed during health check")
@@ -126,13 +125,6 @@ def _inner_health(self) -> bool:
         return self.process.poll() is None
 
 
-@pytest.fixture(scope="module")
-def event_loop():
-    loop = asyncio.get_event_loop()
-    yield loop
-    loop.close()
-
-
 @pytest.fixture(scope="module")
 def data_volume():
     tmpdir = TemporaryDirectory()
@@ -145,18 +137,21 @@ def data_volume():
 
 
 @pytest.fixture(scope="module")
-def launcher(event_loop, data_volume):
+def launcher(data_volume):
     @contextlib.contextmanager
     def docker_launcher(
         model_id: str,
         trust_remote_code: bool = False,
     ):
         logger.info(f"Starting docker launcher for model {model_id}")
-        # TODO: consider finding out how to forward a port in the container instead of leaving it to 80.
-        # For now this is necessary because TPU dockers require to run with net=host and privileged mode.
-        port = 80
+        port = 8080
 
-        args = ["--env"]
+        args = [
+            "--max-input-length", "512",
+            "--max-total-tokens", "1024",
+            "--max-batch-prefill-tokens", "512",
+            "--max-batch-total-tokens", "1024"
+        ]
 
         if trust_remote_code:
             args.append("--trust-remote-code")
@@ -175,17 +170,14 @@ def docker_launcher(
         except Exception as e:
             logger.error(f"Error handling existing container: {str(e)}")
 
-        env = {
-            "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug",
-            "MAX_BATCH_SIZE": "4",
-            "HF_HUB_ENABLE_HF_TRANSFER": "0",
-            "JETSTREAM_PT": "1",
-            "SKIP_WARMUP": "1",
-            "MODEL_ID": model_id,
-        }
+        model_name = next(name for name, cfg in MODEL_CONFIGS.items() if cfg["model_id"] == model_id)
+        env = MODEL_CONFIGS[model_name]["env_config"].copy()
+
+        # Add model_id to env
+        env["MODEL_ID"] = model_id
 
         if HF_TOKEN is not None:
-            env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
+            env["HF_TOKEN"] = HF_TOKEN
 
         for var in ["MAX_BATCH_SIZE", "HF_SEQUENCE_LENGTH"]:
             if var in os.environ:
@@ -198,7 +190,13 @@ def docker_launcher(
             logger.debug(f"Creating container with image {DOCKER_IMAGE}")
             logger.debug(f"Container environment: {env}")
             logger.debug(f"Container volumes: {volumes}")
-            
+
+            # Log equivalent docker run command
+            env_str = ' '.join([f'-e {k}="{v}"' for k,v in env.items()])
+            volume_str = ' '.join([f'-v {v}' for v in volumes])
+            cmd_str = f'docker run -d --name {container_name} {env_str} {volume_str} --shm-size 16G --privileged --ipc host {DOCKER_IMAGE} {" ".join(args)}'
+            logger.debug(f"Equivalent docker run command:\n{cmd_str}")
+
             container = client.containers.run(
                 DOCKER_IMAGE,
                 command=args,
@@ -210,6 +208,7 @@ def docker_launcher(
                 shm_size="16G",
                 privileged=True,
                 ipc_mode="host",
+                ports={"80/tcp": 8080}
             )
             logger.info(f"Container {container_name} started successfully")
 
@@ -245,7 +244,7 @@ def docker_launcher(
                 logger.info(f"Stopping container {container_name}")
                 container.stop()
                 container.wait()
-                
+
                 container_output = container.logs().decode("utf-8")
                 print(container_output, file=sys.stderr)
 
diff --git a/text-generation-inference/integration-tests/test_gemma.py b/text-generation-inference/integration-tests/test_gemma.py
deleted file mode 100644
index b52b1d6a..00000000
--- a/text-generation-inference/integration-tests/test_gemma.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import os
-import Levenshtein
-import pytest
-
-MODEL_ID = "google/gemma-2b-it"
-SEQUENCE_LENGTH = 1024
-
-@pytest.fixture(scope="module")
-def model_name_or_path():
-    os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH)
-    yield MODEL_ID
-
-
-@pytest.fixture(scope="module")
-def tgi_service(launcher, model_name_or_path):
-    with launcher(model_name_or_path) as tgi_service:
-        yield tgi_service
-
-
-@pytest.fixture(scope="module")
-async def tgi_client(tgi_service):
-    await tgi_service.health(1000)
-    return tgi_service.client
-
-@pytest.mark.asyncio
-async def test_model_single_request(tgi_client):
-
-    # Bounded greedy decoding without input
-    response = await tgi_client.generate(
-        "What is Deep Learning?",
-        max_new_tokens=17,
-        decoder_input_details=True,
-    )
-    assert response.details.generated_tokens == 17
-    assert (
-        response.generated_text == "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data"
-    )
-
-    # Bounded greedy decoding with input
-    response = await tgi_client.generate(
-        "What is Deep Learning?",
-        max_new_tokens=17,
-        return_full_text=True,
-        decoder_input_details=True,
-    )
-    assert response.details.generated_tokens == 17
-    assert (
-        response.generated_text
-        == "What is Deep Learning?\n\nDeep learning is a subfield of machine learning that allows computers to learn from data"
-    )
-
-    # Sampling
-    response = await tgi_client.generate(
-        "What is Deep Learning?",
-        do_sample=True,
-        top_k=50,
-        top_p=0.9,
-        repetition_penalty=1.2,
-        max_new_tokens=100,
-        seed=42,
-        decoder_input_details=True,
-    )
-    print(f"\nGot sampling output with seed=42: {response.generated_text}")
-
-    assert (
-        'Deep learning is a subfield of machine learning that focuses on mimicking the structure and function of the human brain'
-        in response.generated_text
-    )
-
-
-@pytest.mark.asyncio
-async def test_model_multiple_requests(tgi_client, generate_load):
-    num_requests = 4
-    responses = await generate_load(
-        tgi_client,
-        "What is Deep Learning?",
-        max_new_tokens=17,
-        n=num_requests,
-    )
-
-    assert len(responses) == 4
-    expected = "\n\nDeep learning is a subfield of machine learning that uses artificial neural networks to learn"
-    for r in responses:
-        assert r.details.generated_tokens == 17
-        # Compute the similarity with the expectation using the levenshtein distance
-        # We should not have more than two substitutions or additions
-        assert Levenshtein.distance(r.generated_text, expected) < 3
diff --git a/text-generation-inference/integration-tests/test_gpt2.py b/text-generation-inference/integration-tests/test_gpt2.py
deleted file mode 100644
index 402c9956..00000000
--- a/text-generation-inference/integration-tests/test_gpt2.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import os
-import Levenshtein
-import pytest
-
-MODEL_ID = "openai-community/gpt2"
-SEQUENCE_LENGTH = 1024
-
-@pytest.fixture(scope="module")
-def model_name_or_path():
-    os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH)
-    yield MODEL_ID
-
-
-@pytest.fixture(scope="module")
-def tgi_service(launcher, model_name_or_path):
-    with launcher(model_name_or_path) as tgi_service:
-        yield tgi_service
-
-
-@pytest.fixture(scope="module")
-async def tgi_client(tgi_service):
-    await tgi_service.health(1000)
-    return tgi_service.client
-
-@pytest.mark.asyncio
-async def test_model_single_request(tgi_client):
-
-    # Bounded greedy decoding without input
-    response = await tgi_client.generate(
-        "What is Deep Learning?",
-        max_new_tokens=17,
-        decoder_input_details=True,
-    )
-    assert response.details.generated_tokens == 17
-    assert (
-        response.generated_text == "\n\nDeep learning is a new field of research that has been around for a while"
-    )
-
-    # Bounded greedy decoding with input
-    response = await tgi_client.generate(
-        "What is Deep Learning?",
-        max_new_tokens=17,
-        return_full_text=True,
-        decoder_input_details=True,
-    )
-    assert response.details.generated_tokens == 17
-    assert (
-        response.generated_text
-        == "What is Deep Learning?\n\nDeep learning is a new field of research that has been around for a while"
-    )
-
-    # Sampling
-    response = await tgi_client.generate(
-        "What is Deep Learning?",
-        do_sample=True,
-        top_k=50,
-        top_p=0.9,
-        repetition_penalty=1.2,
-        max_new_tokens=100,
-        seed=42,
-        decoder_input_details=True,
-    )
-
-    assert (
-        'A lot of researchers have tried to make a "deep learning" approach that focuses only on what is being shown'
-        in response.generated_text
-    )
-
-
-@pytest.mark.asyncio
-async def test_model_multiple_requests(tgi_client, generate_load):
-    num_requests = 4
-    responses = await generate_load(
-        tgi_client,
-        "What is Deep Learning?",
-        max_new_tokens=17,
-        n=num_requests,
-    )
-
-    assert len(responses) == 4
-    expected = "\n\nDeep learning is a technique that allows you to learn something from a single source"
-    for r in responses:
-        assert r.details.generated_tokens == 17
-        # Compute the similarity with the expectation using the levenshtein distance
-        # We should not have more than two substitutions or additions
-        assert Levenshtein.distance(r.generated_text, expected) < 3
diff --git a/text-generation-inference/integration-tests/test_model.py b/text-generation-inference/integration-tests/test_model.py
new file mode 100644
index 00000000..b9ae54e0
--- /dev/null
+++ b/text-generation-inference/integration-tests/test_model.py
@@ -0,0 +1,117 @@
+import os
+from typing import Any, Dict
+
+import Levenshtein
+import pytest
+
+
+MODEL_CONFIGS = {
+    "gpt2": {
+        "model_id": "openai-community/gpt2",
+        "sequence_length": 1024,
+        "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a while",
+        "expected_sampling_output": 'The fundamental concepts of deep learning are the same as those used to train and understand your first language, or your first set of skills',
+        "expected_batch_output": "\n\nDeep learning is a technique that allows you to learn something from a single source",
+        "env_config": {
+            "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug",
+            "MAX_BATCH_SIZE": "4",
+            "HF_HUB_ENABLE_HF_TRANSFER": "0",
+            "JETSTREAM_PT_DISABLE": "1",
+            "SKIP_WARMUP": "1",
+        }
+    },
+    "gemma": {
+        "model_id": "google/gemma-2b-it",
+        "sequence_length": 1024,
+        "expected_greedy_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data",
+        "expected_sampling_output": "Deep learning is a subfield of machine learning that focuses on mimicking the structure and function of the human brain",
+        "expected_batch_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data",
+        "env_config": {
+            "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug",
+            "MAX_BATCH_SIZE": "4",
+            "HF_HUB_ENABLE_HF_TRANSFER": "0",
+            "SKIP_WARMUP": "1",
+        }
+    }
+}
+
+@pytest.fixture(scope="module", params=MODEL_CONFIGS.keys())
+def model_config(request) -> Dict[str, Any]:
+    """Fixture that provides model configurations for testing."""
+    return MODEL_CONFIGS[request.param]
+
+@pytest.fixture(scope="module")
+def model_name_or_path(model_config):
+    os.environ["HF_SEQUENCE_LENGTH"] = str(model_config["sequence_length"])
+    yield model_config["model_id"]
+
+@pytest.fixture(scope="module")
+def tgi_service(launcher, model_name_or_path):
+    with launcher(model_name_or_path) as tgi_service:
+        yield tgi_service
+
+@pytest.fixture(scope="module")
+async def tgi_client(tgi_service):
+    await tgi_service.health(1000)
+    return tgi_service.client
+
+@pytest.fixture(scope="module")
+def expected_outputs(model_config):
+    return {
+        "greedy": model_config["expected_greedy_output"],
+        "sampling": model_config["expected_sampling_output"],
+        "batch": model_config["expected_batch_output"]
+    }
+
+@pytest.mark.asyncio
+async def test_model_single_request(tgi_client, expected_outputs):
+    # Bounded greedy decoding without input
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+    assert response.generated_text == expected_outputs["greedy"]
+
+    # Bounded greedy decoding with input
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        return_full_text=True,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+    assert response.generated_text == f"What is Deep Learning?{expected_outputs['greedy']}"
+
+    # Sampling
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        max_new_tokens=100,
+        seed=42,
+        decoder_input_details=True,
+    )
+
+    assert expected_outputs["sampling"] in response.generated_text
+
+@pytest.mark.asyncio
+async def test_model_multiple_requests(tgi_client, generate_load, expected_outputs):
+    num_requests = 4
+    responses = await generate_load(
+        tgi_client,
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        n=num_requests,
+    )
+
+    assert len(responses) == 4
+    expected = expected_outputs["batch"]
+    for r in responses:
+        assert r.details.generated_tokens == 17
+        # Compute the similarity with the expectation using the levenshtein distance
+        # We should not have more than two substitutions or additions
+        assert Levenshtein.distance(r.generated_text, expected) < 3

From 3a5a7f03e60ab87758265b040a40c3e177c7ec1c Mon Sep 17 00:00:00 2001
From: Baptiste <collebaptiste@gmail.com>
Date: Wed, 4 Dec 2024 10:14:54 +0000
Subject: [PATCH 5/6] refractor(tests): make run arguments to model config

---
 .../integration-tests/conftest.py                 | 15 +++++----------
 .../integration-tests/test_model.py               | 12 ++++++++++++
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py
index 1ef7e4c7..8e162991 100644
--- a/text-generation-inference/integration-tests/conftest.py
+++ b/text-generation-inference/integration-tests/conftest.py
@@ -146,16 +146,6 @@ def docker_launcher(
         logger.info(f"Starting docker launcher for model {model_id}")
         port = 8080
 
-        args = [
-            "--max-input-length", "512",
-            "--max-total-tokens", "1024",
-            "--max-batch-prefill-tokens", "512",
-            "--max-batch-total-tokens", "1024"
-        ]
-
-        if trust_remote_code:
-            args.append("--trust-remote-code")
-
         client = docker.from_env()
 
         container_name = f"tgi-tests-{model_id.split('/')[-1]}"
@@ -171,6 +161,11 @@ def docker_launcher(
             logger.error(f"Error handling existing container: {str(e)}")
 
         model_name = next(name for name, cfg in MODEL_CONFIGS.items() if cfg["model_id"] == model_id)
+
+        args = MODEL_CONFIGS[model_name]["args"].copy()
+        if trust_remote_code:
+            args.append("--trust-remote-code")
+
         env = MODEL_CONFIGS[model_name]["env_config"].copy()
 
         # Add model_id to env
diff --git a/text-generation-inference/integration-tests/test_model.py b/text-generation-inference/integration-tests/test_model.py
index b9ae54e0..45f0fef5 100644
--- a/text-generation-inference/integration-tests/test_model.py
+++ b/text-generation-inference/integration-tests/test_model.py
@@ -12,6 +12,12 @@
         "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a while",
         "expected_sampling_output": 'The fundamental concepts of deep learning are the same as those used to train and understand your first language, or your first set of skills',
         "expected_batch_output": "\n\nDeep learning is a technique that allows you to learn something from a single source",
+        "args": [
+            "--max-input-length", "512",
+            "--max-total-tokens", "1024",
+            "--max-batch-prefill-tokens", "512",
+            "--max-batch-total-tokens", "1024"
+        ],
         "env_config": {
             "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug",
             "MAX_BATCH_SIZE": "4",
@@ -26,6 +32,12 @@
         "expected_greedy_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data",
         "expected_sampling_output": "Deep learning is a subfield of machine learning that focuses on mimicking the structure and function of the human brain",
         "expected_batch_output": "\n\nDeep learning is a subfield of machine learning that allows computers to learn from data",
+        "args": [
+            "--max-input-length", "512",
+            "--max-total-tokens", "1024",
+            "--max-batch-prefill-tokens", "512",
+            "--max-batch-total-tokens", "1024"
+        ],
         "env_config": {
             "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug",
             "MAX_BATCH_SIZE": "4",

From 606a6131c23b4a4feb6840790e0466d10265615b Mon Sep 17 00:00:00 2001
From: Baptiste <collebaptiste@gmail.com>
Date: Wed, 4 Dec 2024 10:25:21 +0000
Subject: [PATCH 6/6] refractor(tests): make run arguments to model config

---
 text-generation-inference/integration-tests/conftest.py   | 7 ++++++-
 text-generation-inference/integration-tests/test_model.py | 4 ----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py
index 8e162991..3115e3dd 100644
--- a/text-generation-inference/integration-tests/conftest.py
+++ b/text-generation-inference/integration-tests/conftest.py
@@ -166,7 +166,12 @@ def docker_launcher(
         if trust_remote_code:
             args.append("--trust-remote-code")
 
-        env = MODEL_CONFIGS[model_name]["env_config"].copy()
+        env = {
+            "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug",
+            "HF_HUB_ENABLE_HF_TRANSFER": "0"
+        }
+        env.update(MODEL_CONFIGS[model_name]["env_config"].copy())
+
 
         # Add model_id to env
         env["MODEL_ID"] = model_id
diff --git a/text-generation-inference/integration-tests/test_model.py b/text-generation-inference/integration-tests/test_model.py
index 45f0fef5..a9b44c75 100644
--- a/text-generation-inference/integration-tests/test_model.py
+++ b/text-generation-inference/integration-tests/test_model.py
@@ -19,9 +19,7 @@
             "--max-batch-total-tokens", "1024"
         ],
         "env_config": {
-            "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug",
             "MAX_BATCH_SIZE": "4",
-            "HF_HUB_ENABLE_HF_TRANSFER": "0",
             "JETSTREAM_PT_DISABLE": "1",
             "SKIP_WARMUP": "1",
         }
@@ -39,9 +37,7 @@
             "--max-batch-total-tokens", "1024"
         ],
         "env_config": {
-            "LOG_LEVEL": "info,text_generation_router,text_generation_launcher=debug",
             "MAX_BATCH_SIZE": "4",
-            "HF_HUB_ENABLE_HF_TRANSFER": "0",
             "SKIP_WARMUP": "1",
         }
     }