update

AlibabaPAI · Dec 19, 2024 · abb36ab · abb36ab
1 parent fc18695
commit abb36ab
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 5 deletions.
diff --git a/Makefile b/Makefile
@@ -22,7 +22,7 @@ install:
 .PHONY: lint
 lint: check_pylint_installed check_pytest_installed
 	@pylint --rcfile=.pylintrc -s n  --jobs=128 ./llumnix
-	
+
 	@pylint --rcfile=.pylintrc \
 			--disable=protected-access,super-init-not-called,unused-argument,redefined-outer-name,invalid-name \
 			-s n --jobs=128 ./tests
@@ -61,22 +61,27 @@ test: check_pytest_installed
 
 .PHONY: unit_test
 unit_test: check_pytest_installed
+	@ray stop
 	@pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings
-	
+
 .PHONY: offline_test
 offline_test:
+	@ray stop
 	@python examlpes/offline_inference.py
 
 .PHONY: e2e_test
 e2e_test:
+	@ray stop
 	@pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py
 
 .PHONY: bench_test
 bench_test:
+	@ray stop
 	@pytest -v -x -s --tb=long ./tests/e2e_test/test_bench.py
 
 .PHONY: migration_test
 migration_test:
+	@ray stop
 	@pytest -v -x -s --tb=long ./tests/e2e_test/test_migration.py
 
 ####################################### test end ########################################

diff --git a/benchmark/benchmark_serving.py b/benchmark/benchmark_serving.py
@@ -102,7 +102,7 @@ async def query_model_vllm(prompt, verbose, ip_ports):
                 if verbose:
                     print('Done')
 
-                output = await resp.json(content_type='text/plain')
+                output = await resp.json()
                 # necessary for latency calc
                 output['response_len'] = expected_response_len
                 if verbose and 'generated_text' in output:

diff --git a/llumnix/entrypoints/bladellm/client.py b/llumnix/entrypoints/bladellm/client.py
@@ -62,7 +62,7 @@ async def background_process_outputs(self):
                     del self.request_streams[request_id]
 
     async def _add_request(self, request: ServerRequest) -> LLMResponse:
-        if request.sampling_params.n > 1:
+        if request.sampling_params.n > 1 or request.sampling_params.use_beam_search:
             return error_resp(request.id, err_code=400, err_msg="Unsupported feature: multiple sequence decoding in Llumnix.")
 
         llumnix_id = random.randint(0, 2147483647) # 1<<31-1

diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py
@@ -36,7 +36,7 @@ async def get_llumnix_response(prompt, sampling_params, ip_ports):
 
     async with aiohttp.ClientSession(timeout=timeout) as session:
         async with session.post(f'http://{ip_ports}/generate', json=request) as resp:
-            output = await resp.json('text/plain')
+            output = await resp.json()
             return output
 
 prompts = [