diff --git a/benchmark/benchmark_serving.py b/benchmark/benchmark_serving.py index d9250a0e..bad79ee6 100644 --- a/benchmark/benchmark_serving.py +++ b/benchmark/benchmark_serving.py @@ -102,7 +102,7 @@ async def query_model_vllm(prompt, verbose, ip_ports): if verbose: print('Done') - output = await resp.json() + output = await resp.json(content_type='text/plain') # necessary for latency calc output['response_len'] = expected_response_len if verbose and 'generated_text' in output: diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py index cc6b404d..3e928611 100644 --- a/tests/e2e_test/test_e2e.py +++ b/tests/e2e_test/test_e2e.py @@ -36,7 +36,7 @@ async def get_llumnix_response(prompt, sampling_params, ip_ports): async with aiohttp.ClientSession(timeout=timeout) as session: async with session.post(f'http://{ip_ports}/generate', json=request) as resp: - output = await resp.json() + output = await resp.json('text/plain') return output prompts = [