diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index ff1db7b4b..ed922bfd7 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -14,7 +14,7 @@
 
 def get_streaming_response(prompt: str,
                            api_url: str,
-                           instance_id: int,
+                           session_id: int,
                            request_output_len: int,
                            stream: bool = True,
                            sequence_start: bool = True,
@@ -24,7 +24,7 @@ def get_streaming_response(prompt: str,
     pload = {
         'prompt': prompt,
         'stream': stream,
-        'instance_id': instance_id,
+        'session_id': session_id,
         'request_output_len': request_output_len,
         'sequence_start': sequence_start,
         'sequence_end': sequence_end,
@@ -36,7 +36,7 @@ def get_streaming_response(prompt: str,
                              stream=stream)
     for chunk in response.iter_lines(chunk_size=8192,
                                      decode_unicode=False,
-                                     delimiter=b'\0'):
+                                     delimiter=b'\n'):
         if chunk:
             data = json.loads(chunk.decode('utf-8'))
             output = data['text']
diff --git a/docs/en/restful_api.md b/docs/en/restful_api.md
index c5a4a0de0..cb70e2637 100644
--- a/docs/en/restful_api.md
+++ b/docs/en/restful_api.md
@@ -22,7 +22,7 @@ from typing import Iterable, List
 
 def get_streaming_response(prompt: str,
                            api_url: str,
-                           instance_id: int,
+                           session_id: int,
                            request_output_len: int,
                            stream: bool = True,
                            sequence_start: bool = True,
@@ -32,7 +32,7 @@ def get_streaming_response(prompt: str,
     pload = {
         'prompt': prompt,
         'stream': stream,
-        'instance_id': instance_id,
+        'session_id': session_id,
         'request_output_len': request_output_len,
         'sequence_start': sequence_start,
         'sequence_end': sequence_end,
@@ -41,7 +41,7 @@ def get_streaming_response(prompt: str,
     response = requests.post(
         api_url, headers=headers, json=pload, stream=stream)
     for chunk in response.iter_lines(
-            chunk_size=8192, decode_unicode=False, delimiter=b'\0'):
+            chunk_size=8192, decode_unicode=False, delimiter=b'\n'):
         if chunk:
             data = json.loads(chunk.decode('utf-8'))
             output = data['text']
@@ -91,7 +91,7 @@ curl http://{server_ip}:{server_port}/generate \
   -H "Content-Type: application/json" \
   -d '{
     "prompt": "Hello! How are you?",
-    "instance_id": 1,
+    "session_id": 1,
     "sequence_start": true,
     "sequence_end": true
   }'
@@ -146,11 +146,10 @@ python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
 
 2. When OOM appeared at the server side, please reduce the number of `instance_num` when lanching the service.
 
-3. When the request with the same `instance_id` to `generate` got a empty return value and a negative `tokens`, please consider setting `sequence_start=false` for the second question and the same for the afterwards.
+3. When the request with the same `session_id` to `generate` got a empty return value and a negative `tokens`, please consider setting `sequence_start=false` for the second question and the same for the afterwards.
 
 4. Requests were previously being handled sequentially rather than concurrently. To resolve this issue,
 
-   - kindly provide unique instance_id values when calling the `generate` API or else your requests may be associated with client IP addresses
-   - additionally, setting `stream=true` enables processing multiple requests simultaneously
+   - kindly provide unique session_id values when calling the `generate` API or else your requests may be associated with client IP addresses
 
 5. Both `generate` api and `v1/chat/completions` upport engaging in multiple rounds of conversation, where input `prompt` or `messages` consists of either single strings or entire chat histories.These inputs are interpreted using multi-turn dialogue modes. However, ff you want to turn the mode of and manage the chat history in clients, please the parameter `sequence_end: true` when utilizing the `generate` function, or specify `renew_session: true` when making use of `v1/chat/completions`
diff --git a/docs/zh_cn/restful_api.md b/docs/zh_cn/restful_api.md
index ab35ead12..2b56fa0f2 100644
--- a/docs/zh_cn/restful_api.md
+++ b/docs/zh_cn/restful_api.md
@@ -24,7 +24,7 @@ from typing import Iterable, List
 
 def get_streaming_response(prompt: str,
                            api_url: str,
-                           instance_id: int,
+                           session_id: int,
                            request_output_len: int,
                            stream: bool = True,
                            sequence_start: bool = True,
@@ -34,7 +34,7 @@ def get_streaming_response(prompt: str,
     pload = {
         'prompt': prompt,
         'stream': stream,
-        'instance_id': instance_id,
+        'session_id': session_id,
         'request_output_len': request_output_len,
         'sequence_start': sequence_start,
         'sequence_end': sequence_end,
@@ -43,7 +43,7 @@ def get_streaming_response(prompt: str,
     response = requests.post(
         api_url, headers=headers, json=pload, stream=stream)
     for chunk in response.iter_lines(
-            chunk_size=8192, decode_unicode=False, delimiter=b'\0'):
+            chunk_size=8192, decode_unicode=False, delimiter=b'\n'):
         if chunk:
             data = json.loads(chunk.decode('utf-8'))
             output = data['text']
@@ -93,7 +93,7 @@ curl http://{server_ip}:{server_port}/generate \
   -H "Content-Type: application/json" \
   -d '{
     "prompt": "Hello! How are you?",
-    "instance_id": 1,
+    "session_id": 1,
     "sequence_start": true,
     "sequence_end": true
   }'
@@ -148,12 +148,11 @@ python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
 
 2. 当服务端显存 OOM 时，可以适当减小启动服务时的 `instance_num` 个数
 
-3. 当同一个 `instance_id` 的请求给 `generate` 函数后，出现返回空字符串和负值的 `tokens`，应该是第二次问话没有设置 `sequence_start=false`
+3. 当同一个 `session_id` 的请求给 `generate` 函数后，出现返回空字符串和负值的 `tokens`，应该是第二次问话没有设置 `sequence_start=false`
 
 4. 如果感觉请求不是并发地被处理，而是一个一个地处理，请设置好以下参数：
 
-   - 不同的 instance_id 传入 `generate` api。否则，我们将自动绑定会话 id 为请求端的 ip 地址编号。
-   - 设置 `stream=true` 使模型在前向传播时可以允许其他请求进入被处理
+   - 不同的 session_id 传入 `generate` api。否则，我们将自动绑定会话 id 为请求端的 ip 地址编号。
 
 5. `generate` api 和 `v1/chat/completions` 均支持多轮对话。`messages` 或者 `prompt` 参数既可以是一个简单字符串表示用户的单词提问，也可以是一段对话历史。
    两个 api 都是默认开启多伦对话的，如果你想关闭这个功能，然后在客户端管理会话记录，请设置 `sequence_end: true` 传入 `generate`，或者设置
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 40f87ac0e..e2c4b3684 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -47,14 +47,31 @@ def __init__(self, model_path, instance_num=32, tp=1) -> None:
         self.starts = [None] * instance_num
         self.steps = {}
 
+    def stop_session(self, session_id: int):
+        instance_id = session_id % self.instance_num
+        input_ids = self.tokenizer.encode('')
+        for outputs in self.generators[instance_id].stream_infer(
+                session_id,
+                input_ids,
+                request_output_len=0,
+                sequence_start=False,
+                sequence_end=False,
+                stop=True):
+            pass
+        self.available[instance_id] = True
+
     @contextmanager
-    def safe_run(self, instance_id: int, stop: bool = False):
+    def safe_run(self, instance_id: int, session_id: Optional[int] = None):
         self.available[instance_id] = False
-        yield
+        try:
+            yield
+        except (Exception, asyncio.CancelledError) as e:  # noqa
+            self.stop_session(session_id)
         self.available[instance_id] = True
 
-    async def get_embeddings(self, prompt):
-        prompt = self.model.get_prompt(prompt)
+    async def get_embeddings(self, prompt, do_prerpocess=False):
+        if do_prerpocess:
+            prompt = self.model.get_prompt(prompt)
         input_ids = self.tokenizer.encode(prompt)
         return input_ids
 
@@ -68,7 +85,7 @@ async def get_generator(self, instance_id: int, stop: bool = False):
     async def generate(
         self,
         messages,
-        instance_id,
+        session_id,
         stream_response=True,
         sequence_start=True,
         sequence_end=False,
@@ -85,7 +102,7 @@ async def generate(
 
         Args:
             messages (str | List): chat history or prompt
-            instance_id (int): actually request host ip
+            session_id (int): the session id
             stream_response (bool): whether return responses streamingly
             request_output_len (int): output token nums
             sequence_start (bool): indicator for starting a sequence
@@ -102,8 +119,7 @@ async def generate(
               1.0 means no penalty
             ignore_eos (bool): indicator for ignoring eos
         """
-        session_id = instance_id
-        instance_id %= self.instance_num
+        instance_id = session_id % self.instance_num
         if str(session_id) not in self.steps:
             self.steps[str(session_id)] = 0
         if step != 0:
@@ -119,7 +135,7 @@ async def generate(
                          finish_reason)
         else:
             generator = await self.get_generator(instance_id, stop)
-            with self.safe_run(instance_id):
+            with self.safe_run(instance_id, session_id):
                 response_size = 0
                 async for outputs in generator.async_stream_infer(
                         session_id=session_id,
@@ -188,14 +204,14 @@ async def generate_openai(
         instance_id %= self.instance_num
         sequence_start = False
         generator = await self.get_generator(instance_id)
-        self.available[instance_id] = False
         if renew_session:  # renew a session
             empty_input_ids = self.tokenizer.encode('')
             for outputs in generator.stream_infer(session_id=session_id,
                                                   input_ids=[empty_input_ids],
                                                   request_output_len=0,
                                                   sequence_start=False,
-                                                  sequence_end=True):
+                                                  sequence_end=True,
+                                                  stop=True):
                 pass
             self.steps[str(session_id)] = 0
         if str(session_id) not in self.steps:
@@ -212,31 +228,31 @@ async def generate_openai(
             yield GenOut('', self.steps[str(session_id)], len(input_ids), 0,
                          finish_reason)
         else:
-            response_size = 0
-            async for outputs in generator.async_stream_infer(
-                    session_id=session_id,
-                    input_ids=[input_ids],
-                    stream_output=stream_response,
-                    request_output_len=request_output_len,
-                    sequence_start=(sequence_start),
-                    sequence_end=False,
-                    step=self.steps[str(session_id)],
-                    stop=stop,
-                    top_k=top_k,
-                    top_p=top_p,
-                    temperature=temperature,
-                    repetition_penalty=repetition_penalty,
-                    ignore_eos=ignore_eos,
-                    random_seed=seed if sequence_start else None):
-                res, tokens = outputs[0]
-                # decode res
-                response = self.tokenizer.decode(res.tolist(),
-                                                 offset=response_size)
-                # response, history token len, input token len, gen token len
-                yield GenOut(response, self.steps[str(session_id)],
-                             len(input_ids), tokens, finish_reason)
-                response_size = tokens
-
-            # update step
-            self.steps[str(session_id)] += len(input_ids) + tokens
-        self.available[instance_id] = True
+            with self.safe_run(instance_id, session_id):
+                response_size = 0
+                async for outputs in generator.async_stream_infer(
+                        session_id=session_id,
+                        input_ids=[input_ids],
+                        stream_output=stream_response,
+                        request_output_len=request_output_len,
+                        sequence_start=(sequence_start),
+                        sequence_end=False,
+                        step=self.steps[str(session_id)],
+                        stop=stop,
+                        top_k=top_k,
+                        top_p=top_p,
+                        temperature=temperature,
+                        repetition_penalty=repetition_penalty,
+                        ignore_eos=ignore_eos,
+                        random_seed=seed if sequence_start else None):
+                    res, tokens = outputs[0]
+                    # decode res
+                    response = self.tokenizer.decode(res.tolist(),
+                                                     offset=response_size)
+                    # response, history len, input len, generation len
+                    yield GenOut(response, self.steps[str(session_id)],
+                                 len(input_ids), tokens, finish_reason)
+                    response_size = tokens
+
+                # update step
+                self.steps[str(session_id)] += len(input_ids) + tokens
diff --git a/lmdeploy/serve/gradio/app.py b/lmdeploy/serve/gradio/app.py
index 954a5bcd3..71db7a274 100644
--- a/lmdeploy/serve/gradio/app.py
+++ b/lmdeploy/serve/gradio/app.py
@@ -12,6 +12,7 @@
 from lmdeploy.serve.gradio.css import CSS
 from lmdeploy.serve.openai.api_client import (get_model_list,
                                               get_streaming_response)
+from lmdeploy.serve.openai.api_server import ip2id
 from lmdeploy.serve.turbomind.chatbot import Chatbot
 
 THEME = gr.themes.Soft(
@@ -37,7 +38,7 @@ def chat_stream(state_chatbot: Sequence, llama_chatbot: Chatbot,
     instruction = state_chatbot[-1][0]
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
 
     bot_response = llama_chatbot.stream_infer(
         session_id, instruction, f'{session_id}-{len(state_chatbot)}')
@@ -166,7 +167,7 @@ def chat_stream_restful(
     """
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
     bot_summarized_response = ''
     state_chatbot = state_chatbot + [(instruction, None)]
 
@@ -176,7 +177,7 @@ def chat_stream_restful(
     for response, tokens, finish_reason in get_streaming_response(
             instruction,
             f'{InterFace.restful_api_url}/generate',
-            instance_id=session_id,
+            session_id=session_id,
             request_output_len=512,
             sequence_start=(len(state_chatbot) == 1),
             sequence_end=False):
@@ -212,12 +213,12 @@ def reset_restful_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State,
 
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
     # end the session
     for response, tokens, finish_reason in get_streaming_response(
             '',
             f'{InterFace.restful_api_url}/generate',
-            instance_id=session_id,
+            session_id=session_id,
             request_output_len=0,
             sequence_start=False,
             sequence_end=True):
@@ -241,11 +242,11 @@ def cancel_restful_func(state_chatbot: gr.State, cancel_btn: gr.Button,
     """
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
     # end the session
     for out in get_streaming_response('',
                                       f'{InterFace.restful_api_url}/generate',
-                                      instance_id=session_id,
+                                      session_id=session_id,
                                       request_output_len=0,
                                       sequence_start=False,
                                       sequence_end=False,
@@ -259,7 +260,7 @@ def cancel_restful_func(state_chatbot: gr.State, cancel_btn: gr.Button,
             messages.append(dict(role='assistant', content=qa[1]))
     for out in get_streaming_response(messages,
                                       f'{InterFace.restful_api_url}/generate',
-                                      instance_id=session_id,
+                                      session_id=session_id,
                                       request_output_len=0,
                                       sequence_start=True,
                                       sequence_end=False):
@@ -346,7 +347,7 @@ async def chat_stream_local(
     """
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
     bot_summarized_response = ''
     state_chatbot = state_chatbot + [(instruction, None)]
 
@@ -391,7 +392,7 @@ async def reset_local_func(instruction_txtbox: gr.Textbox,
 
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
     # end the session
     async for out in InterFace.async_engine.generate('',
                                                      session_id,
@@ -419,7 +420,7 @@ async def cancel_local_func(state_chatbot: gr.State, cancel_btn: gr.Button,
     """
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
     # end the session
     async for out in InterFace.async_engine.generate('',
                                                      session_id,
diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py
index 449b8a294..a8718331b 100644
--- a/lmdeploy/serve/openai/api_client.py
+++ b/lmdeploy/serve/openai/api_client.py
@@ -17,7 +17,7 @@ def get_model_list(api_url: str):
 
 def get_streaming_response(prompt: str,
                            api_url: str,
-                           instance_id: int,
+                           session_id: int,
                            request_output_len: int = 512,
                            stream: bool = True,
                            sequence_start: bool = True,
@@ -28,7 +28,7 @@ def get_streaming_response(prompt: str,
     pload = {
         'prompt': prompt,
         'stream': stream,
-        'instance_id': instance_id,
+        'session_id': session_id,
         'request_output_len': request_output_len,
         'sequence_start': sequence_start,
         'sequence_end': sequence_end,
@@ -41,7 +41,7 @@ def get_streaming_response(prompt: str,
                              stream=stream)
     for chunk in response.iter_lines(chunk_size=8192,
                                      decode_unicode=False,
-                                     delimiter=b'\0'):
+                                     delimiter=b'\n'):
         if chunk:
             data = json.loads(chunk.decode('utf-8'))
             output = data.pop('text', '')
@@ -62,12 +62,20 @@ def main(restful_api_url: str, session_id: int = 0):
     while True:
         prompt = input_prompt()
         if prompt == 'exit':
+            for output, tokens, finish_reason in get_streaming_response(
+                    '',
+                    f'{restful_api_url}/generate',
+                    session_id=session_id,
+                    request_output_len=0,
+                    sequence_start=(nth_round == 1),
+                    sequence_end=True):
+                pass
             exit(0)
         else:
             for output, tokens, finish_reason in get_streaming_response(
                     prompt,
                     f'{restful_api_url}/generate',
-                    instance_id=session_id,
+                    session_id=session_id,
                     request_output_len=512,
                     sequence_start=(nth_round == 1),
                     sequence_end=False):
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 647c36609..94271c4b9 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import json
 import os
 import time
 from http import HTTPStatus
@@ -7,7 +6,7 @@
 
 import fire
 import uvicorn
-from fastapi import BackgroundTasks, FastAPI, Request
+from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 
@@ -16,8 +15,8 @@
     ChatCompletionRequest, ChatCompletionResponse,
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse, ChatMessage, DeltaMessage, EmbeddingsRequest,
-    EmbeddingsResponse, ErrorResponse, GenerateRequest, ModelCard, ModelList,
-    ModelPermission, UsageInfo)
+    EmbeddingsResponse, ErrorResponse, GenerateRequest, GenerateResponse,
+    ModelCard, ModelList, ModelPermission, UsageInfo)
 
 os.environ['TM_LOG_LEVEL'] = 'ERROR'
 
@@ -73,6 +72,16 @@ async def check_request(request) -> Optional[JSONResponse]:
     return ret
 
 
+def ip2id(host_ip: str):
+    """Convert host ip address to session id."""
+    if '.' in host_ip:  # IPv4
+        return int(host_ip.replace('.', '')[-8:])
+    if ':' in host_ip:  # IPv6
+        return int(host_ip.replace(':', '')[-8:], 16)
+    print('Warning, could not get session id from ip, set it 0')
+    return 0
+
+
 @app.post('/v1/chat/completions')
 async def chat_completions_v1(request: ChatCompletionRequest,
                               raw_request: Request = None):
@@ -106,19 +115,18 @@ async def chat_completions_v1(request: ChatCompletionRequest,
     - presence_penalty (replaced with repetition_penalty)
     - frequency_penalty (replaced with repetition_penalty)
     """
-    instance_id = int(raw_request.client.host.replace('.', ''))
-
+    session_id = ip2id(raw_request.client.host)
     error_check_ret = await check_request(request)
     if error_check_ret is not None:
         return error_check_ret
 
     model_name = request.model
-    request_id = str(instance_id)
+    request_id = str(session_id)
     created_time = int(time.time())
 
     result_generator = VariableInterface.async_engine.generate_openai(
         request.messages,
-        instance_id,
+        session_id,
         True,  # always use stream to enable batching
         request.renew_session,
         request_output_len=request.max_tokens if request.max_tokens else 512,
@@ -128,15 +136,6 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         repetition_penalty=request.repetition_penalty,
         ignore_eos=request.ignore_eos)
 
-    async def abort_request() -> None:
-        async for _ in VariableInterface.async_engine.generate_openai(
-                request.messages,
-                instance_id,
-                True,
-                request.renew_session,
-                stop=True):
-            pass
-
     def create_stream_response_json(
         index: int,
         text: str,
@@ -181,12 +180,8 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
 
     # Streaming response
     if request.stream:
-        background_tasks = BackgroundTasks()
-        # Abort the request if the client disconnects.
-        background_tasks.add_task(abort_request)
         return StreamingResponse(completion_stream_generator(),
-                                 media_type='text/event-stream',
-                                 background=background_tasks)
+                                 media_type='text/event-stream')
 
     # Non-streaming response
     final_res = None
@@ -194,7 +189,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
     async for res in result_generator:
         if await raw_request.is_disconnected():
             # Abort the request if the client disconnects.
-            await abort_request()
+            VariableInterface.async_engine.stop_session(session_id)
             return create_error_response(HTTPStatus.BAD_REQUEST,
                                          'Client disconnected')
         final_res = res
@@ -257,7 +252,7 @@ async def generate(request: GenerateRequest, raw_request: Request = None):
 
     The request should be a JSON object with the following fields:
     - prompt: the prompt to use for the generation.
-    - instance_id: determine which instance will be called. If not specified
+    - session_id: determine which instance will be called. If not specified
         with a value other than -1, using host ip directly.
     - sequence_start (bool): indicator for starting a sequence.
     - sequence_end (bool): indicator for ending a sequence
@@ -275,13 +270,13 @@ async def generate(request: GenerateRequest, raw_request: Request = None):
         1.0 means no penalty
     - ignore_eos (bool): indicator for ignoring eos
     """
-    if request.instance_id == -1:
-        instance_id = int(raw_request.client.host.replace('.', ''))
-        request.instance_id = instance_id
+    if request.session_id == -1:
+        session_id = ip2id(raw_request.client.host)
+        request.session_id = session_id
 
     generation = VariableInterface.async_engine.generate(
         request.prompt,
-        request.instance_id,
+        request.session_id,
         stream_response=True,  # always use stream to enable batching
         sequence_start=request.sequence_start,
         sequence_end=request.sequence_end,
@@ -296,21 +291,26 @@ async def generate(request: GenerateRequest, raw_request: Request = None):
     # Streaming case
     async def stream_results() -> AsyncGenerator[bytes, None]:
         async for out in generation:
-            ret = {
-                'text': out.response,
-                'tokens': out.generate_token_len,
-                'finish_reason': out.finish_reason
-            }
-            yield (json.dumps(ret) + '\0').encode('utf-8')
+            chunk = GenerateResponse(text=out.response,
+                                     tokens=out.generate_token_len,
+                                     finish_reason=out.finish_reason)
+            data = chunk.model_dump_json()
+            yield f'{data}\n'
 
     if request.stream:
-        return StreamingResponse(stream_results())
+        return StreamingResponse(stream_results(),
+                                 media_type='text/event-stream')
     else:
         ret = {}
         text = ''
         tokens = 0
         finish_reason = None
         async for out in generation:
+            if await raw_request.is_disconnected():
+                # Abort the request if the client disconnects.
+                VariableInterface.async_engine.stop_session(session_id)
+                return create_error_response(HTTPStatus.BAD_REQUEST,
+                                             'Client disconnected')
             text += out.response
             tokens = out.generate_token_len
             finish_reason = out.finish_reason
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 8f2919a1a..b4eeadff7 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -190,7 +190,7 @@ class EmbeddingsResponse(BaseModel):
 class GenerateRequest(BaseModel):
     """Generate request."""
     prompt: Union[str, List[Dict[str, str]]]
-    instance_id: int = -1
+    session_id: int = -1
     sequence_start: bool = True
     sequence_end: bool = False
     stream: bool = False
@@ -201,3 +201,10 @@ class GenerateRequest(BaseModel):
     temperature: float = 0.8
     repetition_penalty: float = 1.0
     ignore_eos: bool = False
+
+
+class GenerateResponse(BaseModel):
+    """Generate response."""
+    text: str
+    tokens: int
+    finish_reason: Optional[Literal['stop', 'length']] = None