-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfastapi-test.py
38 lines (31 loc) · 1.3 KB
/
fastapi-test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
import asyncio
app = FastAPI(
title="Ask Me Anything API",
description="""
E2E llamaindex streaming RAG lightweight models and tradeoff accuracy vs latency:
1. EMBEDDINGS MODEL: INT4 Quantized BAAI/bge-base-en-v1.5
2. LLM MODEL: phi3-mini-128k-instruct
3. RERANKING MODEL: BAAI/bge-reranker-base
Note: Swagger UI waits to complete response, use example {CURL} command in description of API for non-buffered live streaming result.
""",
)
class QueryRequest(BaseModel):
question: str
async def some_async_processing_function(question: str):
# Simulate an async task with parts of a response being generated over time
for i in range(30):
await asyncio.sleep(1) # Simulate a delay (e.g., I/O-bound operation)
yield f"Part {i+1}: Processed question: {question}\n"
@app.post("/query")
async def query_llamaindex(request: QueryRequest):
question = request.question
async def response_stream():
async for part in some_async_processing_function(question):
yield part
return StreamingResponse(response_stream(), media_type="text/plain")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)