diff --git a/.env-example b/.env-example index 550a8b1..44ae65c 100644 --- a/.env-example +++ b/.env-example @@ -1,3 +1,2 @@ -CHAIN_TYPE=map_reduce OPENAI_API_KEY="" -OPENAI_API_BASE="" +OPENAI_BASE_URL="" diff --git a/Dockerfile b/Dockerfile index 6599760..dd5fa17 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,6 @@ COPY . /src WORKDIR /src -RUN pip install -r requirements-frozen.txt +RUN pip install -r requirements.txt CMD ["python3", "app.py"] \ No newline at end of file diff --git a/app.py b/app.py index 98d450e..a56677b 100644 --- a/app.py +++ b/app.py @@ -3,38 +3,159 @@ import time import uvicorn -from langchain_openai import OpenAI, ChatOpenAI -from langchain.chains.summarize import load_summarize_chain -from langchain.chains import AnalyzeDocumentChain -from langchain.text_splitter import CharacterTextSplitter +from openai import OpenAI from sse_starlette import EventSourceResponse from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, Field from typing import List, Literal, Optional, Union +from langdetect import detect dotenv.load_dotenv() DEFAULTS = { - 'CHAIN_TYPE': "map_reduce" + 'HTTPX_TIMEOUT': 60, + 'TEMPERATURE': 0, + 'MAX_TOKENS': 4096 } +summary_refine_prompt_template = """\ +Your job is to produce a final summary. +We have provided an existing summary up to a certain point: {answer} +We have the opportunity to refine the existing summary (only if needed) with some more context below. +------------ +{text} +------------ +Given the new context, refine the original summary. +If the context isn't useful, return the original summary. +The language of summary must keep in {language}. +""" + +summary_prompt_template = """Write a concise summary of the following, +and the language of summary must keep in {language}. + + +"{text}" + + +CONCISE SUMMARY:""" + +todo_refine_prompt_template = """\ +Your job is to produce a final todo list. +We have provided an existing todo list up to a certain point: {answer} +We have the opportunity to refine the existing todo list (only if needed) with some more context below. +------------ +{text} +------------ +Given the new context, refine the original todo list. +If the context isn't useful, return the original todo list. +The language of todo list must keep in {language}. +""" + +todo_prompt_template = """Write a concise todo list of the following, +and the language of todo list must keep in {language}: + + +"{text}" + + +CONCISE TODO LIST:""" + def get_env(key): return os.environ.get(key, DEFAULTS.get(key)) -def summarize(content: str, chain_type: str, model_name: str): - llm = ChatOpenAI(temperature=0, model_name=model_name) - text_splitter = CharacterTextSplitter( - chunk_size=1500, - chunk_overlap=0, - length_function=len, - ) - summary_chain = load_summarize_chain(llm, chain_type=chain_type) - summarize_document_chain = AnalyzeDocumentChain(combine_docs_chain=summary_chain, text_splitter=text_splitter) - summary_text = summarize_document_chain.invoke(content) - return summary_text +def make_todo_list(content: str, model_name: str): + client = OpenAI() + + language = detect(content) + length = len(content) + chunk_size = 1500 + start_idx = 0 + end_idx = 0 + times = 1 + answer = None + while end_idx < length: + end_idx = start_idx + chunk_size + if end_idx >= length: + end_idx = length + + text = content[start_idx:end_idx] + text_nolines = text.replace("\n", "\\n") + print(f"idx=[{start_idx}, {end_idx}], text: {text_nolines}") + start_idx = end_idx + + if times == 1: + content = todo_prompt_template.format(text=text, language=language) + else: + content = todo_refine_prompt_template.format(answer=answer, text=text, language=language) + + messages = [{ + "role": "user", + "content": content + }] + params = dict( + messages=messages, + stream=False, + model=model_name, + temperature=get_env("TEMPERATURE"), + max_tokens=get_env("MAX_TOKENS"), + timeout=get_env("HTTPX_TIMEOUT") + ) + + chat_completion = client.chat.completions.create(**params) + answer = chat_completion.choices[0].message.content + print(f"Todo times: {times}, answer: {answer}") + times = times + 1 + + return answer + + +def summarize(content: str, model_name: str): + client = OpenAI() + + language = detect(content) + length = len(content) + chunk_size = 1500 + start_idx = 0 + end_idx = 0 + times = 1 + answer = None + while end_idx < length: + end_idx = start_idx + chunk_size + if end_idx >= length: + end_idx = length + + text = content[start_idx:end_idx] + text_nolines = text.replace("\n", "\\n") + print(f"idx=[{start_idx}, {end_idx}], text: {text_nolines}") + start_idx = end_idx + + if times == 1: + content = summary_prompt_template.format(text=text, language=language) + else: + content = summary_refine_prompt_template.format(answer=answer, text=text, language=language) + + messages = [{ + "role": "user", + "content": content + }] + params = dict( + messages=messages, + stream=False, + model=model_name, + temperature=get_env("TEMPERATURE"), + max_tokens=get_env("MAX_TOKENS"), + timeout=get_env("HTTPX_TIMEOUT") + ) + + chat_completion = client.chat.completions.create(**params) + answer = chat_completion.choices[0].message.content + print(f"Summarize times: {times}, answer: {answer}") + times = times + 1 + + return answer app = FastAPI() @@ -87,7 +208,7 @@ class ChatCompletionResponse(BaseModel): created: Optional[int] = Field(default_factory=lambda: int(time.time())) -def predict(query: str, model_id: str, chain_type: str): +def predict(query: str, model_id: str): choice_data = ChatCompletionResponseStreamChoice( index=0, delta=DeltaMessage(role="assistant"), @@ -97,10 +218,29 @@ def predict(query: str, model_id: str, chain_type: str): choice_data], object="chat.completion.chunk") yield "{}".format(chunk.json(exclude_unset=True)) - summary = summarize(query, chain_type, model_id) + summary = summarize(query, model_id) + choice_data = ChatCompletionResponseStreamChoice( + index=0, + delta=DeltaMessage(content=f"Summary:\n {summary}", role="assistant"), + finish_reason=None + ) + chunk = ChatCompletionResponse(model=model_id, choices=[ + choice_data], object="chat.completion.chunk") + yield "{}".format(chunk.json(exclude_unset=True)) + + todo_list = make_todo_list(query, model_id) + choice_data = ChatCompletionResponseStreamChoice( + index=0, + delta=DeltaMessage(content=f"\n\nTodo List:\n {todo_list}", role="assistant"), + finish_reason=None + ) + chunk = ChatCompletionResponse(model=model_id, choices=[ + choice_data], object="chat.completion.chunk") + yield "{}".format(chunk.json(exclude_unset=True)) + choice_data = ChatCompletionResponseStreamChoice( index=0, - delta=DeltaMessage(content=summary['output_text'], role="assistant"), + delta=DeltaMessage(content=f"\n\nTranscription:\n {query}", role="assistant"), finish_reason=None ) chunk = ChatCompletionResponse(model=model_id, choices=[ @@ -123,8 +263,7 @@ async def create_chat_completion(request: ChatCompletionRequest): if request.messages[-1].role != "user": raise HTTPException(status_code=400, detail="Invalid request") user_content = request.messages[-1].content - chain_type = get_env("CHAIN_TYPE") - generate = predict(user_content, request.model, chain_type) + generate = predict(user_content, request.model) return EventSourceResponse(generate, media_type="text/event-stream") diff --git a/requirements.txt b/requirements.txt index 4f25f50..121f2c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,3 @@ -langchain==0.1.14 -langchain-community==0.0.31 -langchain-openai==0.1.1 openai==1.14.0 python-dotenv==1.0.1 fastapi==0.110.0 @@ -12,3 +9,4 @@ httpx==0.27.0 httpx-ws==0.5.2 pydantic==2.6.4 pydantic_core==2.16.3 +langdetect==1.0.9 diff --git a/tests.py b/tests.py new file mode 100644 index 0000000..7195cc8 --- /dev/null +++ b/tests.py @@ -0,0 +1,24 @@ +from app import summarize, make_todo_list + +if __name__ == '__main__': + + content = """ +Key concepts +Text generation models +OpenAI's text generation models (often referred to as generative pre-trained transformers or "GPT" models for short), like GPT-4 and GPT-3.5, have been trained to understand natural and formal language. Models like GPT-4 allows text outputs in response to their inputs. The inputs to these models are also referred to as "prompts". Designing a prompt is essentially how you "program" a model like GPT-4, usually by providing instructions or some examples of how to successfully complete a task. Models like GPT-4 can be used across a great variety of tasks including content or code generation, summarization, conversation, creative writing, and more. Read more in our introductory text generation guide and in our prompt engineering guide. + +Assistants +Assistants refer to entities, which in the case of the OpenAI API are powered by large language models like GPT-4, that are capable of performing tasks for users. These assistants operate based on the instructions embedded within the context window of the model. They also usually have access to tools which allows the assistants to perform more complex tasks like running code or retrieving information from a file. Read more about assistants in our Assistants API Overview. + +Embeddings +An embedding is a vector representation of a piece of data (e.g. some text) that is meant to preserve aspects of its content and/or its meaning. Chunks of data that are similar in some way will tend to have embeddings that are closer together than unrelated data. OpenAI offers text embedding models that take as input a text string and produce as output an embedding vector. Embeddings are useful for search, clustering, recommendations, anomaly detection, classification, and more. Read more about embeddings in our embeddings guide. + +Tokens +Text generation and embeddings models process text in chunks called tokens. Tokens represent commonly occurring sequences of characters. For example, the string " tokenization" is decomposed as " token" and "ization", while a short and common word like " the" is represented as a single token. Note that in a sentence, the first token of each word typically starts with a space character. Check out our tokenizer tool to test specific strings and see how they are translated into tokens. As a rough rule of thumb, 1 token is approximately 4 characters or 0.75 words for English text. + +One limitation to keep in mind is that for a text generation model the prompt and the generated output combined must be no more than the model's maximum context length. For embeddings models (which do not output tokens), the input must be shorter than the model's maximum context length. The maximum context lengths for each text generation and embeddings model can be found in the model index. + """ + answer = summarize(content, "gpt-3.5-turbo") + print(answer) + answer = make_todo_list(content, "gpt-3.5-turbo") + print(answer)