From f790f4a505311a6cafadc8445bb00fea3758f118 Mon Sep 17 00:00:00 2001 From: FangYin Cheng Date: Sat, 7 Oct 2023 21:12:00 +0800 Subject: [PATCH 1/2] feat(model): llama.cpp support new GGUF file format --- .env.template | 2 +- .../install/llm/llama/llama_cpp.md | 12 +- .../install/llm/llama/llama_cpp.po | 188 +++++++++--------- pilot/configs/model_config.py | 3 +- pilot/model/adapter.py | 6 +- pilot/model/cluster/manager_base.py | 2 +- pilot/model/cluster/worker/manager.py | 50 ++++- pilot/model/cluster/worker/remote_manager.py | 2 +- pilot/server/dbgpt_server.py | 6 - setup.py | 24 ++- 10 files changed, 167 insertions(+), 128 deletions(-) diff --git a/.env.template b/.env.template index 65fbb1391..c09418b4b 100644 --- a/.env.template +++ b/.env.template @@ -44,7 +44,7 @@ QUANTIZE_8bit=True ## llama-2-70b must be 8 # llama_cpp_n_gqa=8 ## Model path -# llama_cpp_model_path=/data/models/TheBloke/vicuna-7B-v1.5-GGML/vicuna-7b-v1.5.ggmlv3.q4_0.bin +# llama_cpp_model_path=/data/models/TheBloke/vicuna-13B-v1.5-GGUF/vicuna-13b-v1.5.Q4_K_M.gguf #*******************************************************************# #** EMBEDDING SETTINGS **# diff --git a/docs/getting_started/install/llm/llama/llama_cpp.md b/docs/getting_started/install/llm/llama/llama_cpp.md index ebb3b8220..8295884ba 100644 --- a/docs/getting_started/install/llm/llama/llama_cpp.md +++ b/docs/getting_started/install/llm/llama/llama_cpp.md @@ -8,19 +8,19 @@ DB-GPT already supports [llama.cpp](https://github.com/ggerganov/llama.cpp) via ### Preparing Model Files -To use llama.cpp, you need to prepare a ggml format model file, and there are two common ways to obtain it, you can choose either: +To use llama.cpp, you need to prepare a gguf format model file, and there are two common ways to obtain it, you can choose either: 1. Download a pre-converted model file. -Suppose you want to use [Vicuna 7B v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5), you can download the file already converted from [TheBloke/vicuna-7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-7B-v1.5-GGML), only one file is needed. Download it to the `models` directory and rename it to `ggml-model-q4_0.bin`. +Suppose you want to use [Vicuna 13B v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5), you can download the file already converted from [TheBloke/vicuna-13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF), only one file is needed. Download it to the `models` directory and rename it to `ggml-model-q4_0.gguf`. ```bash -wget https://huggingface.co/TheBloke/vicuna-7B-v1.5-GGML/resolve/main/vicuna-7b-v1.5.ggmlv3.q4_K_M.bin -O models/ggml-model-q4_0.bin +wget https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF/resolve/main/vicuna-13b-v1.5.Q4_K_M.gguf -O models/ggml-model-q4_0.gguf ``` 2. Convert It Yourself -You can convert the model file yourself according to the instructions in [llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp#prepare-data--run), and put the converted file in the models directory and rename it to `ggml-model-q4_0.bin`. +You can convert the model file yourself according to the instructions in [llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp#prepare-data--run), and put the converted file in the models directory and rename it to `ggml-model-q4_0.gguf`. ### Installing Dependencies @@ -46,9 +46,9 @@ Then you can run it according to [Run](https://db-gpt.readthedocs.io/en/latest/g In DB-GPT, the model configuration can be done through `{model name}_{config key}`. -| Environment Variable Key | default | Prompt Template Name| +| Environment Variable Key | default | Description | |----------|-----------| ----------- | -| llama_cpp_prompt_template | None | Prompt template name, now support: `zero_shot, vicuna_v1.1, llama-2,baichuan-chat`, If None, the prompt template is automatically determined from model path。 | +| llama_cpp_prompt_template | None | Prompt template name, now support: `zero_shot, vicuna_v1.1,alpaca,llama-2,baichuan-chat,internlm-chat`, If None, the prompt template is automatically determined from model path。 | | llama_cpp_model_path | None | Model path | | llama_cpp_n_gpu_layers | 1000000000 |Number of layers to offload to the GPU, Set this to 1000000000 to offload all layers to the GPU. If your GPU VRAM is not enough, you can set a low number, eg: `10` | | llama_cpp_n_threads | None | Number of threads to use. If None, the number of threads is automatically determined | diff --git a/docs/locales/zh_CN/LC_MESSAGES/getting_started/install/llm/llama/llama_cpp.po b/docs/locales/zh_CN/LC_MESSAGES/getting_started/install/llm/llama/llama_cpp.po index 1d66e5224..cd7d4689b 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/getting_started/install/llm/llama/llama_cpp.po +++ b/docs/locales/zh_CN/LC_MESSAGES/getting_started/install/llm/llama/llama_cpp.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: DB-GPT 👏👏 0.3.5\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-08-21 16:59+0800\n" +"POT-Creation-Date: 2023-10-07 20:28+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: zh_CN\n" @@ -20,274 +20,275 @@ msgstr "" "Generated-By: Babel 2.12.1\n" #: ../../getting_started/install/llm/llama/llama_cpp.md:1 -#: 24d5c21cd8b44f1d8585ba5c83e34acc +#: 95a9a605d97346fb98e0c0977524d354 msgid "llama.cpp" msgstr "llama.cpp" #: ../../getting_started/install/llm/llama/llama_cpp.md:5 -#: 56969ff863d949aa8df55d3bdb6957e7 +#: ebe3be273a42492d9832512554b4b7dc msgid "" "DB-GPT already supports " "[llama.cpp](https://github.com/ggerganov/llama.cpp) via [llama-cpp-" "python](https://github.com/abetlen/llama-cpp-python)." msgstr "" +"DB-GPT已经通过[llama-cpp-python](https://github.com/abetlen/llama-cpp-" +"python)支持[llama.cpp](https://github.com/ggerganov/llama.cpp)。" #: ../../getting_started/install/llm/llama/llama_cpp.md:7 -#: afe223eafcc641779e1580cac574c34a +#: 97a4f6f95d6845258e3753803fc117a3 msgid "Running llama.cpp" msgstr "运行 llama.cpp" #: ../../getting_started/install/llm/llama/llama_cpp.md:9 -#: 0eaf98a036434eecb2af1fa89f045620 +#: 40fcdf93fe3d4542bbd84ed2d5a82623 msgid "Preparing Model Files" msgstr "准备模型文件" #: ../../getting_started/install/llm/llama/llama_cpp.md:11 -#: 4f45be5d9658451fb95f1d5d31dc8778 +#: f10bd034d24640d3b83572d50b2a9f71 msgid "" -"To use llama.cpp, you need to prepare a ggml format model file, and there" +"To use llama.cpp, you need to prepare a gguf format model file, and there" " are two common ways to obtain it, you can choose either:" -msgstr "使用llama.cpp, 你需要准备ggml格式的文件,你可以通过以下两种方法获取" +msgstr "使用 llama.cpp,你需要准备 gguf 格式的文件,你可以通过以下两种方法获取" #: ../../getting_started/install/llm/llama/llama_cpp.md:13 -#: 9934596e0f6e466aae63cefbb019e0ec +#: fb143586b13849f0bb2b6ae0c9408e95 msgid "Download a pre-converted model file." -msgstr "Download a pre-converted model file." +msgstr "下载已转换的模型文件" #: ../../getting_started/install/llm/llama/llama_cpp.md:15 -#: 33fef76961064a5ca4c86c57111c8bd3 +#: a6e89c960ebd4778b8fc72d3d43e9543 msgid "" -"Suppose you want to use [Vicuna 7B v1.5](https://huggingface.co/lmsys" -"/vicuna-7b-v1.5), you can download the file already converted from " -"[TheBloke/vicuna-7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-" -"7B-v1.5-GGML), only one file is needed. Download it to the `models` " -"directory and rename it to `ggml-model-q4_0.bin`." +"Suppose you want to use [Vicuna 13B v1.5](https://huggingface.co/lmsys" +"/vicuna-13b-v1.5), you can download the file already converted from " +"[TheBloke/vicuna-13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-" +"13B-v1.5-GGUF), only one file is needed. Download it to the `models` " +"directory and rename it to `ggml-model-q4_0.gguf`." msgstr "" -"假设您想使用[Vicuna 7B v1.5](https://huggingface.co/lmsys/vicuna-" -"7b-v1.5)您可以从[TheBloke/vicuna-" -"7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-" -"7B-v1.5-GGML)下载已转换的文件,只需要一个文件。将其下载到models目录并将其重命名为ggml-model-q4_0.bin。" +"假设您想使用[Vicuna 13B v1.5](https://huggingface.co/lmsys/vicuna-" +"13b-v1.5)您可以从[TheBloke/vicuna-" +"13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-" +"13B-v1.5-GGUF)下载已转换的文件,只需要一个文件。将其下载到models目录并将其重命名为 `ggml-" +"model-q4_0.gguf`。" #: ../../getting_started/install/llm/llama/llama_cpp.md:21 -#: 65fed5b7e95b4205b2b94596a21b6fe8 +#: 380ebad2c5a04210a48c5d7a9913413d msgid "Convert It Yourself" -msgstr "Convert It Yourself" +msgstr "自行转换" #: ../../getting_started/install/llm/llama/llama_cpp.md:23 -#: 1421761d320046f79f725e64bd7d854c +#: cf39ca73d9c6456794fb240b164b7cbb msgid "" "You can convert the model file yourself according to the instructions in " "[llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp" "#prepare-data--run), and put the converted file in the models directory " -"and rename it to `ggml-model-q4_0.bin`." +"and rename it to `ggml-model-q4_0.gguf`." msgstr "" "您可以根据[llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp" -"#prepare-data--run)中的说明自己转换模型文件,然后将转换后的文件放入models目录中,并将其重命名为ggml-" -"model-q4_0.bin。" +"#prepare-data--run)中的说明自行转换模型文件,并把转换后的文件放在models目录中,并重命名为`ggml-" +"model-q4_0.gguf`。" #: ../../getting_started/install/llm/llama/llama_cpp.md:25 -#: 850b1f8ef6be49b192e01c1b7d8f1f26 +#: 363cbf1c0b4e4029982519238f776958 msgid "Installing Dependencies" msgstr "安装依赖" #: ../../getting_started/install/llm/llama/llama_cpp.md:27 -#: b323ee4799d745cc9c0a449bd37c371a +#: a98c36e3d7df40f3a816c0ee451b6114 msgid "" "llama.cpp is an optional dependency in DB-GPT, and you can manually " "install it using the following command:" -msgstr "llama.cpp在DB-GPT中是可选安装项, 你可以通过一下命令进行安装" +msgstr "llama.cpp在DB-GPT中是可选安装项, 你可以通过以下命令进行安装" #: ../../getting_started/install/llm/llama/llama_cpp.md:33 -#: 75b75c84ffb7476d8501a28bb2719615 +#: b0038a8ba36647c6a62eef907cb6d304 msgid "Modifying the Configuration File" msgstr "修改配置文件" #: ../../getting_started/install/llm/llama/llama_cpp.md:35 -#: d1f8b3e1ad3441f2aafbfe2519113c2c +#: d2002da716744122a44ab4ed2e47e680 msgid "Next, you can directly modify your `.env` file to enable llama.cpp." msgstr "修改`.env`文件使用llama.cpp" #: ../../getting_started/install/llm/llama/llama_cpp.md:42 -#: 2ddcab3834f646e58a8b3316abf6ce3a +#: 97a5fb5d4ed649f5aa0bbb97c32d54b0 msgid "" "Then you can run it according to [Run](https://db-" "gpt.readthedocs.io/en/latest/getting_started/install/deploy/deploy.html#run)." msgstr "" -"然后你可以通过[Run](https://db-" -"gpt.readthedocs.io/en/latest/getting_started/install/deploy/deploy.html#run).来运行" +"然后你可以根据[运行]" +"(https://db-gpt.readthedocs.io/projects/db-gpt-docs-zh-cn/zh_CN/latest/getting_started/install/deploy/deploy.html#run)来运行" #: ../../getting_started/install/llm/llama/llama_cpp.md:45 -#: bb9f222d22534827a9fa164b2126d192 +#: 0e3771b6aaa141f89c813507f3317bda msgid "More Configurations" msgstr "更多配置文件" #: ../../getting_started/install/llm/llama/llama_cpp.md:47 -#: 14d016ad5bad451888d01e24f0ca86d9 +#: 0802ba524cd1458298fe6f90ae7f2da1 msgid "" "In DB-GPT, the model configuration can be done through `{model " "name}_{config key}`." -msgstr "" -"In DB-GPT, the model configuration can be done through `{model " -"name}_{config key}`." +msgstr "在DB-GPT中,模型配置可以通过`{模型名称}_{配置名}` 来配置。" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: a1bf4c1f49bd4d97ac45d4f3aff442c6 +#: d461d379a523424fb5885e393498ee14 msgid "Environment Variable Key" -msgstr "Environment Variable Key" +msgstr "环境变量键" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 92692a38219c432fadffb8b3825ce678 +#: 0263477d0ddb4914baa0d3584b751086 msgid "default" -msgstr "default" +msgstr "默认值" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 72b2d251aa2e4ca09c335b58e1a08de3 -msgid "Prompt Template Name" -msgstr "Prompt Template Name" +#: e5188d0ded6540a0bddb46d480f8b7ac +msgid "Description" +msgstr "描述" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 85a9f89eeb9a4b70b56913354e947329 +#: 213b27d0e53d4858b7576dc4f2ab4d7f msgid "llama_cpp_prompt_template" msgstr "llama_cpp_prompt_template" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 17e9750fbb824dfdaaed5415f6406e35 602016763bb2470d8a8ef700e576407b -#: 790caafd5c4c4cecbb4c190745fb994c ceb6c41315ab4c5798ab3c64ee8693eb -#: cfafab69a2684e27bd55aadfdd4c1575 +#: 1cb0320826564a89a3e2f51177f8a6ed 23d93dc7d88e431ba31ff64d239a412f +#: 833d5012411a4ad58b04d50a40a29184 95aa2102191946919158ae668b2e3599 +#: becdd178292a48138dcb445ba3c2a6ec msgid "None" msgstr "None" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 5d02f2d1d5834b1e9e5d6982247fd6c9 +#: ac835806c79640aa8cd39edb11d7667c msgid "" -"Prompt template name, now support: `zero_shot, vicuna_v1.1, llama-2" -",baichuan-chat`, If None, the prompt template is automatically determined" -" from model path。" +"Prompt template name, now support: `zero_shot, vicuna_v1.1,alpaca,llama-2" +",baichuan-chat,internlm-chat`, If None, the prompt template is " +"automatically determined from model path。" msgstr "" -"Prompt template 现在可以支持`zero_shot, vicuna_v1.1, llama-2,baichuan-chat`, " -"如果是None, the prompt template可以自动选择模型路径" +"Prompt template 现在可以支持`zero_shot, vicuna_v1.1,alpaca,llama-2,baichuan-" +"chat,internlm-chat`, 如果是None, 可以根据模型路径来自动获取模型 Prompt template" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 2a95bc11386f45498b3585b194f24c17 +#: 41bce5a6bbf2417f8bc40e71c59405ad msgid "llama_cpp_model_path" msgstr "llama_cpp_model_path" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: c02db8a50e7a4df0acb6b75798a3ad4b +#: 15df4d19645b40e7a209827f9a325b8f msgid "Model path" -msgstr "Model path" +msgstr "模型路径" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 6c92b2ec52634728bcc421670cdda70b +#: caf9ddbfb787418d8b167746e3febe8c msgid "llama_cpp_n_gpu_layers" msgstr "llama_cpp_n_gpu_layers" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 9f1e1b763a0b40d28efd734fe20e1ba7 +#: e12e0ed2c01e4d12b41d5da533073c53 msgid "1000000000" msgstr "1000000000" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 0f511b7907594c1f9c9818638764f209 +#: 1f4a868d3fed4ac78bfa48e13b3a59dc msgid "" "Number of layers to offload to the GPU, Set this to 1000000000 to offload" " all layers to the GPU. If your GPU VRAM is not enough, you can set a low" " number, eg: `10`" -msgstr "要将层数转移到GPU上,将其设置为1000000000以将所有层转移到GPU上。如果您的GPU VRAM不足,可以设置较低的数字,例如:10。" +msgstr "要将多少网络层转移到GPU上,将其设置为1000000000以将所有层转移到GPU上。如果您的 GPU 内存不足,可以设置较低的数字,例如:10。" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 1ffdfa4eb78d4127b302b6d703852692 +#: 306e083489e24f819d67f38e2f155f0f msgid "llama_cpp_n_threads" msgstr "llama_cpp_n_threads" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: f14379e7ea16476da403d5085b67db1c +#: 0490a543f67f4ecd8588541399846951 msgid "" "Number of threads to use. If None, the number of threads is automatically" " determined" msgstr "要使用的线程数量。如果为None,则线程数量将自动确定。" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 41cc1035f6e340e19848452d48a161db +#: 2ad3f09e1f894e30ae512e1cd803af52 msgid "llama_cpp_n_batch" msgstr "llama_cpp_n_batch" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 993c3b9218ee4299beae53bd75a01001 +#: c495776868394df5b311087dfc7c55dd msgid "512" msgstr "512" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 0e11d38c9b58478cacdade34de146320 +#: b5e69dc488cc4ae78ee9daefcf73c290 msgid "Maximum number of prompt tokens to batch together when calling llama_eval" msgstr "在调用llama_eval时,批处理在一起的prompt tokens的最大数量" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 24f5381956d34569aabee4a5d832388b +#: 516cfc3ed00c4a6181f37a4649c9f041 msgid "llama_cpp_n_gqa" msgstr "llama_cpp_n_gqa" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 07d05844541c452caaa8d5bf56c3f8a1 +#: 51847a305c4341af8614a2ceb7aa658f msgid "Grouped-query attention. Must be 8 for llama-2 70b." -msgstr "对于llama-2 70b模型,Grouped-query attention必须为8。" +msgstr "对于 llama-2 70B 模型,Grouped-query attention 必须为8。" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 40a1b9750d854bb19dc18b7d530beccf +#: 8261108709f341dab19e4fece7682c0c msgid "llama_cpp_rms_norm_eps" msgstr "llama_cpp_rms_norm_eps" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 6018ee183b9548eabf91e9fc683e7c24 +#: 72cc3d9988414f489ddefe3afb332e83 msgid "5e-06" msgstr "5e-06" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: eb273c6bcf2c4c47808024008ce230dc +#: ebc1baebf57e4009b0fdfa68eb055d80 msgid "5e-6 is a good value for llama-2 models." msgstr "对于llama-2模型来说,5e-6是一个不错的值。" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: f70f3e935b764b6f9544d201ba2aaa05 +#: 0cc1199e293741f087c795230d9c8dda msgid "llama_cpp_cache_capacity" msgstr "llama_cpp_cache_capacity" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 70035ec5be244eda9fe93be3df2c66df +#: 7d13612da75046b1a3fc0877e229bb91 msgid "Maximum cache capacity. Examples: 2000MiB, 2GiB" -msgstr "cache capacity最大值. Examples: 2000MiB, 2GiB" +msgstr "模型缓存最大值. 例如: 2000MiB, 2GiB" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 164c31b005ae4979938d9bc67e7f2759 +#: 53332858d3a8472f8eb59d845c594ffd msgid "llama_cpp_prefer_cpu" msgstr "llama_cpp_prefer_cpu" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: 28f890f6bee3412e94aeb1326367326e +#: 7ff31fe3233a4243840584bc069654cd msgid "False" msgstr "False" #: ../../getting_started/install/llm/llama/llama_cpp.md -#: f8f27b6323384431ba064a720f39f997 +#: 62d1dbd4f8254141a697448a7a5f6701 msgid "" "If a GPU is available, it will be preferred by default, unless " "prefer_cpu=False is configured." -msgstr "如果有可用的GPU,默认情况下会优先使用GPU,除非配置了prefer_cpu=False。" +msgstr "如果有可用的GPU,默认情况下会优先使用GPU,除非配置了 prefer_cpu=False。" #: ../../getting_started/install/llm/llama/llama_cpp.md:61 -#: 0471e56c790047bab422aa47edad0a15 +#: 8de97de28d1a40c3b852a1268255ebed msgid "GPU Acceleration" msgstr "GPU 加速" #: ../../getting_started/install/llm/llama/llama_cpp.md:63 -#: e95ad40d29004455bebeec8a1a7248c8 +#: 8bce74c0ddb5486190ff4d36fd5358be msgid "" "GPU acceleration is supported by default. If you encounter any issues, " "you can uninstall the dependent packages with the following command:" msgstr "默认情况下支持GPU加速。如果遇到任何问题,您可以使用以下命令卸载相关的依赖包" #: ../../getting_started/install/llm/llama/llama_cpp.md:68 -#: c0caf1420e43437589693ddec96bd50f +#: 1f3fe88521614d499cb1d046f8d3c125 msgid "" "Then install `llama-cpp-python` according to the instructions in [llama-" "cpp-python](https://github.com/abetlen/llama-cpp-" @@ -297,24 +298,24 @@ msgstr "" "python/blob/main/README.md).安装`llama-cpp-python`" #: ../../getting_started/install/llm/llama/llama_cpp.md:71 -#: fe082f65b4e9416c97b18e5005bc0a59 +#: fc83106f0a0e4ddfb3c058bec62f4568 msgid "Mac Usage" -msgstr "Mac Usage" +msgstr "Mac 使用" #: ../../getting_started/install/llm/llama/llama_cpp.md:73 -#: 6f30d3fa399f434189fcb03d28a42d2d +#: dcf5904a444342c8a768c4da8b777828 msgid "" "Special attention, if you are using Apple Silicon (M1) Mac, it is highly " "recommended to install arm64 architecture python support, for example:" -msgstr "特别注意:如果您正在使用苹果芯片(M1)的Mac电脑,强烈建议安装arm64架构的Python支持,例如:" +msgstr "特别注意:如果您正在使用苹果芯片(M1)的Mac电脑,强烈建议安装 arm64 架构的 Python 支持,例如:" #: ../../getting_started/install/llm/llama/llama_cpp.md:80 -#: 74602bede3c5472fbabc7de47eb2ff7a +#: 547369c011a9412589dad1fac7ac3ef9 msgid "Windows Usage" msgstr "Windows使用" #: ../../getting_started/install/llm/llama/llama_cpp.md:82 -#: ae78332a348b44cb847723a998b98048 +#: 506fda57977f4aa8b9fe427e3c66f4d7 msgid "" "The use under the Windows platform has not been rigorously tested and " "verified, and you are welcome to use it. If you have any problems, you " @@ -323,8 +324,8 @@ msgid "" "information) directly." msgstr "" "在Windows平台上的使用尚未经过严格的测试和验证,欢迎您使用。如果您有任何问题,可以创建一个[issue](https://github.com" -"/eosphoros-ai/DB-GPT/issues)或者[contact us](https://github.com/eosphoros-" -"ai/DB-GPT/tree/main#contact-information) directly." +"/eosphoros-ai/DB-GPT/issues)或者直接[联系我们](https://github.com/eosphoros-ai" +"/DB-GPT/tree/main#cntact-information)。" #~ msgid "" #~ "DB-GPT is now supported by " @@ -337,3 +338,6 @@ msgstr "" #~ "cpp-python) through " #~ "[llama.cpp](https://github.com/ggerganov/llama.cpp)." +#~ msgid "Prompt Template Name" +#~ msgstr "Prompt Template Name" + diff --git a/pilot/configs/model_config.py b/pilot/configs/model_config.py index 216a6f03f..927272cb1 100644 --- a/pilot/configs/model_config.py +++ b/pilot/configs/model_config.py @@ -70,7 +70,8 @@ def get_device() -> str: "baichuan2-13b": os.path.join(MODEL_PATH, "Baichuan2-13B-Chat"), # (Llama2 based) We only support WizardLM-13B-V1.2 for now, which is trained from Llama-2 13b, see https://huggingface.co/WizardLM/WizardLM-13B-V1.2 "wizardlm-13b": os.path.join(MODEL_PATH, "WizardLM-13B-V1.2"), - "llama-cpp": os.path.join(MODEL_PATH, "ggml-model-q4_0.bin"), + # wget https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF/resolve/main/vicuna-13b-v1.5.Q4_K_M.gguf -O models/ggml-model-q4_0.gguf + "llama-cpp": os.path.join(MODEL_PATH, "ggml-model-q4_0.gguf"), # https://huggingface.co/internlm/internlm-chat-7b-v1_1, 7b vs 7b-v1.1: https://github.com/InternLM/InternLM/issues/288 "internlm-7b": os.path.join(MODEL_PATH, "internlm-chat-7b"), "internlm-7b-8k": os.path.join(MODEL_PATH, "internlm-chat-7b-8k"), diff --git a/pilot/model/adapter.py b/pilot/model/adapter.py index 760e8722d..0ed42abc9 100644 --- a/pilot/model/adapter.py +++ b/pilot/model/adapter.py @@ -382,14 +382,14 @@ def _parse_model_path(model_path: str) -> Tuple[bool, str]: # Just support local model return False, None if not path.is_file(): - model_paths = list(path.glob("*ggml*.bin")) + model_paths = list(path.glob("*ggml*.gguf")) if not model_paths: return False model_path = str(model_paths[0]) logger.warn( - f"Model path {model_path} is not single file, use first *gglm*.bin model file: {model_path}" + f"Model path {model_path} is not single file, use first *gglm*.gguf model file: {model_path}" ) - if not re.fullmatch(".*ggml.*\.bin", model_path): + if not re.fullmatch(".*ggml.*\.gguf", model_path): return False, None return True, model_path diff --git a/pilot/model/cluster/manager_base.py b/pilot/model/cluster/manager_base.py index ce37755f6..10c351fa6 100644 --- a/pilot/model/cluster/manager_base.py +++ b/pilot/model/cluster/manager_base.py @@ -33,7 +33,7 @@ async def start(self): """Start worker manager""" @abstractmethod - async def stop(self): + async def stop(self, ignore_exception: bool = False): """Stop worker manager""" @abstractmethod diff --git a/pilot/model/cluster/worker/manager.py b/pilot/model/cluster/worker/manager.py index 72d9c32d4..b7b9515c5 100644 --- a/pilot/model/cluster/worker/manager.py +++ b/pilot/model/cluster/worker/manager.py @@ -115,14 +115,30 @@ async def start(self): for listener in self.start_listeners: listener(self) - async def stop(self): + async def stop(self, ignore_exception: bool = False): if not self.run_data.stop_event.is_set(): logger.info("Stop all workers") self.run_data.stop_event.clear() stop_tasks = [] - stop_tasks.append(self._stop_all_worker(apply_req=None)) + stop_tasks.append( + self._stop_all_worker(apply_req=None, ignore_exception=ignore_exception) + ) if self.deregister_func: - stop_tasks.append(self.deregister_func(self.run_data)) + # If ignore_exception is True, use exception handling to ignore any exceptions raised from self.deregister_func + if ignore_exception: + + async def safe_deregister_func(run_data): + try: + await self.deregister_func(run_data) + except Exception as e: + logger.warning( + f"Stop worker, ignored exception from deregister_func: {e}" + ) + + stop_tasks.append(safe_deregister_func(self.run_data)) + else: + stop_tasks.append(self.deregister_func(self.run_data)) + await asyncio.gather(*stop_tasks) def after_start(self, listener: Callable[["WorkerManager"], None]): @@ -424,7 +440,7 @@ async def _start_worker(worker_run_data: WorkerRunData): ) async def _stop_all_worker( - self, apply_req: WorkerApplyRequest + self, apply_req: WorkerApplyRequest, ignore_exception: bool = False ) -> WorkerApplyOutput: start_time = time.time() @@ -441,7 +457,19 @@ async def _stop_worker(worker_run_data: WorkerRunData): and self.register_func and self.deregister_func ): - await self.deregister_func(worker_run_data) + _deregister_func = self.deregister_func + if ignore_exception: + + async def safe_deregister_func(run_data): + try: + await self.deregister_func(run_data) + except Exception as e: + logger.warning( + f"Stop worker, ignored exception from deregister_func: {e}" + ) + + _deregister_func = safe_deregister_func + await _deregister_func(worker_run_data) await self._apply_worker(apply_req, _stop_worker) timecost = time.time() - start_time @@ -487,8 +515,8 @@ def __init__(self, worker_manager: WorkerManager = None) -> None: async def start(self): return await self.worker_manager.start() - async def stop(self): - return await self.worker_manager.stop() + async def stop(self, ignore_exception: bool = False): + return await self.worker_manager.stop(ignore_exception=ignore_exception) def after_start(self, listener: Callable[["WorkerManager"], None]): if listener is not None: @@ -631,7 +659,9 @@ async def api_model_shutdown(request: WorkerStartupRequest): return await worker_manager.model_shutdown(request) -def _setup_fastapi(worker_params: ModelWorkerParameters, app=None): +def _setup_fastapi( + worker_params: ModelWorkerParameters, app=None, ignore_exception: bool = False +): if not app: app = FastAPI() if worker_params.standalone: @@ -666,7 +696,7 @@ async def start_worker_manager(): @app.on_event("shutdown") async def startup_event(): - await worker_manager.stop() + await worker_manager.stop(ignore_exception=ignore_exception) return app @@ -837,7 +867,7 @@ def initialize_worker_manager_in_client( worker_params.register = True worker_params.port = local_port logger.info(f"Worker params: {worker_params}") - _setup_fastapi(worker_params, app) + _setup_fastapi(worker_params, app, ignore_exception=True) _start_local_worker(worker_manager, worker_params) worker_manager.after_start(start_listener) _start_local_embedding_worker( diff --git a/pilot/model/cluster/worker/remote_manager.py b/pilot/model/cluster/worker/remote_manager.py index 7f9de4d62..3aa9673bb 100644 --- a/pilot/model/cluster/worker/remote_manager.py +++ b/pilot/model/cluster/worker/remote_manager.py @@ -17,7 +17,7 @@ async def start(self): for listener in self.start_listeners: listener(self) - async def stop(self): + async def stop(self, ignore_exception: bool = False): pass async def _fetch_from_worker( diff --git a/pilot/server/dbgpt_server.py b/pilot/server/dbgpt_server.py index ab9747e36..0a5b19933 100644 --- a/pilot/server/dbgpt_server.py +++ b/pilot/server/dbgpt_server.py @@ -44,11 +44,6 @@ CFG = Config() -def signal_handler(): - print("in order to avoid chroma db atexit problem") - os._exit(0) - - def swagger_monkey_patch(*args, **kwargs): return get_swagger_ui_html( *args, @@ -176,7 +171,6 @@ def run_uvicorn(param: WebWerverParameters): port=param.port, log_level=logging_str_to_uvicorn_level(param.log_level), ) - signal.signal(signal.SIGINT, signal_handler()) def run_webserver(param: WebWerverParameters = None): diff --git a/setup.py b/setup.py index 062ae2048..9d913b096 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,9 @@ long_description = fh.read() BUILD_NO_CACHE = os.getenv("BUILD_NO_CACHE", "false").lower() == "true" +LLAMA_CPP_GPU_ACCELERATION = ( + os.getenv("LLAMA_CPP_GPU_ACCELERATION", "true").lower() == "true" +) def parse_requirements(file_name: str) -> List[str]: @@ -249,21 +252,29 @@ def llama_cpp_python_cuda_requires(): if not cuda_version: print("CUDA not support, use cpu version") return + if not LLAMA_CPP_GPU_ACCELERATION: + print("Disable GPU acceleration") + return + # Supports GPU acceleration device = "cu" + cuda_version.replace(".", "") os_type, cpu_avx = get_cpu_avx_support() + print(f"OS: {os_type}, cpu avx: {cpu_avx}") supported_os = [OSType.WINDOWS, OSType.LINUX] if os_type not in supported_os: print( f"llama_cpp_python_cuda just support in os: {[r._value_ for r in supported_os]}" ) return - if cpu_avx == AVXType.AVX2 or AVXType.AVX512: - cpu_avx = AVXType.AVX - cpu_avx = cpu_avx._value_ + cpu_device = "" + if cpu_avx == AVXType.AVX2 or cpu_avx == AVXType.AVX512: + cpu_device = "avx" + else: + cpu_device = "basic" + device += cpu_device base_url = "https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui" - llama_cpp_version = "0.1.77" + llama_cpp_version = "0.2.10" py_version = "cp310" - os_pkg_name = "linux_x86_64" if os_type == OSType.LINUX else "win_amd64" + os_pkg_name = "manylinux_2_31_x86_64" if os_type == OSType.LINUX else "win_amd64" extra_index_url = f"{base_url}/llama_cpp_python_cuda-{llama_cpp_version}+{device}-{py_version}-{py_version}-{os_pkg_name}.whl" extra_index_url, _ = encode_url(extra_index_url) print(f"Install llama_cpp_python_cuda from {extra_index_url}") @@ -298,7 +309,7 @@ def core_requires(): "langchain>=0.0.286", "SQLAlchemy", "pymysql", - "duckdb", + "duckdb==0.8.1", "duckdb-engine", "jsonschema", # TODO move transformers to default @@ -312,7 +323,6 @@ def knowledge_requires(): """ setup_spec.extras["knowledge"] = [ "spacy==3.5.3", - # "chromadb==0.3.22", "chromadb==0.4.10", "markdown", "bs4", From 2c170746525cf5d9ee278a62e8b7a715b5db480f Mon Sep 17 00:00:00 2001 From: FangYin Cheng Date: Sat, 7 Oct 2023 21:29:53 +0800 Subject: [PATCH 2/2] chore: Not cache package in local file default --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9d913b096..ee95f1338 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ with open("README.md", mode="r", encoding="utf-8") as fh: long_description = fh.read() -BUILD_NO_CACHE = os.getenv("BUILD_NO_CACHE", "false").lower() == "true" +BUILD_NO_CACHE = os.getenv("BUILD_NO_CACHE", "true").lower() == "true" LLAMA_CPP_GPU_ACCELERATION = ( os.getenv("LLAMA_CPP_GPU_ACCELERATION", "true").lower() == "true" )