Bump the versions, new models support (#463)

* up the versions * fixing starcoder2 flash sa * integrate groq / cerebras to the self-hosting (#466) * qwen2.5 models * upd README.md * a warning * get rid of the autogptq models * version 1.8.0 * version 1.8.0 * deprecated versions in the readme * add completion support for the passthrough models * add multiline_code_completion_default_model * _select_default_lora_if_exists for multiline_code_completion_default_model _add_results_for_passthrough_provider fix * rm deepseek-coder-v2/16b/instruct MAX_JOBS=8 * gpt-4 is unavailable
smallcloudai · Dec 2, 2024 · 1b094ba · 1b094ba
1 parent 31ed965
commit 1b094ba
Show file tree

Hide file tree

Showing 19 changed files with 414 additions and 88 deletions.
diff --git a/Dockerfile.base b/Dockerfile.base
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
 
 ENV INSTALL_OPTIONAL=TRUE
 ENV MAX_JOBS=8
@@ -13,24 +13,28 @@ RUN DEBIAN_FRONTEND="noninteractive" TZ=Etc/UTC apt-get install -y  \
     ruby-full \
     ruby-bundler \
     build-essential \
-    cmake \
     pkg-config \
     libicu-dev \
     zlib1g-dev \
     libcurl4-openssl-dev \
     libssl-dev \
     && rm -rf /var/lib/{apt,dpkg,cache,log}
+RUN DEBIAN_FRONTEND="noninteractive" TZ=Etc/UTC apt remove cmake -y
+RUN pip install cmake --upgrade
+
 RUN git clone https://github.com/smallcloudai/linguist.git /tmp/linguist \
     && cd /tmp/linguist \
     && bundle install \
     && rake build_gem
 ENV PATH="${PATH}:/tmp/linguist/bin"
 
-RUN pip install --no-cache-dir torch==2.3.0 --index-url https://download.pytorch.org/whl/cu118
-RUN pip install --no-cache-dir xformers==0.0.26.post1 --index-url https://download.pytorch.org/whl/cu118
+RUN pip install --no-cache-dir torch==2.5.0
+RUN pip install --no-cache-dir xformers==v0.0.28.post2
 RUN pip install ninja
-RUN VLLM_INSTALL_PUNICA_KERNELS=1 pip install -v --no-build-isolation git+https://github.com/smallcloudai/vllm@refact_v0.4.2_06052024
+RUN pip install setuptools_scm
+ENV CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=60;61;70;75;80;86;89;90+PTX"
+RUN pip install -v --no-build-isolation git+https://github.com/smallcloudai/vllm@refact_v0.6.3_2adb440
 
-# there is no prebuild auto-gptq with torch 2.3.0 support
+# there is no prebuild auto-gptq with torch 2.5.0 support
 ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0+PTX"
 RUN BUILD_CUDA_EXT=1 pip install -v --no-build-isolation git+https://github.com/PanQiWei/[email protected]
diff --git a/README.md b/README.md
@@ -103,21 +103,35 @@ Extensions > Refact.ai Assistant > Settings > Infurl
 
 ## Supported models
 
-| Model                                                                                             | Completion | Chat | Fine-tuning | [Deprecated](## "Will be removed in next versions") |
-|---------------------------------------------------------------------------------------------------|------------|------|-------------|-----------------------------------------------------|
-| [Refact/1.6B](https://huggingface.co/smallcloudai/Refact-1_6B-fim)                                | +          |      | +           |                                                     |
-| [starcoder2/3b/base](https://huggingface.co/bigcode/starcoder2-3b)                                | +          |      | +           |                                                     |
-| [starcoder2/7b/base](https://huggingface.co/bigcode/starcoder2-7b)                                | +          |      | +           |                                                     |
-| [starcoder2/15b/base](https://huggingface.co/bigcode/starcoder2-15b)                              | +          |      | +           |                                                     |
-| [deepseek-coder/1.3b/base](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base)           | +          |      | +           |                                                     |
-| [deepseek-coder/5.7b/mqa-base](https://huggingface.co/deepseek-ai/deepseek-coder-5.7bmqa-base)    | +          |      | +           |                                                     |
-| [magicoder/6.7b](https://huggingface.co/TheBloke/Magicoder-S-DS-6.7B-GPTQ)                        |            | +    |             |                                                     |
-| [mistral/7b/instruct-v0.1](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GPTQ)         |            | +    |             |                                                     |
-| [mixtral/8x7b/instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)         |            | +    |             |                                                     |
-| [deepseek-coder/6.7b/instruct](https://huggingface.co/TheBloke/deepseek-coder-6.7B-instruct-GPTQ) |            | +    |             |                                                     |
-| [deepseek-coder/33b/instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)     |            | +    |             |                                                     |
-| [stable/3b/code](https://huggingface.co/stabilityai/stable-code-3b)                               | +          |      |             |                                                     |
-| [llama3/8b/instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)                  |            | +    |             |                                                     |
+| Model                                                                                                   | Completion | Chat | Fine-tuning | [Deprecated](## "Will be removed in next versions") |
+|---------------------------------------------------------------------------------------------------------|------------|------|-------------|-----------------------------------------------------|
+| [Refact/1.6B](https://huggingface.co/smallcloudai/Refact-1_6B-fim)                                      | +          |      | +           |                                                     |
+| [starcoder2/3b/base](https://huggingface.co/bigcode/starcoder2-3b)                                      | +          |      | +           |                                                     |
+| [starcoder2/7b/base](https://huggingface.co/bigcode/starcoder2-7b)                                      | +          |      | +           |                                                     |
+| [starcoder2/15b/base](https://huggingface.co/bigcode/starcoder2-15b)                                    | +          |      | +           |                                                     |
+| [deepseek-coder/1.3b/base](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base)                 | +          |      | +           |                                                     |
+| [deepseek-coder/5.7b/mqa-base](https://huggingface.co/deepseek-ai/deepseek-coder-5.7bmqa-base)          | +          |      | +           |                                                     |
+| [magicoder/6.7b](https://huggingface.co/TheBloke/Magicoder-S-DS-6.7B-GPTQ)                              |            | +    |             | +                                                   |
+| [mistral/7b/instruct-v0.1](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GPTQ)               |            | +    |             | +                                                   |
+| [mixtral/8x7b/instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)               |            | +    |             |                                                     |
+| [deepseek-coder/6.7b/instruct](https://huggingface.co/TheBloke/deepseek-coder-6.7B-instruct-GPTQ)       |            | +    |             | +                                                   |
+| [deepseek-coder/33b/instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)           |            | +    |             |                                                     |
+| [stable/3b/code](https://huggingface.co/stabilityai/stable-code-3b)                                     | +          |      |             |                                                     |
+| [llama3/8b/instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)                        | +          | +    |             |                                                     |
+| [llama3.1/8b/instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)                    | +          | +    |             |                                                     |
+| [llama3.2/1b/instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)                         | +          | +    |             |                                                     |
+| [llama3.2/3b/instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)                         | +          | +    |             |                                                     |
+| [qwen2.5/coder/0.5b/base](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B)                               | +          |      | +           |                                                     |
+| [qwen2.5/coder/1.5b/base](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B)                               | +          |      | +           |                                                     |
+| [qwen2.5/coder/3b/base](https://huggingface.co/Qwen/Qwen2.5-Coder-3B)                                   | +          |      | +           |                                                     |
+| [qwen2.5/coder/7b/base](https://huggingface.co/Qwen/Qwen2.5-Coder-7B)                                   | +          |      | +           |                                                     |
+| [qwen2.5/coder/14b/base](https://huggingface.co/Qwen/Qwen2.5-Coder-14B)                                 | +          |      | +           |                                                     |
+| [qwen2.5/coder/32b/base](https://huggingface.co/Qwen/Qwen2.5-Coder-32B)                                 | +          |      | +           |                                                     |
+| [qwen2.5/coder/1.5b/instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)                  | +          | +    |             |                                                     |
+| [qwen2.5/coder/3b/instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct)                      | +          | +    |             |                                                     |
+| [qwen2.5/coder/7b/instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct)                      | +          | +    |             |                                                     |
+| [qwen2.5/coder/14b/instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct)                    | +          | +    |             |                                                     |
+| [qwen2.5/coder/32b/instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct)                    | +          | +    |             |                                                     |
 
 ## Usage
 

diff --git a/refact_known_models/huggingface.py b/refact_known_models/huggingface.py
@@ -22,6 +22,7 @@
         "required_memory_mb": 8000,
         "T": 4096,  # in fact this model allows 16k context, but we have 4k context at max in hf inference
         "filter_caps": ["chat"],
+        "deprecated": True
     },
     "mistral/7b/instruct-v0.1": {
         "backend": "autogptq",
@@ -30,6 +31,7 @@
         "required_memory_mb": 8000,
         "T": 4096,  # in fact this model allows 8k context, but we have 4k context at max in hf inference
         "filter_caps": ["chat"],
+        "deprecated": True
     },
     "mixtral/8x7b/instruct-v0.1": {
         "backend": "transformers",
@@ -50,6 +52,7 @@
         "required_memory_mb": 8000,
         "T": 4096,  # in fact this model allows 16k context, but we have 4k context at max in hf inference
         "filter_caps": ["chat"],
+        "deprecated": True
     },
     "deepseek-coder/33b/instruct": {
         "backend": "transformers",
@@ -113,16 +116,126 @@
         },
         "required_memory_mb": 20000,
         "T": 8192,
-        "filter_caps": ["chat"],
+        "filter_caps": ["completion", "chat"],
+    },
+    "llama3.1/8b/instruct": {
+        "backend": "transformers",
+        "model_path": "meta-llama/Llama-3.1-8B-Instruct",
+        "model_class_kwargs": {
+            "torch_dtype": "bf16",
+        },
+        "required_memory_mb": 20000,
+        "T": 16384,  # in fact this model can handle 128K context
+        "filter_caps": ["completion", "chat"],
+    },
+    "llama3.2/3b/instruct": {
+        "backend": "transformers",
+        "model_path": "meta-llama/Llama-3.2-3B-Instruct",
+        "model_class_kwargs": {
+            "torch_dtype": "bf16",
+        },
+        "required_memory_mb": 12000,
+        "T": 16384,  # in fact this model can handle 128K context
+        "filter_caps": ["completion", "chat"],
     },
-    "deepseek-coder-v2/16b/instruct": {
+    "llama3.2/1b/instruct": {
         "backend": "transformers",
-        "model_path": "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+        "model_path": "meta-llama/Llama-3.2-1B-Instruct",
         "model_class_kwargs": {
             "torch_dtype": "bf16",
         },
-        "required_memory_mb": 80000,
+        "required_memory_mb": 8000,
         "T": 16384,  # in fact this model can handle 128K context
         "filter_caps": ["completion", "chat"],
     },
+    # qwen 2.5-coder instruct models
+    "qwen2.5/coder/32b/instruct": {
+        "backend": "transformers",
+        "model_path": "Qwen/Qwen2.5-Coder-32B-Instruct",
+        "model_class_kwargs": {},
+        "required_memory_mb": 45000,
+        "T": 32768,
+        "filter_caps": ["completion", "chat"],
+    },
+    "qwen2.5/coder/14b/instruct": {
+        "backend": "transformers",
+        "model_path": "Qwen/Qwen2.5-Coder-14B-Instruct",
+        "model_class_kwargs": {},
+        "required_memory_mb": 45000,
+        "T": 32768,
+        "filter_caps": ["completion", "chat"],
+    },
+    "qwen2.5/coder/7b/instruct": {
+        "backend": "transformers",
+        "model_path": "Qwen/Qwen2.5-Coder-7B-Instruct",
+        "model_class_kwargs": {},
+        "required_memory_mb": 45000,
+        "T": 32768,
+        "filter_caps": ["completion", "chat"],
+    },
+    "qwen2.5/coder/3b/instruct": {
+        "backend": "transformers",
+        "model_path": "Qwen/Qwen2.5-Coder-3B-Instruct",
+        "model_class_kwargs": {},
+        "required_memory_mb": 45000,
+        "T": 32768,
+        "filter_caps": ["completion", "chat"],
+    },
+    "qwen2.5/coder/1.5b/instruct": {
+        "backend": "transformers",
+        "model_path": "Qwen/Qwen2.5-Coder-1.5B-Instruct",
+        "model_class_kwargs": {},
+        "required_memory_mb": 45000,
+        "T": 32768,
+        "filter_caps": ["completion", "chat"],
+    },
+    # qwen 2.5-coder completion models
+    "qwen2.5/coder/32b/base": {
+        "backend": "transformers",
+        "model_path": "Qwen/Qwen2.5-Coder-32B",
+        "model_class_kwargs": {},
+        "required_memory_mb": 45000,
+        "T": 32768,
+        "filter_caps": ["completion", "finetune"],
+    },
+    "qwen2.5/coder/14b/base": {
+        "backend": "transformers",
+        "model_path": "Qwen/Qwen2.5-Coder-14B",
+        "model_class_kwargs": {},
+        "required_memory_mb": 35000,
+        "T": 32768,
+        "filter_caps": ["completion", "finetune"],
+    },
+    "qwen2.5/coder/7b/base": {
+        "backend": "transformers",
+        "model_path": "Qwen/Qwen2.5-Coder-7B",
+        "model_class_kwargs": {},
+        "required_memory_mb": 20000,
+        "T": 32768,
+        "filter_caps": ["completion", "finetune"],
+    },
+    "qwen2.5/coder/3b/base": {
+        "backend": "transformers",
+        "model_path": "Qwen/Qwen2.5-Coder-3B",
+        "model_class_kwargs": {},
+        "required_memory_mb": 15000,
+        "T": 32768,
+        "filter_caps": ["completion", "finetune"],
+    },
+    "qwen2.5/coder/1.5b/base": {
+        "backend": "transformers",
+        "model_path": "Qwen/Qwen2.5-Coder-1.5B",
+        "model_class_kwargs": {},
+        "required_memory_mb": 10000,
+        "T": 32768,
+        "filter_caps": ["completion", "finetune"],
+    },
+    "qwen2.5/coder/0.5b/base": {
+        "backend": "transformers",
+        "model_path": "Qwen/Qwen2.5-Coder-0.5B",
+        "model_class_kwargs": {},
+        "required_memory_mb": 7000,
+        "T": 32768,
+        "filter_caps": ["completion", "finetune"],
+    },
 }