Increase input length, reduce batch size (#107)

Signed-off-by: Antoni Baum <[email protected]>
ray-project · Jun 3, 2023 · 5f1fa11 · 5f1fa11
1 parent 31e287f
commit 5f1fa11
Show file tree

Hide file tree

Showing 11 changed files with 28 additions and 11 deletions.
diff --git a/aviary/backend/llm/predictor.py b/aviary/backend/llm/predictor.py
@@ -76,7 +76,7 @@ def init_model(
     # will raise CUDA errors if use_kernel=True.
     batch_size = max_batch_size or 1
     prompt = [WARMUP_PROMPT] * (
-        int(llm_config.max_input_words / (len(WARMUP_PROMPT) + 1)) + 1
+        int(llm_config.max_input_words / (len(WARMUP_PROMPT.split()) + 1)) + 1
     )
     prompt = " ".join(prompt)
     logger.info(

diff --git a/models/CarperAI--stable-vicuna-13b-delta.yaml b/models/CarperAI--stable-vicuna-13b-delta.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: CarperAI/stable-vicuna-13b-delta
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/restricted/models--CarperAI--stable-vicuna-13b-delta/main-safetensors/
@@ -27,9 +28,10 @@ model_config:
       torch_compile:
         backend: inductor
         mode: max-autotune
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 12
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512

diff --git a/models/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml b/models/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: OpenAssistant/oasst-sft-7-llama-30b-xor
+  max_input_words: 800
   model_description: "Open Assistant is a project meant to give everyone access to a great chat based large language model.\nWe believe that by doing this we will create a revolution in innovation in language. In the same way that stable-diffusion helped the world make art and images in new ways we hope Open Assistant can help improve the world by improving language itself."
   initialization:
     s3_mirror_config:
@@ -28,9 +29,10 @@ model_config:
       torch_compile:
         backend: inductor
         mode: max-autotune
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 6
+    max_batch_size: 4
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512

diff --git a/models/amazon--LightGPT.yaml b/models/amazon--LightGPT.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: amazon/LightGPT
+  max_input_words: 800
   initialization:
     runtime_env:
       pip:
@@ -26,9 +27,10 @@ model_config:
       from_pretrained_kwargs:
         use_cache: true
       use_kernel: true
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 26
+    max_batch_size: 18
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512

diff --git a/models/databricks--dolly-v2-12b.yaml b/models/databricks--dolly-v2-12b.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: databricks/dolly-v2-12b
+  max_input_words: 800
   model_description: "Databricks’ dolly-v2-12b, an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use. Based on pythia-12b, Dolly is trained on ~15k instruction/response fine tuning records databricks-dolly-15k generated by Databricks employees in capability domains from the InstructGPT paper, including brainstorming, classification, closed QA, generation, information extraction, open QA and summarization. dolly-v2-12b is not a state-of-the-art model, but does exhibit surprisingly high quality instruction following behavior not characteristic of the foundation model on which it is based.\n\nDolly v2 is also available in these smaller models sizes:\n\ndolly-v2-7b, a 6.9 billion parameter based on pythia-6.9b\ndolly-v2-3b, a 2.8 billion parameter based on pythia-2.8b\nPlease refer to the dolly GitHub repo for tips on running inference for various GPU configurations."
   initialization:
     s3_mirror_config:
@@ -24,9 +25,10 @@ model_config:
       from_pretrained_kwargs:
         use_cache: true
       use_kernel: true
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 6
+    max_batch_size: 4
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512

diff --git a/models/h2oai--h2ogpt-oasst1-512-12b.yaml b/models/h2oai--h2ogpt-oasst1-512-12b.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: h2oai/h2ogpt-oasst1-512-12b
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--h2oai--h2ogpt-oasst1-512-12b/main-safetensors/
@@ -24,9 +25,10 @@ model_config:
         trust_remote_code: true
         use_cache: true
       use_kernel: true
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 6
+    max_batch_size: 4
     generate_kwargs:
       do_sample: true
       num_beams: 1

diff --git a/models/lmsys--vicuna-13b-delta-v1.1.yaml b/models/lmsys--vicuna-13b-delta-v1.1.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: lmsys/vicuna-13b-delta-v1.1
+  max_input_words: 800
   model_description: "Vicuna is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT. It is an auto-regressive language model, based on the transformer architecture."
   initialization:
     s3_mirror_config:
@@ -28,9 +29,10 @@ model_config:
       torch_compile:
         backend: inductor
         mode: max-autotune
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 12
+    max_batch_size: 6
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512

diff --git a/models/mosaicml--mpt-7b-chat.yaml b/models/mosaicml--mpt-7b-chat.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: mosaicml/mpt-7b-chat
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--mosaicml--mpt-7b-chat/main-safetensors/
@@ -29,7 +30,7 @@ model_config:
         mode: max-autotune
     pipeline: default
   generation:
-    max_batch_size: 22
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512

diff --git a/models/mosaicml--mpt-7b-instruct.yaml b/models/mosaicml--mpt-7b-instruct.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: mosaicml/mpt-7b-instruct
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--mosaicml--mpt-7b-instruct/main-safetensors/
@@ -29,7 +30,7 @@ model_config:
         mode: max-autotune
     pipeline: default
   generation:
-    max_batch_size: 22
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512

diff --git a/models/mosaicml--mpt-7b-storywriter.yaml b/models/mosaicml--mpt-7b-storywriter.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: mosaicml/mpt-7b-storywriter
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--mosaicml--mpt-7b-storywriter/main-safetensors/
@@ -29,7 +30,7 @@ model_config:
         mode: max-autotune
     pipeline: default
   generation:
-    max_batch_size: 12
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512

diff --git a/models/stabilityai--stablelm-tuned-alpha-7b.yaml b/models/stabilityai--stablelm-tuned-alpha-7b.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: stabilityai/stablelm-tuned-alpha-7b
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--stabilityai--stablelm-tuned-alpha-7b/main-safetensors/
@@ -23,9 +24,10 @@ model_config:
       from_pretrained_kwargs:
         use_cache: true
       use_kernel: true
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 14
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512