From 5f1fa11c6284e32d37c097e1e237c3635b055059 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Fri, 2 Jun 2023 17:09:19 -0700
Subject: [PATCH] Increase input length, reduce batch size (#107)

Signed-off-by: Antoni Baum <antoni.baum@protonmail.com>
---
 aviary/backend/llm/predictor.py                      | 2 +-
 models/CarperAI--stable-vicuna-13b-delta.yaml        | 4 +++-
 models/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml | 4 +++-
 models/amazon--LightGPT.yaml                         | 4 +++-
 models/databricks--dolly-v2-12b.yaml                 | 4 +++-
 models/h2oai--h2ogpt-oasst1-512-12b.yaml             | 4 +++-
 models/lmsys--vicuna-13b-delta-v1.1.yaml             | 4 +++-
 models/mosaicml--mpt-7b-chat.yaml                    | 3 ++-
 models/mosaicml--mpt-7b-instruct.yaml                | 3 ++-
 models/mosaicml--mpt-7b-storywriter.yaml             | 3 ++-
 models/stabilityai--stablelm-tuned-alpha-7b.yaml     | 4 +++-
 11 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/aviary/backend/llm/predictor.py b/aviary/backend/llm/predictor.py
index 30c9dc21..1d2552ab 100644
--- a/aviary/backend/llm/predictor.py
+++ b/aviary/backend/llm/predictor.py
@@ -76,7 +76,7 @@ def init_model(
     # will raise CUDA errors if use_kernel=True.
     batch_size = max_batch_size or 1
     prompt = [WARMUP_PROMPT] * (
-        int(llm_config.max_input_words / (len(WARMUP_PROMPT) + 1)) + 1
+        int(llm_config.max_input_words / (len(WARMUP_PROMPT.split()) + 1)) + 1
     )
     prompt = " ".join(prompt)
     logger.info(
diff --git a/models/CarperAI--stable-vicuna-13b-delta.yaml b/models/CarperAI--stable-vicuna-13b-delta.yaml
index 6535425d..2955d524 100644
--- a/models/CarperAI--stable-vicuna-13b-delta.yaml
+++ b/models/CarperAI--stable-vicuna-13b-delta.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: CarperAI/stable-vicuna-13b-delta
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/restricted/models--CarperAI--stable-vicuna-13b-delta/main-safetensors/
@@ -27,9 +28,10 @@ model_config:
       torch_compile:
         backend: inductor
         mode: max-autotune
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 12
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml b/models/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml
index 89a57b75..57868050 100644
--- a/models/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml
+++ b/models/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: OpenAssistant/oasst-sft-7-llama-30b-xor
+  max_input_words: 800
   model_description: "Open Assistant is a project meant to give everyone access to a great chat based large language model.\nWe believe that by doing this we will create a revolution in innovation in language. In the same way that stable-diffusion helped the world make art and images in new ways we hope Open Assistant can help improve the world by improving language itself."
   initialization:
     s3_mirror_config:
@@ -28,9 +29,10 @@ model_config:
       torch_compile:
         backend: inductor
         mode: max-autotune
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 6
+    max_batch_size: 4
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/amazon--LightGPT.yaml b/models/amazon--LightGPT.yaml
index 69d626d6..5bf11d55 100644
--- a/models/amazon--LightGPT.yaml
+++ b/models/amazon--LightGPT.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: amazon/LightGPT
+  max_input_words: 800
   initialization:
     runtime_env:
       pip:
@@ -26,9 +27,10 @@ model_config:
       from_pretrained_kwargs:
         use_cache: true
       use_kernel: true
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 26
+    max_batch_size: 18
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/databricks--dolly-v2-12b.yaml b/models/databricks--dolly-v2-12b.yaml
index 903cb393..c8c71745 100644
--- a/models/databricks--dolly-v2-12b.yaml
+++ b/models/databricks--dolly-v2-12b.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: databricks/dolly-v2-12b
+  max_input_words: 800
   model_description: "Databricks’ dolly-v2-12b, an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use. Based on pythia-12b, Dolly is trained on ~15k instruction/response fine tuning records databricks-dolly-15k generated by Databricks employees in capability domains from the InstructGPT paper, including brainstorming, classification, closed QA, generation, information extraction, open QA and summarization. dolly-v2-12b is not a state-of-the-art model, but does exhibit surprisingly high quality instruction following behavior not characteristic of the foundation model on which it is based.\n\nDolly v2 is also available in these smaller models sizes:\n\ndolly-v2-7b, a 6.9 billion parameter based on pythia-6.9b\ndolly-v2-3b, a 2.8 billion parameter based on pythia-2.8b\nPlease refer to the dolly GitHub repo for tips on running inference for various GPU configurations."
   initialization:
     s3_mirror_config:
@@ -24,9 +25,10 @@ model_config:
       from_pretrained_kwargs:
         use_cache: true
       use_kernel: true
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 6
+    max_batch_size: 4
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/h2oai--h2ogpt-oasst1-512-12b.yaml b/models/h2oai--h2ogpt-oasst1-512-12b.yaml
index e0c0c279..639d3732 100644
--- a/models/h2oai--h2ogpt-oasst1-512-12b.yaml
+++ b/models/h2oai--h2ogpt-oasst1-512-12b.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: h2oai/h2ogpt-oasst1-512-12b
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--h2oai--h2ogpt-oasst1-512-12b/main-safetensors/
@@ -24,9 +25,10 @@ model_config:
         trust_remote_code: true
         use_cache: true
       use_kernel: true
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 6
+    max_batch_size: 4
     generate_kwargs:
       do_sample: true
       num_beams: 1
diff --git a/models/lmsys--vicuna-13b-delta-v1.1.yaml b/models/lmsys--vicuna-13b-delta-v1.1.yaml
index b852f974..23207a15 100644
--- a/models/lmsys--vicuna-13b-delta-v1.1.yaml
+++ b/models/lmsys--vicuna-13b-delta-v1.1.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: lmsys/vicuna-13b-delta-v1.1
+  max_input_words: 800
   model_description: "Vicuna is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT. It is an auto-regressive language model, based on the transformer architecture."
   initialization:
     s3_mirror_config:
@@ -28,9 +29,10 @@ model_config:
       torch_compile:
         backend: inductor
         mode: max-autotune
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 12
+    max_batch_size: 6
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/mosaicml--mpt-7b-chat.yaml b/models/mosaicml--mpt-7b-chat.yaml
index 0388ce04..49b98341 100644
--- a/models/mosaicml--mpt-7b-chat.yaml
+++ b/models/mosaicml--mpt-7b-chat.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: mosaicml/mpt-7b-chat
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--mosaicml--mpt-7b-chat/main-safetensors/
@@ -29,7 +30,7 @@ model_config:
         mode: max-autotune
     pipeline: default
   generation:
-    max_batch_size: 22
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/mosaicml--mpt-7b-instruct.yaml b/models/mosaicml--mpt-7b-instruct.yaml
index c9c5dc65..d1ac2e09 100644
--- a/models/mosaicml--mpt-7b-instruct.yaml
+++ b/models/mosaicml--mpt-7b-instruct.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: mosaicml/mpt-7b-instruct
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--mosaicml--mpt-7b-instruct/main-safetensors/
@@ -29,7 +30,7 @@ model_config:
         mode: max-autotune
     pipeline: default
   generation:
-    max_batch_size: 22
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/mosaicml--mpt-7b-storywriter.yaml b/models/mosaicml--mpt-7b-storywriter.yaml
index 78495167..60fe421c 100644
--- a/models/mosaicml--mpt-7b-storywriter.yaml
+++ b/models/mosaicml--mpt-7b-storywriter.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: mosaicml/mpt-7b-storywriter
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--mosaicml--mpt-7b-storywriter/main-safetensors/
@@ -29,7 +30,7 @@ model_config:
         mode: max-autotune
     pipeline: default
   generation:
-    max_batch_size: 12
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512
diff --git a/models/stabilityai--stablelm-tuned-alpha-7b.yaml b/models/stabilityai--stablelm-tuned-alpha-7b.yaml
index cb368f5d..3774f6b1 100644
--- a/models/stabilityai--stablelm-tuned-alpha-7b.yaml
+++ b/models/stabilityai--stablelm-tuned-alpha-7b.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: stabilityai/stablelm-tuned-alpha-7b
+  max_input_words: 800
   initialization:
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--stabilityai--stablelm-tuned-alpha-7b/main-safetensors/
@@ -23,9 +24,10 @@ model_config:
       from_pretrained_kwargs:
         use_cache: true
       use_kernel: true
+      max_tokens: 1536
     pipeline: default
   generation:
-    max_batch_size: 14
+    max_batch_size: 8
     generate_kwargs:
       do_sample: true
       max_new_tokens: 512