From 5f1fa11c6284e32d37c097e1e237c3635b055059 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 2 Jun 2023 17:09:19 -0700 Subject: [PATCH] Increase input length, reduce batch size (#107) Signed-off-by: Antoni Baum --- aviary/backend/llm/predictor.py | 2 +- models/CarperAI--stable-vicuna-13b-delta.yaml | 4 +++- models/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml | 4 +++- models/amazon--LightGPT.yaml | 4 +++- models/databricks--dolly-v2-12b.yaml | 4 +++- models/h2oai--h2ogpt-oasst1-512-12b.yaml | 4 +++- models/lmsys--vicuna-13b-delta-v1.1.yaml | 4 +++- models/mosaicml--mpt-7b-chat.yaml | 3 ++- models/mosaicml--mpt-7b-instruct.yaml | 3 ++- models/mosaicml--mpt-7b-storywriter.yaml | 3 ++- models/stabilityai--stablelm-tuned-alpha-7b.yaml | 4 +++- 11 files changed, 28 insertions(+), 11 deletions(-) diff --git a/aviary/backend/llm/predictor.py b/aviary/backend/llm/predictor.py index 30c9dc21..1d2552ab 100644 --- a/aviary/backend/llm/predictor.py +++ b/aviary/backend/llm/predictor.py @@ -76,7 +76,7 @@ def init_model( # will raise CUDA errors if use_kernel=True. batch_size = max_batch_size or 1 prompt = [WARMUP_PROMPT] * ( - int(llm_config.max_input_words / (len(WARMUP_PROMPT) + 1)) + 1 + int(llm_config.max_input_words / (len(WARMUP_PROMPT.split()) + 1)) + 1 ) prompt = " ".join(prompt) logger.info( diff --git a/models/CarperAI--stable-vicuna-13b-delta.yaml b/models/CarperAI--stable-vicuna-13b-delta.yaml index 6535425d..2955d524 100644 --- a/models/CarperAI--stable-vicuna-13b-delta.yaml +++ b/models/CarperAI--stable-vicuna-13b-delta.yaml @@ -14,6 +14,7 @@ deployment_config: accelerator_type_cpu: 0.01 model_config: model_id: CarperAI/stable-vicuna-13b-delta + max_input_words: 800 initialization: s3_mirror_config: bucket_uri: s3://large-dl-models-mirror/restricted/models--CarperAI--stable-vicuna-13b-delta/main-safetensors/ @@ -27,9 +28,10 @@ model_config: torch_compile: backend: inductor mode: max-autotune + max_tokens: 1536 pipeline: default generation: - max_batch_size: 12 + max_batch_size: 8 generate_kwargs: do_sample: true max_new_tokens: 512 diff --git a/models/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml b/models/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml index 89a57b75..57868050 100644 --- a/models/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml +++ b/models/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml @@ -14,6 +14,7 @@ deployment_config: accelerator_type_cpu: 0.01 model_config: model_id: OpenAssistant/oasst-sft-7-llama-30b-xor + max_input_words: 800 model_description: "Open Assistant is a project meant to give everyone access to a great chat based large language model.\nWe believe that by doing this we will create a revolution in innovation in language. In the same way that stable-diffusion helped the world make art and images in new ways we hope Open Assistant can help improve the world by improving language itself." initialization: s3_mirror_config: @@ -28,9 +29,10 @@ model_config: torch_compile: backend: inductor mode: max-autotune + max_tokens: 1536 pipeline: default generation: - max_batch_size: 6 + max_batch_size: 4 generate_kwargs: do_sample: true max_new_tokens: 512 diff --git a/models/amazon--LightGPT.yaml b/models/amazon--LightGPT.yaml index 69d626d6..5bf11d55 100644 --- a/models/amazon--LightGPT.yaml +++ b/models/amazon--LightGPT.yaml @@ -14,6 +14,7 @@ deployment_config: accelerator_type_cpu: 0.01 model_config: model_id: amazon/LightGPT + max_input_words: 800 initialization: runtime_env: pip: @@ -26,9 +27,10 @@ model_config: from_pretrained_kwargs: use_cache: true use_kernel: true + max_tokens: 1536 pipeline: default generation: - max_batch_size: 26 + max_batch_size: 18 generate_kwargs: do_sample: true max_new_tokens: 512 diff --git a/models/databricks--dolly-v2-12b.yaml b/models/databricks--dolly-v2-12b.yaml index 903cb393..c8c71745 100644 --- a/models/databricks--dolly-v2-12b.yaml +++ b/models/databricks--dolly-v2-12b.yaml @@ -14,6 +14,7 @@ deployment_config: accelerator_type_cpu: 0.01 model_config: model_id: databricks/dolly-v2-12b + max_input_words: 800 model_description: "Databricks’ dolly-v2-12b, an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use. Based on pythia-12b, Dolly is trained on ~15k instruction/response fine tuning records databricks-dolly-15k generated by Databricks employees in capability domains from the InstructGPT paper, including brainstorming, classification, closed QA, generation, information extraction, open QA and summarization. dolly-v2-12b is not a state-of-the-art model, but does exhibit surprisingly high quality instruction following behavior not characteristic of the foundation model on which it is based.\n\nDolly v2 is also available in these smaller models sizes:\n\ndolly-v2-7b, a 6.9 billion parameter based on pythia-6.9b\ndolly-v2-3b, a 2.8 billion parameter based on pythia-2.8b\nPlease refer to the dolly GitHub repo for tips on running inference for various GPU configurations." initialization: s3_mirror_config: @@ -24,9 +25,10 @@ model_config: from_pretrained_kwargs: use_cache: true use_kernel: true + max_tokens: 1536 pipeline: default generation: - max_batch_size: 6 + max_batch_size: 4 generate_kwargs: do_sample: true max_new_tokens: 512 diff --git a/models/h2oai--h2ogpt-oasst1-512-12b.yaml b/models/h2oai--h2ogpt-oasst1-512-12b.yaml index e0c0c279..639d3732 100644 --- a/models/h2oai--h2ogpt-oasst1-512-12b.yaml +++ b/models/h2oai--h2ogpt-oasst1-512-12b.yaml @@ -14,6 +14,7 @@ deployment_config: accelerator_type_cpu: 0.01 model_config: model_id: h2oai/h2ogpt-oasst1-512-12b + max_input_words: 800 initialization: s3_mirror_config: bucket_uri: s3://large-dl-models-mirror/models--h2oai--h2ogpt-oasst1-512-12b/main-safetensors/ @@ -24,9 +25,10 @@ model_config: trust_remote_code: true use_cache: true use_kernel: true + max_tokens: 1536 pipeline: default generation: - max_batch_size: 6 + max_batch_size: 4 generate_kwargs: do_sample: true num_beams: 1 diff --git a/models/lmsys--vicuna-13b-delta-v1.1.yaml b/models/lmsys--vicuna-13b-delta-v1.1.yaml index b852f974..23207a15 100644 --- a/models/lmsys--vicuna-13b-delta-v1.1.yaml +++ b/models/lmsys--vicuna-13b-delta-v1.1.yaml @@ -14,6 +14,7 @@ deployment_config: accelerator_type_cpu: 0.01 model_config: model_id: lmsys/vicuna-13b-delta-v1.1 + max_input_words: 800 model_description: "Vicuna is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT. It is an auto-regressive language model, based on the transformer architecture." initialization: s3_mirror_config: @@ -28,9 +29,10 @@ model_config: torch_compile: backend: inductor mode: max-autotune + max_tokens: 1536 pipeline: default generation: - max_batch_size: 12 + max_batch_size: 6 generate_kwargs: do_sample: true max_new_tokens: 512 diff --git a/models/mosaicml--mpt-7b-chat.yaml b/models/mosaicml--mpt-7b-chat.yaml index 0388ce04..49b98341 100644 --- a/models/mosaicml--mpt-7b-chat.yaml +++ b/models/mosaicml--mpt-7b-chat.yaml @@ -14,6 +14,7 @@ deployment_config: accelerator_type_cpu: 0.01 model_config: model_id: mosaicml/mpt-7b-chat + max_input_words: 800 initialization: s3_mirror_config: bucket_uri: s3://large-dl-models-mirror/models--mosaicml--mpt-7b-chat/main-safetensors/ @@ -29,7 +30,7 @@ model_config: mode: max-autotune pipeline: default generation: - max_batch_size: 22 + max_batch_size: 8 generate_kwargs: do_sample: true max_new_tokens: 512 diff --git a/models/mosaicml--mpt-7b-instruct.yaml b/models/mosaicml--mpt-7b-instruct.yaml index c9c5dc65..d1ac2e09 100644 --- a/models/mosaicml--mpt-7b-instruct.yaml +++ b/models/mosaicml--mpt-7b-instruct.yaml @@ -14,6 +14,7 @@ deployment_config: accelerator_type_cpu: 0.01 model_config: model_id: mosaicml/mpt-7b-instruct + max_input_words: 800 initialization: s3_mirror_config: bucket_uri: s3://large-dl-models-mirror/models--mosaicml--mpt-7b-instruct/main-safetensors/ @@ -29,7 +30,7 @@ model_config: mode: max-autotune pipeline: default generation: - max_batch_size: 22 + max_batch_size: 8 generate_kwargs: do_sample: true max_new_tokens: 512 diff --git a/models/mosaicml--mpt-7b-storywriter.yaml b/models/mosaicml--mpt-7b-storywriter.yaml index 78495167..60fe421c 100644 --- a/models/mosaicml--mpt-7b-storywriter.yaml +++ b/models/mosaicml--mpt-7b-storywriter.yaml @@ -14,6 +14,7 @@ deployment_config: accelerator_type_cpu: 0.01 model_config: model_id: mosaicml/mpt-7b-storywriter + max_input_words: 800 initialization: s3_mirror_config: bucket_uri: s3://large-dl-models-mirror/models--mosaicml--mpt-7b-storywriter/main-safetensors/ @@ -29,7 +30,7 @@ model_config: mode: max-autotune pipeline: default generation: - max_batch_size: 12 + max_batch_size: 8 generate_kwargs: do_sample: true max_new_tokens: 512 diff --git a/models/stabilityai--stablelm-tuned-alpha-7b.yaml b/models/stabilityai--stablelm-tuned-alpha-7b.yaml index cb368f5d..3774f6b1 100644 --- a/models/stabilityai--stablelm-tuned-alpha-7b.yaml +++ b/models/stabilityai--stablelm-tuned-alpha-7b.yaml @@ -14,6 +14,7 @@ deployment_config: accelerator_type_cpu: 0.01 model_config: model_id: stabilityai/stablelm-tuned-alpha-7b + max_input_words: 800 initialization: s3_mirror_config: bucket_uri: s3://large-dl-models-mirror/models--stabilityai--stablelm-tuned-alpha-7b/main-safetensors/ @@ -23,9 +24,10 @@ model_config: from_pretrained_kwargs: use_cache: true use_kernel: true + max_tokens: 1536 pipeline: default generation: - max_batch_size: 14 + max_batch_size: 8 generate_kwargs: do_sample: true max_new_tokens: 512