Skip to content

Commit

Permalink
[Minor] Revert change in offline inference example (vllm-project#10545)
Browse files Browse the repository at this point in the history
Signed-off-by: Woosuk Kwon <[email protected]>
Signed-off-by: Tyler Michael Smith <[email protected]>
  • Loading branch information
WoosukKwon authored and tlrmchlsmth committed Nov 23, 2024
1 parent 0fac2ac commit 95e7212
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 78 deletions.
98 changes: 20 additions & 78 deletions examples/offline_inference.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,22 @@
from dataclasses import asdict

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser


def get_prompts(num_prompts: int):
# The default sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]

if num_prompts != len(prompts):
prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]

return prompts


def main(args):
# Create prompts
prompts = get_prompts(args.num_prompts)

# Create a sampling params object.
sampling_params = SamplingParams(n=args.n,
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
max_tokens=args.max_tokens)

# Create an LLM.
# The default model is 'facebook/opt-125m'
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**asdict(engine_args))

# Generate texts from the prompts.
# The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


if __name__ == '__main__':
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
group = parser.add_argument_group("SamplingParams options")
group.add_argument("--num-prompts",
type=int,
default=4,
help="Number of prompts used for inference")
group.add_argument("--max-tokens",
type=int,
default=16,
help="Generated output length for sampling")
group.add_argument('--n',
type=int,
default=1,
help='Number of generated sequences per prompt')
group.add_argument('--temperature',
type=float,
default=0.8,
help='Temperature for text generation')
group.add_argument('--top-p',
type=float,
default=0.95,
help='top_p for text generation')
group.add_argument('--top-k',
type=int,
default=-1,
help='top_k for text generation')

args = parser.parse_args()
main(args)
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# Create an LLM.
llm = LLM(model="facebook/opt-125m")
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
80 changes: 80 additions & 0 deletions examples/offline_inference_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from dataclasses import asdict

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser


def get_prompts(num_prompts: int):
# The default sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]

if num_prompts != len(prompts):
prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]

return prompts


def main(args):
# Create prompts
prompts = get_prompts(args.num_prompts)

# Create a sampling params object.
sampling_params = SamplingParams(n=args.n,
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
max_tokens=args.max_tokens)

# Create an LLM.
# The default model is 'facebook/opt-125m'
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**asdict(engine_args))

# Generate texts from the prompts.
# The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


if __name__ == '__main__':
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
group = parser.add_argument_group("SamplingParams options")
group.add_argument("--num-prompts",
type=int,
default=4,
help="Number of prompts used for inference")
group.add_argument("--max-tokens",
type=int,
default=16,
help="Generated output length for sampling")
group.add_argument('--n',
type=int,
default=1,
help='Number of generated sequences per prompt')
group.add_argument('--temperature',
type=float,
default=0.8,
help='Temperature for text generation')
group.add_argument('--top-p',
type=float,
default=0.95,
help='top_p for text generation')
group.add_argument('--top-k',
type=int,
default=-1,
help='top_k for text generation')

args = parser.parse_args()
main(args)

0 comments on commit 95e7212

Please sign in to comment.