Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Rename manager arguments #9

Merged
merged 9 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 19 additions & 19 deletions benchmark/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,8 @@ def calculate_cdf(latencies):
print(f"{hist=}")
print(f"{cumsum=}")

def plot_latency_cdf(req_latencies, prefill_latencies, decode_latencies, results_filename):
fig_filename = os.path.splitext(results_filename)[0] + "_latency.png"
def plot_latency_cdf(req_latencies, prefill_latencies, decode_latencies, log_filename):
fig_filename = os.path.splitext(log_filename)[0] + "_latency.png"
fig, (ax_req, ax_prefill, ax_decode) = plt.subplots(1, 3, figsize=(3*7, 4.8))

def plot_single(ax, latencies, is_prefill=False):
Expand Down Expand Up @@ -286,8 +286,8 @@ def plot_single(ax, latencies, is_prefill=False):
plt.suptitle(fig_filename_title, fontsize=6)
fig.savefig(fig_filename)

def plot_len_cdf(prompt_lens, response_lens, total_tokens, results_filename):
fig_filename = os.path.splitext(results_filename)[0] + "_len.png"
def plot_len_cdf(prompt_lens, response_lens, total_tokens, log_filename):
fig_filename = os.path.splitext(log_filename)[0] + "_len.png"
fig, (ax_prompt, ax_response, ax_total) = plt.subplots(1, 3, figsize=(3*7, 4.8))

def plot_single(ax, lens, x_label_str, title_str):
Expand Down Expand Up @@ -328,8 +328,8 @@ def plot_single(ax, lens, x_label_str, title_str):
plt.suptitle(fig_filename_title, fontsize=6)
fig.savefig(fig_filename)

def plot_instance(results_filename_0):
current_dir = os.path.dirname(os.path.abspath(results_filename_0))
def plot_instance(log_filename_0):
current_dir = os.path.dirname(os.path.abspath(log_filename_0))
log_files = glob.glob(os.path.join(current_dir, '*.log_instance.csv'))
log_files.sort(key=os.path.getmtime, reverse=True)
df_0 = pd.read_csv(log_files[0]).sort_values(by=["timestamp"])
Expand All @@ -347,15 +347,15 @@ def plot_instance(results_filename_0):
fig, ax = plt.subplots()
ax.plot(timestamp_list_0, instance_num_list_0, color="red", label=f"instance_num(avg {avg_instance_num} /s)")
ax.legend(loc='upper left')
fig_filename = os.path.splitext(results_filename_0)[0] + "_instance.png"
fig_filename = os.path.splitext(log_filename_0)[0] + "_instance.png"
index1 = fig_filename.rfind('/')
index2 = fig_filename.rfind('/', 0, index1)
fig_filename_title = fig_filename[index2 + 1:]
plt.suptitle(fig_filename_title, fontsize=6)
fig.savefig(fig_filename)
return avg_instance_num

def save_all_latencies_npy(all_token_latencies:List[np.ndarray], results_filename):
def save_all_latencies_npy(all_token_latencies:List[np.ndarray], log_filename):
dtype = [('timestamp',float),('latency',float)]
all_lat_pairs = []
for arr in all_token_latencies:
Expand All @@ -364,7 +364,7 @@ def save_all_latencies_npy(all_token_latencies:List[np.ndarray], results_filenam
all_lat_pairs.append((pair[0],pair[1]))
all_lat_pairs = np.array(all_lat_pairs,dtype=dtype)
all_lat_pairs = np.sort(all_lat_pairs,order='timestamp')
np.save(os.path.splitext(results_filename)[0], all_lat_pairs)
np.save(os.path.splitext(log_filename)[0], all_lat_pairs)

class MeasureLatency:
def __init__(self):
Expand Down Expand Up @@ -423,7 +423,7 @@ async def benchmark(
prompts: List[str],
allow_variable_generation_length: bool,
verbose: bool,
results_filename: str,
log_filename: str,
ip_ports: List[int],
distribution: str,
qps: float,
Expand Down Expand Up @@ -475,9 +475,9 @@ async def benchmark(
m._latencies, m._per_token_latencies, m._inference_latencies, m._request_ids, m._decode_latencies, m._request_lens,
log_latencies, fail_on_response_failure)
calculate_cdf(m._latencies)
plot_latency_cdf(m._latencies, m._prefill_token_latencies, m._decode_token_latencies, results_filename)
save_all_latencies_npy(m._all_latencies, results_filename)
# avg_instance_num = plot_instance(results_filename)
plot_latency_cdf(m._latencies, m._prefill_token_latencies, m._decode_token_latencies, log_filename)
save_all_latencies_npy(m._all_latencies, log_filename)
# avg_instance_num = plot_instance(log_filename)
avg_instance_num = 0.0
return throughput, m._prefill_token_latencies, m._decode_token_latencies, m._inference_latencies, avg_instance_num, m._latencies, m._request_ids, m._decode_latencies, m._request_lens, m._all_decode_latencies

Expand Down Expand Up @@ -655,7 +655,7 @@ def main():
parser.add_argument('-v', '--verbose', action='store_true')
parser.add_argument('--backend', type=GenerationBackend,
choices=[e.name for e in GenerationBackend], default='vLLM')
parser.add_argument('--results_filename', type=str, default='benchmark.log')
parser.add_argument('--log_filename', type=str, default='benchmark.log')
parser.add_argument('--ip_ports', nargs='+', required=True, help='List of ip:port')
parser.add_argument('--random_prompt_lens_mean', type=int)
parser.add_argument('--random_prompt_lens_range', type=int)
Expand Down Expand Up @@ -692,7 +692,7 @@ def main():
# parser.add_argument('--calculate_begin_ratio', type=float, default=0.5)
# parser.add_argument('--calculate_end_ratio', type=float, default=0.8)

parser.add_argument('--enable_migrate', type=int ,default=0)
parser.add_argument('--enable_migration', type=int ,default=0)
parser.add_argument('--priority_ratio', type=float ,default=0.0)

args = parser.parse_args()
Expand Down Expand Up @@ -757,7 +757,7 @@ def main():

print('total tokens', sorted(list(total_tokens)))

plot_len_cdf(prompt_lens, response_lens, total_tokens, args.results_filename)
plot_len_cdf(prompt_lens, response_lens, total_tokens, args.log_filename)

prompts = list(zip(prompts, prompt_lens, response_lens))

Expand All @@ -767,19 +767,19 @@ def main():
prompts,
args.allow_variable_generation_length,
args.verbose,
args.results_filename,
args.log_filename,
args.ip_ports,
args.distribution,
args.qps,
args.coefficient_variation,
args.log_latencies,
args.fail_on_response_failure,
))
file_name = os.path.splitext(args.results_filename)[0] + "_latency_info.json"
file_name = os.path.splitext(args.log_filename)[0] + "_latency_info.json"
results = []
import datetime
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
file_name = os.path.splitext(args.results_filename)[0] + "_latency_info.json"
file_name = os.path.splitext(args.log_filename)[0] + "_latency_info.json"
try:
with open(file_name, 'r') as f:
results = json.load(f)
Expand Down
89 changes: 44 additions & 45 deletions docs/Arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,70 +9,74 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
[--fixed-node-init]
[--initial-instances INITIAL_INSTANCES]
[--load-metric {consumed_speed,used_ratio}]
[--polling-interval POLLING_INTERVAL]
[--dispatch-policy {balanced,load,queue}]
[--enable-migrate]
[--check-migrate-frequency CHECK_MIGRATE_FREQUENCY]
[--check-migrate-policy {balanced,prefill_constrained,prefill_relaxed}]
[--enable-migration]
[--pair-migration-frequency PAIR_MIGRATION_FREQUENCY]
[--pair-migration-policy {balanced,prefill_constrained,prefill_relaxed}]
[--migrate-out-threshold MIGRATE_OUT_THRESHOLD]
[--migrate-policy {LCFS,SJF,LJF}]
[--enable-prefill-migrate ENABLE_PREFILL_MIGRATE]
[--request-migration-policy {LCFS,SJF,LJF}]
[--enable-defrag ENABLE_DEFRAG]
[--enable-scaling]
[--min-instances MIN_INSTANCES]
[--max-instances MAX_INSTANCES]
[--scaling-interval SCALING_INTERVAL]
[--scale-policy {max_load,avg_load}]
[--scaling-policy {max_load,avg_load}]
[--scale-up-threshold SCALE_UP_THRESHOLD]
[--scale-down-threshold SCALE_DOWN_THRESHOLD]
[--disable-log-requests-manager]
[--record-instance-info]
[--results-filename RESULTS_FILENAME]
[--gpu-type GPU_TYPE]
[--log-instance-info]
[--log-filename LOG_FILENAME]
[--profiling-result-file-path PROFILING_RESULT_FILE_PATH]
[--polling-interval POLLING_INTERVAL]
[--gpu-type GPU_TYPE]
[--migration-backend {gloo,rpc}]
[--migration-cache_blocks MIGRATION_CACHE_BLOCKS]
[--last-stage-max-blocks LAST_STAGE_MAX_BLOCKS]
[--max-stages MAX_STAGES]
```

`--fixed-node-init`
- Place llumlet and workers on the current node.
- Fix the placement of instance to current node.

`--initial-instances`
- Number of model instances.
- Number of model instances created at initialization.
- Default: 1

`--load-metric`
- Load metric.
- Instance load metric.
- Possible choices: consumed_speed, used_ratio
- Default: "consumed_speed"

`--polling-interval`
- Time interval(s) to update instance info and pair migration.
- Default: 0.1

`--dispatch-policy`
- Dispatch policy.
- Request dispatch policy.
- Possible choices: balanced, load, queue
- Default: "load"

`--enable-migrate`
- Enable migrate request between instances.
`--enable-migration`
- Enable migrate requests between instances.

`--check-migrate-frequency`
- Check migrate frequency.
`--pair-migration-frequency`
- Pair migration frequency.
- Default: 1

`--check-migrate-policy`
- Check migrate policy.
`--pair-migration-policy`
- Pair migration policy.

`--migrate-out-threshold`
- Migrate out load threshold.
- Migrate out instance load threshold.
- Default: 3.0

`--migrate-policy`
- Migrate policy.
`--request-migration-policy`
- Request migration policy.
- Possible choices: LCFS, SJF, LJF
- Default: "LCFS"
- Default: "SJF"

`--enable-prefill-migrate`
- Enable prefill migrate.
`--enable-defrag`
- Enable defragmentation.
- Default: False

`--enable-scaling`
Expand All @@ -90,49 +94,44 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
- Interval time of check scaling.
- Default: 10

`--scale-policy`
- Scale policy.
`--scaling-policy`
- Scaling policy.
- Possible choices: max_load, avg_load
- default: "max_load"

`--scale-up-threshold`
- Scaling up threshold.
- Scale up threshold.
- Default: 4

`--scale-down-threshold`
- Scaling down threshold.
- Scale down threshold.
- Default: 100

`--disable-log-requests-manager`
- Disable logging requests in manager.
- Default: False

`--record-instance-info`
- Enable recording instance-info data to a csv file.
- Default: False

`--results-filename`
- Results filename.
`--log-instance-info`
- Enable logging instance info.

`--gpu-type`
- GPU type specified when using simulator.
- Default: "a10"
`--log-filename`
- Log filename.
- Default: "server.log"

`--profiling-result-file-path`
- Profiling result file path.
- Default: ""

`--polling-interval`
- Time interval(s) to update instance info/migration.
- Default: 0.1
`--gpu-type`
- GPU type specified when using simulator.
- Default: "a10"

`--migration-backend`
- Communication backend during migration.
- Communication backend of migration.
- Possible choices: gloo, rpc
- Default: "rpc"

`--migration-cache-blocks`
- Cache blocks num during migration.
- Number of cache blocks in migration.
- Default: 512

`--last-stage-max-blocks`
Expand Down
Loading
Loading