AlibabaPAI · s5u13b · Aug 16, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/benchmark/benchmark_serving.py b/benchmark/benchmark_serving.py
@@ -240,8 +240,8 @@ def calculate_cdf(latencies):
     print(f"{hist=}")
     print(f"{cumsum=}")
 
-def plot_latency_cdf(req_latencies, prefill_latencies, decode_latencies, results_filename):
-    fig_filename = os.path.splitext(results_filename)[0] + "_latency.png"
+def plot_latency_cdf(req_latencies, prefill_latencies, decode_latencies, log_filename):
+    fig_filename = os.path.splitext(log_filename)[0] + "_latency.png"
     fig, (ax_req, ax_prefill, ax_decode) = plt.subplots(1, 3, figsize=(3*7, 4.8))
 
     def plot_single(ax, latencies, is_prefill=False):
@@ -286,8 +286,8 @@ def plot_single(ax, latencies, is_prefill=False):
     plt.suptitle(fig_filename_title, fontsize=6)
     fig.savefig(fig_filename)
 
-def plot_len_cdf(prompt_lens, response_lens, total_tokens, results_filename):
-    fig_filename = os.path.splitext(results_filename)[0] + "_len.png"
+def plot_len_cdf(prompt_lens, response_lens, total_tokens, log_filename):
+    fig_filename = os.path.splitext(log_filename)[0] + "_len.png"
     fig, (ax_prompt, ax_response, ax_total) = plt.subplots(1, 3, figsize=(3*7, 4.8))
 
     def plot_single(ax, lens, x_label_str, title_str):
@@ -328,8 +328,8 @@ def plot_single(ax, lens, x_label_str, title_str):
     plt.suptitle(fig_filename_title, fontsize=6)
     fig.savefig(fig_filename)
 
-def plot_instance(results_filename_0):
-    current_dir = os.path.dirname(os.path.abspath(results_filename_0))
+def plot_instance(log_filename_0):
+    current_dir = os.path.dirname(os.path.abspath(log_filename_0))
     log_files = glob.glob(os.path.join(current_dir, '*.log_instance.csv'))
     log_files.sort(key=os.path.getmtime, reverse=True)
     df_0 = pd.read_csv(log_files[0]).sort_values(by=["timestamp"])
@@ -347,15 +347,15 @@ def plot_instance(results_filename_0):
     fig, ax = plt.subplots()
     ax.plot(timestamp_list_0, instance_num_list_0, color="red", label=f"instance_num(avg {avg_instance_num} /s)")
     ax.legend(loc='upper left')
-    fig_filename = os.path.splitext(results_filename_0)[0] + "_instance.png"
+    fig_filename = os.path.splitext(log_filename_0)[0] + "_instance.png"
     index1 = fig_filename.rfind('/')
     index2 = fig_filename.rfind('/', 0, index1)
     fig_filename_title = fig_filename[index2 + 1:]
     plt.suptitle(fig_filename_title, fontsize=6)
     fig.savefig(fig_filename)
     return avg_instance_num
 
-def save_all_latencies_npy(all_token_latencies:List[np.ndarray], results_filename):
+def save_all_latencies_npy(all_token_latencies:List[np.ndarray], log_filename):
     dtype = [('timestamp',float),('latency',float)]
     all_lat_pairs = []
     for arr in all_token_latencies:
@@ -364,7 +364,7 @@ def save_all_latencies_npy(all_token_latencies:List[np.ndarray], results_filenam
             all_lat_pairs.append((pair[0],pair[1]))
     all_lat_pairs = np.array(all_lat_pairs,dtype=dtype)
     all_lat_pairs = np.sort(all_lat_pairs,order='timestamp')
-    np.save(os.path.splitext(results_filename)[0], all_lat_pairs)
+    np.save(os.path.splitext(log_filename)[0], all_lat_pairs)
 
 class MeasureLatency:
     def __init__(self):
@@ -423,7 +423,7 @@ async def benchmark(
     prompts: List[str],
     allow_variable_generation_length: bool,
     verbose: bool,
-    results_filename: str,
+    log_filename: str,
     ip_ports: List[int],
     distribution: str,
     qps: float,
@@ -475,9 +475,9 @@ async def benchmark(
                                       m._latencies, m._per_token_latencies, m._inference_latencies, m._request_ids, m._decode_latencies, m._request_lens,
                                       log_latencies, fail_on_response_failure)
     calculate_cdf(m._latencies)
-    plot_latency_cdf(m._latencies, m._prefill_token_latencies, m._decode_token_latencies, results_filename)
-    save_all_latencies_npy(m._all_latencies, results_filename)
-    # avg_instance_num = plot_instance(results_filename)
+    plot_latency_cdf(m._latencies, m._prefill_token_latencies, m._decode_token_latencies, log_filename)
+    save_all_latencies_npy(m._all_latencies, log_filename)
+    # avg_instance_num = plot_instance(log_filename)
     avg_instance_num = 0.0
     return throughput, m._prefill_token_latencies, m._decode_token_latencies, m._inference_latencies, avg_instance_num, m._latencies, m._request_ids, m._decode_latencies, m._request_lens, m._all_decode_latencies
 
@@ -655,7 +655,7 @@ def main():
     parser.add_argument('-v', '--verbose', action='store_true')
     parser.add_argument('--backend', type=GenerationBackend,
                         choices=[e.name for e in GenerationBackend], default='vLLM')
-    parser.add_argument('--results_filename', type=str, default='benchmark.log')
+    parser.add_argument('--log_filename', type=str, default='benchmark.log')
     parser.add_argument('--ip_ports', nargs='+', required=True, help='List of ip:port')
     parser.add_argument('--random_prompt_lens_mean', type=int)
     parser.add_argument('--random_prompt_lens_range', type=int)
@@ -692,7 +692,7 @@ def main():
     # parser.add_argument('--calculate_begin_ratio', type=float, default=0.5)
     # parser.add_argument('--calculate_end_ratio', type=float, default=0.8)
 
-    parser.add_argument('--enable_migrate', type=int ,default=0)
+    parser.add_argument('--enable_migration', type=int ,default=0)
     parser.add_argument('--priority_ratio', type=float ,default=0.0)
 
     args = parser.parse_args()
@@ -757,7 +757,7 @@ def main():
 
         print('total tokens', sorted(list(total_tokens)))
 
-    plot_len_cdf(prompt_lens, response_lens, total_tokens, args.results_filename)
+    plot_len_cdf(prompt_lens, response_lens, total_tokens, args.log_filename)
 
     prompts = list(zip(prompts, prompt_lens, response_lens))
 
@@ -767,19 +767,19 @@ def main():
         prompts,
         args.allow_variable_generation_length,
         args.verbose,
-        args.results_filename,
+        args.log_filename,
         args.ip_ports,
         args.distribution,
         args.qps,
         args.coefficient_variation,
         args.log_latencies,
         args.fail_on_response_failure,
     ))
-    file_name = os.path.splitext(args.results_filename)[0] + "_latency_info.json"
+    file_name = os.path.splitext(args.log_filename)[0] + "_latency_info.json"
     results = []
     import datetime
     current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
-    file_name = os.path.splitext(args.results_filename)[0] + "_latency_info.json"
+    file_name = os.path.splitext(args.log_filename)[0] + "_latency_info.json"
     try:
         with open(file_name, 'r') as f:
             results = json.load(f)

diff --git a/docs/Arguments.md b/docs/Arguments.md
@@ -9,70 +9,74 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
             [--fixed-node-init]
             [--initial-instances INITIAL_INSTANCES]
             [--load-metric {consumed_speed,used_ratio}]
+            [--polling-interval POLLING_INTERVAL]
             [--dispatch-policy {balanced,load,queue}]
-            [--enable-migrate]
-            [--check-migrate-frequency CHECK_MIGRATE_FREQUENCY]
-            [--check-migrate-policy {balanced,prefill_constrained,prefill_relaxed}]
+            [--enable-migration]
+            [--pair-migration-frequency PAIR_MIGRATION_FREQUENCY]
+            [--pair-migration-policy {balanced,prefill_constrained,prefill_relaxed}]
             [--migrate-out-threshold MIGRATE_OUT_THRESHOLD]
-            [--migrate-policy {LCFS,SJF,LJF}]
-            [--enable-prefill-migrate ENABLE_PREFILL_MIGRATE]
+            [--request-migration-policy {LCFS,SJF,LJF}]
+            [--enable-defrag ENABLE_DEFRAG]
             [--enable-scaling]
             [--min-instances MIN_INSTANCES]
             [--max-instances MAX_INSTANCES]
             [--scaling-interval SCALING_INTERVAL]
-            [--scale-policy {max_load,avg_load}]
+            [--scaling-policy {max_load,avg_load}]
             [--scale-up-threshold SCALE_UP_THRESHOLD]
             [--scale-down-threshold SCALE_DOWN_THRESHOLD]
             [--disable-log-requests-manager]
-            [--record-instance-info]
-            [--results-filename RESULTS_FILENAME]
-            [--gpu-type GPU_TYPE]
+            [--log-instance-info]
+            [--log-filename LOG_FILENAME]
             [--profiling-result-file-path PROFILING_RESULT_FILE_PATH]
-            [--polling-interval POLLING_INTERVAL]
+            [--gpu-type GPU_TYPE]
             [--migration-backend {gloo,rpc}]
             [--migration-cache_blocks MIGRATION_CACHE_BLOCKS]
             [--last-stage-max-blocks LAST_STAGE_MAX_BLOCKS]
             [--max-stages MAX_STAGES]
 ```
 
 `--fixed-node-init`
-- Place llumlet and workers on the current node.
+- Fix the placement of instance to current node.
 
 `--initial-instances`
-- Number of model instances.
+- Number of model instances created at initialization.
 - Default: 1
 
 `--load-metric`
-- Load metric.
+- Instance load metric.
 - Possible choices: consumed_speed, used_ratio
 - Default: "consumed_speed"
 
+`--polling-interval`
+- Time interval(s) to update instance info and pair migration.
+- Default: 0.1
+
 `--dispatch-policy`
-- Dispatch policy.
+- Request dispatch policy.
 - Possible choices: balanced, load, queue
 - Default: "load"
 
-`--enable-migrate`
-- Enable migrate request between instances.
+`--enable-migration`
+- Enable migrate requests between instances.
 
-`--check-migrate-frequency`
-- Check migrate frequency.
+`--pair-migration-frequency`
+- Pair migration frequency.
 - Default: 1
 
-`--check-migrate-policy`
-- Check migrate policy.
+`--pair-migration-policy`
+- Pair migration policy.
 
 `--migrate-out-threshold`
-- Migrate out load threshold.
+- Migrate out instance load threshold.
 - Default: 3.0
 
-`--migrate-policy`
-- Migrate policy.
+`--request-migration-policy`
+- Request migration policy.
 - Possible choices: LCFS, SJF, LJF
-- Default: "LCFS"
+- Default: "SJF"
 
-`--enable-prefill-migrate`
-- Enable prefill migrate.
+`--enable-defrag`
+- Enable defragmentation.
 - Default: False
 
 `--enable-scaling`
@@ -90,49 +94,44 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 - Interval time of check scaling.
 - Default: 10
 
-`--scale-policy`
-- Scale policy.
+`--scaling-policy`
+- Scaling policy.
 - Possible choices: max_load, avg_load
 - default: "max_load"
 
 `--scale-up-threshold`
-- Scaling up threshold.
+- Scale up threshold.
 - Default: 4
 
 `--scale-down-threshold`
-- Scaling down threshold.
+- Scale down threshold.
 - Default: 100
 
 `--disable-log-requests-manager`
 - Disable logging requests in manager.
-- Default: False
-
-`--record-instance-info`
-- Enable recording instance-info data to a csv file.
-- Default: False
 
-`--results-filename`
-- Results filename.
+`--log-instance-info`
+- Enable logging instance info.
 
-`--gpu-type`
-- GPU type specified when using simulator.
-- Default: "a10"
+`--log-filename`
+- Log filename.
+- Default: "server.log"
 
 `--profiling-result-file-path`
 - Profiling result file path.
 - Default: ""
 
-`--polling-interval`
-- Time interval(s) to update instance info/migration.
-- Default: 0.1
+`--gpu-type`
+- GPU type specified when using simulator.
+- Default: "a10"
 
 `--migration-backend`
-- Communication backend during migration.
+- Communication backend of migration.
 - Possible choices: gloo, rpc
 - Default: "rpc"
 
 `--migration-cache-blocks`
-- Cache blocks num during migration.
+- Number of cache blocks in migration.
 - Default: 512
 
 `--last-stage-max-blocks`