Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Change TODO id #8

Merged
merged 1 commit into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llumnix/backends/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def model_decode(x, a, b, c):
def get_latency_mem(backend_type: BackendType, profiling_database: ProfilingDatabase, gpu_type: str, **backend_args):
assert BackendType.is_sim_backend(backend_type)
if backend_type == BackendType.SIM_VLLM:
# TODO(ziming) support multi-lora, more device, vision language model
# TODO(ZeldaHuang): support multi-lora, more device, vision language model
model_config = backend_args.get("model_config")
_ = backend_args.get("cache_config")
parallel_config = backend_args.get("parallel_config")
Expand Down
2 changes: 1 addition & 1 deletion llumnix/backends/vllm/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.last_inference_latency = 0
self.migration_bandwidth = self.latency_mem.migration_bandwidth
# TODO(ziming) add swap bandwidth
# TODO(ZeldaHuang): add swap bandwidth

self.cache_block_size = get_cache_block_size(
self.cache_config.block_size, self.model_config, self.parallel_config)
Expand Down
4 changes: 2 additions & 2 deletions llumnix/backends/vllm/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

logger = init_logger(__name__)

# TODO(ziming) adapt prefix cache and sliding window, now use v1 manager
# TODO(ZeldaHuang): adapt prefix cache and sliding window, now use v1 manager
class BlockManagerLlumnix(BlockSpaceManagerV1):
def get_free_blocks(self, num_required_blocks: int) -> BlockTable:
num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
Expand Down Expand Up @@ -87,7 +87,7 @@ def get_last_running_request(self) -> Optional[MigratingRequest]:

@scheduler_lock
def get_longest_running_request(self) -> Optional[MigratingRequest]:
# TODO(ziming) use for loop find request
# TODO(ZeldaHuang): use for loop find request
sorted_running = sorted(self.running, key=lambda seq_group: seq_group.get_seqs()[0].get_len())
for seq_group in reversed(sorted_running):
if seq_group not in self.prefilling_seq_groups:
Expand Down
2 changes: 1 addition & 1 deletion llumnix/entrypoints/llumnix_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

logger = init_logger(__name__)

# TODO(yiwang): Set the values through tests.
# TODO(s5u13b): Set the values through tests.
MAX_RESTARTS = 30
RESTART_INTERVALS = 1
MAX_TASK_RETRIES = 300
Expand Down
2 changes: 1 addition & 1 deletion llumnix/entrypoints/vllm/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ async def generate_benchmark(request: Request) -> Response:

global num_finished_request
if log_requests:
# TODO(yiwang): Use logger.
# TODO(s5u13b): Use logger.
print(f"Finished request {request_id}.")
num_finished_request += 1
print(f"num_finished_request {num_finished_request}.")
Expand Down
2 changes: 1 addition & 1 deletion llumnix/global_scheduler/scale_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def get_empty_instance_info(self) -> InstanceInfo:
dummy_intance_info = InstanceInfo()
dummy_intance_info.instance_id = -1
dummy_intance_info.step_id = -1
# TODO(yiwang): Should be changed for proactive auto-scaling.
# TODO(s5u13b): Should be changed for proactive auto-scaling.
dummy_intance_info.num_total_gpu_block = np.inf
dummy_intance_info.num_available_gpu_block = np.inf
dummy_intance_info.num_free_gpu_block = np.inf
Expand Down
2 changes: 1 addition & 1 deletion llumnix/llumlet/local_migration_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(self, migrate_policy: str, backend_engine: BackendInterface) -> Non
self.backend_engine = backend_engine

def get_migrate_out_request(self) -> Optional[MigratingRequest]:
# TODO(yiwang): remove the if-else codes
# TODO(s5u13b): remove the if-else codes
migrate_out_request: MigratingRequest = None
if self.migrate_policy == 'LCFS':
migrate_out_request = self.backend_engine.get_last_running_request()
Expand Down
2 changes: 1 addition & 1 deletion llumnix/llumlet/migration_coordinator.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def migrate_out_onestage(self, migrate_in_ray_actor: "ray.actor.ActorHandle", m
# do stage send/recv
migrate_out_request.stage_timestamps.append(time.time())
migrate_out_request.stage_num_blocks_list.append(stage_block_num)
# TODO(ziming) send_blocks in migrate_in_pre_alloc/migrate_in_last_stage
# TODO(ZeldaHuang): send_blocks in migrate_in_pre_alloc/migrate_in_last_stage
self.backend_engine.send_blocks(migrate_in_ray_actor, src_blocks, dst_blocks)
if not is_last_stage and self.backend_engine.should_abort_migration(migrate_out_request.backend_request, \
migrate_out_request.stage_timestamps[-1]):
Expand Down
Loading